def test_detect_language_mixed_languages(self): text = """ # spanish Lionel Andrés Messi Cuccittini (Rosario, 24 de junio de 1987), conocido como Leo Messi, es un futbolista argentino11 que juega como delantero en el Fútbol Club Barcelona y en la selección argentina, de la que es capitán. # german Messi spielt seit seinem 14. Lebensjahr für den FC Barcelona. Mit 24 Jahren wurde er Rekordtorschütze des FC Barcelona, mit 25 der jüngste Spieler in der La-Liga-Geschichte, der 200 Tore erzielte. # english Four score and seven years ago our fathers brought forth on this continent, a new nation, conceived in Liberty, and dedicated to the proposition that all men are created equal. Now we are engaged in a great civil war, testing whether that nation, or any nation so conceived and so dedicated, can long endure. We are met on a great battle-field of that war. We have come to dedicate a portion of that field, as a final resting place for those who here gave their lives that that nation might live. """ result = detect_language(text, self.languages) self.assertEqual(result, 'English')
def get_name_fields(layer, fields, field_types, min_completeness=0.8, min_uniqueness=0.8): print("starting get_name_field with:") print(fields) print(field_types) name_fields = [] for i, field in enumerate(fields): field = fields[i] field_lower = field.lower() field_type = field_types[i] values = get_values_list(layer, field) values_set = set(values) number_of_unique_values = len(values_set) completeness = float(len([value for value in values if value ])) / float(number_of_values) uniqueness = float(number_of_unique_values) / float(number_of_values) if "pc" not in field_lower and field_type == "OFTString" and completeness > min_completeness and uniqueness > min_uniqueness and not isGibberish( values_set) and number_of_unique_values > 10 and isNameList( values_set): language = detect_language(values) name_fields.append({ "name": field, "uniqueness": uniqueness, "language": language }) # sort namefields by uniqueness name_fields = sorted(name_fields, key=lambda namefield: -1 * namefield["uniqueness"]) print("name_fields are", name_fields) return name_fields
def post(self, request): form = DetectLanguageForm(request.POST) language = None if form.is_valid(): language = detect_language(form.data['phrase']) return render(request, self.template_name, {'form': form, 'language':language})
def parse_conversation(self, response): speakers = response.css( 'body > div.container > div.msgBlock > table > tr > td > b' ).extract() utterances = filter( lambda x: x != '\r' and '[Message edited by' not in x, response.css('td[id=post]::text').extract()) clean_utterances_count = 0 # Check if conversation contains more than 1 utterance if (len(utterances) > 1): corpus = "<s>" for index in range(len(speakers)): parsed_clean_utterance = self.clean_utterance( utterances[index]) # Check if utterance is not empty if (parsed_clean_utterance.isspace() is False): # Check utterance language if detect_language(parsed_clean_utterance) != 'English': clean_utterances_count += 1 corpus += '<utt uid="' + str( self.generate_uid(speakers[index]) ) + '">' + parsed_clean_utterance + '</utt>' if (clean_utterances_count > 1): corpus += "</s>\n" self.text_file.write(corpus) # Visit all pages of the current post next_page_urls = response.css( 'body > div.container > div.msgBlock > table > tr.msgHeader > td > div.fnavhead > div.fnavnum > a::attr(href)' ).extract() for next_url in next_page_urls: next_url = response.urljoin(next_url) yield scrapy.Request(url=next_url, callback=self.parse_conversation)
def f_lan(s): """ :param s: string to be processed :return: boolean (s is English) """ # some reviews are actually english but biased toward french return detect_language(s) in {'English', 'French'}
def generate_output(language): input_lang = language[0].lower() vw_in = pd.read_csv('input/vw_clean_' + input_lang + '.csv') df = pd.DataFrame(data=vw_in) vw_out = pd.DataFrame() lang = '' if str(input_lang).lower() == 'en': lang = 'english' else: lang = 'spanish' print('Processing dataset in', lang) for index, row in df.iterrows(): try: if len(str(row['text']).split()) > 1: if lang_detector.detect_language(str(row['text'])) == lang: if not (str(row['text'])).lower().startswith('rt'): # we avoid RT text = preprocessor.process_tweet(str(row['text'])).replace('volkswagen', ' ').replace('vw', ' ') vw_out = vw_out.append({'id':row['id'], 'created_at':row['created_at'], 'name':row['name'], 'screen_name':row['screen_name'], 'verified':row['followers_count'], 'friends_count':row['friends_count'], 'text':text, 'description':row['description'], 'lang':row['lang'], 'time_zone':row['time_zone'], 'location':row['location']}, ignore_index = True) except KeyboardInterrupt: print('Generating file in', lang) vw_out.to_csv('output/vw_clean_' + input_lang + '_rechecked.csv') sys.exit() print('Generating file in', lang) vw_out.to_csv('output/vw_clean_' + input_lang + '_rechecked.csv')
def f_lan(s): """ :param s: string to be processed :return: boolean (s is English) """ # some reviews are french return detect_language(s) in {'English'}
def f_lan(s): """ :param s: string to be processed :return: boolean (s is English) """ # some reviews are french but incorrectly have been labeled as EN return detect_language(s) in {'English'} # {'English',French'}
def check_lang(self, text, lang_list={'English', 'French'}): """ Function for language detection. @param text (string): text to be processed. @param lang_list(set): allowed languages for the text (Default: English + French) @return boolean: (True if text belongs to any one of the languages in lang_list) """ return detect_language(s) in lang_list
def post(self, request): form = DetectLanguageForm(request.POST) language = None if form.is_valid(): language = detect_language(form.data['phrase']) return render(request, self.template_name, { 'form': form, 'language': language })
def spell_checker(text, depth): words = get_words_from_text(text) incorrect_words = [] for word in words: lang = detect_language(word) if lang != 'unknown': correct_word = get_correct_word(word, lang, depth) if correct_word != word: incorrect_words.append(word + " - " + correct_word) return incorrect_words
def clean(df): from nltk.corpus import stopwords import re from googletrans import Translator from language_detector import detect_language #remove $ and @mentions df['tweet'] = df['tweet'].map(lambda x: re.sub(r'\$[A-Za-z0-9]*','',x)) #df['tweet'] = df['tweet'].map(lambda x: re.sub(r'\#[A-Za-z0-9]*','',x)) df['tweet'] = df['tweet'].map(lambda x: re.sub(r'\@[A-Za-z0-9]*','',x)) #to datetime format df.drop( df[df['date'] == 'date'].index , inplace=True) df['date'] = df['date'].apply(lambda x: to_datetime(x) if type(x)== str else x) #drop first column df= df.drop(df.columns[0], axis=1) #remove https: links df['tweet'] = df['tweet'].str.replace(r'https?://[^\s<>"]+|www\.[^\s<>"]+', "") #df['tweet'] = df['tweet'].apply(lambda x: re.split('https:\/\/.*', str(x))[0]) #remove punctuation df['tweet'] = df['tweet'].str.replace('[^\w\s]','') #to lowercase df['tweet'] = df['tweet'].map(lambda s:s.lower() if type(s) == str else s) #delete line breaks df['tweet'].replace(r'\s+|\\n', ' ', regex=True, inplace=True) df= df.reset_index(drop=True) #drop all foreign languages '''for x in range(len(df['tweet'])): if detect_language(df['tweet'][x]) != 'English': df = df.drop(index= x) else: continue''' df['tweet'] = df['tweet'].map(lambda x: x.replace(x,'') if detect_language(x) != 'English' else x) df['tweet'].replace('', np.nan, inplace=True) #drop nulls df= df.dropna() df= df.reset_index(drop=True) ''' #remove stop words stop = stopwords.words('english') df['tweet'] = df['tweet'].apply(lambda x: [w.strip() for w in x if w.strip() not in stop]) df['tweet'].apply(lambda x: [item for item in x if item not in stop])''' return df
def test_detect_language_english_with_module_language_specification(self): text = """ Shakespeare was born and brought up in Stratford-upon-Avon, Warwickshire. At the age of 18, he married Anne Hathaway, with whom he had three children: Susanna, and twins Hamnet and Judith. Sometime between 1585 and 1592, he began a successful career in London as an actor, writer, and part-owner of a playing company called the Lord Chamberlain's Men, later known as the King's Men. """ result = detect_language(text, LANGUAGES) self.assertEqual(result, 'English')
def test_detect_language_english(self): text = """ Messi plays since he was 14 years for the FC Barcelona . At 24, he was top scorer of FC Barcelona , 25 the youngest player in La Liga history , the 200 goals scored . Meanwhile, Messi is the only player over 300 achieved top-flight and is therefore scorer La Liga . """ result = detect_language(text, self.languages) self.assertEqual(result, 'English')
def test_detect_language_english(self): text = """ Messi plays since he was 14 years for the FC Barcelona . At 24, he was top scorer of FC Barcelona , 25 the youngest player in La Liga history , the 200 goals scored . Meanwhile, Messi is the only player over 300 achieved top-flight and is therefore scorer La Liga . """ result = detect_language(text, self.languages) self.assertEqual(result, 'English')
def test_detect_language_german(self): text = """ Messi spielt seit seinem 14. Lebensjahr für den FC Barcelona. Mit 24 Jahren wurde er Rekordtorschütze des FC Barcelona, mit 25 der jüngste Spieler in der La-Liga-Geschichte, der 200 Tore erzielte. Inzwischen hat Messi als einziger Spieler mehr als 300 Erstligatore erzielt und ist damit Rekordtorschütze der Primera División. """ result = detect_language(text, self.languages) self.assertEqual(result, 'German')
def test_detect_language_german(self): text = """ Messi spielt seit seinem 14. Lebensjahr für den FC Barcelona. Mit 24 Jahren wurde er Rekordtorschütze des FC Barcelona, mit 25 der jüngste Spieler in der La-Liga-Geschichte, der 200 Tore erzielte. Inzwischen hat Messi als einziger Spieler mehr als 300 Erstligatore erzielt und ist damit Rekordtorschütze der Primera División. """ result = detect_language(text, self.languages) self.assertEqual(result, 'German')
def test_detect_language_english(self): text = """ Four score and seven years ago our fathers brought forth on this continent, a new nation, conceived in Liberty, and dedicated to the proposition that all men are created equal. Now we are engaged in a great civil war, testing whether that nation, or any nation so conceived and so dedicated, can long endure. We are met on a great battle-field of that war. We have come to dedicate a portion of that field, as a final resting place for those who here gave their lives that that nation might live. """ result = detect_language(text, self.languages) self.assertEqual(result, 'English')
def test_detect_language_spanish(self): text = """ Lionel Andrés Messi Cuccittini (Rosario, 24 de junio de 1987), conocido como Leo Messi, es un futbolista argentino11 que juega como delantero en el Fútbol Club Barcelona y en la selección argentina, de la que es capitán. Considerado con frecuencia el mejor jugador del mundo y calificado en el ámbito deportivo como el más grande de todos los tiempos, Messi es el único futbolista en la historia que ha ganado cinco veces el FIFA Balón de Oro –cuatro de ellos en forma consecutiva– y el primero en recibir tres Botas de Oro. """ result = detect_language(text, self.languages) self.assertEqual(result, 'Spanish')
def test_detect_language_spanish(self): text = """ Lionel Andrés Messi Cuccittini (Rosario, 24 de junio de 1987), conocido como Leo Messi, es un futbolista argentino11 que juega como delantero en el Fútbol Club Barcelona y en la selección argentina, de la que es capitán. Considerado con frecuencia el mejor jugador del mundo y calificado en el ámbito deportivo como el más grande de todos los tiempos, Messi es el único futbolista en la historia que ha ganado cinco veces el FIFA Balón de Oro –cuatro de ellos en forma consecutiva– y el primero en recibir tres Botas de Oro. """ result = detect_language(text, self.languages) self.assertEqual(result, 'Spanish')
def test_detect_language_english(self): # NOTE: You will first need to define a new "English" language # in the languages.py module. text = """ # english Lionel Andrés 'Leo' Messi is an Argentine professional footballer who plays as a forward for Spanish club FC Barcelona and the Argentina national team. Often considered the best player in the world and rated by many in the sport as the greatest of all time, Messi is the only football player in history to win five FIFA Ballons, four of which he won consecutively, and the first player to win three European Golden Shoes. """ result = detect_language(text, LANGUAGES) self.assertEqual(result, 'English')
def test_detect_language_english(self): # NOTE: You will first need to define a new "English" language # in the languages.py module. text = """ # english Lionel Andrés 'Leo' Messi is an Argentine professional footballer who plays as a forward for Spanish club FC Barcelona and the Argentina national team. Often considered the best player in the world and rated by many in the sport as the greatest of all time, Messi is the only football player in history to win five FIFA Ballons, four of which he won consecutively, and the first player to win three European Golden Shoes. """ result = detect_language(text, LANGUAGES) self.assertEqual(result, 'English')
def test_detect_language_mixed_languages(self): text = """ # spanish Lionel Andrés Messi Cuccittini (Rosario, 24 de junio de 1987), conocido como Leo Messi, es un futbolista argentino11 que juega como delantero en el Fútbol Club Barcelona y en la selección argentina, de la que es capitán. # german Messi spielt seit seinem 14. Lebensjahr für den FC Barcelona. Mit 24 Jahren wurde er Rekordtorschütze des FC Barcelona, mit 25 der jüngste Spieler in der La-Liga-Geschichte, der 200 Tore erzielte. """ result = detect_language(text, self.languages) self.assertEqual(result, 'Spanish')
def test_detect_language_mixed_languages(self): text = """ # spanish Lionel Andrés Messi Cuccittini (Rosario, 24 de junio de 1987), conocido como Leo Messi, es un futbolista argentino11 que juega como delantero en el Fútbol Club Barcelona y en la selección argentina, de la que es capitán. # german Messi spielt seit seinem 14. Lebensjahr für den FC Barcelona. Mit 24 Jahren wurde er Rekordtorschütze des FC Barcelona, mit 25 der jüngste Spieler in der La-Liga-Geschichte, der 200 Tore erzielte. """ result = detect_language(text, self.languages) self.assertEqual(result, 'Spanish')
def update_languages(df: pd.DataFrame) -> pd.DataFrame: """ Update the language columns of the DF with the language of article. Args: df (pd.DataFrame): The dataframe, with a columns "title". Returns: pd.DataFrame: DF with a new column "lang". """ languages = [] for index, row in df.iterrows(): try: lang = detect_language(row.title)[0:2] except Exception: lang = None languages.append(lang) df['lang'] = languages del languages return df
def get_name_fields(layer, fields, field_types, min_completeness=0.8, min_uniqueness=0.8): print "starting get_name_field with:" print fields print field_types name_fields = [] for i, field in enumerate(fields): field = fields[i] field_lower = field.lower() field_type = field_types[i] values = [value.lower() if isinstance(value, str) or isinstance(value, unicode) else value for value in layer.get_fields(field)] values_set = set(values) number_of_unique_values = len(values_set) completeness = float(len([value for value in values if value])) / float(number_of_values) uniqueness = float(number_of_unique_values) / float(number_of_values) if "pc" not in field_lower and field_type == "OFTString" and completeness > min_completeness and uniqueness > min_uniqueness and not isGibberish(values_set) and number_of_unique_values > 10 and isNameList(values_set): language = detect_language(values) name_fields.append({"name": field, "uniqueness": uniqueness, "language": language}) # sort namefields by uniqueness name_fields = sorted(name_fields, key = lambda namefield: -1*namefield["uniqueness"]) print "name_fields are", name_fields return name_fields
def get_repo_info(full_name): repo = github_client.get_repo(full_name) # Fetch labels. labels = [topic for topic in repo.get_topics()] languages = list(repo.get_languages().items()) if len(languages) > 0: main_language = languages[0][0].lower() if main_language not in labels: labels.append(main_language) # Fetch categories. categories = [] readme = repo.get_readme().decoded_content.decode("utf-8") spoken_language = detect_language(readme) if spoken_language == "Mandarin": categories.append("language:zh") elif spoken_language == "English": categories.append("language:en") return { "ItemId": full_name.replace("/", ":").lower(), "Timestamp": str(repo.updated_at), "Labels": labels, "Categories": categories, "Comment": repo.description, }
def test_detect_language_english_with_module_language_specification(self): result = detect_language(self.texts["english"], LANGUAGES) self.assertEqual(result.lower(), 'english')
#coding=utf-8 from language_detector import detect_language text = u"中" language = detect_language(text) print language
def test_detect_language_hindi_with_module_language_specification(self): result = detect_language(self.texts["hindi"], LANGUAGES) self.assertEqual(result.lower(), 'hindi')
def test_detect_language_mixed_with_our_language_specification(self): result = detect_language(self.texts["mostly-spanish"], self.languages) self.assertEqual(result.lower(), 'spanish')
def test_detect_language_mixed_with_module_language_specification(self): result = detect_language(self.texts["mostly-spanish"], LANGUAGES) self.assertEqual(result.lower(), 'spanish')
def main(file, userselection, n_pois, output, desc): df_reviews = pd.read_csv(file).drop_duplicates(subset=['Tripadvisor'], keep='first') #Lower all words df_reviews['description'] = df_reviews['description'].str.lower() #Numeric to strings df_reviews['description'] = df_reviews['description'].apply( lambda x: re.sub(r'\d+', '', x)) #remove html tags df_reviews['description'] = df_reviews['description'].apply( lambda x: (html.unescape(x))) #remove punctuation df_reviews['description'] = df_reviews['description'].apply( lambda x: x.translate(str.maketrans('', '', string.punctuation))) #remove accent df_reviews['description'] = df_reviews['description'].apply( lambda x: unidecode.unidecode(x)) #remove specific characters and words df_reviews['description'] = df_reviews['description'].apply( lambda x: re.sub("description", '', x)) df_reviews['description'] = df_reviews['description'].apply( lambda x: re.sub("wikipedia", '', x)) df_reviews['description'] = df_reviews['description'].apply( lambda x: re.sub("'s", '', x)) #stop words stop_words = set(stopwords.words('english')) lemmatizer = WordNetLemmatizer() df_reviews['tokens'] = df_reviews['description'].apply(lambda x: [ lemmatizer.lemmatize(word) for word in word_tokenize(x) if not word in stop_words and detect_language(word) == 'English' ]) #get corpus corpus = get_corpus(df_reviews) seg_list = split_to_words(corpus) vectorizer_model = CountVectorizer(stop_words=stop_words, analyzer='word', max_features=2000) vec_docs = vectorizer_model.fit_transform(seg_list) tf_feature_names = vectorizer_model.get_feature_names() no_topics = 10 no_top_words = 5 lda = LatentDirichletAllocation(n_components=no_topics, max_iter=5, learning_method='online', learning_offset=40., random_state=0).fit(vec_docs) display_topics(lda, tf_feature_names, no_top_words) lda_results = lda.fit_transform(vec_docs) df_reviews = get_topics(lda_results, df_reviews) topic_dict = display_topics(lda, tf_feature_names, no_top_words) h = pd.DataFrame.from_dict(topic_dict, orient='index').transpose().melt() df_reviews = df_reviews.merge(h, left_on='topics', right_on='variable', how='left') df_reviews = df_reviews.drop(columns=['topics', 'variable', 'tokens']) df_reviews = pd.get_dummies(df_reviews, prefix=['keyword'], columns=['value']).drop_duplicates() cols = [col for col in df_reviews.columns if 'keyword' not in col] df_reviews = df_reviews.groupby(cols).sum().reset_index() df_reviews = review_rate(df_reviews) selection = user_selection(userselection) results = best_results(dict_user(df_reviews, selection), df_reviews, n_pois, relevant=userselection) if output != "default": results.to_csv("{output}_{sufix}.csv".format(output=output, sufix=selection), index=False) desc_ = pd.read_csv(desc, sep="|") results = results.merge(desc_, left_on='Tripadvisor', right_on='name', how='inner') return results
def test_detect_language_ambiguous(self): text = """ A giant dog chased seven cats. """ result = detect_language(text, self.languages) self.assertEqual(result, {'Spanish', 'English'})
def is_sent_english(sent): """Check if the sentence is English or not""" return ld.detect_language(sent) == 'english'
def test_detect_language_english_with_module_language_specification(self): result = detect_language(self.texts["english"], LANGUAGES) self.assertEqual(result.lower(), 'english')
def test_detect_language_german_with_module_language_specification(self): result = detect_language(self.texts["german"], LANGUAGES) self.assertEqual(result.lower(), 'german')
def test_detect_language_german_with_our_language_specification(self): result = detect_language(self.texts["german"], self.languages) self.assertEqual(result.lower(), 'german')
def test_detect_language_spanish_with_our_language_specification(self): result = detect_language(self.texts["spanish"], self.languages) self.assertEqual(result.lower(), 'spanish')
def test_detect_language_english_with_our_language_specification(self): result = detect_language(self.texts["english"], self.languages) self.assertEqual(result.lower(), 'english')
def test_detect_language_german_with_our_language_specification(self): result = detect_language(self.texts["german"], self.languages) self.assertEqual(result.lower(), 'german')
def test_detect_language_german_with_module_language_specification(self): result = detect_language(self.texts["german"], LANGUAGES) self.assertEqual(result.lower(), 'german')
def is_line_in_english(line): """Check if the line is in english. - Input: line -string. - Return: true if line is in english ,false otherwise.""" return ld.detect_language(line) == 'english'