def lang_features(story_sentences: List[Dict[str, Any]]) -> List[Dict[str, Any]]: lang_list = [] wtl = WhatTheLang() for sent_dict in story_sentences: text = sent_dict["text"] try: lang = wtl.predict_lang(text) if not isinstance(lang, str): lang = "UKN" except: lang = "UKN" try: if len(text) <= 10: is_nonsense = False else: is_nonsense = nonsense(text) except: is_nonsense = True is_eng = isAscii(text) lang_dict = dict(sentence_id=sent_dict["id"], lang=lang, nonsense=is_nonsense, ascii_chars=is_eng) lang_list.append(lang_dict) return lang_list
def translate(text, target="en", src=False): wtl = WhatTheLang() if (wtl.predict_lang(text) != target): info("translating...") translator = Translator() translated = translator.translate(text, dest=target) text = translated.text if (src): return text, translated.src return text
def whatlang(text, label): try: wtl = WhatTheLang() result = wtl.predict_lang(text) if result == label: return True except Exception as e: print(e) pass return False
def translate(self, text=None, target="en", src=False): if (text == None): text = self.text wtl = WhatTheLang() if (wtl.predict_lang(text) != target): self.info("translating...") translator = Translator() translated = translator.translate(text, dest=target) self.translated = translated.text if (src): self.lang = translated.src else: self.translated = text
def translate(self, text=None, target="de", src=False): global translation_engines global translation_tokenizers if (text == None): text = self.text wtl = WhatTheLang() try: langcode = wtl.predict_lang(text) if (langcode != target): try: mname = f'Helsinki-NLP/opus-mt-{langcode}-{target}' # model name model = None tok = None if (mname in translation_engines.keys()): model = translation_engines[mname] tok = translation_tokenizers[mname] else: model = MarianMTModel.from_pretrained(mname) tok = MarianSentencePieceTokenizer.from_pretrained( mname) translation_engines[mname] = model translation_tokenizers[mname] = tok batch = tok.prepare_translation_batch( src_texts=[text]) # don't need tgt_text for inference gen = model.generate( **batch) # for forward pass: model(**batch) words: List[str] = tok.decode_batch( gen, skip_special_tokens=True) self.translated = words[0] if (src): self.lang = langcode self.info("translated local...") except Exception as e: print(e) self.info("translating...") translator = Translator() translated = translator.translate(text, dest=target) self.translated = translated.text if (src): self.lang = translated.src else: self.translated = text except: self.translated = text
def create_training_samples_sents(): wtl = WhatTheLang() data = get_data_as_df() label_map = get_label_map(data) ready_data = [] for author, text, lang in tqdm( list(zip(data["author"], data["text"], data["lang"]))): # Get correct language, if inconsistent correct by hand #pred_lang = wtl.predict_lang(text) if lang not in ["de", "fr", "en", "it", "da"]: continue ready_data.append(text) """ # Get correct tokenizer and embedder sentences = sent_tokenizer(text, lang) ready_data.append({ "response": label_map[author], "document": document }) """ """ s = 0 cv = CountVectorizer() word_count_vector = cv.fit_transform(ready_data) print(word_count_vector.shape) print(get_label_map(data))""" return ready_data
def generate_lang_id_report(ref, outs, model="wtl", min_length=5, print_lines=False, print_line_numbers=False): if model=="wtl": wtl = WhatTheLang() lang_id_reports=[] lang_id_lines_reports=[] lang_id_line_numbers_reports=[] for out in outs: langs = defaultdict(int) lang_lines = defaultdict(list) lang_line_numbers = defaultdict(list) for i, sentence in enumerate(out, start=1): line = corpus_utils.list2str(sentence) if len(sentence) >= int(min_length): if model=="langid": (lang, prob) = langid.classify(line) elif model=="wtl": lang = wtl.predict_lang(line) else: raise NotImplementedError(f"Unknown model for language identification: '{model}'.") langs[lang] +=1 if print_line_numbers: lang_line_numbers[lang].append(i) if print_lines: lang_lines[lang].append(line) else: langs["shorter than min_length"] +=1 if print_line_numbers: lang_line_numbers["shorter than min_length"].append(i) if print_lines: lang_lines["shorter than min_length"].append(line) lang_id_reports.append(langs) lang_id_lines_reports.append(lang_lines) lang_id_line_numbers_reports.append(lang_line_numbers) reporter = reporters.LangIDreport(model, lang_id_reports, lang_id_lines_reports, lang_id_line_numbers_reports,print_lines,print_line_numbers) reporter.generate_report() return reporter
def main( dump_file: Path, output_file, corpus_file: Path, limit=None, sample_size=None, random_state=None, ): logger.info(f'Reading file {dump_file}') df_github = pd.read_csv(dump_file, nrows=limit) if sample_size is not None: logger.info(f'Selecting {sample_size} random samples...') df_github = df_github.sample(sample_size) wtl = WhatTheLang() bodies = df_github['body'].tolist() batch_size = 512 list_df = [ bodies[i:i + batch_size] for i in range(0, len(bodies), batch_size) ] langs = [] logger.info('Recognizing language...') for df in tqdm(list_df): langs += wtl.predict_lang(df) df_github['lang'] = langs df_english = df_github[df_github['lang'] == 'en'] df_english['body'] = df_english['body'].swifter.apply(clean_text) df_lm = pd.DataFrame({'issue_body': df_english['body']}) logger.info(f'Saving {len(df_lm)} rows') df_lm.to_json(output_file, orient='records', lines=True) corpus_file.write_text('\n'.join(df_english['body']), encoding='utf8', errors='strict')
def load_links(self): global loaded_df if loaded_df is not None: df = loaded_df else: try: df = pd.read_csv("train_data/links.csv") except FileNotFoundError: print("Downloading links data...") df = pd.read_json("https://api.fakenewsdetector.org/links/all") df.to_csv("train_data/links.csv") df.dropna(subset=["title", "content"], inplace=True, how="all") df["category_id"] = df['verified_category_id'].fillna( df['category_id']) df["clickbait_title"] = df['verified_clickbait_title'].fillna( df['clickbait_title']) df = df.fillna('') # Limiting df = df[0:5000] df["title_content"] = self.join_text_and_content(df) print("Detecting language and limiting links...") wtl = WhatTheLang() df["lang"] = [ wtl.predict_lang(text[0:50]) for text in df["title_content"] ] df = df[df["lang"] == 'en'][0:500].append( df[df["lang"] == 'es'][0:500]).append( df[df["lang"] == 'pt'][0:500]) print(df[["title", "lang"]].groupby(['lang']).agg(['count']).T) loaded_df = df df = df.loc[self.filter] df = df.copy() return df
def filter_non_english_abstracts(papers): """ Remove abstracts that are not written in englisch :param papers: DataFrame with papers """ wtl = WhatTheLang() abstracts = papers["abstract"].apply(lambda text: re.sub(r"[\r\n]+", " ", text)) wtl_lang = abstracts.apply(wtl.predict_lang) langid_lang = abstracts.apply(lambda a: langid.classify(a)[0]) filtered_papers = papers[(wtl_lang == "en") & (langid_lang == "en")] logging.info(f"Removed {len(papers) - len(filtered_papers)} papers " f"with non english abstracts") return filtered_papers
def create_training_samples(): wtl = WhatTheLang() data = get_data_as_df() label_map = get_label_map(data) tokenizers = get_tokenizers() word_embeddings = get_word_embeddings() ready_data = [] for author, text, lang in tqdm( list(zip(data["author"], data["text"], data["lang"]))): # Get correct language, if inconsistent correct by hand #pred_lang = wtl.predict_lang(text) if lang not in ["de", "fr", "en", "it", "da"]: continue # Get correct tokenizer and embedder tokenizer = tokenizers[lang] embedder = word_embeddings[lang] # Vectorize each word tokenized_words = apply_transforms( [str(word) for word in tokenizer(text)]) document = None good, bad = 0, 0 for word in tokenized_words: if word in embedder: embedding = torch.tensor(embedder[word]).reshape(1, 300) good += 1 """ else: # this is an unknown word embedding = torch.ones(1, 300) bad += 1 """ if not document == None: document = torch.cat([document, embedding], 0) else: document = embedding # cut off all docs > 100: if document == None: continue # pad with zeros #print(f"Good-Ratio: {good/(good+bad)}") ready_data.append({ "response": label_map[author], "document": document }) return ready_data
# # - ## Import des librairies et chargement du fichier # In[1]: import time from random import randint import json, pandas from kafka import SimpleProducer, KafkaClient from kafka import KafkaProducer import treetaggerwrapper tagger = treetaggerwrapper.TreeTagger(TAGLANG='fr') from whatthelang import WhatTheLang wtl = WhatTheLang() producer = KafkaProducer( bootstrap_servers='localhost:9092', value_serializer=lambda v: json.dumps(v).encode('utf-8'), linger_ms=10) # Temps d'attente maximale fixé à 10 secondes ATTENTE_MAX = 1 fichier = open('Data/new_comment_2018.json', 'rb') #fichier = open('Data/comment_2018_echantillon.json', 'rb') lignes = fichier.readlines() def comment_to_lemme(comment):
def detect(text): wtl = WhatTheLang() return wtl.predict_lang(text)
def detect(self): wtl = WhatTheLang() self.lang = wtl.predict_lang(self.text)
from whatthelang import WhatTheLang wtl = WhatTheLang() print( wtl.predict_lang([ "അമ്മ", "पिता", "teacher", "അമ്മ", "पिता", "teacher", "അമ്മ", "पिता", "teacher", "അമ്മ", "पिता", "teacher" ]))
def load_df(stop_words, spell): """Return pandas data frame with normalized and filtered reviews""" filename = "data/df_filtered.jbl" if os.path.exists(filename): df = joblib.load(filename) return df wtl = WhatTheLang() stemmer = RSLPStemmer() # Raw data logging.info("Loading data from CSV file") df = pd.read_csv("data/data.csv").drop_duplicates() logging.info("Total rows: %d", len(df)) df["text"] = df.apply(lambda row: f"{row.title} {row.text}", axis=1) del df["title"] logging.info("Running LEIA") s = SentimentIntensityAnalyzer() df["leia"] = df_parallel_apply( df, lambda row: leia_sentiment(row.text, analyzer=s)) # Normalize text and remove stop words logging.info("Normalize text and remove stop words") df["text"] = df_parallel_apply(df, lambda row: normalize_text(row.text)) df["text"] = df_parallel_apply(df, lambda row: spell_check(row.text, spell), series=128) df["text"] = df_parallel_apply( df, lambda row: remove_words(row.text, stop_words)) common_stop_words = { k: v for k, v in sorted(REMOVED_STOP_WORDS.items(), key=lambda item: -item[1]) } logging.info("Common stop words: %s", list(itertools.islice(common_stop_words, 50))) # Filter text with at least 10 characters and 5 words logging.info("Filter text with at least 10 characters and 5 words") df = df[(df["text"].str.len() > 10) & (df["text"].str.count(" ") >= 4)] logging.info("Total rows: %d", len(df)) # Filter for portuguese text only logging.info("Filter for portuguese text only") df["language"] = df_parallel_apply(df, lambda row: language(row.text, wtl), backend="threading") logging.info(df[df["language"] == "en"].head()) logging.info(df[df["language"] == "pt"].head()) df = df[df["language"] == "pt"] logging.info("Total rows: %d", len(df)) # Add sentiment and length columns df["sentiment"] = df_parallel_apply( df, lambda row: sentiment(row.rating, row.source)) leia_accuracy = accuracy_score(df["leia"].values, df["sentiment"].values) logging.info("Leia accuracy: %.2f%%", leia_accuracy * 100) wordcloud_plot(df, "before") # Text Stemming logging.info("Stemming text") df["text"] = df_parallel_apply( df, lambda row: stemmed_text(row.text, stemmer=stemmer)) # Filter text with at least 10 characters and 5 words logging.info("Filter text with at least 10 characters and 5 words") df = df[(df["text"].str.len() > 10) & (df["text"].str.count(" ") >= 4)] logging.info("Total rows: %d", len(df)) wordcloud_plot(df, "after") df['length'] = df.apply(lambda row: len(row.text), axis=1) data_plots(df) joblib.dump(df, filename) logging.info("Saved '%s'", filename) return df