def lang_features(story_sentences: List[Dict[str, Any]]) -> List[Dict[str, Any]]:
    lang_list = []

    wtl = WhatTheLang()

    for sent_dict in story_sentences:

        text = sent_dict["text"]

        try:
            lang = wtl.predict_lang(text)

            if not isinstance(lang, str):
                lang = "UKN"

        except:
            lang = "UKN"
        try:
            if len(text) <= 10:
                is_nonsense = False
            else:
                is_nonsense = nonsense(text)
        except:
            is_nonsense = True

        is_eng = isAscii(text)

        lang_dict = dict(sentence_id=sent_dict["id"], lang=lang, nonsense=is_nonsense, ascii_chars=is_eng)

        lang_list.append(lang_dict)

    return lang_list
def translate(text, target="en", src=False):
    wtl = WhatTheLang()
    if (wtl.predict_lang(text) != target):
        info("translating...")
        translator = Translator()
        translated = translator.translate(text, dest=target)
        text = translated.text
        if (src):
            return text, translated.src
    return text
Exemple #3
0
def whatlang(text, label):
    try:
        wtl = WhatTheLang()
        result = wtl.predict_lang(text)
        if result == label:
            return True
    except Exception as e:
        print(e)
        pass
    return False
 def translate(self, text=None, target="en", src=False):
     if (text == None):
         text = self.text
     wtl = WhatTheLang()
     if (wtl.predict_lang(text) != target):
         self.info("translating...")
         translator = Translator()
         translated = translator.translate(text, dest=target)
         self.translated = translated.text
         if (src):
             self.lang = translated.src
     else:
         self.translated = text
Exemple #5
0
    def translate(self, text=None, target="de", src=False):
        global translation_engines
        global translation_tokenizers
        if (text == None):
            text = self.text
        wtl = WhatTheLang()
        try:
            langcode = wtl.predict_lang(text)
            if (langcode != target):
                try:
                    mname = f'Helsinki-NLP/opus-mt-{langcode}-{target}'  # model name
                    model = None
                    tok = None

                    if (mname in translation_engines.keys()):
                        model = translation_engines[mname]
                        tok = translation_tokenizers[mname]
                    else:
                        model = MarianMTModel.from_pretrained(mname)
                        tok = MarianSentencePieceTokenizer.from_pretrained(
                            mname)

                        translation_engines[mname] = model
                        translation_tokenizers[mname] = tok

                    batch = tok.prepare_translation_batch(
                        src_texts=[text])  # don't need tgt_text for inference
                    gen = model.generate(
                        **batch)  # for forward pass: model(**batch)
                    words: List[str] = tok.decode_batch(
                        gen, skip_special_tokens=True)
                    self.translated = words[0]

                    if (src):
                        self.lang = langcode

                    self.info("translated local...")

                except Exception as e:
                    print(e)
                    self.info("translating...")
                    translator = Translator()
                    translated = translator.translate(text, dest=target)
                    self.translated = translated.text
                    if (src):
                        self.lang = translated.src

            else:
                self.translated = text
        except:
            self.translated = text
Exemple #6
0
def create_training_samples_sents():

    wtl = WhatTheLang()
    data = get_data_as_df()
    label_map = get_label_map(data)

    ready_data = []
    for author, text, lang in tqdm(
            list(zip(data["author"], data["text"], data["lang"]))):
        # Get correct language, if inconsistent correct by hand
        #pred_lang = wtl.predict_lang(text)
        if lang not in ["de", "fr", "en", "it", "da"]:
            continue
        ready_data.append(text)
        """
        # Get correct tokenizer and embedder
        sentences = sent_tokenizer(text, lang)

        
        ready_data.append({
            "response": label_map[author],
            "document": document
        })
        """
        """
    s = 0
    cv = CountVectorizer()
    word_count_vector = cv.fit_transform(ready_data)
    print(word_count_vector.shape)
    print(get_label_map(data))"""
    return ready_data
def generate_lang_id_report(ref, outs,
                            model="wtl",
                            min_length=5,
                            print_lines=False,
                            print_line_numbers=False):
    if model=="wtl":
        wtl = WhatTheLang()
    lang_id_reports=[]
    lang_id_lines_reports=[]
    lang_id_line_numbers_reports=[]
    for out in outs:
        langs = defaultdict(int)
        lang_lines = defaultdict(list)
        lang_line_numbers = defaultdict(list)
        for i, sentence in enumerate(out, start=1):
            line = corpus_utils.list2str(sentence)
            if len(sentence) >= int(min_length):
                if model=="langid":
                    (lang, prob) = langid.classify(line)
                elif model=="wtl":
                    lang = wtl.predict_lang(line)
                else:
                    raise NotImplementedError(f"Unknown model for language identification: '{model}'.")
                langs[lang] +=1
                if print_line_numbers:
                    lang_line_numbers[lang].append(i)
                if print_lines:    
                    lang_lines[lang].append(line)
            else:
                langs["shorter than min_length"] +=1
                if print_line_numbers:
                    lang_line_numbers["shorter than min_length"].append(i)
                if print_lines:
                    lang_lines["shorter than min_length"].append(line)
        lang_id_reports.append(langs)  
        lang_id_lines_reports.append(lang_lines)
        lang_id_line_numbers_reports.append(lang_line_numbers)

    reporter = reporters.LangIDreport(model, lang_id_reports, lang_id_lines_reports, lang_id_line_numbers_reports,print_lines,print_line_numbers)
    reporter.generate_report()
    return reporter
Exemple #8
0
def main(
    dump_file: Path,
    output_file,
    corpus_file: Path,
    limit=None,
    sample_size=None,
    random_state=None,
):
    logger.info(f'Reading file {dump_file}')
    df_github = pd.read_csv(dump_file, nrows=limit)

    if sample_size is not None:
        logger.info(f'Selecting {sample_size} random samples...')
        df_github = df_github.sample(sample_size)

    wtl = WhatTheLang()
    bodies = df_github['body'].tolist()
    batch_size = 512
    list_df = [
        bodies[i:i + batch_size] for i in range(0, len(bodies), batch_size)
    ]
    langs = []

    logger.info('Recognizing language...')
    for df in tqdm(list_df):
        langs += wtl.predict_lang(df)

    df_github['lang'] = langs

    df_english = df_github[df_github['lang'] == 'en']
    df_english['body'] = df_english['body'].swifter.apply(clean_text)

    df_lm = pd.DataFrame({'issue_body': df_english['body']})

    logger.info(f'Saving {len(df_lm)} rows')

    df_lm.to_json(output_file, orient='records', lines=True)
    corpus_file.write_text('\n'.join(df_english['body']),
                           encoding='utf8',
                           errors='strict')
    def load_links(self):
        global loaded_df
        if loaded_df is not None:
            df = loaded_df
        else:
            try:
                df = pd.read_csv("train_data/links.csv")
            except FileNotFoundError:
                print("Downloading links data...")
                df = pd.read_json("https://api.fakenewsdetector.org/links/all")
                df.to_csv("train_data/links.csv")

            df.dropna(subset=["title", "content"], inplace=True, how="all")
            df["category_id"] = df['verified_category_id'].fillna(
                df['category_id'])

            df["clickbait_title"] = df['verified_clickbait_title'].fillna(
                df['clickbait_title'])

            df = df.fillna('')

            # Limiting
            df = df[0:5000]
            df["title_content"] = self.join_text_and_content(df)
            print("Detecting language and limiting links...")
            wtl = WhatTheLang()
            df["lang"] = [
                wtl.predict_lang(text[0:50]) for text in df["title_content"]
            ]
            df = df[df["lang"] == 'en'][0:500].append(
                df[df["lang"] == 'es'][0:500]).append(
                    df[df["lang"] == 'pt'][0:500])
            print(df[["title", "lang"]].groupby(['lang']).agg(['count']).T)

        loaded_df = df
        df = df.loc[self.filter]
        df = df.copy()

        return df
def filter_non_english_abstracts(papers):
    """
    Remove abstracts that are not written in englisch
    :param papers: DataFrame with papers
    """
    wtl = WhatTheLang()
    abstracts = papers["abstract"].apply(lambda text: re.sub(r"[\r\n]+", " ", text))
    wtl_lang = abstracts.apply(wtl.predict_lang)
    langid_lang = abstracts.apply(lambda a: langid.classify(a)[0])
    filtered_papers = papers[(wtl_lang == "en") & (langid_lang == "en")]
    logging.info(f"Removed {len(papers) - len(filtered_papers)} papers "
                 f"with non english abstracts")
    return filtered_papers
Exemple #11
0
def create_training_samples():

    wtl = WhatTheLang()
    data = get_data_as_df()
    label_map = get_label_map(data)
    tokenizers = get_tokenizers()
    word_embeddings = get_word_embeddings()

    ready_data = []
    for author, text, lang in tqdm(
            list(zip(data["author"], data["text"], data["lang"]))):
        # Get correct language, if inconsistent correct by hand
        #pred_lang = wtl.predict_lang(text)
        if lang not in ["de", "fr", "en", "it", "da"]:
            continue
        # Get correct tokenizer and embedder
        tokenizer = tokenizers[lang]
        embedder = word_embeddings[lang]
        # Vectorize each word
        tokenized_words = apply_transforms(
            [str(word) for word in tokenizer(text)])
        document = None
        good, bad = 0, 0
        for word in tokenized_words:
            if word in embedder:
                embedding = torch.tensor(embedder[word]).reshape(1, 300)
                good += 1
                """
                else: # this is an unknown word
                    embedding = torch.ones(1, 300)
                    bad += 1
                """
                if not document == None:
                    document = torch.cat([document, embedding], 0)
                else:
                    document = embedding
        # cut off all docs > 100:
        if document == None: continue
        # pad with zeros
        #print(f"Good-Ratio: {good/(good+bad)}")
        ready_data.append({
            "response": label_map[author],
            "document": document
        })
    return ready_data
#

# -  ## Import des librairies et chargement du fichier

# In[1]:

import time
from random import randint
import json, pandas
from kafka import SimpleProducer, KafkaClient
from kafka import KafkaProducer
import treetaggerwrapper
tagger = treetaggerwrapper.TreeTagger(TAGLANG='fr')

from whatthelang import WhatTheLang
wtl = WhatTheLang()

producer = KafkaProducer(
    bootstrap_servers='localhost:9092',
    value_serializer=lambda v: json.dumps(v).encode('utf-8'),
    linger_ms=10)

# Temps d'attente maximale fixé à 10 secondes
ATTENTE_MAX = 1

fichier = open('Data/new_comment_2018.json', 'rb')
#fichier = open('Data/comment_2018_echantillon.json', 'rb')
lignes = fichier.readlines()


def comment_to_lemme(comment):
def detect(text):
    wtl = WhatTheLang()
    return wtl.predict_lang(text)
 def detect(self):
     wtl = WhatTheLang()
     self.lang = wtl.predict_lang(self.text)
Exemple #15
0
from whatthelang import WhatTheLang
wtl = WhatTheLang()
print(
    wtl.predict_lang([
        "അമ്മ", "पिता", "teacher", "അമ്മ", "पिता", "teacher", "അമ്മ", "पिता",
        "teacher", "അമ്മ", "पिता", "teacher"
    ]))
Exemple #16
0
def load_df(stop_words, spell):
    """Return pandas data frame with normalized and filtered reviews"""

    filename = "data/df_filtered.jbl"

    if os.path.exists(filename):
        df = joblib.load(filename)
        return df

    wtl = WhatTheLang()
    stemmer = RSLPStemmer()

    # Raw data
    logging.info("Loading data from CSV file")
    df = pd.read_csv("data/data.csv").drop_duplicates()
    logging.info("Total rows: %d", len(df))

    df["text"] = df.apply(lambda row: f"{row.title} {row.text}", axis=1)
    del df["title"]

    logging.info("Running LEIA")
    s = SentimentIntensityAnalyzer()
    df["leia"] = df_parallel_apply(
        df, lambda row: leia_sentiment(row.text, analyzer=s))

    # Normalize text and remove stop words
    logging.info("Normalize text and remove stop words")
    df["text"] = df_parallel_apply(df, lambda row: normalize_text(row.text))
    df["text"] = df_parallel_apply(df,
                                   lambda row: spell_check(row.text, spell),
                                   series=128)
    df["text"] = df_parallel_apply(
        df, lambda row: remove_words(row.text, stop_words))

    common_stop_words = {
        k: v
        for k, v in sorted(REMOVED_STOP_WORDS.items(),
                           key=lambda item: -item[1])
    }
    logging.info("Common stop words: %s",
                 list(itertools.islice(common_stop_words, 50)))

    # Filter text with at least 10 characters and 5 words
    logging.info("Filter text with at least 10 characters and 5 words")
    df = df[(df["text"].str.len() > 10) & (df["text"].str.count(" ") >= 4)]
    logging.info("Total rows: %d", len(df))

    # Filter for portuguese text only
    logging.info("Filter for portuguese text only")
    df["language"] = df_parallel_apply(df,
                                       lambda row: language(row.text, wtl),
                                       backend="threading")

    logging.info(df[df["language"] == "en"].head())
    logging.info(df[df["language"] == "pt"].head())

    df = df[df["language"] == "pt"]
    logging.info("Total rows: %d", len(df))

    # Add sentiment and length columns
    df["sentiment"] = df_parallel_apply(
        df, lambda row: sentiment(row.rating, row.source))

    leia_accuracy = accuracy_score(df["leia"].values, df["sentiment"].values)
    logging.info("Leia accuracy: %.2f%%", leia_accuracy * 100)

    wordcloud_plot(df, "before")

    # Text Stemming
    logging.info("Stemming text")
    df["text"] = df_parallel_apply(
        df, lambda row: stemmed_text(row.text, stemmer=stemmer))

    # Filter text with at least 10 characters and 5 words
    logging.info("Filter text with at least 10 characters and 5 words")
    df = df[(df["text"].str.len() > 10) & (df["text"].str.count(" ") >= 4)]
    logging.info("Total rows: %d", len(df))

    wordcloud_plot(df, "after")

    df['length'] = df.apply(lambda row: len(row.text), axis=1)

    data_plots(df)

    joblib.dump(df, filename)
    logging.info("Saved '%s'", filename)

    return df