Ejemplo n.º 1
0
def lang_features(story_sentences: List[Dict[str, Any]]) -> List[Dict[str, Any]]:
    lang_list = []

    wtl = WhatTheLang()

    for sent_dict in story_sentences:

        text = sent_dict["text"]

        try:
            lang = wtl.predict_lang(text)

            if not isinstance(lang, str):
                lang = "UKN"

        except:
            lang = "UKN"
        try:
            if len(text) <= 10:
                is_nonsense = False
            else:
                is_nonsense = nonsense(text)
        except:
            is_nonsense = True

        is_eng = isAscii(text)

        lang_dict = dict(sentence_id=sent_dict["id"], lang=lang, nonsense=is_nonsense, ascii_chars=is_eng)

        lang_list.append(lang_dict)

    return lang_list
Ejemplo n.º 2
0
def translate(text, target="en", src=False):
    wtl = WhatTheLang()
    if (wtl.predict_lang(text) != target):
        info("translating...")
        translator = Translator()
        translated = translator.translate(text, dest=target)
        text = translated.text
        if (src):
            return text, translated.src
    return text
Ejemplo n.º 3
0
def whatlang(text, label):
    try:
        wtl = WhatTheLang()
        result = wtl.predict_lang(text)
        if result == label:
            return True
    except Exception as e:
        print(e)
        pass
    return False
Ejemplo n.º 4
0
 def translate(self, text=None, target="en", src=False):
     if (text == None):
         text = self.text
     wtl = WhatTheLang()
     if (wtl.predict_lang(text) != target):
         self.info("translating...")
         translator = Translator()
         translated = translator.translate(text, dest=target)
         self.translated = translated.text
         if (src):
             self.lang = translated.src
     else:
         self.translated = text
Ejemplo n.º 5
0
    def translate(self, text=None, target="de", src=False):
        global translation_engines
        global translation_tokenizers
        if (text == None):
            text = self.text
        wtl = WhatTheLang()
        try:
            langcode = wtl.predict_lang(text)
            if (langcode != target):
                try:
                    mname = f'Helsinki-NLP/opus-mt-{langcode}-{target}'  # model name
                    model = None
                    tok = None

                    if (mname in translation_engines.keys()):
                        model = translation_engines[mname]
                        tok = translation_tokenizers[mname]
                    else:
                        model = MarianMTModel.from_pretrained(mname)
                        tok = MarianSentencePieceTokenizer.from_pretrained(
                            mname)

                        translation_engines[mname] = model
                        translation_tokenizers[mname] = tok

                    batch = tok.prepare_translation_batch(
                        src_texts=[text])  # don't need tgt_text for inference
                    gen = model.generate(
                        **batch)  # for forward pass: model(**batch)
                    words: List[str] = tok.decode_batch(
                        gen, skip_special_tokens=True)
                    self.translated = words[0]

                    if (src):
                        self.lang = langcode

                    self.info("translated local...")

                except Exception as e:
                    print(e)
                    self.info("translating...")
                    translator = Translator()
                    translated = translator.translate(text, dest=target)
                    self.translated = translated.text
                    if (src):
                        self.lang = translated.src

            else:
                self.translated = text
        except:
            self.translated = text
Ejemplo n.º 6
0
def generate_lang_id_report(ref, outs,
                            model="wtl",
                            min_length=5,
                            print_lines=False,
                            print_line_numbers=False):
    if model=="wtl":
        wtl = WhatTheLang()
    lang_id_reports=[]
    lang_id_lines_reports=[]
    lang_id_line_numbers_reports=[]
    for out in outs:
        langs = defaultdict(int)
        lang_lines = defaultdict(list)
        lang_line_numbers = defaultdict(list)
        for i, sentence in enumerate(out, start=1):
            line = corpus_utils.list2str(sentence)
            if len(sentence) >= int(min_length):
                if model=="langid":
                    (lang, prob) = langid.classify(line)
                elif model=="wtl":
                    lang = wtl.predict_lang(line)
                else:
                    raise NotImplementedError(f"Unknown model for language identification: '{model}'.")
                langs[lang] +=1
                if print_line_numbers:
                    lang_line_numbers[lang].append(i)
                if print_lines:    
                    lang_lines[lang].append(line)
            else:
                langs["shorter than min_length"] +=1
                if print_line_numbers:
                    lang_line_numbers["shorter than min_length"].append(i)
                if print_lines:
                    lang_lines["shorter than min_length"].append(line)
        lang_id_reports.append(langs)  
        lang_id_lines_reports.append(lang_lines)
        lang_id_line_numbers_reports.append(lang_line_numbers)

    reporter = reporters.LangIDreport(model, lang_id_reports, lang_id_lines_reports, lang_id_line_numbers_reports,print_lines,print_line_numbers)
    reporter.generate_report()
    return reporter
Ejemplo n.º 7
0
def main(
    dump_file: Path,
    output_file,
    corpus_file: Path,
    limit=None,
    sample_size=None,
    random_state=None,
):
    logger.info(f'Reading file {dump_file}')
    df_github = pd.read_csv(dump_file, nrows=limit)

    if sample_size is not None:
        logger.info(f'Selecting {sample_size} random samples...')
        df_github = df_github.sample(sample_size)

    wtl = WhatTheLang()
    bodies = df_github['body'].tolist()
    batch_size = 512
    list_df = [
        bodies[i:i + batch_size] for i in range(0, len(bodies), batch_size)
    ]
    langs = []

    logger.info('Recognizing language...')
    for df in tqdm(list_df):
        langs += wtl.predict_lang(df)

    df_github['lang'] = langs

    df_english = df_github[df_github['lang'] == 'en']
    df_english['body'] = df_english['body'].swifter.apply(clean_text)

    df_lm = pd.DataFrame({'issue_body': df_english['body']})

    logger.info(f'Saving {len(df_lm)} rows')

    df_lm.to_json(output_file, orient='records', lines=True)
    corpus_file.write_text('\n'.join(df_english['body']),
                           encoding='utf8',
                           errors='strict')
Ejemplo n.º 8
0
    def load_links(self):
        global loaded_df
        if loaded_df is not None:
            df = loaded_df
        else:
            try:
                df = pd.read_csv("train_data/links.csv")
            except FileNotFoundError:
                print("Downloading links data...")
                df = pd.read_json("https://api.fakenewsdetector.org/links/all")
                df.to_csv("train_data/links.csv")

            df.dropna(subset=["title", "content"], inplace=True, how="all")
            df["category_id"] = df['verified_category_id'].fillna(
                df['category_id'])

            df["clickbait_title"] = df['verified_clickbait_title'].fillna(
                df['clickbait_title'])

            df = df.fillna('')

            # Limiting
            df = df[0:5000]
            df["title_content"] = self.join_text_and_content(df)
            print("Detecting language and limiting links...")
            wtl = WhatTheLang()
            df["lang"] = [
                wtl.predict_lang(text[0:50]) for text in df["title_content"]
            ]
            df = df[df["lang"] == 'en'][0:500].append(
                df[df["lang"] == 'es'][0:500]).append(
                    df[df["lang"] == 'pt'][0:500])
            print(df[["title", "lang"]].groupby(['lang']).agg(['count']).T)

        loaded_df = df
        df = df.loc[self.filter]
        df = df.copy()

        return df
Ejemplo n.º 9
0
def detect(text):
    wtl = WhatTheLang()
    return wtl.predict_lang(text)
Ejemplo n.º 10
0
 def detect(self):
     wtl = WhatTheLang()
     self.lang = wtl.predict_lang(self.text)
Ejemplo n.º 11
0
from whatthelang import WhatTheLang
wtl = WhatTheLang()
print(
    wtl.predict_lang([
        "അമ്മ", "पिता", "teacher", "അമ്മ", "पिता", "teacher", "അമ്മ", "पिता",
        "teacher", "അമ്മ", "पिता", "teacher"
    ]))