def main(log_path):
    trainer = BARTTrainer(init='bart.large')
    trainer.load_model(f'{log_path}/best_model.pt')

    src_texts, tgt_texts = load_data('test')
    trainer.load_data(split='dev',
                      src_texts=src_texts,
                      tgt_texts=tgt_texts,
                      src_max_len=SRC_MAX_LEN,
                      tgt_max_len=TGT_MAX_LEN)

    test_nll = trainer.evaluate()
    test_ppl = math.exp(test_nll)
    print(f'Test NLL: {test_nll}; Test PPL: {test_ppl}')

    gen_file = open(f'{log_path}/test.hypo', 'w')
    gold_file = open(f'{log_path}/test.gold', 'w')
    formatted_file = open(f'{log_path}/test.log', 'w')
    for src, tgt in zip(src_texts, tgt_texts):
        gen_text = trainer.generate([src])[0]

        gen_text = cleantext.clean(gen_text, extra_spaces=True)
        tgt = cleantext.clean(tgt, extra_spaces=True)

        print(gen_text, file=gen_file)
        print(tgt, file=gold_file)

        print(f'CHAT_HISTORY:\n{src}', file=formatted_file)
        print(f'\nGROUND TRUTH:\n{tgt}', file=formatted_file)
        print(f'\nGENERATION:\n{gen_text}', file=formatted_file)
        print('=' * 100, '\n\n', file=formatted_file)
Exemple #2
0
def test_remove_trail_leading_whitespace():
    text_input = b"Sehr geehrte Damen und Herren,\\r\\n\\r\\nich m\\xf6chte Sie bitten, zu folgendem Fall Stellung zu nehmen. Ich habe einen Fotoautomaten f\\xfcr biometrische Passfotos benutzt, der mein Gesicht nicht erkannt hat. Es besteht die Vermutung, dass dieser Fotoautomat vom BSI zertifiziert ist (Zertifikat BSI-DSZ-CC-0985-2018).\\r\\n\\r\\nDer Fotoautomat steht in  .\\r\\n\\r\\n\\r\\n\\t\\r\\n\\t\\tOrt des Automats: \\r\\n\\t\\r\\n\\r\\n\\r\\n\\r\\n \\r\\n\\t\\r\\n\\t\\tMarke: \\r\\n\\t\\r\\n\\r\\n\\r\\n\\r\\n\\r\\nHier noch Text von Anna Lena.\\r\\n\\r\\nMit freundlichen Gr\\xfc\\xdfen"
    text_input = text_input.decode("unicode_escape")
    text_output = """Sehr geehrte Damen und Herren,

ich möchte Sie bitten, zu folgendem Fall Stellung zu nehmen. Ich habe einen Fotoautomaten für biometrische Passfotos benutzt, der mein Gesicht nicht erkannt hat. Es besteht die Vermutung, dass dieser Fotoautomat vom BSI zertifiziert ist (Zertifikat BSI-DSZ-CC-0985-2018).

Der Fotoautomat steht in .

Ort des Automats:

Marke:

Hier noch Text von Anna Lena.

Mit freundlichen Grüßen"""

    print(
        cleantext.clean(
            text_input,
            lower=False,
            lang="de",
            no_line_breaks=False,
            keep_two_line_breaks=True,
        ))

    assert text_output == cleantext.clean(
        text_input,
        lower=False,
        lang="de",
        no_line_breaks=False,
        keep_two_line_breaks=True,
    )
Exemple #3
0
def test_whitespace():
    assert cleantext.clean(" peter", normalize_whitespace=False) == " peter"
    assert cleantext.clean(" peter", normalize_whitespace=True) == "peter"
    assert (cleantext.clean(" pet\n\ner",
                            normalize_whitespace=True,
                            no_line_breaks=True) == "pet er")
    assert (cleantext.clean(" pet\n\ner",
                            normalize_whitespace=True,
                            no_line_breaks=False) == "pet\ner")
Exemple #4
0
def test_empty_string():
    """Test for clean function

    :param text: raw text
    :return:
    """

    with pytest.raises(CleanTextEmptyString):
        clean('')
def process_page(soup):
    orgs = []
    for org in soup.findAll('div', class_='company-item'):
        title = clean(org.find('a').get_text(), lower=True)
        link = org.find('a')['href']

        head_title = clean(
            org.select_one('.company-item-info>dl>dt').get_text(), lower=True)
        head_name = clean(
            org.select_one('.company-item-info>dl>dd').get_text(), lower=True)

        address = clean(org.find('address').get_text(), lower=True)

        inn = clean(
            org.find(text="ИНН").parent.parent.findNext('dd').get_text(),
            lower=True)
        ogrn = clean(
            org.find(text="ОГРН").parent.parent.findNext('dd').get_text(),
            lower=True)
        reg_date = clean(org.find(
            text="Дата регистрации").parent.parent.findNext('dd').get_text(),
                         lower=True)
        reg_cap_dirty = org.find(text="Уставный капитал")
        reg_cap = clean(reg_cap_dirty.parent.parent.findNext('dd').get_text(),
                        lower=True) if reg_cap_dirty != None else None

        status = org.select_one('.warning-text, .attention-text')
        if status != None:
            status = clean(status.get_text(), lower=True)

        main_activity = clean(
            org.find(text="Основной вид деятельности").parent.parent.findNext(
                'dd').get_text(),
            lower=True)
        main_activity_code = int(
            re.findall(r'\d+', main_activity.replace('.',
                                                     ''))[0].ljust(6, '0'))

        if status == 'Организация ликвидирована':
            return (orgs, False)

        orgs.append({
            'title': title,
            'link': link,
            'head_title': head_title,
            'head_name': head_name,
            'inn': inn,
            'ogrn': ogrn,
            'reg_date': reg_date,
            'reg_cap': reg_cap,
            'status': status,
            'main_activity': main_activity,
            'main_activity_code': main_activity_code,
            'address': address
        })

    return (orgs, True)
Exemple #6
0
def clean_dict(d):
    for k, v in d.items():
        if isinstance(v, str):
            d[k] = clean(v, lower=False, no_line_breaks=True)
        elif isinstance(v, list):
            d[k] = list(
                map(
                    lambda x: clean(x, lower=False, no_line_breaks=True)
                    if isinstance(v, list) else x, v))
Exemple #7
0
def main():
    os.makedirs(DATA_DIR, exist_ok=True)
    if not os.path.exists(
            os.path.join(DATA_DIR, 'COVID-Dialogue-Dataset-English.txt')):
        os.system(f'wget -P {DATA_DIR} https://raw.githubusercontent.com/UCSD'
                  f'-AI4H/COVID-Dialogue/master/COVID-Dialogue-Dataset'
                  f'-English.txt')

    dialogues_texts_dirty = open(
        os.path.join(DATA_DIR,
                     'COVID-Dialogue-Dataset-English.txt')).read().split('id=')

    dialogues = []
    for text in dialogues_texts_dirty:
        text = text[text.find('Description'):].strip()

        description = text[len('Description\n'):text.find('\nDialogue')]
        description = cleantext.clean(description,
                                      extra_spaces=True,
                                      lowercase=True)

        text = text[text.find('\nPatient:'):]

        utterances, last_person, valid = [], 'None', True
        for x in re.finditer('Doctor:|Patient:', text):
            if x.group() == last_person:
                valid = False
                break
            else:
                last_person = x.group()

            utterance = text[x.end():].split('Patient:')[0].split('Doctor:')[0]
            utterances.append(
                cleantext.clean(utterance, extra_spaces=True, lowercase=True))

        if valid and utterances:
            dialogues.append({
                'description': description,
                'utterances': utterances
            })

    print('#dialogs:', len(dialogues))

    random.seed(11111)
    random.shuffle(dialogues)

    train_size = int(0.8 * len(dialogues))
    dev_size = int(0.1 * len(dialogues))

    pickle.dump(dialogues[:train_size], open(f'{DATA_DIR}/train.pickle', 'wb'))
    pickle.dump(dialogues[train_size:train_size + dev_size],
                open(f'{DATA_DIR}/dev.pickle', 'wb'))
    pickle.dump(dialogues[train_size + dev_size:],
                open(f'{DATA_DIR}/test.pickle', 'wb'))

    print_fairseq_format()
Exemple #8
0
def test_remove_trail_leading_whitespace():
    text_input = """
    Sehr geehrte Damen und Herren,

ich möchte Sie bitten, zu folgendem Fall Stellung zu nehmen. Ich habe einen Fotoautomaten für biometrische Passfotos benutzt, der mein Gesicht nicht erkannt hat. Es besteht die Vermutung, dass dieser Fotoautomat vom BSI zertifiziert ist (Zertifikat BSI-DSZ-CC-0985-2018).

Der Fotoautomat steht in  19061  Berlin.



		Marke: Fotofix





		Ort des Automats: Bezirksamt / Bürgeramt / Bürgerbüro





Mit freundlichen Grüßen,
Johannes dfdfd
    """

    text_output = """Sehr geehrte Damen und Herren,

ich möchte Sie bitten, zu folgendem Fall Stellung zu nehmen. Ich habe einen Fotoautomaten für biometrische Passfotos benutzt, der mein Gesicht nicht erkannt hat. Es besteht die Vermutung, dass dieser Fotoautomat vom BSI zertifiziert ist (Zertifikat BSI-DSZ-CC-0985-2018).

Der Fotoautomat steht in 19061 Berlin.

Marke: Fotofix

Ort des Automats: Bezirksamt / Bürgeramt / Bürgerbüro

Mit freundlichen Grüßen,
Johannes dfdfd"""

    print(
        cleantext.clean(
            text_input,
            lower=False,
            lang="de",
            no_line_breaks=False,
            keep_two_line_breaks=True,
        )
    )

    assert text_output == cleantext.clean(
        text_input,
        lower=False,
        lang="de",
        no_line_breaks=False,
        keep_two_line_breaks=True,
    )
def split_proximity(text):
    # TODO: add not / - option
    text = text.replace("*", "").replace(":", "").replace("'", '"')
    tokens = smart_split(text)

    for t in tokens:
        t_cl = clean(t, lang="de", lower=False, no_punct=False)
        t_cl_p = clean(t, lang="de", lower=False, no_punct=True)

        if t_cl.lower() == "or":
            continue
        if " " in t or '"' in t_cl:
            yield "' " + t_cl_p.replace(" ", " <-> ") + " '"
        else:
            yield t_cl_p + ":*"
Exemple #10
0
def convert_text_into_sentences(text):
    assert isinstance(text, str)

    # Normalize a text.
    text = clean(
        text,
        fix_unicode=True,
        to_ascii=True,
        lower=True,
        no_line_breaks=False,
        no_urls=True,
        no_emails=True,
        no_phone_numbers=True,
        no_numbers=False,
        no_digits=False,
        no_currency_symbols=False,
        no_punct=False)

    # Remove tags.
    text = re.sub('<[^<]+?>', '', text)
    # Remove invalid charactors.
    text = re.sub('[#%\'\(\)\*\+\-\\\/:;<=>@^_`|~\[\]]+', '', text)
    # Convert a text into sentences.
    sentences = sent_tokenize(text)

    return sentences
Exemple #11
0
def pre_procee_2(txt):
#     print(type(txt),txt)
    txt = re.sub(r'<.*?>|&([a-z0-9]+|#[0-9]{1,6}|#x[0-9a-f]{1,6});'," ",txt)
    txt = clean(pre_process_1(txt),
                fix_unicode=True, # fix various unicode errors
                to_ascii=True, # transliterate to closest ASCII representation
                lower=True, # lowercase text
                no_line_breaks=True, # fully strip line breaks as opposed to only normalizing them
                no_urls=True, # replace all URLs with a special token
                no_emails=True, # replace all email addresses with a special token
                no_phone_numbers=True, # replace all phone numbers with a special token
                no_numbers=True, # replace all numbers with a special token
                no_digits=False, # replace all digits with a special token
                no_currency_symbols=True, # replace all currency symbols with a special token
                no_punct=False, # fully remove punctuation
                replace_with_url="URL>",
                replace_with_email="EMAIL",
                replace_with_phone_number="PHONE",
                replace_with_number="NUMBER",
                replace_with_digit="0",
                replace_with_currency_symbol="CURR",
                lang="en") # set to 'de' for German special handling)
#     txt = [porter_stemmer.stem(i) for i in word_tokenize(str(txt).lower())]
    txt = [i for i in word_tokenize(str(txt).lower()) if i not in stop_lst]
    txt = [w for w in word_tokenize(str(txt).lower()) if not w in stop_words]
    txt = ' '.join(txt)
    txt.translate(str.maketrans('', '', string.punctuation))
    txt = re.sub(r'[^\w\s]','',txt)
    txt = re.sub(r'_+','',txt)
    txt = re.sub(r"\s+"," ",txt)
    txt = ''.join(txt)
    txt = txt.strip()
    return txt
Exemple #12
0
def clean():
    for t in gen(skip=True):
        filename = get_txt_file(t)
        with open(filename, "r") as f:
            old_txt = f.read()
        new_txt = old_txt
        new_txt = cleantext.clean(new_txt, fix_unicode=True, to_ascii=True, lower=False, no_urls=True)

        new_txt = re.sub(r"For more information about District economic conditions,? visit: URL", "", new_txt)
        new_txt = new_txt.replace("%-", " percent to ").replace("%", " percent")
        new_txt = new_txt.replace(" & ", " and ")
        # new_txt = new_txt.replace("&", " and ") # ex. R&D
        # new_txt = re.sub(r" -(?=[\d.])", " minus ", new_txt)
        # new_txt = new_txt.replace("+", " plus ")
        new_txt = new_txt.replace("=", " equals ") # only one instance

        new_txt = re.sub(r"[<>~*]", "", new_txt)
        new_txt = re.sub(r"\-\-+", " , ", new_txt)
        new_txt = re.sub(r"\?(?=[\w])", "? ", new_txt).replace(" ?", "?")
        new_txt = re.sub(r"\s+,", ",", new_txt).replace(",,", ",")
        new_txt = re.sub(r"\s+\.(?=[^0-9])", " ", new_txt)
        new_txt = new_txt.replace("...", " ")
        new_txt = new_txt.replace("..", ".").replace(",.", ",") # ".," is legitimate
        new_txt = new_txt.replace("[", "").replace("]", "") # parentheses are legitimate

        new_txt = re.sub(r"\s+", " ", new_txt)
        new_txt = new_txt.strip()
        with open(filename, "w") as f:
            f.write(new_txt)
def Encrypt(p):

    c = []

    with open(inputfile, mode='r') as f:  # Read plaintext from file (p.csv)
        global words
        file_ = f.readlines()
        words = cleantext.clean(file_, numbers=True, punct=True)
        f.close()

    with open(outputfile, mode='w') as f_:

        fieldnames = ['x', 'y', 'z']
        writer = csv.DictWriter(f_, fieldnames=fieldnames)
        writer.writeheader()
        print(" \n Encryption of Plaintext (" + inputfile +
              ") -> Cyphertext (" + outputfile + "+) : \n")

        for index, char in enumerate(words):

            counter_ = Counter(p)

            x = ((index * index) + 1)
            y = alpha.index(char.lower()) * x
            z = counter_[char]
            w = (x, y, z)
            c.append(w)

            writer.writerow({'x': x, 'y': y, 'z': z})

    print(c)
Exemple #14
0
def main(log_path, wiki_sup=True):
    supervisor = pickle.load(open('supervisions/supervisor.pickle', 'rb')) \
        if wiki_sup else None
    dataset = MANewsDataset(split='test',
                            supervisor=supervisor,
                            n_wiki_words=N_WIKI_WORDS)
    test_examples = [example for example in dataset]

    bart = BART.load_from_checkpoint(
        init=MODEL_INIT,
        checkpoint_path=f'{log_path}/best_model.ckpt').to('cuda')
    bart.eval()

    src_file = open(f'{log_path}/test.source', 'w')
    gold_file = open(f'{log_path}/test.gold', 'w')
    hypo_file = open(f'{log_path}/test.hypo', 'w')

    for i in trange(0, len(test_examples), BATCH_SIZE, desc=f'Generating'):
        batch_examples = test_examples[i:i + BATCH_SIZE]

        gen_texts = bart.generate(
            src_texts=[example['src'] for example in batch_examples],
            max_len=MAX_LEN,
            min_len=MIN_LEN,
            beam_size=BEAM_SIZE,
            len_penalty=LEN_PENALTY,
            no_repeat_ngram_size=NO_REPEAT_NGRAM_SIZE)

        for example, gen_text in zip(batch_examples, gen_texts):
            print(example['src'].replace('\n\n', ' ||| '), file=src_file)
            print(example['tgt'], file=gold_file)
            print(cleantext.clean(gen_text, extra_spaces=True, lowercase=True),
                  file=hypo_file)
Exemple #15
0
def post_extract():
    # 1. write temp file to disk
    f = request.files['file']  # uploaded file (via form / REST client)
    temp = tempfile.NamedTemporaryFile(prefix="jargonbuster_", delete=False)
    extractedText = ""
    info = ""
    try:
        f.save(temp)
        extractor = TikaExtractor(temp)  #PDFMinerExtractor(temp)
        temp.close()
        extractedText = extractor.extractText()
        info = extractor.extractInfo()

        # 3. denoise: do some basic cleaning (e.g. linebreaks)
        extractedText = clean(extractedText, no_line_breaks=True, lang="en")
        # remove remains from word breaks (like "re- miniscence")
        extractedText = re.sub(r'([a-z])\- ([a-z])', r'\1\2', extractedText)
        extractedText = re.sub(r'\.\d+\s+([a-z])+', r'\1', extractedText)
        extractedText = re.sub(r'\[[^]]*\]', r'', extractedText)
        #extractedText = re.sub(r'https?:\/\/.\S+', r'', extractedText)
        # use auto correct
        #spell = Speller (lang="en")
        #extractedText = spell(extractedText)

    finally:
        temp.close()
        os.unlink(temp.name)

    response = jsonify(text=extractedText, info=info)

    return response
Exemple #16
0
def getTokenized(text):
    firstIndex = text.find("p id=\"speakable-summary\"") + 25
    secondIndex = text.find("<footer class", firstIndex)
    content = text[firstIndex:secondIndex]

    cleanr = re.compile('<.*?>')
    content = re.sub(cleanr, '', content)
    content = content.replace('\n', ' ')
    content = content.replace('\t', ' ')
    content = html.unescape(content)

    content = clean(
        content,
        fix_unicode=True,  # fix various unicode errors
        to_ascii=True,  # transliterate to closest ASCII representation
        lower=False,  # lowercase text
        no_line_breaks=
        False,  # fully strip line breaks as opposed to only normalizing them
        no_urls=False,  # replace all URLs with a special token
        no_emails=False,  # replace all email addresses with a special token
        no_phone_numbers=False,  # replace all phone numbers with a special token
        no_numbers=False,  # replace all numbers with a special token
        no_digits=False,  # replace all digits with a special token
        no_currency_symbols=
        False,  # replace all currency symbols with a special token
        no_punct=False,  # fully remove punctuation
    )

    tokenizer = RegexpTokenizer(r'([%$&\-\+]?\b[^\s]+\b[%$&]?)')
    content = tokenizer.tokenize(content)

    return content
Exemple #17
0
def getTokenized(text):
    firstIndex = text.find('body":"')+7
    secondIndex = 0
    if "Field Level Media" in text:
        secondIndex = text.find("Field Level Media", firstIndex)
    else:
        secondIndex = text.find('","attribution"', firstIndex)
    content = text[firstIndex:secondIndex]


    content = clean(content,
            fix_unicode=True,               # fix various unicode errors
            to_ascii=True,                  # transliterate to closest ASCII representation
            lower=False,                     # lowercase text
            no_line_breaks=False,           # fully strip line breaks as opposed to only normalizing them
            no_urls=False,                  # replace all URLs with a special token
            no_emails=False,                # replace all email addresses with a special token
            no_phone_numbers=False,         # replace all phone numbers with a special token
            no_numbers=False,               # replace all numbers with a special token
            no_digits=False,                # replace all digits with a special token
            no_currency_symbols=False,      # replace all currency symbols with a special token
            no_punct=False,                 # fully remove punctuation
        )
    cleanr = re.compile('<.*?>')
    content = re.sub(cleanr, '', content)
    content = content.replace('\n',' ')
    content = content.replace('\t',' ')


    tokenizer = RegexpTokenizer(r'([%$&\-\+]?\b[^\s]+\b[%$&]?)')
    content = tokenizer.tokenize(content)
    return content
Exemple #18
0
 def clean_text(self, text):
     return clean(
         text,
         fix_unicode=True,  # fix various unicode errors
         to_ascii=True,  # transliterate to closest ASCII representation
         lower=True,  # lowercase text
         no_line_breaks=
         False,  # fully strip line breaks as opposed to only normalizing them
         no_urls=True,  # replace all URLs with a special token
         no_emails=True,  # replace all email addresses with a special token
         no_phone_numbers=
         True,  # replace all phone numbers with a special token
         no_numbers=True,  # replace all numbers with a special token
         no_digits=True,  # replace all digits with a special token
         no_currency_symbols=
         True,  # replace all currency symbols with a special token
         no_punct=True,  # fully remove punctuation
         replace_with_url="<URL>",
         replace_with_email="<EMAIL>",
         replace_with_phone_number="<PHONE>",
         replace_with_number="<NUMBER>",
         replace_with_digit="0",
         replace_with_currency_symbol="<CUR>",
         lang="en"  # set to 'de' for German special handling
     )
Exemple #19
0
	def get_lexicon(self, file):

		vocab = []

		for line in self.Load.read_file(file):
			
			#Use clean-text
			line = clean(line,
							fix_unicode = True,
							to_ascii = False,
							lower = True,
							no_line_breaks = True,
							no_urls = True,
							no_emails = True,
							no_phone_numbers = True,
							no_numbers = True,
							no_digits = True,
							no_currency_symbols = True,
							no_punct = True,
							replace_with_punct = "",
							replace_with_url = "<URL>",
							replace_with_email = "<EMAIL>",						
							replace_with_phone_number = "<PHONE>",
							replace_with_number = "<NUMBER>",
							replace_with_digit = "0",
							replace_with_currency_symbol = "<CUR>"
							)

			line = line.split()
			vocab += line

		return set(vocab)
Exemple #20
0
def cleaning(text):
    text = text.strip()

    # regular cleaning
    text = clean(text,
                 fix_unicode=True,
                 to_ascii=False,
                 lower=True,
                 no_line_breaks=True,
                 no_urls=True,
                 no_emails=True,
                 no_phone_numbers=True,
                 no_numbers=False,
                 no_digits=False,
                 no_currency_symbols=True,
                 no_punct=False,
                 replace_with_url="",
                 replace_with_email="",
                 replace_with_phone_number="",
                 replace_with_number="",
                 replace_with_digit="0",
                 replace_with_currency_symbol="",
                 )

   # normalizing
    normalizer = hazm.Normalizer()
    text = normalizer.normalize(text)

    # removing wierd patterns
    wierd_pattern = re.compile("["
                               u"\U0001F600-\U0001F64F"  # emoticons
                               u"\U0001F300-\U0001F5FF"  # symbols & pictographs
                               u"\U0001F680-\U0001F6FF"  # transport & map symbols
                               u"\U0001F1E0-\U0001F1FF"  # flags (iOS)
                               u"\U00002702-\U000027B0"
                               u"\U000024C2-\U0001F251"
                               u"\U0001f926-\U0001f937"
                               u'\U00010000-\U0010ffff'
                               u"\u200d"
                               u"\u2640-\u2642"
                               u"\u2600-\u2B55"
                               u"\u23cf"
                               u"\u23e9"
                               u"\u231a"
                               u"\u3030"
                               u"\ufe0f"
                               u"\u2069"
                               u"\u2066"
                               # u"\u200c"
                               u"\u2068"
                               u"\u2067"
                               "]+", flags=re.UNICODE)

    text = wierd_pattern.sub(r'', text)

    # removing extra spaces, hashtags
    text = re.sub("#", "", text)
    text = re.sub("\s+", " ", text)
    text = re.sub("a", "ِ", text)
    return text
Exemple #21
0
    def load(self, line):

        #Tokenize zho
        if self.language == "zho" and self.zho_split == True:

            line = [
                x for x in self.tk.cut(line, cut_all=True, HMM=True) if x != ""
            ]
            line = " ".join(line)

        #Use clean-text
        line = clean(line,
                     fix_unicode=True,
                     to_ascii=False,
                     lower=True,
                     no_line_breaks=True,
                     no_urls=True,
                     no_emails=True,
                     no_phone_numbers=True,
                     no_numbers=True,
                     no_digits=True,
                     no_currency_symbols=True,
                     no_punct=True,
                     replace_with_punct="",
                     replace_with_url="<URL>",
                     replace_with_email="<EMAIL>",
                     replace_with_phone_number="<PHONE>",
                     replace_with_number="<NUMBER>",
                     replace_with_digit="0",
                     replace_with_currency_symbol="<CUR>")

        line = self.r.tagRawSentenceHash(rawLine=line)
        #Array of tuples (LEX, POS, CAT)

        return np.array(line)
Exemple #22
0
def trim(line, vecdict, vec_len, threshold=0.6):
    line = clean(line)
    words = importance(line, vecdict, vec_len)
    if words == []:
        return ""
    max_val = max(map(lambda x: x[1], words))
    return " ".join(map(lambda x: x[0], filter(lambda x: x[1]/max_val > threshold, words)))
Exemple #23
0
 def get_phone_string(self, text):
     utt = clean(text,
                 fix_unicode=True,
                 to_ascii=False,
                 lower=False,
                 lang=self.clean_lang)
     self.expand_abbrevations(utt)
     utt = utt.replace("_SIL_", "~")
     phones = phonemizer.phonemize(utt,
                                   language_switch='remove-flags',
                                   backend="espeak",
                                   language=self.g2p_lang,
                                   preserve_punctuation=True,
                                   strip=True,
                                   punctuation_marks=';:,.!?¡¿—…"«»“”~',
                                   with_stress=self.use_stress).replace(";", ",") \
         .replace(":", ",").replace('"', ",").replace("-", ",").replace("-", ",").replace("\n", " ") \
         .replace("\t", " ").replace("¡", "").replace("¿", "").replace(",", "~")
     phones = re.sub("~+", "~", phones)
     if not self.use_prosody:
         phones = phones.replace("ˌ", "").replace("ː", "").replace(
             "ˑ", "").replace("˘", "").replace("|", "").replace("‖", "")
     if not self.use_word_boundaries:
         phones = phones.replace(" ", "")
     return phones + "#"
Exemple #24
0
def stats():
    q = request.args.get("q")
    if q is None:
        return jsonify({})
    q = cleantext.clean(q, lang="de")
    # for qs like "token" remove the quotes to count
    counting_q = q.replace('"', "").replace("'", "")

    query, page, jurisdiction, max_year, min_year = build_query()

    all_results = (
        query.join(Document)
        .filter(Document.year >= trends_min_year)
        .search(q)
        .with_entities(Document.year, DocumentPage.content)
    )
    d = defaultdict(int)

    for r in all_results:
        year = r.year
        count = r.content.lower().count(counting_q)
        d[year] += count

    for year_tup in get_year_totals():
        d[year_tup[0]] /= year_tup[1]

    # fix NSU
    if q.lower() == "nsu":
        for y in range(trends_min_year, 2009):
            d[y] = 0

    return jsonify([q, d])
Exemple #25
0
def getEachResumeText(current_batch,
                      pdf_reader,
                      bookmarks,
                      bookmarks_len,
                      start_bookmark=0):
    start_page = pdf_reader.getDestinationPageNumber(bookmarks[start_bookmark])
    # at last bookmark set end_page to last page of the document
    if bookmarks_len == start_bookmark:
        end_page = pdf_reader.getNumPages()
    # else set it to next bookmark
    else:
        end_bookmark = start_bookmark + 1
        end_page = pdf_reader.getDestinationPageNumber(bookmarks[end_bookmark])

    # itterate over a complete resume to extract text
    clean_extracted = ''
    for page in range(start_page, end_page):
        with pdfplumber.open(current_batch) as pdf:
            current_page = pdf.pages[page]
            extracted = current_page.extract_text()
            cleaned = clean(extracted,
                            lower=True,
                            no_line_breaks=True,
                            no_phone_numbers=True,
                            no_emails=True,
                            no_urls=True,
                            no_numbers=True,
                            no_digits=True)
            clean_extracted += cleaned
    # helper function to save whats extracted by pdfplumber in a text file
    saveCleanText(clean_extracted)
    return clean_extracted, start_page
Exemple #26
0
def cleanup_text(data):

    email_headers = re.compile(r'^from:.*(?:\r?\n(?!\r?\n).*)*', re.IGNORECASE)
    clean_data = list(map(lambda x: email_headers.sub('', x).strip(), data[:]))  # remove email headers
    clean_data = list(map(lambda x: re.sub(r"\r?\n\r?\n.*(?:\r?\n(?!\r?\n).*)*$",'', x).strip(), clean_data[:])) # remove ending signatures
    #clean_data = list(map(lambda x: re.sub(r"[a-zA-Z0-9_.\$\-]*@(\w*\.)*\w*",'',x).strip(), clean_data[:])) # remove emails

    ## using clean-text library
# =============================================================================
#     # usage:
#     clean("some input",
#         fix_unicode=True,               # fix various unicode errors
#         to_ascii=True,                  # transliterate to closest ASCII representation
#         lower=True,                     # lowercase text
#         no_line_breaks=False,           # fully strip line breaks as opposed to only normalizing them
#         no_urls=False,                  # replace all URLs with a special token
#         no_emails=False,                # replace all email addresses with a special token
#         no_phone_numbers=False,         # replace all phone numbers with a special token
#         no_numbers=False,               # replace all numbers with a special token
#         no_digits=False,                # replace all digits with a special token
#         no_currency_symbols=False,      # replace all currency symbols with a special token
#         no_punct=False,                 # remove punctuations
#         replace_with_punct="",          # instead of removing punctuations you may replace them
#         replace_with_url="<URL>",
#         replace_with_email="<EMAIL>",
#         replace_with_phone_number="<PHONE>",
#         replace_with_number="<NUMBER>",
#         replace_with_digit="0",
#         replace_with_currency_symbol="<CUR>",
#         lang="en"                       # set to 'de' for German special handling
#     )
# =============================================================================

    clean_data = list(map(lambda x: clean(x, no_urls=True, no_emails=True, no_digits=True, no_currency_symbols=True, no_punct=True).strip(), clean_data[:]))
    return clean_data
Exemple #27
0
def clean_text(text):
    if type(text) == float:
        return ""
    text = re.sub(r'\([^)]*\)', '', text)
    cleaned_text = clean(
        text,
        fix_unicode=True,  # fix various unicode errors
        to_ascii=True,  # transliterate to closest ASCII representation
        lower=True,  # lowercase text
        no_line_breaks=
        True,  # fully strip line breaks as opposed to only normalizing them
        no_urls=True,  # replace all URLs with a special token
        no_emails=True,  # replace all email addresses with a special token
        no_phone_numbers=True,  # replace all phone numbers with a special token
        no_numbers=True,  # replace all numbers with a special token
        no_digits=True,  # replace all digits with a special token
        no_currency_symbols=
        True,  # replace all currency symbols with a special token
        no_punct=False,  # fully remove punctuation
        replace_with_url='[URL]',
        replace_with_email='[EMAIL]',
        replace_with_phone_number='[PHONE]',
        replace_with_number="[NUMBER]",
        replace_with_digit="0",
        replace_with_currency_symbol="[CUR]",
        lang="en"  # set to 'de' for German special handling
    )
    cleaned_text = cleaned_text.strip()
    return cleaned_text
Exemple #28
0
def cl(x):
    return clean(x,
                 no_urls=True,
                 no_digits=True,
                 no_punct=True,
                 no_line_breaks=True,
                 lang='de')
def text_cleaner(string2):

    polish_stopwords = pd.read_csv(
        '/home/erazer/PycharmProjects/jigsaw/nlp_utils/polish_stopwords.txt',
        header=None,
        names=['words']).words.to_list()
    from cleantext import clean
    cleaned_string = clean(string2,
                           no_urls=True,
                           no_digits=True,
                           no_line_breaks=True,
                           no_emails=True,
                           no_phone_numbers=True,
                           no_numbers=True,
                           no_currency_symbols=True,
                           no_punct=True,
                           replace_with_digit="",
                           replace_with_url="",
                           replace_with_email="",
                           replace_with_currency_symbol="",
                           replace_with_number="",
                           replace_with_phone_number="")

    filtered_sentence = [
        elem for elem in cleaned_string.split(' ')
        if elem not in polish_stopwords
    ]
    # optional - do not return it as a list of separate words, but as a string
    return filtered_sentence
Exemple #30
0
def my_clean(text):
    emojiPattern = re.compile(
        pattern="["
        u"\U0001F600-\U0001F64F"  # emoticons
        u"\U0001F300-\U0001F5FF"  # symbols & pictographs
        u"\U0001F680-\U0001F6FF"  # transport & map symbols
        u"\U0001F1E0-\U0001F1FF"  # flags (iOS)
        "]+",
        flags=re.UNICODE)
    newText = emojiPattern.sub(r'', text)
    newText = clean(newText,
                    fix_unicode=True,
                    to_ascii=True,
                    lower=True,
                    no_line_breaks=True,
                    lang="en")
    newText = re.sub(r'http\S+', '', newText)
    emojiPattern = re.compile(
        pattern="["
        u"\U0001F600-\U0001F64F"  # emoticons
        u"\U0001F300-\U0001F5FF"  # symbols & pictographs
        u"\U0001F680-\U0001F6FF"  # transport & map symbols
        u"\U0001F1E0-\U0001F1FF"  # flags (iOS)
        "]+",
        flags=re.UNICODE)
    newText = emojiPattern.sub(r'', newText)
    newText = re.sub(r'[^\w\s]', '', newText)

    return newText
Exemple #31
0
def text_processing(text):
    text = re.sub(r'<.*>', '', text)
    text = clean(
        text,
        fix_unicode=True,  # fix various unicode errors
        to_ascii=True,  # transliterate to closest ASCII representation
        lower=True,  # lowercase text
        no_line_breaks=
        False,  # fully strip line breaks as opposed to only normalizing them
        no_urls=False,  # replace all URLs with a special token
        no_emails=False,  # replace all email addresses with a special token
        no_phone_numbers=False,  # replace all phone numbers with a special token
        no_numbers=False,  # replace all numbers with a special token
        no_digits=False,  # replace all digits with a special token
        no_currency_symbols=
        False,  # replace all currency symbols with a special token
        no_punct=False,  # fully remove punctuation
        replace_with_url="<URL>",
        replace_with_email="<EMAIL>",
        replace_with_phone_number="<PHONE>",
        replace_with_number="<NUMBER>",
        replace_with_digit="0",
        replace_with_currency_symbol="<CUR>",
    )

    url = re.findall(
        r'http[s]?://(?:[a-zA-Z]|[0-9]|[$-_@.&+]|[!*\(\), ]|(?:%[0-9a-fA-F][0-9a-fA-F]))+',
        text)

    text = re.sub(
        r'http[s]?://(?:[a-zA-Z]|[0-9]|[$-_@.&+]|[!*\(\), ]|(?:%[0-9a-fA-F][0-9a-fA-F]))+',
        '', text)
    text = re.sub(r'www\S*', '', text)
    return text, url
Exemple #32
0
def read_train(train_file):
    lines = []
    y = []
    vectorizer = CountVectorizer(min_df=3)
    tf_idf = TfidfTransformer()

    for parts in utils.read_train(train_file):
        is_blocked = parts[8]
        desc = cleantext.clean(parts[4], False)
        lines.append(desc)
        y.append(int(is_blocked))

    vectorizer = vectorizer.fit_transform(lines)
    X_nb = tf_idf.fit_transform(vectorizer)
    X_log = binarize(vectorizer)

    return X_nb, X_log, numpy.asarray(y)