Python clean Exemples, cleantext.clean Python Exemples

Exemple #1

0

Afficher le fichier

Fichier : bart_generate.py Projet : UCSD-AI4H/COVID-Dialogue

def main(log_path):
    trainer = BARTTrainer(init='bart.large')
    trainer.load_model(f'{log_path}/best_model.pt')

    src_texts, tgt_texts = load_data('test')
    trainer.load_data(split='dev',
                      src_texts=src_texts,
                      tgt_texts=tgt_texts,
                      src_max_len=SRC_MAX_LEN,
                      tgt_max_len=TGT_MAX_LEN)

    test_nll = trainer.evaluate()
    test_ppl = math.exp(test_nll)
    print(f'Test NLL: {test_nll}; Test PPL: {test_ppl}')

    gen_file = open(f'{log_path}/test.hypo', 'w')
    gold_file = open(f'{log_path}/test.gold', 'w')
    formatted_file = open(f'{log_path}/test.log', 'w')
    for src, tgt in zip(src_texts, tgt_texts):
        gen_text = trainer.generate([src])[0]

        gen_text = cleantext.clean(gen_text, extra_spaces=True)
        tgt = cleantext.clean(tgt, extra_spaces=True)

        print(gen_text, file=gen_file)
        print(tgt, file=gold_file)

        print(f'CHAT_HISTORY:\n{src}', file=formatted_file)
        print(f'\nGROUND TRUTH:\n{tgt}', file=formatted_file)
        print(f'\nGENERATION:\n{gen_text}', file=formatted_file)
        print('=' * 100, '\n\n', file=formatted_file)

Exemple #2

0

Afficher le fichier

def test_remove_trail_leading_whitespace():
    text_input = b"Sehr geehrte Damen und Herren,\\r\\n\\r\\nich m\\xf6chte Sie bitten, zu folgendem Fall Stellung zu nehmen. Ich habe einen Fotoautomaten f\\xfcr biometrische Passfotos benutzt, der mein Gesicht nicht erkannt hat. Es besteht die Vermutung, dass dieser Fotoautomat vom BSI zertifiziert ist (Zertifikat BSI-DSZ-CC-0985-2018).\\r\\n\\r\\nDer Fotoautomat steht in  .\\r\\n\\r\\n\\r\\n\\t\\r\\n\\t\\tOrt des Automats: \\r\\n\\t\\r\\n\\r\\n\\r\\n\\r\\n \\r\\n\\t\\r\\n\\t\\tMarke: \\r\\n\\t\\r\\n\\r\\n\\r\\n\\r\\n\\r\\nHier noch Text von Anna Lena.\\r\\n\\r\\nMit freundlichen Gr\\xfc\\xdfen"
    text_input = text_input.decode("unicode_escape")
    text_output = """Sehr geehrte Damen und Herren,

ich möchte Sie bitten, zu folgendem Fall Stellung zu nehmen. Ich habe einen Fotoautomaten für biometrische Passfotos benutzt, der mein Gesicht nicht erkannt hat. Es besteht die Vermutung, dass dieser Fotoautomat vom BSI zertifiziert ist (Zertifikat BSI-DSZ-CC-0985-2018).

Der Fotoautomat steht in .

Ort des Automats:

Marke:

Hier noch Text von Anna Lena.

Mit freundlichen Grüßen"""

    print(
        cleantext.clean(
            text_input,
            lower=False,
            lang="de",
            no_line_breaks=False,
            keep_two_line_breaks=True,
        ))

    assert text_output == cleantext.clean(
        text_input,
        lower=False,
        lang="de",
        no_line_breaks=False,
        keep_two_line_breaks=True,
    )

Exemple #3

0

Afficher le fichier

def test_whitespace():
    assert cleantext.clean(" peter", normalize_whitespace=False) == " peter"
    assert cleantext.clean(" peter", normalize_whitespace=True) == "peter"
    assert (cleantext.clean(" pet\n\ner",
                            normalize_whitespace=True,
                            no_line_breaks=True) == "pet er")
    assert (cleantext.clean(" pet\n\ner",
                            normalize_whitespace=True,
                            no_line_breaks=False) == "pet\ner")

Exemple #4

0

Afficher le fichier

def test_empty_string():
    """Test for clean function

    :param text: raw text
    :return:
    """

    with pytest.raises(CleanTextEmptyString):
        clean('')

Exemple #5

0

Afficher le fichier

Fichier : get_org.py Projet : lambdamai/silver-activity-engine

def process_page(soup):
    orgs = []
    for org in soup.findAll('div', class_='company-item'):
        title = clean(org.find('a').get_text(), lower=True)
        link = org.find('a')['href']

        head_title = clean(
            org.select_one('.company-item-info>dl>dt').get_text(), lower=True)
        head_name = clean(
            org.select_one('.company-item-info>dl>dd').get_text(), lower=True)

        address = clean(org.find('address').get_text(), lower=True)

        inn = clean(
            org.find(text="ИНН").parent.parent.findNext('dd').get_text(),
            lower=True)
        ogrn = clean(
            org.find(text="ОГРН").parent.parent.findNext('dd').get_text(),
            lower=True)
        reg_date = clean(org.find(
            text="Дата регистрации").parent.parent.findNext('dd').get_text(),
                         lower=True)
        reg_cap_dirty = org.find(text="Уставный капитал")
        reg_cap = clean(reg_cap_dirty.parent.parent.findNext('dd').get_text(),
                        lower=True) if reg_cap_dirty != None else None

        status = org.select_one('.warning-text, .attention-text')
        if status != None:
            status = clean(status.get_text(), lower=True)

        main_activity = clean(
            org.find(text="Основной вид деятельности").parent.parent.findNext(
                'dd').get_text(),
            lower=True)
        main_activity_code = int(
            re.findall(r'\d+', main_activity.replace('.',
                                                     ''))[0].ljust(6, '0'))

        if status == 'Организация ликвидирована':
            return (orgs, False)

        orgs.append({
            'title': title,
            'link': link,
            'head_title': head_title,
            'head_name': head_name,
            'inn': inn,
            'ogrn': ogrn,
            'reg_date': reg_date,
            'reg_cap': reg_cap,
            'status': status,
            'main_activity': main_activity,
            'main_activity_code': main_activity_code,
            'address': address
        })

    return (orgs, True)

Exemple #6

0

Afficher le fichier

def clean_dict(d):
    for k, v in d.items():
        if isinstance(v, str):
            d[k] = clean(v, lower=False, no_line_breaks=True)
        elif isinstance(v, list):
            d[k] = list(
                map(
                    lambda x: clean(x, lower=False, no_line_breaks=True)
                    if isinstance(v, list) else x, v))

Exemple #7

0

Afficher le fichier

def main():
    os.makedirs(DATA_DIR, exist_ok=True)
    if not os.path.exists(
            os.path.join(DATA_DIR, 'COVID-Dialogue-Dataset-English.txt')):
        os.system(f'wget -P {DATA_DIR} https://raw.githubusercontent.com/UCSD'
                  f'-AI4H/COVID-Dialogue/master/COVID-Dialogue-Dataset'
                  f'-English.txt')

    dialogues_texts_dirty = open(
        os.path.join(DATA_DIR,
                     'COVID-Dialogue-Dataset-English.txt')).read().split('id=')

    dialogues = []
    for text in dialogues_texts_dirty:
        text = text[text.find('Description'):].strip()

        description = text[len('Description\n'):text.find('\nDialogue')]
        description = cleantext.clean(description,
                                      extra_spaces=True,
                                      lowercase=True)

        text = text[text.find('\nPatient:'):]

        utterances, last_person, valid = [], 'None', True
        for x in re.finditer('Doctor:|Patient:', text):
            if x.group() == last_person:
                valid = False
                break
            else:
                last_person = x.group()

            utterance = text[x.end():].split('Patient:')[0].split('Doctor:')[0]
            utterances.append(
                cleantext.clean(utterance, extra_spaces=True, lowercase=True))

        if valid and utterances:
            dialogues.append({
                'description': description,
                'utterances': utterances
            })

    print('#dialogs:', len(dialogues))

    random.seed(11111)
    random.shuffle(dialogues)

    train_size = int(0.8 * len(dialogues))
    dev_size = int(0.1 * len(dialogues))

    pickle.dump(dialogues[:train_size], open(f'{DATA_DIR}/train.pickle', 'wb'))
    pickle.dump(dialogues[train_size:train_size + dev_size],
                open(f'{DATA_DIR}/dev.pickle', 'wb'))
    pickle.dump(dialogues[train_size + dev_size:],
                open(f'{DATA_DIR}/test.pickle', 'wb'))

    print_fairseq_format()

Exemple #8

0

Afficher le fichier

def test_remove_trail_leading_whitespace():
    text_input = """
    Sehr geehrte Damen und Herren,

ich möchte Sie bitten, zu folgendem Fall Stellung zu nehmen. Ich habe einen Fotoautomaten für biometrische Passfotos benutzt, der mein Gesicht nicht erkannt hat. Es besteht die Vermutung, dass dieser Fotoautomat vom BSI zertifiziert ist (Zertifikat BSI-DSZ-CC-0985-2018).

Der Fotoautomat steht in  19061  Berlin.



		Marke: Fotofix





		Ort des Automats: Bezirksamt / Bürgeramt / Bürgerbüro





Mit freundlichen Grüßen,
Johannes dfdfd
    """

    text_output = """Sehr geehrte Damen und Herren,

ich möchte Sie bitten, zu folgendem Fall Stellung zu nehmen. Ich habe einen Fotoautomaten für biometrische Passfotos benutzt, der mein Gesicht nicht erkannt hat. Es besteht die Vermutung, dass dieser Fotoautomat vom BSI zertifiziert ist (Zertifikat BSI-DSZ-CC-0985-2018).

Der Fotoautomat steht in 19061 Berlin.

Marke: Fotofix

Ort des Automats: Bezirksamt / Bürgeramt / Bürgerbüro

Mit freundlichen Grüßen,
Johannes dfdfd"""

    print(
        cleantext.clean(
            text_input,
            lower=False,
            lang="de",
            no_line_breaks=False,
            keep_two_line_breaks=True,
        )
    )

    assert text_output == cleantext.clean(
        text_input,
        lower=False,
        lang="de",
        no_line_breaks=False,
        keep_two_line_breaks=True,
    )

Exemple #9

0

Afficher le fichier

Fichier : search.py Projet : tatortrechts/api.tatortrechts.de

def split_proximity(text):
    # TODO: add not / - option
    text = text.replace("*", "").replace(":", "").replace("'", '"')
    tokens = smart_split(text)

    for t in tokens:
        t_cl = clean(t, lang="de", lower=False, no_punct=False)
        t_cl_p = clean(t, lang="de", lower=False, no_punct=True)

        if t_cl.lower() == "or":
            continue
        if " " in t or '"' in t_cl:
            yield "' " + t_cl_p.replace(" ", " <-> ") + " '"
        else:
            yield t_cl_p + ":*"

Exemple #10

0

Afficher le fichier

Fichier : utils.py Projet : ku2482/sagemaker-tutorial

def convert_text_into_sentences(text):
    assert isinstance(text, str)

    # Normalize a text.
    text = clean(
        text,
        fix_unicode=True,
        to_ascii=True,
        lower=True,
        no_line_breaks=False,
        no_urls=True,
        no_emails=True,
        no_phone_numbers=True,
        no_numbers=False,
        no_digits=False,
        no_currency_symbols=False,
        no_punct=False)

    # Remove tags.
    text = re.sub('<[^<]+?>', '', text)
    # Remove invalid charactors.
    text = re.sub('[#%\'\(\)\*\+\-\\\/:;<=>@^_`|~\[\]]+', '', text)
    # Convert a text into sentences.
    sentences = sent_tokenize(text)

    return sentences

Exemple #11

0

Afficher le fichier

def pre_procee_2(txt):
#     print(type(txt),txt)
    txt = re.sub(r'<.*?>|&([a-z0-9]+|#[0-9]{1,6}|#x[0-9a-f]{1,6});'," ",txt)
    txt = clean(pre_process_1(txt),
                fix_unicode=True, # fix various unicode errors
                to_ascii=True, # transliterate to closest ASCII representation
                lower=True, # lowercase text
                no_line_breaks=True, # fully strip line breaks as opposed to only normalizing them
                no_urls=True, # replace all URLs with a special token
                no_emails=True, # replace all email addresses with a special token
                no_phone_numbers=True, # replace all phone numbers with a special token
                no_numbers=True, # replace all numbers with a special token
                no_digits=False, # replace all digits with a special token
                no_currency_symbols=True, # replace all currency symbols with a special token
                no_punct=False, # fully remove punctuation
                replace_with_url="URL>",
                replace_with_email="EMAIL",
                replace_with_phone_number="PHONE",
                replace_with_number="NUMBER",
                replace_with_digit="0",
                replace_with_currency_symbol="CURR",
                lang="en") # set to 'de' for German special handling)
#     txt = [porter_stemmer.stem(i) for i in word_tokenize(str(txt).lower())]
    txt = [i for i in word_tokenize(str(txt).lower()) if i not in stop_lst]
    txt = [w for w in word_tokenize(str(txt).lower()) if not w in stop_words]
    txt = ' '.join(txt)
    txt.translate(str.maketrans('', '', string.punctuation))
    txt = re.sub(r'[^\w\s]','',txt)
    txt = re.sub(r'_+','',txt)
    txt = re.sub(r"\s+"," ",txt)
    txt = ''.join(txt)
    txt = txt.strip()
    return txt

Exemple #12

0

Afficher le fichier

def clean():
    for t in gen(skip=True):
        filename = get_txt_file(t)
        with open(filename, "r") as f:
            old_txt = f.read()
        new_txt = old_txt
        new_txt = cleantext.clean(new_txt, fix_unicode=True, to_ascii=True, lower=False, no_urls=True)

        new_txt = re.sub(r"For more information about District economic conditions,? visit: URL", "", new_txt)
        new_txt = new_txt.replace("%-", " percent to ").replace("%", " percent")
        new_txt = new_txt.replace(" & ", " and ")
        # new_txt = new_txt.replace("&", " and ") # ex. R&D
        # new_txt = re.sub(r" -(?=[\d.])", " minus ", new_txt)
        # new_txt = new_txt.replace("+", " plus ")
        new_txt = new_txt.replace("=", " equals ") # only one instance

        new_txt = re.sub(r"[<>~*]", "", new_txt)
        new_txt = re.sub(r"\-\-+", " , ", new_txt)
        new_txt = re.sub(r"\?(?=[\w])", "? ", new_txt).replace(" ?", "?")
        new_txt = re.sub(r"\s+,", ",", new_txt).replace(",,", ",")
        new_txt = re.sub(r"\s+\.(?=[^0-9])", " ", new_txt)
        new_txt = new_txt.replace("...", " ")
        new_txt = new_txt.replace("..", ".").replace(",.", ",") # ".," is legitimate
        new_txt = new_txt.replace("[", "").replace("]", "") # parentheses are legitimate

        new_txt = re.sub(r"\s+", " ", new_txt)
        new_txt = new_txt.strip()
        with open(filename, "w") as f:
            f.write(new_txt)

Exemple #13

0

Afficher le fichier

Fichier : Encrypt.py Projet : theo-kirby/geometric-encryption

def Encrypt(p):

    c = []

    with open(inputfile, mode='r') as f:  # Read plaintext from file (p.csv)
        global words
        file_ = f.readlines()
        words = cleantext.clean(file_, numbers=True, punct=True)
        f.close()

    with open(outputfile, mode='w') as f_:

        fieldnames = ['x', 'y', 'z']
        writer = csv.DictWriter(f_, fieldnames=fieldnames)
        writer.writeheader()
        print(" \n Encryption of Plaintext (" + inputfile +
              ") -> Cyphertext (" + outputfile + "+) : \n")

        for index, char in enumerate(words):

            counter_ = Counter(p)

            x = ((index * index) + 1)
            y = alpha.index(char.lower()) * x
            z = counter_[char]
            w = (x, y, z)
            c.append(w)

            writer.writerow({'x': x, 'y': y, 'z': z})

    print(c)

Exemple #14

0

Afficher le fichier

def main(log_path, wiki_sup=True):
    supervisor = pickle.load(open('supervisions/supervisor.pickle', 'rb')) \
        if wiki_sup else None
    dataset = MANewsDataset(split='test',
                            supervisor=supervisor,
                            n_wiki_words=N_WIKI_WORDS)
    test_examples = [example for example in dataset]

    bart = BART.load_from_checkpoint(
        init=MODEL_INIT,
        checkpoint_path=f'{log_path}/best_model.ckpt').to('cuda')
    bart.eval()

    src_file = open(f'{log_path}/test.source', 'w')
    gold_file = open(f'{log_path}/test.gold', 'w')
    hypo_file = open(f'{log_path}/test.hypo', 'w')

    for i in trange(0, len(test_examples), BATCH_SIZE, desc=f'Generating'):
        batch_examples = test_examples[i:i + BATCH_SIZE]

        gen_texts = bart.generate(
            src_texts=[example['src'] for example in batch_examples],
            max_len=MAX_LEN,
            min_len=MIN_LEN,
            beam_size=BEAM_SIZE,
            len_penalty=LEN_PENALTY,
            no_repeat_ngram_size=NO_REPEAT_NGRAM_SIZE)

        for example, gen_text in zip(batch_examples, gen_texts):
            print(example['src'].replace('\n\n', ' ||| '), file=src_file)
            print(example['tgt'], file=gold_file)
            print(cleantext.clean(gen_text, extra_spaces=True, lowercase=True),
                  file=hypo_file)

Exemple #15

0

Afficher le fichier

def post_extract():
    # 1. write temp file to disk
    f = request.files['file']  # uploaded file (via form / REST client)
    temp = tempfile.NamedTemporaryFile(prefix="jargonbuster_", delete=False)
    extractedText = ""
    info = ""
    try:
        f.save(temp)
        extractor = TikaExtractor(temp)  #PDFMinerExtractor(temp)
        temp.close()
        extractedText = extractor.extractText()
        info = extractor.extractInfo()

        # 3. denoise: do some basic cleaning (e.g. linebreaks)
        extractedText = clean(extractedText, no_line_breaks=True, lang="en")
        # remove remains from word breaks (like "re- miniscence")
        extractedText = re.sub(r'([a-z])\- ([a-z])', r'\1\2', extractedText)
        extractedText = re.sub(r'\.\d+\s+([a-z])+', r'\1', extractedText)
        extractedText = re.sub(r'\[[^]]*\]', r'', extractedText)
        #extractedText = re.sub(r'https?:\/\/.\S+', r'', extractedText)
        # use auto correct
        #spell = Speller (lang="en")
        #extractedText = spell(extractedText)

    finally:
        temp.close()
        os.unlink(temp.name)

    response = jsonify(text=extractedText, info=info)

    return response

Exemple #16

0

Afficher le fichier

def getTokenized(text):
    firstIndex = text.find("p id=\"speakable-summary\"") + 25
    secondIndex = text.find("<footer class", firstIndex)
    content = text[firstIndex:secondIndex]

    cleanr = re.compile('<.*?>')
    content = re.sub(cleanr, '', content)
    content = content.replace('\n', ' ')
    content = content.replace('\t', ' ')
    content = html.unescape(content)

    content = clean(
        content,
        fix_unicode=True,  # fix various unicode errors
        to_ascii=True,  # transliterate to closest ASCII representation
        lower=False,  # lowercase text
        no_line_breaks=
        False,  # fully strip line breaks as opposed to only normalizing them
        no_urls=False,  # replace all URLs with a special token
        no_emails=False,  # replace all email addresses with a special token
        no_phone_numbers=False,  # replace all phone numbers with a special token
        no_numbers=False,  # replace all numbers with a special token
        no_digits=False,  # replace all digits with a special token
        no_currency_symbols=
        False,  # replace all currency symbols with a special token
        no_punct=False,  # fully remove punctuation
    )

    tokenizer = RegexpTokenizer(r'([%$&\-\+]?\b[^\s]+\b[%$&]?)')
    content = tokenizer.tokenize(content)

    return content

Exemple #17

0

Afficher le fichier

Fichier : reuters.py Projet : agbenn/news-to-text

def getTokenized(text):
    firstIndex = text.find('body":"')+7
    secondIndex = 0
    if "Field Level Media" in text:
        secondIndex = text.find("Field Level Media", firstIndex)
    else:
        secondIndex = text.find('","attribution"', firstIndex)
    content = text[firstIndex:secondIndex]


    content = clean(content,
            fix_unicode=True,               # fix various unicode errors
            to_ascii=True,                  # transliterate to closest ASCII representation
            lower=False,                     # lowercase text
            no_line_breaks=False,           # fully strip line breaks as opposed to only normalizing them
            no_urls=False,                  # replace all URLs with a special token
            no_emails=False,                # replace all email addresses with a special token
            no_phone_numbers=False,         # replace all phone numbers with a special token
            no_numbers=False,               # replace all numbers with a special token
            no_digits=False,                # replace all digits with a special token
            no_currency_symbols=False,      # replace all currency symbols with a special token
            no_punct=False,                 # fully remove punctuation
        )
    cleanr = re.compile('<.*?>')
    content = re.sub(cleanr, '', content)
    content = content.replace('\n',' ')
    content = content.replace('\t',' ')


    tokenizer = RegexpTokenizer(r'([%$&\-\+]?\b[^\s]+\b[%$&]?)')
    content = tokenizer.tokenize(content)
    return content

Exemple #18

0

Afficher le fichier

 def clean_text(self, text):
     return clean(
         text,
         fix_unicode=True,  # fix various unicode errors
         to_ascii=True,  # transliterate to closest ASCII representation
         lower=True,  # lowercase text
         no_line_breaks=
         False,  # fully strip line breaks as opposed to only normalizing them
         no_urls=True,  # replace all URLs with a special token
         no_emails=True,  # replace all email addresses with a special token
         no_phone_numbers=
         True,  # replace all phone numbers with a special token
         no_numbers=True,  # replace all numbers with a special token
         no_digits=True,  # replace all digits with a special token
         no_currency_symbols=
         True,  # replace all currency symbols with a special token
         no_punct=True,  # fully remove punctuation
         replace_with_url="<URL>",
         replace_with_email="<EMAIL>",
         replace_with_phone_number="<PHONE>",
         replace_with_number="<NUMBER>",
         replace_with_digit="0",
         replace_with_currency_symbol="<CUR>",
         lang="en"  # set to 'de' for German special handling
     )

Exemple #19

0

Afficher le fichier

Fichier : c2xg.py Projet : H-TayyarMadabushi/c2xg

	def get_lexicon(self, file):

		vocab = []

		for line in self.Load.read_file(file):
			
			#Use clean-text
			line = clean(line,
							fix_unicode = True,
							to_ascii = False,
							lower = True,
							no_line_breaks = True,
							no_urls = True,
							no_emails = True,
							no_phone_numbers = True,
							no_numbers = True,
							no_digits = True,
							no_currency_symbols = True,
							no_punct = True,
							replace_with_punct = "",
							replace_with_url = "<URL>",
							replace_with_email = "<EMAIL>",						
							replace_with_phone_number = "<PHONE>",
							replace_with_number = "<NUMBER>",
							replace_with_digit = "0",
							replace_with_currency_symbol = "<CUR>"
							)

			line = line.split()
			vocab += line

		return set(vocab)

Exemple #20

0

Afficher le fichier

def cleaning(text):
    text = text.strip()

    # regular cleaning
    text = clean(text,
                 fix_unicode=True,
                 to_ascii=False,
                 lower=True,
                 no_line_breaks=True,
                 no_urls=True,
                 no_emails=True,
                 no_phone_numbers=True,
                 no_numbers=False,
                 no_digits=False,
                 no_currency_symbols=True,
                 no_punct=False,
                 replace_with_url="",
                 replace_with_email="",
                 replace_with_phone_number="",
                 replace_with_number="",
                 replace_with_digit="0",
                 replace_with_currency_symbol="",
                 )

   # normalizing
    normalizer = hazm.Normalizer()
    text = normalizer.normalize(text)

    # removing wierd patterns
    wierd_pattern = re.compile("["
                               u"\U0001F600-\U0001F64F"  # emoticons
                               u"\U0001F300-\U0001F5FF"  # symbols & pictographs
                               u"\U0001F680-\U0001F6FF"  # transport & map symbols
                               u"\U0001F1E0-\U0001F1FF"  # flags (iOS)
                               u"\U00002702-\U000027B0"
                               u"\U000024C2-\U0001F251"
                               u"\U0001f926-\U0001f937"
                               u'\U00010000-\U0010ffff'
                               u"\u200d"
                               u"\u2640-\u2642"
                               u"\u2600-\u2B55"
                               u"\u23cf"
                               u"\u23e9"
                               u"\u231a"
                               u"\u3030"
                               u"\ufe0f"
                               u"\u2069"
                               u"\u2066"
                               # u"\u200c"
                               u"\u2068"
                               u"\u2067"
                               "]+", flags=re.UNICODE)

    text = wierd_pattern.sub(r'', text)

    # removing extra spaces, hashtags
    text = re.sub("#", "", text)
    text = re.sub("\s+", " ", text)
    text = re.sub("a", "ِ", text)
    return text

Exemple #21

0

Afficher le fichier

    def load(self, line):

        #Tokenize zho
        if self.language == "zho" and self.zho_split == True:

            line = [
                x for x in self.tk.cut(line, cut_all=True, HMM=True) if x != ""
            ]
            line = " ".join(line)

        #Use clean-text
        line = clean(line,
                     fix_unicode=True,
                     to_ascii=False,
                     lower=True,
                     no_line_breaks=True,
                     no_urls=True,
                     no_emails=True,
                     no_phone_numbers=True,
                     no_numbers=True,
                     no_digits=True,
                     no_currency_symbols=True,
                     no_punct=True,
                     replace_with_punct="",
                     replace_with_url="<URL>",
                     replace_with_email="<EMAIL>",
                     replace_with_phone_number="<PHONE>",
                     replace_with_number="<NUMBER>",
                     replace_with_digit="0",
                     replace_with_currency_symbol="<CUR>")

        line = self.r.tagRawSentenceHash(rawLine=line)
        #Array of tuples (LEX, POS, CAT)

        return np.array(line)

Exemple #22

0

Afficher le fichier

Fichier : importance.py Projet : ssmike/tinkoff-bot

def trim(line, vecdict, vec_len, threshold=0.6):
    line = clean(line)
    words = importance(line, vecdict, vec_len)
    if words == []:
        return ""
    max_val = max(map(lambda x: x[1], words))
    return " ".join(map(lambda x: x[0], filter(lambda x: x[1]/max_val > threshold, words)))

Exemple #23

0

Afficher le fichier

 def get_phone_string(self, text):
     utt = clean(text,
                 fix_unicode=True,
                 to_ascii=False,
                 lower=False,
                 lang=self.clean_lang)
     self.expand_abbrevations(utt)
     utt = utt.replace("_SIL_", "~")
     phones = phonemizer.phonemize(utt,
                                   language_switch='remove-flags',
                                   backend="espeak",
                                   language=self.g2p_lang,
                                   preserve_punctuation=True,
                                   strip=True,
                                   punctuation_marks=';:,.!?¡¿—…"«»“”~',
                                   with_stress=self.use_stress).replace(";", ",") \
         .replace(":", ",").replace('"', ",").replace("-", ",").replace("-", ",").replace("\n", " ") \
         .replace("\t", " ").replace("¡", "").replace("¿", "").replace(",", "~")
     phones = re.sub("~+", "~", phones)
     if not self.use_prosody:
         phones = phones.replace("ˌ", "").replace("ː", "").replace(
             "ˑ", "").replace("˘", "").replace("|", "").replace("‖", "")
     if not self.use_word_boundaries:
         phones = phones.replace(" ", "")
     return phones + "#"

Exemple #24

0

Afficher le fichier

def stats():
    q = request.args.get("q")
    if q is None:
        return jsonify({})
    q = cleantext.clean(q, lang="de")
    # for qs like "token" remove the quotes to count
    counting_q = q.replace('"', "").replace("'", "")

    query, page, jurisdiction, max_year, min_year = build_query()

    all_results = (
        query.join(Document)
        .filter(Document.year >= trends_min_year)
        .search(q)
        .with_entities(Document.year, DocumentPage.content)
    )
    d = defaultdict(int)

    for r in all_results:
        year = r.year
        count = r.content.lower().count(counting_q)
        d[year] += count

    for year_tup in get_year_totals():
        d[year_tup[0]] /= year_tup[1]

    # fix NSU
    if q.lower() == "nsu":
        for y in range(trends_min_year, 2009):
            d[y] = 0

    return jsonify([q, d])

Exemple #25

0

Afficher le fichier

Fichier : batch_filter.py Projet : shafay07/hire-droid

def getEachResumeText(current_batch,
                      pdf_reader,
                      bookmarks,
                      bookmarks_len,
                      start_bookmark=0):
    start_page = pdf_reader.getDestinationPageNumber(bookmarks[start_bookmark])
    # at last bookmark set end_page to last page of the document
    if bookmarks_len == start_bookmark:
        end_page = pdf_reader.getNumPages()
    # else set it to next bookmark
    else:
        end_bookmark = start_bookmark + 1
        end_page = pdf_reader.getDestinationPageNumber(bookmarks[end_bookmark])

    # itterate over a complete resume to extract text
    clean_extracted = ''
    for page in range(start_page, end_page):
        with pdfplumber.open(current_batch) as pdf:
            current_page = pdf.pages[page]
            extracted = current_page.extract_text()
            cleaned = clean(extracted,
                            lower=True,
                            no_line_breaks=True,
                            no_phone_numbers=True,
                            no_emails=True,
                            no_urls=True,
                            no_numbers=True,
                            no_digits=True)
            clean_extracted += cleaned
    # helper function to save whats extracted by pdfplumber in a text file
    saveCleanText(clean_extracted)
    return clean_extracted, start_page

Exemple #26

0

Afficher le fichier

def cleanup_text(data):

    email_headers = re.compile(r'^from:.*(?:\r?\n(?!\r?\n).*)*', re.IGNORECASE)
    clean_data = list(map(lambda x: email_headers.sub('', x).strip(), data[:]))  # remove email headers
    clean_data = list(map(lambda x: re.sub(r"\r?\n\r?\n.*(?:\r?\n(?!\r?\n).*)*$",'', x).strip(), clean_data[:])) # remove ending signatures
    #clean_data = list(map(lambda x: re.sub(r"[a-zA-Z0-9_.\$\-]*@(\w*\.)*\w*",'',x).strip(), clean_data[:])) # remove emails

    ## using clean-text library
# =============================================================================
#     # usage:
#     clean("some input",
#         fix_unicode=True,               # fix various unicode errors
#         to_ascii=True,                  # transliterate to closest ASCII representation
#         lower=True,                     # lowercase text
#         no_line_breaks=False,           # fully strip line breaks as opposed to only normalizing them
#         no_urls=False,                  # replace all URLs with a special token
#         no_emails=False,                # replace all email addresses with a special token
#         no_phone_numbers=False,         # replace all phone numbers with a special token
#         no_numbers=False,               # replace all numbers with a special token
#         no_digits=False,                # replace all digits with a special token
#         no_currency_symbols=False,      # replace all currency symbols with a special token
#         no_punct=False,                 # remove punctuations
#         replace_with_punct="",          # instead of removing punctuations you may replace them
#         replace_with_url="<URL>",
#         replace_with_email="<EMAIL>",
#         replace_with_phone_number="<PHONE>",
#         replace_with_number="<NUMBER>",
#         replace_with_digit="0",
#         replace_with_currency_symbol="<CUR>",
#         lang="en"                       # set to 'de' for German special handling
#     )
# =============================================================================

    clean_data = list(map(lambda x: clean(x, no_urls=True, no_emails=True, no_digits=True, no_currency_symbols=True, no_punct=True).strip(), clean_data[:]))
    return clean_data

Exemple #27

0

Afficher le fichier

def clean_text(text):
    if type(text) == float:
        return ""
    text = re.sub(r'\([^)]*\)', '', text)
    cleaned_text = clean(
        text,
        fix_unicode=True,  # fix various unicode errors
        to_ascii=True,  # transliterate to closest ASCII representation
        lower=True,  # lowercase text
        no_line_breaks=
        True,  # fully strip line breaks as opposed to only normalizing them
        no_urls=True,  # replace all URLs with a special token
        no_emails=True,  # replace all email addresses with a special token
        no_phone_numbers=True,  # replace all phone numbers with a special token
        no_numbers=True,  # replace all numbers with a special token
        no_digits=True,  # replace all digits with a special token
        no_currency_symbols=
        True,  # replace all currency symbols with a special token
        no_punct=False,  # fully remove punctuation
        replace_with_url='[URL]',
        replace_with_email='[EMAIL]',
        replace_with_phone_number='[PHONE]',
        replace_with_number="[NUMBER]",
        replace_with_digit="0",
        replace_with_currency_symbol="[CUR]",
        lang="en"  # set to 'de' for German special handling
    )
    cleaned_text = cleaned_text.strip()
    return cleaned_text

Exemple #28

0

Afficher le fichier

def cl(x):
    return clean(x,
                 no_urls=True,
                 no_digits=True,
                 no_punct=True,
                 no_line_breaks=True,
                 lang='de')

Exemple #29

0

Afficher le fichier

Fichier : tests.py Projet : MateuszLewandowski1/nlp_utils

def text_cleaner(string2):

    polish_stopwords = pd.read_csv(
        '/home/erazer/PycharmProjects/jigsaw/nlp_utils/polish_stopwords.txt',
        header=None,
        names=['words']).words.to_list()
    from cleantext import clean
    cleaned_string = clean(string2,
                           no_urls=True,
                           no_digits=True,
                           no_line_breaks=True,
                           no_emails=True,
                           no_phone_numbers=True,
                           no_numbers=True,
                           no_currency_symbols=True,
                           no_punct=True,
                           replace_with_digit="",
                           replace_with_url="",
                           replace_with_email="",
                           replace_with_currency_symbol="",
                           replace_with_number="",
                           replace_with_phone_number="")

    filtered_sentence = [
        elem for elem in cleaned_string.split(' ')
        if elem not in polish_stopwords
    ]
    # optional - do not return it as a list of separate words, but as a string
    return filtered_sentence

Exemple #30

0

Afficher le fichier

def my_clean(text):
    emojiPattern = re.compile(
        pattern="["
        u"\U0001F600-\U0001F64F"  # emoticons
        u"\U0001F300-\U0001F5FF"  # symbols & pictographs
        u"\U0001F680-\U0001F6FF"  # transport & map symbols
        u"\U0001F1E0-\U0001F1FF"  # flags (iOS)
        "]+",
        flags=re.UNICODE)
    newText = emojiPattern.sub(r'', text)
    newText = clean(newText,
                    fix_unicode=True,
                    to_ascii=True,
                    lower=True,
                    no_line_breaks=True,
                    lang="en")
    newText = re.sub(r'http\S+', '', newText)
    emojiPattern = re.compile(
        pattern="["
        u"\U0001F600-\U0001F64F"  # emoticons
        u"\U0001F300-\U0001F5FF"  # symbols & pictographs
        u"\U0001F680-\U0001F6FF"  # transport & map symbols
        u"\U0001F1E0-\U0001F1FF"  # flags (iOS)
        "]+",
        flags=re.UNICODE)
    newText = emojiPattern.sub(r'', newText)
    newText = re.sub(r'[^\w\s]', '', newText)

    return newText

Exemple #31

0

Afficher le fichier

Fichier : get_csv.py Projet : airKlizz/GreenMail

def text_processing(text):
    text = re.sub(r'<.*>', '', text)
    text = clean(
        text,
        fix_unicode=True,  # fix various unicode errors
        to_ascii=True,  # transliterate to closest ASCII representation
        lower=True,  # lowercase text
        no_line_breaks=
        False,  # fully strip line breaks as opposed to only normalizing them
        no_urls=False,  # replace all URLs with a special token
        no_emails=False,  # replace all email addresses with a special token
        no_phone_numbers=False,  # replace all phone numbers with a special token
        no_numbers=False,  # replace all numbers with a special token
        no_digits=False,  # replace all digits with a special token
        no_currency_symbols=
        False,  # replace all currency symbols with a special token
        no_punct=False,  # fully remove punctuation
        replace_with_url="<URL>",
        replace_with_email="<EMAIL>",
        replace_with_phone_number="<PHONE>",
        replace_with_number="<NUMBER>",
        replace_with_digit="0",
        replace_with_currency_symbol="<CUR>",
    )

    url = re.findall(
        r'http[s]?://(?:[a-zA-Z]|[0-9]|[$-_@.&+]|[!*\(\), ]|(?:%[0-9a-fA-F][0-9a-fA-F]))+',
        text)

    text = re.sub(
        r'http[s]?://(?:[a-zA-Z]|[0-9]|[$-_@.&+]|[!*\(\), ]|(?:%[0-9a-fA-F][0-9a-fA-F]))+',
        '', text)
    text = re.sub(r'www\S*', '', text)
    return text, url

Exemple #32

0

Afficher le fichier

Fichier : ensemple.py Projet : rodion-zheludkov/kaggle

def read_train(train_file):
    lines = []
    y = []
    vectorizer = CountVectorizer(min_df=3)
    tf_idf = TfidfTransformer()

    for parts in utils.read_train(train_file):
        is_blocked = parts[8]
        desc = cleantext.clean(parts[4], False)
        lines.append(desc)
        y.append(int(is_blocked))

    vectorizer = vectorizer.fit_transform(lines)
    X_nb = tf_idf.fit_transform(vectorizer)
    X_log = binarize(vectorizer)

    return X_nb, X_log, numpy.asarray(y)