def main(log_path): trainer = BARTTrainer(init='bart.large') trainer.load_model(f'{log_path}/best_model.pt') src_texts, tgt_texts = load_data('test') trainer.load_data(split='dev', src_texts=src_texts, tgt_texts=tgt_texts, src_max_len=SRC_MAX_LEN, tgt_max_len=TGT_MAX_LEN) test_nll = trainer.evaluate() test_ppl = math.exp(test_nll) print(f'Test NLL: {test_nll}; Test PPL: {test_ppl}') gen_file = open(f'{log_path}/test.hypo', 'w') gold_file = open(f'{log_path}/test.gold', 'w') formatted_file = open(f'{log_path}/test.log', 'w') for src, tgt in zip(src_texts, tgt_texts): gen_text = trainer.generate([src])[0] gen_text = cleantext.clean(gen_text, extra_spaces=True) tgt = cleantext.clean(tgt, extra_spaces=True) print(gen_text, file=gen_file) print(tgt, file=gold_file) print(f'CHAT_HISTORY:\n{src}', file=formatted_file) print(f'\nGROUND TRUTH:\n{tgt}', file=formatted_file) print(f'\nGENERATION:\n{gen_text}', file=formatted_file) print('=' * 100, '\n\n', file=formatted_file)
def test_remove_trail_leading_whitespace(): text_input = b"Sehr geehrte Damen und Herren,\\r\\n\\r\\nich m\\xf6chte Sie bitten, zu folgendem Fall Stellung zu nehmen. Ich habe einen Fotoautomaten f\\xfcr biometrische Passfotos benutzt, der mein Gesicht nicht erkannt hat. Es besteht die Vermutung, dass dieser Fotoautomat vom BSI zertifiziert ist (Zertifikat BSI-DSZ-CC-0985-2018).\\r\\n\\r\\nDer Fotoautomat steht in .\\r\\n\\r\\n\\r\\n\\t\\r\\n\\t\\tOrt des Automats: \\r\\n\\t\\r\\n\\r\\n\\r\\n\\r\\n \\r\\n\\t\\r\\n\\t\\tMarke: \\r\\n\\t\\r\\n\\r\\n\\r\\n\\r\\n\\r\\nHier noch Text von Anna Lena.\\r\\n\\r\\nMit freundlichen Gr\\xfc\\xdfen" text_input = text_input.decode("unicode_escape") text_output = """Sehr geehrte Damen und Herren, ich möchte Sie bitten, zu folgendem Fall Stellung zu nehmen. Ich habe einen Fotoautomaten für biometrische Passfotos benutzt, der mein Gesicht nicht erkannt hat. Es besteht die Vermutung, dass dieser Fotoautomat vom BSI zertifiziert ist (Zertifikat BSI-DSZ-CC-0985-2018). Der Fotoautomat steht in . Ort des Automats: Marke: Hier noch Text von Anna Lena. Mit freundlichen Grüßen""" print( cleantext.clean( text_input, lower=False, lang="de", no_line_breaks=False, keep_two_line_breaks=True, )) assert text_output == cleantext.clean( text_input, lower=False, lang="de", no_line_breaks=False, keep_two_line_breaks=True, )
def test_whitespace(): assert cleantext.clean(" peter", normalize_whitespace=False) == " peter" assert cleantext.clean(" peter", normalize_whitespace=True) == "peter" assert (cleantext.clean(" pet\n\ner", normalize_whitespace=True, no_line_breaks=True) == "pet er") assert (cleantext.clean(" pet\n\ner", normalize_whitespace=True, no_line_breaks=False) == "pet\ner")
def test_empty_string(): """Test for clean function :param text: raw text :return: """ with pytest.raises(CleanTextEmptyString): clean('')
def process_page(soup): orgs = [] for org in soup.findAll('div', class_='company-item'): title = clean(org.find('a').get_text(), lower=True) link = org.find('a')['href'] head_title = clean( org.select_one('.company-item-info>dl>dt').get_text(), lower=True) head_name = clean( org.select_one('.company-item-info>dl>dd').get_text(), lower=True) address = clean(org.find('address').get_text(), lower=True) inn = clean( org.find(text="ИНН").parent.parent.findNext('dd').get_text(), lower=True) ogrn = clean( org.find(text="ОГРН").parent.parent.findNext('dd').get_text(), lower=True) reg_date = clean(org.find( text="Дата регистрации").parent.parent.findNext('dd').get_text(), lower=True) reg_cap_dirty = org.find(text="Уставный капитал") reg_cap = clean(reg_cap_dirty.parent.parent.findNext('dd').get_text(), lower=True) if reg_cap_dirty != None else None status = org.select_one('.warning-text, .attention-text') if status != None: status = clean(status.get_text(), lower=True) main_activity = clean( org.find(text="Основной вид деятельности").parent.parent.findNext( 'dd').get_text(), lower=True) main_activity_code = int( re.findall(r'\d+', main_activity.replace('.', ''))[0].ljust(6, '0')) if status == 'Организация ликвидирована': return (orgs, False) orgs.append({ 'title': title, 'link': link, 'head_title': head_title, 'head_name': head_name, 'inn': inn, 'ogrn': ogrn, 'reg_date': reg_date, 'reg_cap': reg_cap, 'status': status, 'main_activity': main_activity, 'main_activity_code': main_activity_code, 'address': address }) return (orgs, True)
def clean_dict(d): for k, v in d.items(): if isinstance(v, str): d[k] = clean(v, lower=False, no_line_breaks=True) elif isinstance(v, list): d[k] = list( map( lambda x: clean(x, lower=False, no_line_breaks=True) if isinstance(v, list) else x, v))
def main(): os.makedirs(DATA_DIR, exist_ok=True) if not os.path.exists( os.path.join(DATA_DIR, 'COVID-Dialogue-Dataset-English.txt')): os.system(f'wget -P {DATA_DIR} https://raw.githubusercontent.com/UCSD' f'-AI4H/COVID-Dialogue/master/COVID-Dialogue-Dataset' f'-English.txt') dialogues_texts_dirty = open( os.path.join(DATA_DIR, 'COVID-Dialogue-Dataset-English.txt')).read().split('id=') dialogues = [] for text in dialogues_texts_dirty: text = text[text.find('Description'):].strip() description = text[len('Description\n'):text.find('\nDialogue')] description = cleantext.clean(description, extra_spaces=True, lowercase=True) text = text[text.find('\nPatient:'):] utterances, last_person, valid = [], 'None', True for x in re.finditer('Doctor:|Patient:', text): if x.group() == last_person: valid = False break else: last_person = x.group() utterance = text[x.end():].split('Patient:')[0].split('Doctor:')[0] utterances.append( cleantext.clean(utterance, extra_spaces=True, lowercase=True)) if valid and utterances: dialogues.append({ 'description': description, 'utterances': utterances }) print('#dialogs:', len(dialogues)) random.seed(11111) random.shuffle(dialogues) train_size = int(0.8 * len(dialogues)) dev_size = int(0.1 * len(dialogues)) pickle.dump(dialogues[:train_size], open(f'{DATA_DIR}/train.pickle', 'wb')) pickle.dump(dialogues[train_size:train_size + dev_size], open(f'{DATA_DIR}/dev.pickle', 'wb')) pickle.dump(dialogues[train_size + dev_size:], open(f'{DATA_DIR}/test.pickle', 'wb')) print_fairseq_format()
def test_remove_trail_leading_whitespace(): text_input = """ Sehr geehrte Damen und Herren, ich möchte Sie bitten, zu folgendem Fall Stellung zu nehmen. Ich habe einen Fotoautomaten für biometrische Passfotos benutzt, der mein Gesicht nicht erkannt hat. Es besteht die Vermutung, dass dieser Fotoautomat vom BSI zertifiziert ist (Zertifikat BSI-DSZ-CC-0985-2018). Der Fotoautomat steht in 19061 Berlin. Marke: Fotofix Ort des Automats: Bezirksamt / Bürgeramt / Bürgerbüro Mit freundlichen Grüßen, Johannes dfdfd """ text_output = """Sehr geehrte Damen und Herren, ich möchte Sie bitten, zu folgendem Fall Stellung zu nehmen. Ich habe einen Fotoautomaten für biometrische Passfotos benutzt, der mein Gesicht nicht erkannt hat. Es besteht die Vermutung, dass dieser Fotoautomat vom BSI zertifiziert ist (Zertifikat BSI-DSZ-CC-0985-2018). Der Fotoautomat steht in 19061 Berlin. Marke: Fotofix Ort des Automats: Bezirksamt / Bürgeramt / Bürgerbüro Mit freundlichen Grüßen, Johannes dfdfd""" print( cleantext.clean( text_input, lower=False, lang="de", no_line_breaks=False, keep_two_line_breaks=True, ) ) assert text_output == cleantext.clean( text_input, lower=False, lang="de", no_line_breaks=False, keep_two_line_breaks=True, )
def split_proximity(text): # TODO: add not / - option text = text.replace("*", "").replace(":", "").replace("'", '"') tokens = smart_split(text) for t in tokens: t_cl = clean(t, lang="de", lower=False, no_punct=False) t_cl_p = clean(t, lang="de", lower=False, no_punct=True) if t_cl.lower() == "or": continue if " " in t or '"' in t_cl: yield "' " + t_cl_p.replace(" ", " <-> ") + " '" else: yield t_cl_p + ":*"
def convert_text_into_sentences(text): assert isinstance(text, str) # Normalize a text. text = clean( text, fix_unicode=True, to_ascii=True, lower=True, no_line_breaks=False, no_urls=True, no_emails=True, no_phone_numbers=True, no_numbers=False, no_digits=False, no_currency_symbols=False, no_punct=False) # Remove tags. text = re.sub('<[^<]+?>', '', text) # Remove invalid charactors. text = re.sub('[#%\'\(\)\*\+\-\\\/:;<=>@^_`|~\[\]]+', '', text) # Convert a text into sentences. sentences = sent_tokenize(text) return sentences
def pre_procee_2(txt): # print(type(txt),txt) txt = re.sub(r'<.*?>|&([a-z0-9]+|#[0-9]{1,6}|#x[0-9a-f]{1,6});'," ",txt) txt = clean(pre_process_1(txt), fix_unicode=True, # fix various unicode errors to_ascii=True, # transliterate to closest ASCII representation lower=True, # lowercase text no_line_breaks=True, # fully strip line breaks as opposed to only normalizing them no_urls=True, # replace all URLs with a special token no_emails=True, # replace all email addresses with a special token no_phone_numbers=True, # replace all phone numbers with a special token no_numbers=True, # replace all numbers with a special token no_digits=False, # replace all digits with a special token no_currency_symbols=True, # replace all currency symbols with a special token no_punct=False, # fully remove punctuation replace_with_url="URL>", replace_with_email="EMAIL", replace_with_phone_number="PHONE", replace_with_number="NUMBER", replace_with_digit="0", replace_with_currency_symbol="CURR", lang="en") # set to 'de' for German special handling) # txt = [porter_stemmer.stem(i) for i in word_tokenize(str(txt).lower())] txt = [i for i in word_tokenize(str(txt).lower()) if i not in stop_lst] txt = [w for w in word_tokenize(str(txt).lower()) if not w in stop_words] txt = ' '.join(txt) txt.translate(str.maketrans('', '', string.punctuation)) txt = re.sub(r'[^\w\s]','',txt) txt = re.sub(r'_+','',txt) txt = re.sub(r"\s+"," ",txt) txt = ''.join(txt) txt = txt.strip() return txt
def clean(): for t in gen(skip=True): filename = get_txt_file(t) with open(filename, "r") as f: old_txt = f.read() new_txt = old_txt new_txt = cleantext.clean(new_txt, fix_unicode=True, to_ascii=True, lower=False, no_urls=True) new_txt = re.sub(r"For more information about District economic conditions,? visit: URL", "", new_txt) new_txt = new_txt.replace("%-", " percent to ").replace("%", " percent") new_txt = new_txt.replace(" & ", " and ") # new_txt = new_txt.replace("&", " and ") # ex. R&D # new_txt = re.sub(r" -(?=[\d.])", " minus ", new_txt) # new_txt = new_txt.replace("+", " plus ") new_txt = new_txt.replace("=", " equals ") # only one instance new_txt = re.sub(r"[<>~*]", "", new_txt) new_txt = re.sub(r"\-\-+", " , ", new_txt) new_txt = re.sub(r"\?(?=[\w])", "? ", new_txt).replace(" ?", "?") new_txt = re.sub(r"\s+,", ",", new_txt).replace(",,", ",") new_txt = re.sub(r"\s+\.(?=[^0-9])", " ", new_txt) new_txt = new_txt.replace("...", " ") new_txt = new_txt.replace("..", ".").replace(",.", ",") # ".," is legitimate new_txt = new_txt.replace("[", "").replace("]", "") # parentheses are legitimate new_txt = re.sub(r"\s+", " ", new_txt) new_txt = new_txt.strip() with open(filename, "w") as f: f.write(new_txt)
def Encrypt(p): c = [] with open(inputfile, mode='r') as f: # Read plaintext from file (p.csv) global words file_ = f.readlines() words = cleantext.clean(file_, numbers=True, punct=True) f.close() with open(outputfile, mode='w') as f_: fieldnames = ['x', 'y', 'z'] writer = csv.DictWriter(f_, fieldnames=fieldnames) writer.writeheader() print(" \n Encryption of Plaintext (" + inputfile + ") -> Cyphertext (" + outputfile + "+) : \n") for index, char in enumerate(words): counter_ = Counter(p) x = ((index * index) + 1) y = alpha.index(char.lower()) * x z = counter_[char] w = (x, y, z) c.append(w) writer.writerow({'x': x, 'y': y, 'z': z}) print(c)
def main(log_path, wiki_sup=True): supervisor = pickle.load(open('supervisions/supervisor.pickle', 'rb')) \ if wiki_sup else None dataset = MANewsDataset(split='test', supervisor=supervisor, n_wiki_words=N_WIKI_WORDS) test_examples = [example for example in dataset] bart = BART.load_from_checkpoint( init=MODEL_INIT, checkpoint_path=f'{log_path}/best_model.ckpt').to('cuda') bart.eval() src_file = open(f'{log_path}/test.source', 'w') gold_file = open(f'{log_path}/test.gold', 'w') hypo_file = open(f'{log_path}/test.hypo', 'w') for i in trange(0, len(test_examples), BATCH_SIZE, desc=f'Generating'): batch_examples = test_examples[i:i + BATCH_SIZE] gen_texts = bart.generate( src_texts=[example['src'] for example in batch_examples], max_len=MAX_LEN, min_len=MIN_LEN, beam_size=BEAM_SIZE, len_penalty=LEN_PENALTY, no_repeat_ngram_size=NO_REPEAT_NGRAM_SIZE) for example, gen_text in zip(batch_examples, gen_texts): print(example['src'].replace('\n\n', ' ||| '), file=src_file) print(example['tgt'], file=gold_file) print(cleantext.clean(gen_text, extra_spaces=True, lowercase=True), file=hypo_file)
def post_extract(): # 1. write temp file to disk f = request.files['file'] # uploaded file (via form / REST client) temp = tempfile.NamedTemporaryFile(prefix="jargonbuster_", delete=False) extractedText = "" info = "" try: f.save(temp) extractor = TikaExtractor(temp) #PDFMinerExtractor(temp) temp.close() extractedText = extractor.extractText() info = extractor.extractInfo() # 3. denoise: do some basic cleaning (e.g. linebreaks) extractedText = clean(extractedText, no_line_breaks=True, lang="en") # remove remains from word breaks (like "re- miniscence") extractedText = re.sub(r'([a-z])\- ([a-z])', r'\1\2', extractedText) extractedText = re.sub(r'\.\d+\s+([a-z])+', r'\1', extractedText) extractedText = re.sub(r'\[[^]]*\]', r'', extractedText) #extractedText = re.sub(r'https?:\/\/.\S+', r'', extractedText) # use auto correct #spell = Speller (lang="en") #extractedText = spell(extractedText) finally: temp.close() os.unlink(temp.name) response = jsonify(text=extractedText, info=info) return response
def getTokenized(text): firstIndex = text.find("p id=\"speakable-summary\"") + 25 secondIndex = text.find("<footer class", firstIndex) content = text[firstIndex:secondIndex] cleanr = re.compile('<.*?>') content = re.sub(cleanr, '', content) content = content.replace('\n', ' ') content = content.replace('\t', ' ') content = html.unescape(content) content = clean( content, fix_unicode=True, # fix various unicode errors to_ascii=True, # transliterate to closest ASCII representation lower=False, # lowercase text no_line_breaks= False, # fully strip line breaks as opposed to only normalizing them no_urls=False, # replace all URLs with a special token no_emails=False, # replace all email addresses with a special token no_phone_numbers=False, # replace all phone numbers with a special token no_numbers=False, # replace all numbers with a special token no_digits=False, # replace all digits with a special token no_currency_symbols= False, # replace all currency symbols with a special token no_punct=False, # fully remove punctuation ) tokenizer = RegexpTokenizer(r'([%$&\-\+]?\b[^\s]+\b[%$&]?)') content = tokenizer.tokenize(content) return content
def getTokenized(text): firstIndex = text.find('body":"')+7 secondIndex = 0 if "Field Level Media" in text: secondIndex = text.find("Field Level Media", firstIndex) else: secondIndex = text.find('","attribution"', firstIndex) content = text[firstIndex:secondIndex] content = clean(content, fix_unicode=True, # fix various unicode errors to_ascii=True, # transliterate to closest ASCII representation lower=False, # lowercase text no_line_breaks=False, # fully strip line breaks as opposed to only normalizing them no_urls=False, # replace all URLs with a special token no_emails=False, # replace all email addresses with a special token no_phone_numbers=False, # replace all phone numbers with a special token no_numbers=False, # replace all numbers with a special token no_digits=False, # replace all digits with a special token no_currency_symbols=False, # replace all currency symbols with a special token no_punct=False, # fully remove punctuation ) cleanr = re.compile('<.*?>') content = re.sub(cleanr, '', content) content = content.replace('\n',' ') content = content.replace('\t',' ') tokenizer = RegexpTokenizer(r'([%$&\-\+]?\b[^\s]+\b[%$&]?)') content = tokenizer.tokenize(content) return content
def clean_text(self, text): return clean( text, fix_unicode=True, # fix various unicode errors to_ascii=True, # transliterate to closest ASCII representation lower=True, # lowercase text no_line_breaks= False, # fully strip line breaks as opposed to only normalizing them no_urls=True, # replace all URLs with a special token no_emails=True, # replace all email addresses with a special token no_phone_numbers= True, # replace all phone numbers with a special token no_numbers=True, # replace all numbers with a special token no_digits=True, # replace all digits with a special token no_currency_symbols= True, # replace all currency symbols with a special token no_punct=True, # fully remove punctuation replace_with_url="<URL>", replace_with_email="<EMAIL>", replace_with_phone_number="<PHONE>", replace_with_number="<NUMBER>", replace_with_digit="0", replace_with_currency_symbol="<CUR>", lang="en" # set to 'de' for German special handling )
def get_lexicon(self, file): vocab = [] for line in self.Load.read_file(file): #Use clean-text line = clean(line, fix_unicode = True, to_ascii = False, lower = True, no_line_breaks = True, no_urls = True, no_emails = True, no_phone_numbers = True, no_numbers = True, no_digits = True, no_currency_symbols = True, no_punct = True, replace_with_punct = "", replace_with_url = "<URL>", replace_with_email = "<EMAIL>", replace_with_phone_number = "<PHONE>", replace_with_number = "<NUMBER>", replace_with_digit = "0", replace_with_currency_symbol = "<CUR>" ) line = line.split() vocab += line return set(vocab)
def cleaning(text): text = text.strip() # regular cleaning text = clean(text, fix_unicode=True, to_ascii=False, lower=True, no_line_breaks=True, no_urls=True, no_emails=True, no_phone_numbers=True, no_numbers=False, no_digits=False, no_currency_symbols=True, no_punct=False, replace_with_url="", replace_with_email="", replace_with_phone_number="", replace_with_number="", replace_with_digit="0", replace_with_currency_symbol="", ) # normalizing normalizer = hazm.Normalizer() text = normalizer.normalize(text) # removing wierd patterns wierd_pattern = re.compile("[" u"\U0001F600-\U0001F64F" # emoticons u"\U0001F300-\U0001F5FF" # symbols & pictographs u"\U0001F680-\U0001F6FF" # transport & map symbols u"\U0001F1E0-\U0001F1FF" # flags (iOS) u"\U00002702-\U000027B0" u"\U000024C2-\U0001F251" u"\U0001f926-\U0001f937" u'\U00010000-\U0010ffff' u"\u200d" u"\u2640-\u2642" u"\u2600-\u2B55" u"\u23cf" u"\u23e9" u"\u231a" u"\u3030" u"\ufe0f" u"\u2069" u"\u2066" # u"\u200c" u"\u2068" u"\u2067" "]+", flags=re.UNICODE) text = wierd_pattern.sub(r'', text) # removing extra spaces, hashtags text = re.sub("#", "", text) text = re.sub("\s+", " ", text) text = re.sub("a", "ِ", text) return text
def load(self, line): #Tokenize zho if self.language == "zho" and self.zho_split == True: line = [ x for x in self.tk.cut(line, cut_all=True, HMM=True) if x != "" ] line = " ".join(line) #Use clean-text line = clean(line, fix_unicode=True, to_ascii=False, lower=True, no_line_breaks=True, no_urls=True, no_emails=True, no_phone_numbers=True, no_numbers=True, no_digits=True, no_currency_symbols=True, no_punct=True, replace_with_punct="", replace_with_url="<URL>", replace_with_email="<EMAIL>", replace_with_phone_number="<PHONE>", replace_with_number="<NUMBER>", replace_with_digit="0", replace_with_currency_symbol="<CUR>") line = self.r.tagRawSentenceHash(rawLine=line) #Array of tuples (LEX, POS, CAT) return np.array(line)
def trim(line, vecdict, vec_len, threshold=0.6): line = clean(line) words = importance(line, vecdict, vec_len) if words == []: return "" max_val = max(map(lambda x: x[1], words)) return " ".join(map(lambda x: x[0], filter(lambda x: x[1]/max_val > threshold, words)))
def get_phone_string(self, text): utt = clean(text, fix_unicode=True, to_ascii=False, lower=False, lang=self.clean_lang) self.expand_abbrevations(utt) utt = utt.replace("_SIL_", "~") phones = phonemizer.phonemize(utt, language_switch='remove-flags', backend="espeak", language=self.g2p_lang, preserve_punctuation=True, strip=True, punctuation_marks=';:,.!?¡¿—…"«»“”~', with_stress=self.use_stress).replace(";", ",") \ .replace(":", ",").replace('"', ",").replace("-", ",").replace("-", ",").replace("\n", " ") \ .replace("\t", " ").replace("¡", "").replace("¿", "").replace(",", "~") phones = re.sub("~+", "~", phones) if not self.use_prosody: phones = phones.replace("ˌ", "").replace("ː", "").replace( "ˑ", "").replace("˘", "").replace("|", "").replace("‖", "") if not self.use_word_boundaries: phones = phones.replace(" ", "") return phones + "#"
def stats(): q = request.args.get("q") if q is None: return jsonify({}) q = cleantext.clean(q, lang="de") # for qs like "token" remove the quotes to count counting_q = q.replace('"', "").replace("'", "") query, page, jurisdiction, max_year, min_year = build_query() all_results = ( query.join(Document) .filter(Document.year >= trends_min_year) .search(q) .with_entities(Document.year, DocumentPage.content) ) d = defaultdict(int) for r in all_results: year = r.year count = r.content.lower().count(counting_q) d[year] += count for year_tup in get_year_totals(): d[year_tup[0]] /= year_tup[1] # fix NSU if q.lower() == "nsu": for y in range(trends_min_year, 2009): d[y] = 0 return jsonify([q, d])
def getEachResumeText(current_batch, pdf_reader, bookmarks, bookmarks_len, start_bookmark=0): start_page = pdf_reader.getDestinationPageNumber(bookmarks[start_bookmark]) # at last bookmark set end_page to last page of the document if bookmarks_len == start_bookmark: end_page = pdf_reader.getNumPages() # else set it to next bookmark else: end_bookmark = start_bookmark + 1 end_page = pdf_reader.getDestinationPageNumber(bookmarks[end_bookmark]) # itterate over a complete resume to extract text clean_extracted = '' for page in range(start_page, end_page): with pdfplumber.open(current_batch) as pdf: current_page = pdf.pages[page] extracted = current_page.extract_text() cleaned = clean(extracted, lower=True, no_line_breaks=True, no_phone_numbers=True, no_emails=True, no_urls=True, no_numbers=True, no_digits=True) clean_extracted += cleaned # helper function to save whats extracted by pdfplumber in a text file saveCleanText(clean_extracted) return clean_extracted, start_page
def cleanup_text(data): email_headers = re.compile(r'^from:.*(?:\r?\n(?!\r?\n).*)*', re.IGNORECASE) clean_data = list(map(lambda x: email_headers.sub('', x).strip(), data[:])) # remove email headers clean_data = list(map(lambda x: re.sub(r"\r?\n\r?\n.*(?:\r?\n(?!\r?\n).*)*$",'', x).strip(), clean_data[:])) # remove ending signatures #clean_data = list(map(lambda x: re.sub(r"[a-zA-Z0-9_.\$\-]*@(\w*\.)*\w*",'',x).strip(), clean_data[:])) # remove emails ## using clean-text library # ============================================================================= # # usage: # clean("some input", # fix_unicode=True, # fix various unicode errors # to_ascii=True, # transliterate to closest ASCII representation # lower=True, # lowercase text # no_line_breaks=False, # fully strip line breaks as opposed to only normalizing them # no_urls=False, # replace all URLs with a special token # no_emails=False, # replace all email addresses with a special token # no_phone_numbers=False, # replace all phone numbers with a special token # no_numbers=False, # replace all numbers with a special token # no_digits=False, # replace all digits with a special token # no_currency_symbols=False, # replace all currency symbols with a special token # no_punct=False, # remove punctuations # replace_with_punct="", # instead of removing punctuations you may replace them # replace_with_url="<URL>", # replace_with_email="<EMAIL>", # replace_with_phone_number="<PHONE>", # replace_with_number="<NUMBER>", # replace_with_digit="0", # replace_with_currency_symbol="<CUR>", # lang="en" # set to 'de' for German special handling # ) # ============================================================================= clean_data = list(map(lambda x: clean(x, no_urls=True, no_emails=True, no_digits=True, no_currency_symbols=True, no_punct=True).strip(), clean_data[:])) return clean_data
def clean_text(text): if type(text) == float: return "" text = re.sub(r'\([^)]*\)', '', text) cleaned_text = clean( text, fix_unicode=True, # fix various unicode errors to_ascii=True, # transliterate to closest ASCII representation lower=True, # lowercase text no_line_breaks= True, # fully strip line breaks as opposed to only normalizing them no_urls=True, # replace all URLs with a special token no_emails=True, # replace all email addresses with a special token no_phone_numbers=True, # replace all phone numbers with a special token no_numbers=True, # replace all numbers with a special token no_digits=True, # replace all digits with a special token no_currency_symbols= True, # replace all currency symbols with a special token no_punct=False, # fully remove punctuation replace_with_url='[URL]', replace_with_email='[EMAIL]', replace_with_phone_number='[PHONE]', replace_with_number="[NUMBER]", replace_with_digit="0", replace_with_currency_symbol="[CUR]", lang="en" # set to 'de' for German special handling ) cleaned_text = cleaned_text.strip() return cleaned_text
def cl(x): return clean(x, no_urls=True, no_digits=True, no_punct=True, no_line_breaks=True, lang='de')
def text_cleaner(string2): polish_stopwords = pd.read_csv( '/home/erazer/PycharmProjects/jigsaw/nlp_utils/polish_stopwords.txt', header=None, names=['words']).words.to_list() from cleantext import clean cleaned_string = clean(string2, no_urls=True, no_digits=True, no_line_breaks=True, no_emails=True, no_phone_numbers=True, no_numbers=True, no_currency_symbols=True, no_punct=True, replace_with_digit="", replace_with_url="", replace_with_email="", replace_with_currency_symbol="", replace_with_number="", replace_with_phone_number="") filtered_sentence = [ elem for elem in cleaned_string.split(' ') if elem not in polish_stopwords ] # optional - do not return it as a list of separate words, but as a string return filtered_sentence
def my_clean(text): emojiPattern = re.compile( pattern="[" u"\U0001F600-\U0001F64F" # emoticons u"\U0001F300-\U0001F5FF" # symbols & pictographs u"\U0001F680-\U0001F6FF" # transport & map symbols u"\U0001F1E0-\U0001F1FF" # flags (iOS) "]+", flags=re.UNICODE) newText = emojiPattern.sub(r'', text) newText = clean(newText, fix_unicode=True, to_ascii=True, lower=True, no_line_breaks=True, lang="en") newText = re.sub(r'http\S+', '', newText) emojiPattern = re.compile( pattern="[" u"\U0001F600-\U0001F64F" # emoticons u"\U0001F300-\U0001F5FF" # symbols & pictographs u"\U0001F680-\U0001F6FF" # transport & map symbols u"\U0001F1E0-\U0001F1FF" # flags (iOS) "]+", flags=re.UNICODE) newText = emojiPattern.sub(r'', newText) newText = re.sub(r'[^\w\s]', '', newText) return newText
def text_processing(text): text = re.sub(r'<.*>', '', text) text = clean( text, fix_unicode=True, # fix various unicode errors to_ascii=True, # transliterate to closest ASCII representation lower=True, # lowercase text no_line_breaks= False, # fully strip line breaks as opposed to only normalizing them no_urls=False, # replace all URLs with a special token no_emails=False, # replace all email addresses with a special token no_phone_numbers=False, # replace all phone numbers with a special token no_numbers=False, # replace all numbers with a special token no_digits=False, # replace all digits with a special token no_currency_symbols= False, # replace all currency symbols with a special token no_punct=False, # fully remove punctuation replace_with_url="<URL>", replace_with_email="<EMAIL>", replace_with_phone_number="<PHONE>", replace_with_number="<NUMBER>", replace_with_digit="0", replace_with_currency_symbol="<CUR>", ) url = re.findall( r'http[s]?://(?:[a-zA-Z]|[0-9]|[$-_@.&+]|[!*\(\), ]|(?:%[0-9a-fA-F][0-9a-fA-F]))+', text) text = re.sub( r'http[s]?://(?:[a-zA-Z]|[0-9]|[$-_@.&+]|[!*\(\), ]|(?:%[0-9a-fA-F][0-9a-fA-F]))+', '', text) text = re.sub(r'www\S*', '', text) return text, url
def read_train(train_file): lines = [] y = [] vectorizer = CountVectorizer(min_df=3) tf_idf = TfidfTransformer() for parts in utils.read_train(train_file): is_blocked = parts[8] desc = cleantext.clean(parts[4], False) lines.append(desc) y.append(int(is_blocked)) vectorizer = vectorizer.fit_transform(lines) X_nb = tf_idf.fit_transform(vectorizer) X_log = binarize(vectorizer) return X_nb, X_log, numpy.asarray(y)