def get_text(soup:bs4.BeautifulSoup) -> str: "get the raw text" text = soup.getText() #translate newlines back from BigQuery text = re.sub(r'\n\n+', '\n', text) #translate double quotes back from BigQuery text = re.sub(r'xxxdblqte', ' \" ', text) return normalize_whitespace(text)
def _clean_content(self, content): # strip out link markup, e.g. [foo](http://foo.com) content = REDDIT_LINK_RE.sub(r'\1', content) # clean up basic HTML cruft content = content.replace('>', '>').replace('<', '<') # strip out text markup, e.g. * for bold text content = content.replace('`', '').replace('*', '').replace('~', '') # normalize whitespace return preprocess.normalize_whitespace(content)
def _clean_content(self, content): # strip out link markup, e.g. [foo](http://foo.com) content = REDDIT_LINK_RE.sub(r'\1', content) # clean up basic HTML cruft content = content.replace('>', '>').replace('<', '<') # strip out text markup, e.g. * for bold text content = content.replace('`', '').replace('*', '').replace('~', '') # normalize whitespace return normalize_whitespace(content)
def clean_sentence(sentences): c = sentences.replace('-', ' ') # people use to concatinate words c = normalize_whitespace(c) c = preprocess_text(c, lowercase=True, no_numbers=True, no_punct=True, no_contractions=True) return c
def _clean_content(self, content): # strip out link markup, e.g. [foo](http://foo.com) content = REDDIT_LINK_RE.sub(r"\1", content) # clean up basic HTML cruft content = content.replace(">", ">").replace("<", "<") # strip out text markup, e.g. * for bold text content = content.replace("`", "").replace("*", "").replace("~", "") # normalize whitespace return normalize_whitespace(content)
def clean_tweet(self, text): # FIXED UNICODE text = preprocess.fix_bad_unicode(text) # GET TEXT ONLY FROM HTML text = BeautifulSoup(text, features='lxml').getText() # UN-PACK CONTRACTIONS text = preprocess.unpack_contractions(text) # REMOVE URL text = preprocess.replace_urls(text) # REMOVE EMAILS text = preprocess.replace_emails(text) # REMOVE PHONE NUMBERS text = preprocess.replace_phone_numbers(text) # REMOVE NUMBERS text = preprocess.replace_numbers(text) # REMOVE CURRENCY text = preprocess.replace_currency_symbols(text) # REMOVE ACCENTS text = preprocess.remove_accents(text) # CONVERT EMOJIS TO TEXT words = text.split() reformed = [ self.SMILEY[word] if word in self.SMILEY else word for word in words ] text = " ".join(reformed) text = emoji.demojize(text) text = text.replace(":", " ") text = ' '.join(text.split()) # SPLIT ATTACHED WORDS text = ' '.join(re.findall('[A-Z][^A-Z]*', text)) # SPLIT UNDERSCORE WORDS text = text.replace('_', ' ') # REMOVE PUNCTUATION text = preprocess.remove_punct(text) # Remove numbers text = re.sub(r'\d', '', text) # REMOVE WORDS LESS THAN 3 CHARACTERS text = re.sub(r'\b\w{1,2}\b', '', text) # NORMALIZE WHITESPACE text = preprocess.normalize_whitespace(text) return text
def fulltext_extractor(self, d, clean_text=True): if 'fullText' in d: fulltext = d['fullText'] if clean_text: fulltext = preprocess.normalize_whitespace(fulltext) fulltext = preprocess_text_by_config(fulltext, self.textacy_defs) return fulltext else: return d
def preprocess(text, fix_unicode=True, normalize_white_space = False, lowercase=False, transliterate=False, no_urls=False, no_emails=False, no_phone_numbers=False, no_numbers=False, no_currency_symbols=False, no_punct=False, no_contractions=False, no_accents=False): if normalize_white_space: text = pp.normalize_whitespace(text) text = pp.preprocess_text(text, fix_unicode, lowercase, transliterate, no_urls, no_emails, no_phone_numbers, no_numbers, no_currency_symbols, no_punct, no_contractions, no_accents) return text
def preprocess_sentence(sent): # TODO check language? s = preprocess.normalize_whitespace(sent) return preprocess.preprocess_text(s, lowercase=True, transliterate=True, no_urls=True, no_phone_numbers=True, no_numbers=True, no_currency_symbols=True, no_contractions=True, no_accents=True)
def tokenizer(sentences): y = [] if type(sentences) == str: sentences = [sentences] for comment in sentences: comment = my_preprocess(comment) txt = preprocess.normalize_whitespace(comment) txt = preprocess.preprocess_text(txt, fix_unicode=True, lowercase=True, transliterate=True, no_urls=True, no_emails=True, no_phone_numbers=True, no_numbers=True, no_currency_symbols=True, no_punct=True, no_contractions=True, no_accents=True) y.append(u''.join(txt)) return y
def question_mark_pos(line): """ Get the list of positions of question mark in line """ text = normalize_whitespace(line) token_pattern = r"\?{1,}\s{0,}\?{1,}" text = re.sub(token_pattern, r"?", line) # replace multi question marks with just one #print line token_pattern = r"(?u)\b\w+\b|\?" r = re.compile(token_pattern) word_qmark = r.findall(line) #print word_qmark pos_qmark = get_position_list("?", word_qmark) count_qmark = len(pos_qmark) if pos_qmark[0] == 0: count_word = len(word_qmark) pos_qmark = [len(word_qmark)] else: count_word = len(word_qmark) - count_qmark pos_qmark = [i - j for j, i in enumerate(pos_qmark, start=1)] pos_qmark = pos_qmark + [count_word] return pos_qmark
def test_normalize_whitespace(): text = "Hello, world! Hello...\t \tworld?\n\nHello:\r\n\n\nWorld. " proc_text = "Hello, world! Hello... world?\nHello:\nWorld." assert preprocess.normalize_whitespace(text) == proc_text
def clean_text(text): text = re.sub('[^a-zA-Z0-9ßöäüÖÄÜ_.:,;?!()&@/€\- ]', "", text) text = pp.normalize_whitespace(text) return text
def test_normalize_whitespace(self): text = "Hello, world! Hello...\t \tworld?\n\nHello:\r\n\n\nWorld. " proc_text = "Hello, world! Hello... world?\nHello:\nWorld." self.assertEqual(preprocess.normalize_whitespace(text), proc_text)
def label(corpus, output, kg_url, test_size, numdocs=None): """ Generate fasttext compatible text files Single label version """ docs = read_corpus(corpus) if numdocs: docs = docs[:numdocs] kg = get_lemmatized_kg(kg_url) X, y = prepare_corpus(docs, kg) by_labels = defaultdict(list) for doc, tls in zip(X, y): label, count = tls[0] by_labels[label].append((count, doc)) counts = [len(v) for v in by_labels.values()] max_docs = min(counts) X, y = [], [] for label, counteddocs in by_labels.items(): docs = sorted(counteddocs, key=lambda d: d[0], reverse=True) docs = [d[1] for d in docs] docs = docs[:max_docs] X.extend(docs) y.extend([label] * max_docs) # yy = [] # for tl in y: # ls = [x[0].lower().replace(' ', '_') for x in tl] # yy.append(ls) # y = yy X = [ normalize_whitespace((' </s> '.join(sents)).replace('dignr', '')) for sents in X ] X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=test_size, random_state=0) train_path = output + '-train' test_path = output + '-test' with open(train_path, 'w') as f: for label, text in zip(y_train, X_train): ls = '__label__' + label.replace(' ', '_').lower() # ls = ' '.join(['__label__{}'.format(l) for l in labels[:1]]) line = "{} {}".format(ls, text) f.write(line) f.write('\n') with open(test_path, 'w') as f: for label, text in zip(y_test, X_test): ls = '__label__' + label.replace(' ', '_').lower() # ls = ' '.join(['__label__{}'.format(l) for l in labels[:1]]) line = "{} {}".format(ls, text) f.write(line) f.write('\n') logger.info("Wrote train file: %s", train_path) logger.info("Wrote test file: %s", test_path)