def preprocessing(text): '''Preprocesses a text using standard gensim techniques: removes stopwords, strips short words (1-2 characters), strips numbers, strips http addresses, strips Unicode from emoji etc., lowercases everything, strips extra spaces, punctuation, non-alphanumeric symbols. Also perform stemming input: text: a string returns: the preprocessed string. ''' text = text.lower() text = preprocess.remove_stopwords(text) # remove stop words text = preprocess.strip_short(text) #get rid of short words text = preprocess.strip_numeric(text) #get rid of numbers p = re.compile(r'(http.*\s)|(http.*$)') text = p.sub('',text) p = re.compile(r'[^\x00-\x7F]+') text = p.sub('',text) text = preprocess.strip_multiple_whitespaces(text) text = preprocess.strip_punctuation(text) text = preprocess.strip_non_alphanum(text) text = preprocess.remove_stopwords(text) text = preprocess.strip_short(text) # stemming words = text.split() stemmed_words = [stemmer.stem(word) for word in words] text = ' '.join(stemmed_words) return text
def raw_text_preprocess(d): d = re.sub(r"http\S+", "", d) d = strip_non_alphanum(d).lower().strip() d = split_alphanum(d) d = strip_short(d, minsize=2) d = strip_numeric(d) d = ViTokenizer.tokenize(d) return d
def raw_text_preprocess(raw): raw = re.sub(r"http\S+", "", raw) raw = strip_non_alphanum(raw).lower().strip() raw = split_alphanum(raw) raw = strip_short(raw, minsize=2) raw = strip_numeric(raw) raw = ViTokenizer.tokenize(raw) return raw
def __init__(self): self.nlp = spacy.load("en_core_web_sm") self.filter = [ lambda x: x.lower(), strip_multiple_whitespaces, strip_numeric, strip_non_alphanum, strip_punctuation, remove_stopwords, strip_tags, lambda s: strip_short(s, minsize=4), ]
def gensim_clean_string(textIn, _strip_tags=True, _split_alphanumeric=True, _strip_nonalphanumeric=True, _strip_muliple_whitespace=True, _strip_short=True, _short_charcount_min=3, _strip_punctuation=False, _convert_to_lower = False): cleaner = textIn if _strip_tags: cleaner = strip_tags(textIn) if _strip_nonalphanumeric: cleaner = strip_non_alphanum(cleaner) if _strip_muliple_whitespace: cleaner = strip_multiple_whitespaces(cleaner) if _split_alphanumeric: cleaner = split_alphanum(cleaner) if _strip_short: cleaner = strip_short(cleaner, minsize=_short_charcount_min) if _convert_to_lower: cleaner = cleaner.lower() return cleaner
def save_word_dict(text): proc_text = [] sentences = text sentences = tokenize.sent_tokenize(sentences) for sentence in sentences: sentence_without_stops = remove_stopwords(sentence) sentence_without_stops = stem_text(sentence_without_stops) sentence_without_stops = strip_short(sentence_without_stops) sentence_without_stops = strip_punctuation(sentence_without_stops) proc_sentence = word_tokenize(sentence_without_stops.lower()) if len(proc_sentence) == 0: continue proc_text.append(proc_sentence) dictionary = corpora.Dictionary(proc_text) return [dictionary, proc_text, sentences]
def testStripShort(self): self.assertEqual(strip_short("salut les amis du 59", 3), "salut les amis")