def preprocessing(text):
    '''Preprocesses a text using standard gensim techniques: 
    removes stopwords, strips short words (1-2 characters), strips numbers, 
    strips http addresses, strips Unicode from emoji etc., lowercases everything, 
    strips extra spaces, punctuation, non-alphanumeric symbols. Also perform stemming

    input: 
        text: a string
    returns: 
        the preprocessed string.
    '''
    text = text.lower()
    text = preprocess.remove_stopwords(text) # remove stop words
    text = preprocess.strip_short(text) #get rid of short words
    text = preprocess.strip_numeric(text) #get rid of numbers
    p = re.compile(r'(http.*\s)|(http.*$)')
    text = p.sub('',text)
    p = re.compile(r'[^\x00-\x7F]+')
    text = p.sub('',text)
    text = preprocess.strip_multiple_whitespaces(text)
    text = preprocess.strip_punctuation(text)
    text = preprocess.strip_non_alphanum(text)
    text = preprocess.remove_stopwords(text)
    text = preprocess.strip_short(text)
# stemming
    words = text.split()
    stemmed_words = [stemmer.stem(word) for word in words]
    text = ' '.join(stemmed_words)

    return text
def raw_text_preprocess(d):
    d = re.sub(r"http\S+", "", d)
    d = strip_non_alphanum(d).lower().strip()
    d = split_alphanum(d)
    d = strip_short(d, minsize=2)
    d = strip_numeric(d)
    d = ViTokenizer.tokenize(d)
    return d
Example #3
0
def raw_text_preprocess(raw):
    raw = re.sub(r"http\S+", "", raw)
    raw = strip_non_alphanum(raw).lower().strip()
    raw = split_alphanum(raw)
    raw = strip_short(raw, minsize=2)
    raw = strip_numeric(raw)
    raw = ViTokenizer.tokenize(raw)
    return raw
Example #4
0
 def __init__(self):
     self.nlp = spacy.load("en_core_web_sm")
     self.filter = [
         lambda x: x.lower(),
         strip_multiple_whitespaces,
         strip_numeric,
         strip_non_alphanum,
         strip_punctuation,
         remove_stopwords,
         strip_tags,
         lambda s: strip_short(s, minsize=4),
     ]
def gensim_clean_string(textIn, _strip_tags=True, _split_alphanumeric=True, _strip_nonalphanumeric=True,
                        _strip_muliple_whitespace=True, _strip_short=True, _short_charcount_min=3,
                        _strip_punctuation=False, _convert_to_lower = False):
    cleaner = textIn
    if _strip_tags:
        cleaner = strip_tags(textIn)
    if _strip_nonalphanumeric:
        cleaner = strip_non_alphanum(cleaner)
    if _strip_muliple_whitespace:
        cleaner = strip_multiple_whitespaces(cleaner)
    if _split_alphanumeric:
        cleaner = split_alphanum(cleaner)
    if _strip_short:
        cleaner = strip_short(cleaner, minsize=_short_charcount_min)
    if _convert_to_lower:
        cleaner = cleaner.lower()


    return cleaner
Example #6
0
def save_word_dict(text):
    proc_text = []

    sentences = text
    sentences = tokenize.sent_tokenize(sentences)

    for sentence in sentences:
        sentence_without_stops = remove_stopwords(sentence)
        sentence_without_stops = stem_text(sentence_without_stops)
        sentence_without_stops = strip_short(sentence_without_stops)
        sentence_without_stops = strip_punctuation(sentence_without_stops)

        proc_sentence = word_tokenize(sentence_without_stops.lower())

        if len(proc_sentence) == 0:
            continue
        proc_text.append(proc_sentence)

    dictionary = corpora.Dictionary(proc_text)
    return [dictionary, proc_text, sentences]
Example #7
0
 def testStripShort(self):
     self.assertEqual(strip_short("salut les amis du 59", 3), "salut les amis")