import nltk from nltk.corpus import stopwords from nltk.tokenize.toktok import ToktokTokenizer import spacy from bs4 import BeautifulSoup import unidecode import re import contractions import unicodedata nlp = spacy.load('en_core_web_lg') tokenizer = ToktokTokenizer() stopword_list = nltk.corpus.stopwords.words('english') contractions.add("n't", "not") contractions.add("1st", "first") contractions.add("2st", "second") contractions.add("3th", "third") def lemmatize_text(text): """ Input: Vector of text Process: Lemmatize the input vector Output: Returns vector of text """ text = nlp(text) text = ' '.join([ word.lemma_ if word.lemma_ != '-PRON-' else word.text for word in text ])
def expandText(txt): contractions.add("wont", "will not") contractions.add("dont", "do not") contractions.add("doesnt", "does not") contractions.add("dn't", "did not") contractions.add("wont", "will not") contractions.add("cant", "can not") contractions.add("its", "it is") contractions.add("idk", "i do not know") expand_txt = contractions.fix(txt) return expand_txt
def expand_contractions(tweets): """ Function to transform some of the most common English and French contractions into their expanded form Args: tweets: list containing all tweets Returns: clean_tweets: list of tweets with the contractions expanded References: https://github.com/kootenpv/contractions """ # since the library is designed for English contractions, # we will only have to add French contractions contractions.add("c'est", "cest") contractions.add("c’est", "cest") contractions.add("qu'il", "que il") contractions.add("qu’il", "que il") contractions.add("s'il", "si il") contractions.add("s’il", "si il") # create a list for storing the results clean_tweets = [] for tweet in tweets: clean_tweets.append(contractions.fix(tweet).lower()) # the rest of the French contractions will need to be solved # through regular expressions # l’intelligence --> le intelligence clean_tweets = [re.sub(r"\bl['|’](\S)", r"le \1", tweet) for tweet in clean_tweets] # d’bananes --> des bananes clean_tweets = [re.sub(r"\bd['|’](\S)", r"de \1", tweet) for tweet in clean_tweets] # j’avais --> je avais clean_tweets = [re.sub(r"\bj['|’](\S)", r"je \1", tweet) for tweet in clean_tweets] # n’aurait --> ne aurait clean_tweets = [re.sub(r"\bn['|’](\S)", r"ne \1", tweet) for tweet in clean_tweets] return(clean_tweets)
def test_add(): contractions.add('mychange', 'my change') assert contractions.fix('mychange') == 'my change'
def update_acronyms(): for (key, value) in common_acronym_list.acronym_list.items(): #print(key, ":",value) con.add(key, value) return True