def lima(word, words): # print(word) lemmatiser = wnl() words_tag = dict(pos_tag(words)) # print(wordnet.synsets(word)) # print(get_wordnet_pos(words_tag.get(word))) # if word.isalpha() and wordnet.synsets(word): return lemmatiser.lemmatize(word, get_wordnet_pos(words_tag.get(word)))
def clean_text(text): data = [char for char in text if char not in string.punctuation] data = ''.join(data) data = str(data) words = regex.sub(r"(@[A-Za-z0-9]+)|([^A-Za-z0-9 \t])|(\w+:\/\/\S+)|^rt|http.+?", " ", data ) words = words.lower() final_words = [wnl().lemmatize(word , pos = "v") for word in words.split()] final_words = ' '.join(final_words) return(final_words)
def clean_text_data(data): regex = re.compile("[^a-zA-Z]") stem_data_without_stop = [ " ".join([ wnl().lemmatize(regex.sub("", word.lower())) for word in text.split(" ") if word not in stop ]) for text in data ] return stem_data_without_stop
def __init__(self): self.error_list = [] self.error_dic = dict() self.rule_dic = {'i': ['am', 'could', 'should', 'have', 'did', 'had', 'will', 'was', 'can', 'shall', 'may', 'might', 'must', 'would'], 'he': ['is', 'could', 'should', 'did', 'has', 'will', 'had', 'was', 'can', 'shall', 'may', 'might', 'must', 'would'], 'you': ['are', 'had', 'could', 'should', 'did', 'have', 'will', 'were', 'can', 'shall', 'may', 'might', 'must', 'would'] } self.tool = language_check.LanguageTool('en-US') self.lemmatizer = wnl() self.nlp = spacy.load('en') logging.basicConfig(filename="log_file.log", format='%(asctime)s %(message)s', filemode='w', level=logging.DEBUG)
def PreprocessCSV(csvfile, outputfile): """ output a csv file and return a word list. """ print("Start preprocessing %s ..." % csvfile) voc = [] dataframe = pandas.read_csv(csvfile, usecols=["Insult", "Comment"]) labels = dataframe.iloc[:, 0].tolist() sents = dataframe.iloc[:, 1].tolist() newsents = [] for sent in sents: # process sentences of samples # in case of blank, add a useless flag at the end sent = sent.strip("\"").lower() sent = sent.replace("\t", " ") sent = sent.replace("\n", " ") sent = sent.replace("\xa0", " ") sent = sent.replace("\xc2", " ") sent = sent.replace("\xc8", " ") sent = sent.replace("\xec", " ") sent = sent.replace("\x80", " ") sent = sent.replace("\xa6", " ") sent = re.sub("[$%^&*\[\]]", "", sent) tks = wt(sent) newtks = [] #built first-part features for tk in tks: if tk.isalpha(): tk = wnl().lemmatize(tk) newtks.append(tk) voc.append(tk) else: pass newsent = " ".join(newtks) newsent = newsent + " " + "auselessflag" newsents.append(newsent) # write the outputfile col_order = ["Insult", "Comment"] dataframe2 = pandas.DataFrame({"Insult": labels, "Comment": newsents}) dataframe2.to_csv(outputfile, index=False, columns=col_order) fdist = FreqDist(voc) keys = fdist.keys() wordlist = [] for key in keys: wordlist.append(key) print( "file \"%s\" is preprocessed, and there are %d keys in the return wordlist." % (csvfile, len(wordlist))) return wordlist
def preProcessData(tuple): count=0 for tweet,sentiment in tuple: #Remove HashTags tweet=re.sub('#','',tweet) #Remove Username like @Rahul tweet=re.sub('@[\w\d_]*','',tweet) #Remove URL's tweet=re.sub('http.//[\w\d\.\\/]*','',tweet) #Remove Puntuations tweet=re.sub(r'[%\.\'\"\?:,;!-]',' ',tweet) #Remove HTML Tags tweet=re.sub('<.*?>','',tweet) #Remove rpeadted Words tweet=re.sub(r'([a-z])\1+',r'\1',tweet) #Removing words that start with a number or a special character tweet = re.sub(r'^[^a-zA-Z]+',' ',tweet) #Convert camel Casing into space Separated word tweet=re.sub("([a-z])([A-Z])","\g<1> \g<2>",tweet) #Remove additional white spaces tweet = re.sub('[\s]+', ' ', tweet) #Remove StopWords tweet=tweet.split() nltkVariable=nltk.corpus.stopwords.words('english') for word in tweet: if word in nltkVariable: tweet.remove(word) #Lemmatize Words tweet=[wnl().lemmatize(word) for word in tweet] if count==6: #print tweet # tweet=re.sub('http.//[\w\d\.\\/]*','',tweet) print tweet count=count+1
else: uncontracted.append(x) elif x.lower() in contractions.keys(): uncontracted.append(contractions[x.lower()]) elif x in contractions.keys(): uncontracted.append(contractions[x]) else: uncontracted.append(x) return (" ".join(uncontracted)) # In[17]: from nltk.stem import WordNetLemmatizer as wnl lemmatizer = wnl() # In[18]: def nltk_tag_to_wordnet_tag(nltk_tag): if nltk_tag.startswith('J'): return wordnet.ADJ elif nltk_tag.startswith('V'): return wordnet.VERB elif nltk_tag.startswith('N'): return wordnet.NOUN elif nltk_tag.startswith('R'): return wordnet.ADV else: return None
ranscendent cinematic experience. Thank you to everybody at Fox and New Regency … my entire team. I have to thank everyone from the very onset of my career … To my parents; none of this would be possible without you. And to my friends, I love you dearly; you know who you are. And lastly, I just want to say this: Making The Revenant was about man's relationship to the natural world. A world that we collectively felt in 2015 as the hottest year in recorded history. Our production needed to move to the southern tip of this planet just to be able to find snow. Climate change is real, it is happening right now. It is the most urgent threat facing our entire species, and we need to work collectively together and stop procrastinating. We need to support leaders around the world who do not speak for the big polluters, but who speak for all of humanity, for the indigenous people of the world, for the billions and billions of underprivileged people out there who would be most affected by this. For our children’s children, and for those people out there whose voices have been drowned out by the politics of greed. I thank you all for this amazing award tonight. Let us not take this planet for granted. I do not take tonight for granted. Thank you so very much.""" s = nk.sent_tokenize(paragraph) lm = wnl() #Lemmatization that better meaningful word gives as output for i in range(len(s)): words = nk.word_tokenize(s[i]) nwords = [lm.lemmatize(word) for word in words] s[i] = ' '.join(nwords)
def wordLemmatizer(tweet_words): return [wnl().lemmatize(word) for word in tweet_words]
def __init__(self): self.tokenizer = data.load('tokenizers/punkt/english.pickle') self.wnl = wnl() self.delims = {".", ',', "!", ":", ";"} self.stopwords = {"the"}
def lemmatizer_list(row): lemmatizer = wnl() tokenized_words = row['tokenized_text'] lemmatized_tokens = [lemmatizer.lemmatize(w, get_wordnet_pos(w)) for w in tokenized_words] return lemmatized_tokens
sys.path.append("..") from Parsers.Stemming.lovins import stem as lovins_stem from nltk.stem.porter import PorterStemmer as porter from import LancasterStemmer as lancs from nltk.stem.snowball import EnglishStemmer as snowball from nltk.stem import WordNetLemmatizer as wnl #Source : reqVersion = (3,0) curVersion = sys.version_info _lancs = lancs() _porter = porter() _snowball = snowball() _wnl = wnl() def removeEscapeChars(textString): if curVersion < reqVersion: return textString.encode('string_escape').decode('string_escape') #Python 2 else: return textString.encode('encode_escape').decode("unicode_escape") #Python 3 def getASCIIChars(textString): return unicode(string, 'ascii', 'ignore') #Source: def detectEncoding(textString): try: return chardet.detect(textString) except UnicodeDecodeError:
# generate the list of words f = open('word_freq_final.txt', 'r') wordList = [] count = 0 for line in f: wordList.append(line.split(',')[0]) wordList.sort() """g = open('wordlist_final.txt', 'w') for i in range(len(wordList)): g.write(str(i) + "-" + wordList[i] + "\n")""" cwd = os.getcwd() valid_ext = ".jpg" d = enchant.Dict("en") wnl = wnl() # files of data set images # the data set must be contained in a folder (titled "imgs") within the working directory paths = ["imgs/1_early-renaissance" , "imgs/2_high-renaissance" , "imgs/3_mannerism-late-renaissance" , "imgs/4_northern-renaissance" , "imgs/5_baroque" , "imgs/6_rococo" , "imgs/7_romanticism" , "imgs/8_impressionism" , "imgs/9_post-impressionism" , "imgs/10_realism" , "imgs/11_art-nouveau-modern" , "imgs/12_cubism"