def word_freq(self, top_n=10, group=1 == 1, bi=1 == 1): ''' Function: Show words frequency & word clouds. Paramters- group: If true produce frequency & word clouds for each tokens list in the list. top_n: Number of top frequency words to show. bi: Whether to show bigrams or not. ''' if group and self.labels == None: group = 1 == 2 from nltk import FreqDist as fd import pandas as pd if not group: tks = [t for l in self.lst_tk_lsts for t in l if type(t) == str] ndist = pd.Series(fd(tks)).sort_values(ascending=1 == 2) sm = ndist.sum() ndist = ndist.map(lambda x: str(round(x / sm * 100, 2)) + '%') word_cloud(' '.join(tks)) print(ndist[:top_n]) if bi: df_bi = self.vectorize(method='count', ngram_range=(2, 2), stop_words='english', return_df=1 == 1) if 'label' in df_bi.columns: df_bi.drop(columns=['label'], inplace=1 == 1) sr = df_bi.sum().sort_values(ascending=1 == 2) sr = sr.map(lambda x: str(round(x / sr.sum() * 100, 1)) + '%') print(sr[:top_n]) return None if bi: df_bi = self.vectorize(method='count', ngram_range=(2, 2), stop_words='english', return_df=1 == 1) df = df_bi.groupby('label').sum() sr = pd.Series(self.lst_tk_lsts, index=self.labels) for i in list(set(self.labels)): print(i, ':') sr0 = sr[i].to_list() tks = [t for k in sr0 for t in k] ndist = pd.Series(fd(tks)).sort_values(ascending=1 == 2) sm = ndist.sum() ndist = ndist.map(lambda x: str(round(x / sm * 100, 2)) + '%') word_cloud(' '.join(tks)) print(ndist[:top_n], '\n') if bi: sr1 = df.loc[i, :].sort_values(ascending=1 == 2) sr1 = sr1.map( lambda x: str(round(x / sr1.sum() * 100, 1)) + '%') print(sr1[:top_n], '\n')
def __getSpeaketDict(self): for speaker in self.speakerData: data = self.speakerData[speaker]['statement'].strip() self.totalTrainData += data+" " self.speakerData[speaker]["words"] = dict(fd(word_tokenize(data))) self.speakerData[speaker]["wordCount"] = sum(self.speakerData[speaker]["words"].values()) self.speakerData[speaker]["classVocabSize"] = len(self.speakerData[speaker]["words"].keys()) self.uniqueList.append(self.speakerData[speaker]["words"].keys())
def list_to_fd_DF(lt, t): lt_fd = fd(lt) id_ = [] row = [] for col in lt_fd: row.append([col, lt_fd[col]]) id_.append(col) return (df( row, index=id_, columns=['FileName_{0}'.format(t), 'Count_{0}'.format(t)]))
def fristXpq(sDat, max_len, x=0): token = wt(sDat.lower()) lt = [] if len(sDat) < max_len: max_len = len(sDat) for i in range(1, max_len + 1): text = [] if i > 1: text = textToWordList(token, i) else: text = token if x == 0: fdist = fd(text).most_common() else: fdist = fd(text).most_common(x) fdist = [[fdist[k][0], fdist[k][1]] for tp, k in zip(fdist, range(len(fdist)))] # sfdist=[[fdist[k][0],fdist[k][1]/len(text)] for tp,k in zip(fdist,range(len(fdist)))] lt.extend(fdist) # lt=fqtopq(lt,len(text)) return (lt)
def guess_lang(text): '''Guess the language of the text. This version includes only Spanish, German and English and the sample needs to be quite big but it could be enhanced.''' Spanish = udhr.words('Spanish-Latin1') German = udhr.words('German_Deutsch-Latin1') English = udhr.words('English-Latin1') spanfd = fd(Spanish) small_spanfd = {} gerfd = fd(German) small_gerfd = {} enfd = fd(English) small_enfd = {} text_fd = fd(nltk.regexp_tokenize(text.lower(), r'\w+')) for key in spanfd.keys(): if text_fd.has_key(key): small_spanfd[key] = spanfd[key] for key in enfd.keys(): if text_fd.has_key(key): small_enfd[key] = enfd[key] for key in gerfd.keys(): if text_fd.has_key(key): small_gerfd[key] = gerfd[key] corwithspan = cor(small_spanfd, text_fd) corwithen = cor(small_enfd, text_fd) corwithger = cor(small_gerfd, text_fd) if abs(corwithspan) == abs(corwithen) == abs(corwithger): print "I don't know..." elif max(abs(corwithspan), abs(corwithen), abs(corwithger)) == abs(corwithspan): print "It's Spanish!" elif max(abs(corwithspan), abs(corwithen), abs(corwithger)) == abs(corwithen): print "It's English!" elif max(abs(corwithspan), abs(corwithen), abs(corwithger)) == abs(corwithger): print "It's German!"
def arrange(devdict,option=0): global exportlist exportlist = [] print 'getting dict' for x,y in devdict.items(): # x is index key, y is value #print 'init variables' combrole = [] combtag = [] combtagfd = {} combtag2 = [] tagcount = 0 #print 'rolegrab start' for z in y[1:]: # grab roles # print 'grab roles' combrole.append(z[0]) if z[1] != 'NoTag': tagcount += 1 # print 'grabbing tags' for a in z[3:]: # grab tags # print a combtag.append(a) if combtag: # print 'getting tag fdist' combtagfd = fd(combtag) for b,c in combtagfd.items(): # x is tag, y is frequency # print b,c combtag2.append(b) combtag2.append(str(c)) else: combtag = [] combtag2 = [] #print [y[0][0]] #print combrole #print y[0][1:] #print combtag2 if option == 1: if len(combrole) > 1: combrole = [' | '.join(combrole)] elif option == 0: if len(combrole) < 54: blank = 54 - len(combrole) appendix = [''] * blank combrole.extend(appendix) exportlist.append([y[0][0]]+combrole+y[0][1:]+[str(tagcount)]+combtag2) # print 'exported' print 'done'
mode='rb') as f: word_tokenized_lowered = pickle.load(f) with open('{}/data/pos_dicts.pickle'.format(mod_path), mode='rb') as file: pos_tag_dict, pos_tag_dict_univ = pickle.load(file) with open('{}/data/abbrev_dict.pickle'.format(mod_path), mode='rb') as file: abbrevs_orig = pickle.load(file) with open('{}/data/sig_dict.pickle'.format(mod_path), mode='rb') as file: sig_dict = pickle.load(file) brown = word_tokenized_lowered[:1161192] brown_common = { word: log(1161192 / freq) for word, freq in fd(brown).most_common(5000)[100:] } words = [w for w, freq in fd(brown).most_common()] names_lower = {w.lower() for w in names.words()} def expand_EXPN(nsw, i, text, user_abbrevs={}): """Expand abbreviations to best possible match. If no close matches, return nsw.""" try: if user_abbrevs: abbrevs = create_user_abbrevs(user_abbrevs) else: abbrevs = abbrevs_orig if nsw in ['St.', 'st.', 'St']: if i < len(text):
from sklearn.naive_bayes import MultinomialNB, GaussianNB, BernoulliNB from sklearn.linear_model import LogisticRegression, SGDClassifier from sklearn.svm import SVC, LinearSVC, NuSVC documents = [(list(mr.words(fileid)), category) for category in mr.categories() for fileid in mr.fileids(category)] random.shuffle(documents) all_words = [] for w in mr.words(): all_words.append(w.lower()) all_words = fd(all_words) word_features = list(all_words.keys())[:3000] def find_features(document): words = set(document) features = {} for w in word_features: features[w] = {w in words} return features featuresets = [(find_features(rev), category) for (rev, category) in documents] training_set = featuresets[:2500] testing_set = featuresets[2500:]
def __genBagOfWords(self): self.bagOfWords = dict(fd(word_tokenize(self.totalTrainData.strip()))) self.vocabSize = len(self.bagOfWords.keys()) self.totalwords = sum(self.bagOfWords.values())
if t.lower() not in stops: pos = pos_tag([t]) clean_word = lemmatizer.lemmatize(t, pos=get_simple_pos(pos[0][1])) output_words.append(clean_word.lower()) return output_words # print(tweet_words[0]) clean_words_train = [(clean_tweets(tweet), sentiment) for tweet, sentiment in tweet_words] clean_words_test = [(clean_tweets(tweet)) for tweet in test_tweet_words] print(clean_words_train) all_words = [] for tweet in clean_words_train: all_words += tweet[0] freq = fd(all_words) common = freq.most_common(200) features = [i[0] for i in common] def get_feature_dict(words): current_features = {} words_set = set(words) for w in features: current_features[w] = w in words_set return current_features training_data = [(get_feature_dict(tweet), sentiment) for tweet, sentiment in clean_words_train] testing_data = [(get_feature_dict(tweet)) for tweet in clean_words_test]
def createVector(filename, last): #Lexical doc = open('text/'+filename,'r') DocString = doc.read() data = [] charactercount = len(DocString) alphebetcount = 0 uppercasecount = 0 digitcount = 0 whitespacecount = 0 tab_and_spacecount = 0 # Freq Special char # WordBased totalwordcount = 0 totalshortwords = 0 totalcharinwordsC = 0 avgwordlen = 0 avg_sent_len_chars = 0 avg_sent_len_word = 0 total_dif_words = 0 freq_once_ocur_words = 0 # Hapax legomena freq_twice_ocur_words = 0 # Hapax dislegomena Yule_K_measure = 0 # vocab richness def by yule Simpson_D_measure = 0 # vocab rich def by simpson Sichel_S_measure = 0 # vocab rich by sichele Brunet_W_measure = 0# vocab rich by Brune Honore_R_measure = 0# vocab rich by Honre Freq_dist_Wordlen = 0 # need 20 of these punclist = [0,0,0,0,0,0,0,0] #, . ? ! : ; ' " for x in DocString: if x.isalpha(): alphebetcount+=1 if x.isdigit(): digitcount+=1 if x.isupper(): uppercasecount+=1 if x==',': punclist[0]+=1 if x=='.': punclist[1]+=1 if x=='?': punclist[2]+=1 if x=='!': punclist[3]+=1 if x==':': punclist[4]+=1 if x==';': punclist[5]+=1 if x=='\'': punclist[6]+=1 if x=='"': punclist[7]+=1 # A-Z count letcur = [0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0] for line in enumerate(DocString): tab_and_spacecount+=line.count('\t') tab_and_spacecount+=line.count(' ') for i in range(0, 25): letcur[i] = letcur[i] + line.count(chr(i+65))+line.count(chr(i+97)) words = word_tokenize(DocString, language='english') totalwordcount = len(words) temp = [x for x in words if len(x)<4] totalshortwords = len(temp) for x in words: totalcharinwordsC+=len(x) avgwordlen=totalcharinwordsC/len(words) sents = sent_tokenize(DocString,language='english') senttemp = 0 for x in sents: senttemp+=len(x) avg_sent_len_chars=senttemp/len(sents) wordtemp = 0 for x in sents: wordtemp+=len(word_tokenize(x,language='english')) avg_sent_len_word = wordtemp/len(sents) freqdist = fd(words) total_dif_words=len(freqdist) freq_once_ocur_words=len(freqdist.hapaxes()) freqwords = [0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0] for x in freqdist: if freqdist[x]<=20: if freqdist[x]>=3: freqwords[freqdist[x]-3]+=1 #richness measures #syntatic feat: freq punctuation, and function words will be loaded from the research doc asd #structural freatures data+=[charactercount,alphebetcount,uppercasecount,digitcount,whitespacecount,tab_and_spacecount]+letcur+\ [totalwordcount,total_dif_words,totalshortwords,totalcharinwordsC,freq_twice_ocur_words,freq_once_ocur_words]\ + freqwords+punclist read = open('profiles/'+last+'.txt', 'a') read.write('\n') for a in data: read.write(str(a)+',') read.write(" ")
elif n: middle = n.group(1) end = n.group(2) exp += infer_spaces(middle) + " " + ends[end] else: return word return exp except (KeyboardInterrupt, SystemExit): raise except: return word # Build a cost dict, assuming Zipf's law and cost = -math.log(probability). brown = word_tokenized_lowered[:1161192] words = [w for w, freq in fd(brown).most_common()] wordcost = dict( (k, log((i + 1) * log(len(words)))) for i, k in enumerate(words)) maxword = max(len(x) for x in words) def infer_spaces(s): """Uses dynamic programming to infer the location of spaces in a string without spaces.""" # Find the best match for the i first characters, assuming cost has # been built for the i-1 first characters. # Returns a pair (match_cost, match_length). def best_match(i): candidates = enumerate(reversed(cost[max(0, i - maxword):i])) return min((c + wordcost.get(s[i - k - 1:i], 9e999), k + 1)