def freq_dst(self,posCorpus,negCorpus): #Creates frequency distribution for words in corpus posFreqDist = FreqDist() for word in posCorpus.words(): posFreqDist.inc(word) negFreqDist = FreqDist() for word in negCorpus.words(): negFreqDist.inc(word) #Frequency Distributions with Laplace Smoothing global posLapFreq posLapFreq = nltk.probability.LaplaceProbDist(posFreqDist) global negLapFreq negLapFreq = nltk.probability.LaplaceProbDist(negFreqDist) #GetBigrams posBigrams = nltk.bigrams(posCorpus.words()) negBigrams = nltk.bigrams(negCorpus.words()) #Get no. of words per corpus posWordLen = len(posCorpus.words()) negWordLen = len(negCorpus.words()) #FreqDist for Bigrams global posBiFreq posBiFreq = nltk.probability.LaplaceProbDist(nltk.FreqDist(posBigrams)) global negBiFreq negBiFreq = nltk.probability.LaplaceProbDist(nltk.FreqDist(negBigrams))
def to_bigram(self, termpos): words = [elem[0] for elem in termpos] pos_tags = [elem[1] for elem in termpos] b_words = nltk.bigrams(words) b_pos = nltk.bigrams(pos_tags) return (b_words, b_pos)
def similarity(paper1,paper2): score=[] stops=nltk.corpus.stopwords.words('english') #stopwords to weed out ##compare the titles and score the word cosine similarity title1 = paper1[1] title2 = paper2[1] tokens1=[w for w in nltk.word_tokenize(title1) if w not in stops] tokens2=[w for w in nltk.word_tokenize(title2) if w not in stops] fd1=nltk.FreqDist(tokens1) fd2=nltk.FreqDist(tokens2) keys=list(set(list(fd1.keys())+list(fd2.keys()))) scoretemp=0 for key in keys: scoretemp += fd1[key]*fd2[key] a = numpy.linalg.norm(numpy.asarray(list(fd1.values())))*numpy.linalg.norm(numpy.asarray(list(fd2.values()))) if a: score.append(1-scoretemp/a) else: score.append(0) ##compare the abstracts and score single word cosine similarity abstract1 = paper1[3] abstract2 = paper2[3] tokens1=[w for w in nltk.word_tokenize(abstract1) if w not in stops] tokens2=[w for w in nltk.word_tokenize(abstract2) if w not in stops] fd1=nltk.FreqDist(tokens1) fd2=nltk.FreqDist(tokens2) keys=list(set(list(fd1.keys())+list(fd2.keys()))) scoretemp=0 for key in keys: scoretemp += fd1[key]*fd2[key] a = numpy.linalg.norm(numpy.asarray(list(fd1.values())))*numpy.linalg.norm(numpy.asarray(list(fd2.values()))) if a: score.append(1-scoretemp/(numpy.linalg.norm(numpy.asarray(list(fd1.values())))*numpy.linalg.norm(numpy.asarray(list(fd2.values()))))) else: score.append(0) ##compare the abstracts and score bigram cosine similarity tokens1 = nltk.word_tokenize(abstract1) tokens2 = nltk.word_tokenize(abstract2) bgsall1 = nltk.bigrams(tokens1) bgsall2 = nltk.bigrams(tokens2) bgs1 = [bg for bg in bgsall1 if bg[0] not in stops and bg[1] not in stops] bgs2 = [bg for bg in bgsall2 if bg[0] not in stops and bg[1] not in stops] fd1=nltk.FreqDist(bgs1) fd2=nltk.FreqDist(bgs2) keys=list(set(list(fd1.keys())+list(fd2.keys()))) scoretemp=0 for key in keys: scoretemp += fd1[key]*fd2[key] # print(fd1.values()) a = numpy.linalg.norm(numpy.asarray(list(fd1.values())))*numpy.linalg.norm(numpy.asarray(list(fd2.values()))) if a: score.append(1-scoretemp/(numpy.linalg.norm(numpy.asarray(list(fd1.values())))*numpy.linalg.norm(numpy.asarray(list(fd2.values()))))) else: score.append(0) ##total score is sum of the three scores return sum(score)
def test(): uniDictList = [{} for x in range(6)] biDictList = [{} for x in range(6)] vocabSize = [0 for x in range(6)] totalSize = [0 for x in range(6)] biVocabSize = [0 for x in range(6)] bitotalSize = [0 for x in range(6)] numList = [0 for x in range(6)] numCorrect = total = 0 # randomly split set for entry in entryList: if random.random() > 0.10: entry.test = 0 else: entry.test = 1 # compute train dictionaries for entry in entryList: if entry.test == 0: for word in entry.review.split(): uniDictList[entry.rating][word] = uniDictList[entry.rating].get(word,0)+1 for bigram in bigrams(entry.review.split()): biDictList[entry.rating][bigram] = biDictList[entry.rating].get(word,0)+1 numList[entry.rating] += 1 print numList totalCount = reduce(lambda x,y: x+y, numList) # compute dictionary stats for x in xrange(1,6): vocabSize[x] = len(uniDictList[x].keys()) totalSize[x] = reduce(lambda x,y: x+y,uniDictList[x].values()) biVocabSize[x] = len(biDictList[x].keys()) bitotalSize[x] = reduce(lambda x,y: x+y,biDictList[x].values()) # testing for entry in entryList: if entry.test == 1: rankProb = [0 for x in range(6)] for x in range(1,6): for word in entry.review.split(): rankProb[x] += math.log(uniDictList[x].get(word,1)) - math.log(vocabSize[x]+totalSize[x]) for bigram in bigrams(entry.review.split()): rankProb[x] += math.log(biDictList[x].get(bigram,1)) - math.log(biVocabSize[x]+bitotalSize[x]) map(lambda x: x*numList[entry.rating]/totalCount,rankProb) entry.pRating = rankProb.index(max(rankProb[1:6])) if entry.pRating == entry.rating: numCorrect += 1 total += 1 print bigrams(entry.review.split()) return [numCorrect, total]
def estimateLikelihood(self): uniqBigrams = set() uniqCount = 0 for tweet in self._focusTweets['aae']: tweet = tweet.split('\t') for bigram in nltk.bigrams(tweet): try: dummy = self._biDict[bigram] self._likelihood['aae'][bigram] += 1 self._likelihood['aae']['__BITOTAL__'] += 1 if bigram not in uniqBigrams: uniqBigrams.add(bigram) uniqCount += 1 except: continue self._likelihood['aae']['__BITOTAL__'] += uniqCount ## Adding vocab to total for add one smoothing!! sys.stderr.write("Likelihood Bigram Entries AAE:"+str(len(self._likelihood['aae']))+"\n") uniqBigrams = set() uniqCount = 0 for tweet in self._focusTweets['mse']: tweet = tweet.split('\t') for bigram in nltk.bigrams(tweet): try: dummy = self._biDict[bigram] self._likelihood['mse'][bigram] += 1 self._likelihood['mse']['__BITOTAL__'] += 1 if bigram not in uniqBigrams: uniqBigrams.add(bigram) uniqCount += 1 except: continue self._likelihood['mse']['__BITOTAL__'] += uniqCount sys.stderr.write("Likelihood Bigram Entries MSE:"+str(len(self._likelihood['mse']))+"\n")
def find_colloc(data): # find most common collocations def check(wb, tb): if len(wb[0]) <= 1 or len(wb[1]) <= 2: return False try: if detect(wb[0]) != "ar" or detect(wb[1]) != "ar": return False except: return False if tb in [("NN", "NN"), ("NN", "DTNN"), ("NNP", "NNP")]: return True return False bigrams = FreqDist() for d in data: tokens = d["tokens"] words_bigrams = nltk.bigrams([t[0] for t in tokens]) tags_bigrams = nltk.bigrams([t[1] for t in tokens]) for wb, tb in zip(words_bigrams, tags_bigrams): if check(wb, tb): bigrams[wb] += 1 return bigrams
def main(): text = open('holmes.txt').read() tokens = nltk.wordpunct_tokenize(text) charList = [] for word in tokens: for char in word: charList.append(char) fDistChars = nltk.FreqDist(charList) fDistWords = nltk.FreqDist(tokens) print("Answer to 1A, there are {} character types in the book, namely: \n{}".format(len(fDistChars),sorted(fDistChars))) print("\nAnswer to 1B, there are {} word types in the book, namely: \n{}".format(len(fDistWords),sorted(fDistWords))) bigramChars = nltk.bigrams(charList) trigramChars = nltk.trigrams(charList) print("\nAnswer to 1C, the 20 most common characters are: \nUnigrams: \n{}\nBigrams: \n{}\nTrigrams: \n{}".format(most_common(charList), most_common(bigramChars), most_common(trigramChars))) bigramWords = nltk.bigrams(tokens) trigramWords = nltk.trigrams(tokens) print("\nAnswer to 1D, the 20 most common words are: \nUnigrams: \n{}\nBigrams: \n{}\nTrigrams: \n{}".format(most_common(tokens), most_common(bigramWords), most_common(trigramWords))) bigram_measures = nltk.collocations.BigramAssocMeasures() finder = BigramCollocationFinder.from_words(tokens) scoredPMI = finder.score_ngrams(bigram_measures.pmi) scoredCHI = finder.score_ngrams(bigram_measures.chi_sq) print("\nAnswer to 2, the 20 most likely collocations are:\nPMI:\n{} \nChi's square\n{}" .format(scoredPMI[:20],scoredCHI[:20])) print("\nSpearmans correlation = {}".format(nltk.metrics.spearman.spearman_correlation(scoredPMI, scoredCHI)))
def textsimilarity(text1,text2): score=[] stops=nltk.corpus.stopwords.words('english') #stopwords to weed out stops = stops + ['we',',','.','(',')','using','new','propose','investigate'] stops = stops + ['-','show','infer','novel','method'] #get tokens and bigrams from the text, either string or list of keywords if type(text1) is not list: alltokens = nltk.word_tokenize(text1.lower()) allpairs = [list(pair) for pair in nltk.bigrams(alltokens)] tokens1 = [token for token in alltokens if token not in stops] pairs1 = [" ".join(bg) for bg in allpairs if bg[0] not in stops and bg[1] not in stops] else: alltokens = [] allpairs1 = [] for el in text1: atokens = nltk.word_tokenize(el.lower()) alltokens += atokens apairs = [list(pair) for pair in nltk.bigrams(atokens)] allpairs += apairs tokens1 = [token for token in alltokens if token not in stops] pairs1 = [" ".join(bg) for bg in allpairs if bg[0] not in stops and bg[1] not in stops] if type(text2) is not list: tokens = nltk.word_tokenize(text2.lower()) allpairs = [list(pair) for pair in nltk.bigrams(tokens)] tokens2 = [token for token in tokens if token not in stops] pairs2 = [" ".join(bg) for bg in allpairs if bg[0] not in stops and bg[1] not in stops] else: for el in text2: atokens = nltk.word_tokenize(el.lower()) alltokens += atokens apairs = [list(pair) for pair in nltk.bigrams(atokens)] allpairs += apairs tokens2 = [token for token in alltokens if token not in stops] pairs2 = [" ".join(bg) for bg in allpairs if bg[0] not in stops and bg[1] not in stops] ###score single word cosine similarity ## fd1=nltk.FreqDist(tokens1) ## fd2=nltk.FreqDist(tokens2) ## keys=list(set(list(fd1.keys())+list(fd2.keys()))) ## scoretemp=0 ## for key in keys: ## scoretemp += fd1[key]*fd2[key] ## score.append(1-scoretemp/(numpy.linalg.norm(numpy.asarray(list(fd1.values())))*numpy.linalg.norm(numpy.asarray(list(fd2.values()))))) ## ####score bigram cosine similarity ## fd1=nltk.FreqDist(pairs1) ## fd2=nltk.FreqDist(pairs2) ## keys=list(set(list(fd1.keys())+list(fd2.keys()))) ## scoretemp=0 ## for key in keys: ## scoretemp += fd1[key]*fd2[key] ## score.append(1-scoretemp/(numpy.linalg.norm(numpy.asarray(list(fd1.values())))*numpy.linalg.norm(numpy.asarray(list(fd2.values()))))) score.append(sum(1 for token in tokens1 if token in tokens2)) score.append(sum(1 for pair in pairs1 if pair in pairs2)) print('done') ##total score is sum of the the scores return sum(score)
def main(): # Corpus Location #for training data posTrainCorpus = 'C:/Users/Abhinav/Desktop/Course work/NLP/txt_sentoken/pos_train' negTrainCorpus = 'C:/Users/Abhinav/Desktop/Course work/NLP/txt_sentoken/neg_train' #for test data posTestCorpus = 'C:/Users/Abhinav/Desktop/Course work/NLP/txt_sentoken/pos_test' negTestCorpus = 'C:/Users/Abhinav/Desktop/Course work/NLP/txt_sentoken/neg_test' # Create Plain Text Corpus for training data posCorpus = PlaintextCorpusReader(posTrainCorpus, '.*') negCorpus = PlaintextCorpusReader(negTrainCorpus, '.*') # Create Plain Text Corpus for test data posTstCorpus = PlaintextCorpusReader(posTestCorpus, '.*') negTstCorpus = PlaintextCorpusReader(negTestCorpus, '.*') #GetBigrams posBigrams = nltk.bigrams(posCorpus.words()) negBigrams = nltk.bigrams(negCorpus.words()) #Get no. of words per corpus posWordLen = len(posCorpus.words()) negWordLen = len(negCorpus.words()) # Creating object of Lang_Model_classifier obj1 = Lang_Model_Classifier() obj1.freq_dst(posCorpus, negCorpus) #For negative test data for filename in os.listdir(negTestCorpus): wordSet = negTstCorpus.words(filename) print '**Unigram**' unigr = obj1.perp(wordSet) print unigr print '**Bigram**' bigr = obj1.perpBi(nltk.bigrams(wordSet)) print bigr #For positive test data for filename in os.listdir(posTestCorpus): wordSet2 = posTstCorpus.words(filename) print '**Unigram**' posunigr = obj1.perp(wordSet2) print posunigr print '**Bigram**' posbigr = obj1.perpBi(nltk.bigrams(wordSet2)) print posbigr
def hybrid_cfdist(): sherlock_corpus = PlaintextCorpusReader(CORPUS_ROOT_SHERLOCK, '.*', encoding='utf-8') sherlock_bigrams = nltk.bigrams(sherlock_corpus.words()) pokemon_corpus = PlaintextCorpusReader(CORPUS_ROOT_POKEMON, '.*', encoding='utf-8') pokemon_bigrams = nltk.bigrams(pokemon_corpus.words()) return nltk.ConditionalFreqDist(sherlock_bigrams + pokemon_bigrams)
def wordlistfun(filename): minlength = 2 lmtzr = nltk.stem.wordnet.WordNetLemmatizer() wordlist = [] wordfreq = [] hashlist = [] hashfreq = [] with open(filename, "r") as f: count_all = Counter() count_hash = Counter() count_only = Counter() count_bi = Counter() count_only2 = Counter() count_bigramonly = Counter() count_bigramstop = Counter() for line in f: try: tweet = json.loads(line) # Create a list with all the terms terms_stop = [ term for term in preprocess(tweet["text"]) if term.lower() not in stop ] # Update the counter terms_hash = [term for term in preprocess(tweet["text"]) if term.lower().startswith("#")] terms_only = [ term for term in preprocess(tweet["text"]) if term.lower() not in stop and not term.lower().startswith(("#", "@")) ] # mind the ((double brackets)) # startswith() takes a tuple (not a list) if # we pass a list of inputs terms_only2 = [ term.encode("unicode-escape") for term in preprocess(tweet["text"]) if term.lower() not in stop and not term.lower().startswith(("#", "@")) and not term.lower().startswith(("htt", "\u")) and term.lower() not in [r"(?:(?:\d+,?)+(?:\.?\d+)?)"] and len(term) > minlength ] terms_bigramstop = bigrams(terms_stop) terms_bigramonly = bigrams(terms_only2) count_all.update(terms_stop) count_hash.update(terms_hash) count_only.update(terms_only) count_only2.update(terms_only2) count_bigramonly.update(terms_bigramonly) count_bigramstop.update(terms_bigramstop) except: pass wordlist, wordfreq = zip(*count_only2.most_common()) hashlist, hashfreq = zip(*count_hash.most_common()) return wordlist, wordfreq, hashlist, hashfreq
def do_ir2(db, param): print 'Computazione di IR2', db, param, '...' def words(text): stopwords = set(nltk.corpus.stopwords.words('english')) return [w for w in nltk.word_tokenize(text.lower()) if w not in string.punctuation and w not in stopwords] class BigramsCorpus: def __init__(self, db, collection): self.client = MongoClient()[db][collection] def __iter__(self): for doc in self.client.find(): yield [doc['_id']] def __len__(self): return self.client.count() bigram_corpus = BigramsCorpus('cordis', 'bi_grams') bigrams = Dictionary(bigram_corpus) project ={'$project': {'_id': 0, 'title': 1, 'reference': 1}} a = [project] project_corpus = MongoCorpus('cordis', 'projects', aggregate=a) n = max(bigrams.keys()) dataset = [] for doc in project_corpus: temp = bigrams.doc2bow([' '.join(x) for x in nltk.bigrams(words(doc['title']))]) x = [0]*(n+1) for bi, _ in temp: x[bi] = 1 dataset.append(x) alg = KMeans(n_clusters=int(param)) alg.fit(dataset) clusters = defaultdict(list) for i, doc in enumerate(project_corpus): temp = bigrams.doc2bow([' '.join(x) for x in nltk.bigrams(words(doc['title']))]) x = [0]*(n+1) for bi, _ in temp: x[bi] = 1 p = alg.predict([x]) clusters[p[0]].append(doc['reference']) mongo_clusters = [] for k, v in clusters.items(): mongo_clusters.append({'cluster': k, 'projects': v}) # Mongo da questo errore: InvalidDocument: Cannot encode object: 0 print mongo_clusters # Salva su collezione Mongo mongo = MongoClient()['g8']['ir2'] mongo.insert_many(mongo_clusters) print 'Fatto!'
def how_is_often_used_in_text(): from nltk.corpus import brown brown_learned_text = brown.words(categories="learned") print sorted(set(b for (a, b) in nltk.bigrams(brown_learned_text) if a == "often")) # or use the tagged words for the actual POS tags brown_learned_tagged = brown.tagged_words(categories="learned", simplify_tags=True) fd = nltk.FreqDist([b[1] for (a, b) in nltk.bigrams(brown_learned_tagged) if a[0] == "often"]) fd.tabulate()
def extract_bigrams(articleList, commentCount): featureMatrix = np.zeros([commentCount,100]) index = 0 stemmer = SnowballStemmer("english", ignore_stopwords=True) bagOfWords = [] for art in articleList.items(): for comm in art[1]: mywords = words(comm.body) mywords = known_words(mywords) # Remove Stops filtered_words = [w for w in mywords if not w in stopwords.words('english')] # Stemming stemmed_words = [stemmer.stem(w) for w in filtered_words] bagOfWords += stemmed_words bagOfWords.append("\n") tempVector = dict() #Create your bigrams bgs = nltk.bigrams(bagOfWords) fdist = nltk.FreqDist(bgs) for k in fdist.keys()[:100]: tempVector[k] = 0 theKeys = tempVector.keys() for art in articleList.items(): for comm in art[1]: mywords = words(comm.body) mywords = known_words(mywords) # Remove Stops filtered_words = [w for w in mywords if not w in stopwords.words('english')] # Stemming stemmed_words = [stemmer.stem(w) for w in filtered_words] bgs = nltk.bigrams(stemmed_words) for word in (w for w in bgs if tempVector.has_key(w)): keyInd = theKeys.index(word) featureMatrix[index][keyInd] += 1 index += 1 if index % 100 == 0: print "extracted", index, "features" if index >= commentCount: break print "non-zero",np.count_nonzero(featureMatrix) print "Percentage filled:%.2f" %(float(np.count_nonzero(featureMatrix))/(featureMatrix.shape[0]*featureMatrix.shape[1])) return featureMatrix
def get_joint_entropy(string1, string2): first_bigram = list(nltk.bigrams(string1.lower())) second_bigram = list(nltk.bigrams(string2.lower())) combo = first_bigram + second_bigram bigram_dict = collections.Counter(combo) for i in bigram_dict: if i in first_bigram and i in second_bigram: value = float(bigram_dict[i]) / float(len(combo)) yield value
def get_joint_entropy(string1, string2): bigram1 = list(nltk.bigrams(string1.lower())) bigram2 = list(nltk.bigrams(string2.lower())) combo = bigram1 + bigram2 bigram_dict = collections.Counter(combo) for i in bigram_dict: if i in bigram1 and i in bigram2: value = float(bigram_dict[i]) / float(len(combo)) yield value
def featureSets(data): #data accepted as (rating, list of words) fs = [] for (r, words) in data: nicewords = [word.lower() for word in words if not isStopWord(word) and not isPunctuation(word)] for bigram in nltk.bigrams(nicewords): fs.append((BigramClassifier.features(bigram),r)) return fs return [(BigramClassifier.features(bigram), r) for bigram in nltk.bigrams(words)]
def bigrami(documents,dg1,gg1,dg2,gg2): bigram = [] stopwords = nltk.corpus.stopwords.words('english') for i in range(dg1,gg1): bigram.append([w for w in bigrams(documents[i][0])]) for i in range(dg2,gg2): bigram.append([w for w in bigrams(documents[i][0])]) result = [] [result.extend(w) for w in bigram] result = [w for w in result if w[0] not in stopwords and w[1] not in stopwords and w[0] and nije_interpunkcija(w[0]) and nije_interpunkcija(w[1])] result = nltk.FreqDist(result) return result.keys()
def joint_entropy(string1, string2): x = [] bi1 = list(nltk.bigrams(string1.lower())) bi2 = list(nltk.bigrams(string2.lower())) combo = bi1 + bi2 yes = list(set(combo)) for i in yes: if i in bi1 and i in bi2: count = (float(bi1.count(i))+float(bi2.count(i)))/float(len(combo)) x.append(count) calc = sum(i*np.log2(i) for i in x)*-1 return calc
def exercise_bigrams(): sent = ["In", "the", "beginning", "God", "created", "the heaven", "and", "the earth"] print list(nltk.bigrams(sent)) text = nltk.corpus.genesis.words("english-kjv.txt") bigrams = nltk.bigrams(text) cfd = nltk.ConditionalFreqDist(bigrams) word = "living" for i in range(15): print word word = cfd[word].max()
def jacquard_bigram(query): final=[] for a in file('enwiktionary.a.list'): a=a.rstrip() bigram=set(nltk.bigrams(a)) q_bigram=set(nltk.bigrams(query)) intersect=q_bigram.intersection(bigram) union=q_bigram.union(bigram) sim=float(len(intersect))/len(union) final.append([a,sim]) final_sorted= sorted(final,key=lambda sim:sim[1], reverse=True) print final_sorted[:10]
def main(): OUT = open("../output.txt", "w") OUT.close() INP = open("../data/test.hyp1-hyp2-ref", "r") inp = INP.read() for sent in inp.split("\n")[:-1]: h1 = sent.split(" ||| ")[0].split(" ") h2 = sent.split(" ||| ")[1].split(" ") ref = sent.split(" ||| ")[2].split(" ") h1p = process(h1) h2p = process(h2) refp = process(ref) #print(h1c, h2c, refc) #h1_match = word_matches(h1, rset) #h2_match = word_matches(h2, rset) h1c = Counter(h1) h2c = Counter(h2) refc = Counter(ref) h1_bigrams = nltk.bigrams(h1) h2_bigrams = nltk.bigrams(h2) ref_bigrams = nltk.bigrams(ref) h1_trigrams = nltk.trigrams(h1) h2_trigrams = nltk.trigrams(h2) ref_trigrams = nltk.trigrams(ref) #print(h_bigrams, ref_bigrams) h1_bigramsc = Counter(h1_bigrams) h2_bigramsc = Counter(h2_bigrams) ref_bigramsc = Counter(ref_bigrams) h1_trigramsc = Counter(h1_trigrams) h2_trigramsc = Counter(h2_trigrams) ref_trigramsc = Counter(ref_trigrams) h1_allc = h1c + h1_bigramsc + h1_trigramsc h2_allc = h2c + h2_bigramsc + h2_trigramsc ref_allc = refc + ref_bigramsc + ref_trigramsc h1_precision = precision(h1_allc, ref_allc) h2_precision = precision(h2_allc, ref_allc) h1_recall = recall(h1_allc, ref_allc) h2_recall = recall(h2_allc, ref_allc) h1_meteor = meteor(h1_precision, h1_recall) h2_meteor = meteor(h2_precision, h2_recall) OUT = open("../output.txt", "a") if h1_meteor > h2_meteor: OUT.write("-1\n") else: if h1_meteor == h2_meteor: OUT.write("0\n") else: OUT.write("1\n") OUT.close()
def construct_features(self, sentences, use_smoothing=True): print 'creating features...' if not use_smoothing: self.set_lambda(0) data = [] regex = re.compile('[%s]' % re.escape(string.punctuation)) for i, sent in enumerate(sentences): print i term, tpos, posf, bterm, btpos, bposf = (0, 0, 0, 0, 0, 0) tokenized_tagged = nltk.pos_tag(nltk.word_tokenize(sent)) for token, p in tokenized_tagged: # unigrams try: new_token = regex.sub(u'', token).decode('utf-8') if not new_token == u'' and not new_token in stopwords.words('english'): term += self.Term_Freq[new_token]/self.N_Term # I think we need a different normalizer here posf += self.POS_Freq[p]/self.N_Term tpos += self.Term_Freq[(new_token, p)]/self.N_Term except: pass # normalize with respect to sentence length term /= len(sent) posf /= len(sent) tpos /= len(sent) # bigrams words = [elem[0] for elem in tokenized_tagged] pos_tags = [elem[1] for elem in tokenized_tagged] b_words = nltk.bigrams(words) b_pos = nltk.bigrams(pos_tags) if len(b_words) > 0: for b_w, b_p in zip(b_words, b_pos): bterm += self.BTerm_Freq[b_w]/self.BN_Term bposf += self.BPOS_Freq[b_p]/self.BN_Term btpos += self.BTPOS_Freq[(b_w, b_p)]/self.BN_Term # normalize bterm /= len(b_words) bposf /= len(b_pos) btpos /= len(b_words) data.append([term, posf, tpos, bterm, bposf, btpos]) return np.asarray(data)
def window_bigrams(problem): """Get the bigrams of window size 5, realling don't know what size is good.""" tokenized = nltk.tag.untag(problem.tagged) out = {} for index in problem.head_indices: upperbound = index + 5 lowerbound = max(0, index - 5) words_after = tokenized[index + 1 : upperbound + 1] words_before = tokenized[lowerbound:index] bigrams_before = nltk.bigrams(words_before) bigrams_after = nltk.bigrams(words_after) bigrams_before.extend(bigrams_after) windowfeatures = dict([("wbigram({}&{})".format(w[0], w[1]), True) for w in bigrams_before]) out.update(windowfeatures) return out
def window_bigrams_with_tags(problem): """Get the tagged bigrams, just three before and three after""" tagged = [nltk.tag.tuple2str(tup) for tup in problem.tagged] out = {} for index in problem.head_indices: ## window of WIDTH before lowerbound = max(0, index - 3) bigrams_before = nltk.bigrams(tagged[lowerbound:index]) ## and WIDTH after upperbound = index + 3 bigrams_after = nltk.bigrams(tagged[index + 1 : upperbound + 1]) bigrams_before.extend(bigrams_after) windowfeatures = dict([("wbigram({}&{})".format(w[0], w[1]), True) for w in bigrams_before]) out.update(windowfeatures) return out
def train(self, lFileList): """Trains the Naive Bayes Sentiment Classifier.""" # lFileList = self.loop_files() #flag will be -1 for a negative word #flag will be 1 for a positive word #iterate through each files for sFilename in lFileList: #checks positive files if sFilename[7] == '5': #tokenize file name file = self.loadFile('movies_reviews/' + sFilename) tokens = self.tokenize(file) #extract bigrams bigrams_text = nltk.bigrams(tokens) #iterate through bigrams for (w,x) in bigrams_text: #check to see if the bigrams are in punctuation if w.lower() not in punctuation_stopwords and x.lower() not in punctuation_stopwords: #if the bigrams are not already in the dict, increment if (w.lower(), x.lower()) not in self.poswordsfreq: self.poswordsfreq[(w.lower(), x.lower())] = 1 self.total_positive = 1 else: #if the bigrams are in the dict, increment self.poswordsfreq[(w.lower(), x.lower())]+= 1 self.total_positive += 1 #checks negative files elif sFilename[7] == '1': #tokenize file name file = self.loadFile('movies_reviews/' + sFilename) #print file tokens = self.tokenize(file) #extract bigrams bigrams_text = nltk.bigrams(tokens) #print bigrams_text #iterate through the bigrams for (w,x) in bigrams_text: #check and make sure the bigrams are not punctuation if w.lower() not in punctuation_stopwords and x.lower() not in punctuation_stopwords: #print sFilename #if the bigram is not already in the dict, increment if (w.lower(), x.lower()) not in self.negwordsfreq: self.negwordsfreq[(w.lower(), x.lower())] = 1 self.total_negative = 1 else: #if the bigram is already in the dict, increment self.negwordsfreq[(w.lower(), x.lower())]+= 1 self.total_negative += 1
def getRedCarpetInfo(tweets): bestDressed = dict() worstDressed = dict() mostTalkedAbout = dict() rivalries = dict() for tweet in tweets: if regExRivalries.search(tweet): filteredSentences = ' '.join(word for word in tweet.split() if word.lower() not in stop and word.lower() not in blacklistWords and word.lower() not in wordsToIgnoreRedCarpet) unigrams = wordTokenizer.tokenize(filteredSentences) for bigram in nltk.bigrams(unigrams): posTags = nltk.pos_tag(bigram) noun = 0 for (data, tag) in posTags: if tag == 'NNP': noun += 1 if noun == 2: name = "%s %s" % bigram addToDictionary(name, rivalries) if regExBestDress.search(tweet): filteredSentences = ' '.join(word for word in tweet.split() if word.lower() not in stop and word.lower() not in blacklistWords and word.lower() not in wordsToIgnoreRedCarpet) unigrams = wordTokenizer.tokenize(filteredSentences) for bigram in nltk.bigrams(unigrams): posTags = nltk.pos_tag(bigram) noun = 0 for (data, tag) in posTags: if tag == 'NNP': noun += 1 if noun == 2: name = "%s %s" % bigram addToDictionary(name, bestDressed) if regExWorstDress.search(tweet): filteredSentences = ' '.join(word for word in tweet.split() if word.lower() not in stop and word.lower() not in blacklistWords and word.lower() not in wordsToIgnoreRedCarpet) unigrams = wordTokenizer.tokenize(filteredSentences) for bigram in nltk.bigrams(unigrams): posTags = nltk.pos_tag(bigram) noun = 0 for (data, tag) in posTags: if tag == 'NNP': noun += 1 if noun == 2: name = "%s %s" % bigram addToDictionary(name, worstDressed) getTopN(bestDressed, bestDressedList, 5) getTopN(worstDressed, worstDressedList, 5) getTopN(rivalries, rivalriesList, 2)
def worst_dressed(year): # returns up to 10 (but usually many fewer) of the worst dressed attendees worst_pat = re.compile(r"worst dressed", re.I) worst_dressed_tweets = [] if year == 2013 or year == "2013": for text in tweets_2013_texts: if worst_pat.search(text): worst_dressed_tweets.append(text) elif year == 2015 or year == "2015": for text in tweets_2015_texts: if worst_pat.search(text): worst_dressed_tweets.append(text) else: pass possible_worst_bigrams = Counter() for tweet in worst_dressed_tweets: tweet = tweet.translate(translate_table) tokens = nltk.word_tokenize(tweet) bigrams = nltk.bigrams(tokens) for bigram in bigrams: if bigram[0].istitle() and bigram[1].istitle(): possible_worst_bigrams[bigram] += 1 p_worst_dressed = possible_worst_bigrams.most_common(10) worst_dressed = [] for i in p_worst_dressed: if i[0][0] not in red_stop and i[0][1] not in red_stop: worst_dressed.append(i[0][0] + " " + i[0][1]) return worst_dressed
def best_dressed(year): # returns up to 10 of the best dressed attendees best_pat = re.compile(r"best dressed", re.I) best_dressed_tweets = [] if year == 2013 or year == "2013": for text in tweets_2013_texts: if best_pat.search(text): best_dressed_tweets.append(text) elif year == 2015 or year == "2015": for text in tweets_2015_texts: if best_pat.search(text): best_dressed_tweets.append(text) else: pass possible_best_bigrams = Counter() for tweet in best_dressed_tweets: tweet = tweet.translate(translate_table) tokens = nltk.word_tokenize(tweet) bigrams = nltk.bigrams(tokens) for bigram in bigrams: if bigram[0].istitle() and bigram[1].istitle(): possible_best_bigrams[bigram] += 1 p_best_dressed = possible_best_bigrams.most_common(10) best_dressed = [] for i in p_best_dressed: if i[0][0] in female_names or i[0][0] in male_names and i[0][0] not in red_stop: best_dressed.append(i[0][0] + " " + i[0][1]) return best_dressed
def ngramify(self, word_list, stop): # creates an ngram from a word_list based on class settings mode = self.mode pos = self.inclued_pos word = self.include_word stopset = set(stopwords.words("english")) stopset.remove("not") if stop: if word and pos: selection = [(w.lower(), p) for w, p in word_list if w.lower() not in stopset] elif word: selection = [w.lower() for w, p in word_list if w.lower() not in stopset] elif pos: selection = [p for w, p in word_list if w.lower() not in stopset] else: if word and pos: selection = [(w.lower(), p) for w, p in word_list] elif word: selection = [w.lower() for w, p in word_list] elif pos: selection = [p for w, p in word_list] if mode == "unigrams": word_list = selection elif mode == "bigrams": word_list = nltk.bigrams(selection) elif mode == "trigrams": word_list = nltk.trigrams(selection) return word_list
# Generate plotgrid fig, ax = plt.subplots(*grid, figsize=(15,10)) rows, cols = grid # Extract word clouds from clusters for index, label in enumerate(np.unique(cluster_data[f"Labels_{c}"])): # Get total text to be analyzed text = " ".join(cluster_data[cluster_data[f"Labels_{c}"] == label]["Obiettivo / Motivazione"].map(lemmatizer)) # Extract monogram end bigram frequencies from text as dictionary tokenizer = RegexpTokenizer(r'\w+') sent_words = tokenizer.tokenize(text) freq_monogram = FreqDist(sent_words) freq_bigram = FreqDist(bigrams(sent_words)) dict_monogram = dict(freq_monogram) dict_bigram = {" ".join(k):v for k, v in dict(freq_bigram).items()} dict_token = {**dict_monogram, **dict_bigram} # Get only valid words and remove stopwords, storing the frequencies for analysis clean_dict = {k:v for k, v in dict_token.items() if validate_token(k, enriched_stopwords)} freq_df = pd.DataFrame([(k, v) for k, v in clean_dict.items()], columns=["ngram", "freq"]).nlargest(100, ['freq']) freq_df.to_csv(f"frequencies/frequencies_{c}_{label}.csv") # Generat and plot the word cloud wordcloud = WordCloud(stopwords=enriched_stopwords, background_color="white").generate_from_frequencies(clean_dict) if c == 2: ax[index%c].imshow(wordcloud, interpolation='bilinear') ax[index%c].axis("off")
def go_pos_context(pos_model_path, meter_model_path, corpus_path): from nltk import bigrams #line = 'Laut zerspringt der Weiherspiegel.' #print(line) pos_model = joblib.load(pos_model_path) meter_model = joblib.load(meter_model_path) corpus = json.load(open(corpus_path, 'r')) #get_pos_meter_mapping(pos_model, meter_model, line) pos_dict = {} counter = 0 for idx, doc in corpus.items(): counter += 1 if counter > 20000: break lines = doc['lines'] for line in lines: mp = get_pos_meter_mapping(pos_model, meter_model, line) for tuple1, tuple2 in bigrams(mp): pos1 = tuple1[0] meter1 = tuple1[1] pos2 = tuple2[0] meter2 = tuple2[1] #print(pos, meter) cnt = pos_dict.setdefault("_".join([pos1, pos2]), Counter()) cnt[meter2] += 1 #print(pos_dict) ranking = [] for pos, contours in pos_dict.items(): ps = pos.split('_') pos1 = ps[0] pos2 = ps[1] print(pos1, pos2, contours) plus1 = 1 minus1 = 1 plus2 = 1 minus2 = 1 amphi = 1 dibrach = 1 spondee = 1 for c in contours: if len(c) == 1: if pos2 == 'VM': print(pos1, pos2, c, contours[c]) if c[0] == '+': plus1 += float(contours[c]) elif c[0] == '-': minus1 += float(contours[c]) #if len(c) > 1: # print(pos, c, contours[c]) # for prom in c: # if prom == '+': # plus2+= float(contours[c]/len(c)) # if prom == '-': # minus2+= float(contours[c]/len(c)) #if len(c) == 2: # print(pos, c, c[0], contours[c]) # if c[0] == '+' and c[1] == '-': # plus2 = float(contours[c]) # elif c[0] == '-' and c[1] == '+': # minus2 = float(contours[c]) # elif c[0] == '-' and c[1] == '-': # dibrach = float(contours[c]) # elif c[0] == '+' and c[1] == '+': # spondee = float(contours[c]) #if len(c) == 3: # print(pos, c, c[0], contours[c]) # if c[0] == '+' and c[1] == '-' and c[2] == '-': # plus = float(contours[c]) # elif c[0] == '-' and c[1] == '-' and c[2] == '+': # minus = float(contours[c]) # elif c[0] == '-' and c[1] == '+' and c[2] == '-': # amphi = float(contours[c]) if pos2 == 'VM': plus = plus1 + plus2 minus = minus1 + minus2 print("_".join([pos1, pos2]), round(plus, 2), round(minus, 2)) ranking.append((round(plus / minus, 2), "_".join([pos1, pos2]))) #einsilber = plus1/minus1 #zweisilber = (plus2+minus2+spondee)/dibrach #print(pos, round(einsilber, 2), round(zweisilber, 2)) #ranking.append((round(einsilber,2), pos)) s = sorted(ranking) s.reverse() print(s)
import nltk from nltk.probability import * from nltk.corpus import stopwords from nltk.stem import WordNetLemmatizer import codecs import sys # from urllib import request # url = 'https://www.gutenberg.org/cache/epub/2707/pg2707.txt' # raw = response.read().decode('utf8') f = codecs.open('text.txt', encoding='utf8') lines = f.readlines() all_text = ' '.join(lines).lower() tokenizer = nltk.RegexpTokenizer('\w+') tokens = tokenizer.tokenize(all_text) sys.stderr.write('Finding bigrams...' + '\n') bigrams = nltk.bigrams(tokens) for b in bigrams: print(b) #tokens = nltk.word_tokenize(all_text) english_stopwords = stopwords.words('english') stopwords_set = set(english_stopwords) filtered_tokens = [w for w in tokens if w not in stopwords_set] lemmatizer = WordNetLemmatizer() lemmatized = [lemmatizer.lemmatize(w) for w in filtered_tokens] #print(len(tokens)) fd = nltk.FreqDist(filtered_tokens) print(fd.most_common(10)) print(len(tokens)) print(len(lemmatized))
args = parser.parse_args() FILE_LENGTH = args.flen STOPWORDS = set(stopwords.words('english') + list(string.punctuation)) stemmer = SnowballStemmer("english") with open(args.input, 'r') as f, open(f'{args.output}', 'w') as outp: for line in tqdm(f, total=FILE_LENGTH, mininterval=10.0, maxinterval=20.0): raw = json.loads(line) doc = {} doc["id"] = raw["_id"] doc["contents"] = "".join(raw["text"]) if args.bigrams: tokens = filter(lambda word: word.lower() not in STOPWORDS, word_tokenize(doc["contents"])) if args.stem: tokens = map(stemmer.stem, tokens) bigram_doc = bigrams(tokens) bigram_doc = " ".join( ["".join(bigram) for bigram in bigram_doc]) doc["contents"] += " " + bigram_doc doc["wikipedia_id"] = raw["wikipedia_id"] doc["wikipedia_title"] = raw["wikipedia_title"] doc["categories"] = raw["categories"] _ = outp.write(json.dumps(doc)) _ = outp.write('\n')
def addNGrams(search_query_performance_df, columns): if functions.dfIsEmpty(search_query_performance_df): return n_gram_dict = {} for i, row in search_query_performance_df.fillna(0).iterrows(): impressions = float(row['impressions']) clicks = float(row['clicks']) conversions = float(row['conversions']) conversion_value = float(row['conversion_value']) cost = float(row['cost']) text = (row['query']) # tidy up puncts = [",", ".", "!", "?", ":"] for punct in puncts: text = text.replace(punct, "") text = word_tokenize(text) bigram = bigrams(text) bigram_vec = [] for gram in bigram: bigram_vec.append(gram) trigram = trigrams(text) trigram_vec = [] for gram in trigram: trigram_vec.append(gram) total_gram_vec = bigram_vec + trigram_vec for gram in total_gram_vec: if gram not in n_gram_dict.keys(): n_gram_dict[gram] = {'impressions': impressions, \ # 'avg_pos_mult': impressions * avg_pos, \ 'gram_count': 1, 'clicks': clicks, \ 'cost': cost, 'conversions': conversions, 'conversion_value': conversion_value} else: n_gram_dict[gram]['impressions'] += impressions # n_gram_dict[gram]['avg_pos_mult'] += impressions * avg_pos n_gram_dict[gram]['gram_count'] += 1 n_gram_dict[gram]['clicks'] += clicks n_gram_dict[gram]['cost'] += cost n_gram_dict[gram]['conversions'] += conversions n_gram_dict[gram]['conversion_value'] += conversion_value ### compute average position ### and statistic data n_gram_df_data = {} for gram in n_gram_dict.keys(): impressions = n_gram_dict[gram]['impressions'] count = n_gram_dict[gram]['gram_count'] # avg_pos = n_gram_dict[gram]['avg_pos_mult'] / count clicks = n_gram_dict[gram]['clicks'] conversions = n_gram_dict[gram]['conversions'] cost = n_gram_dict[gram]['cost'] conversion_value = n_gram_dict[gram]['conversion_value'] try: cpa = cost / conversions except ZeroDivisionError: cpa = 0 try: roas = conversion_value / cost except ZeroDivisionError: roas = 0 try: ctr = clicks / impressions except ZeroDivisionError: ctr = 0 try: conversion_rate = conversions / clicks except ZeroDivisionError: conversion_rate = 0 try: average_cpc = cost / clicks except ZeroDivisionError: average_cpc = 0 if clicks != 0 and clicks != 1: std = np.sqrt(clicks * (1 - ctr) ** 2 + \ (impressions - clicks) * ctr ** 2) / (impressions - 1) standard_error = std / np.sqrt(impressions) else: standard_error = 0 min_result = ctr - standard_error * 2 max_result = ctr + standard_error * 2 n_gram_df_data[gram] = {'n_gram_count': count, 'impressions': impressions, 'ctr': ctr, 'conversion_rate': conversion_rate, \ 'average_cpc': average_cpc, 'ctr_significance': standard_error, 'conversions': conversions, 'cost': cost, 'conversion_value': conversion_value, 'cpa': cpa, 'roas': roas, 'clicks': clicks} df = pd.DataFrame(n_gram_df_data) df = df.T df["ctr_significance"] = df["ctr_significance"].replace( r'^\s*$', 0, regex=True).astype("float") return df
import nltk from nltk.corpus import treebank treebank_tagged = treebank.tagged_words(tagset='universal') tagpairs = nltk.bigrams(treebank_tagged) preceders_noun = [x[1] for (x, y) in tagpairs if y[1] == 'NOUN'] freqdist = nltk.FreqDist(preceders_noun) print([tag for (tag, _) in freqdist.most_common()])
## Create collocations with intervneing words (gapped n-grams) finder = BigramCollocationFinder.from_words(brown.words(), window_size=2) finder.apply_word_filter(lambda x: not x.isalpha()) finder.apply_freq_filter(10) finder.nbest(bigram_measures.pmi, 10) ## Finders scored = finder.score_ngrams(bigram_measures.raw_freq) scored[:10] ```{note} How to get the document frequency of the bigrams??? ``` unigram_freq = nltk.FreqDist(brown.words()) bigram_freq = nltk.FreqDist('_'.join(x) for x in nltk.bigrams(brown.words())) unigram_freq_per_file = [nltk.FreqDist(words) for words in [brown.words(fileids=f) for f in brown.fileids()]] bigram_freq_per_file = [nltk.FreqDist('_'.join(x) for x in nltk.bigrams(words)) for words in [brown.words(fileids=f) for f in brown.fileids()]] ## Function to get unigram dispersion def createUnigramDipsersionDist(uni_freq, uni_freq_per_file): len(uni_freq_per_file) unigram_dispersion = {} for fid in uni_freq_per_file: for w, f in fid.items(): if w in unigram_dispersion: unigram_dispersion[w] += 1
def getTrainingAndTestData(tweets, K, k, method, feature_set): add_ngram_feat = feature_set.get('ngram', 1) add_negtn_feat = feature_set.get('negtn', False) from functools import wraps procTweets = [ (processAll(text, subject=subj, query=quer), sent) \ for (text, sent, subj, quer) in tweets] stemmer = nltk.stem.PorterStemmer() all_tweets = [] #DATADICT: all_tweets = [ (words, sentiment), ... ] for (text, sentiment) in procTweets: words = [word if(word[0:2]=='__') else word.lower() \ for word in text.split() \ if len(word) >= 3] words = [stemmer.stem(w) for w in words] #DATADICT: words = [ 'word1', 'word2', ... ] all_tweets.append((words, sentiment)) # train_tweets = all_tweets[:int(len(all_tweets)*ratio)] #DATADICT: train_tweets = [ (words, sentiment), ... ] # test_tweets = all_tweets[int(len(all_tweets)*ratio):] #DATADICT: test_tweets = [ (words, sentiment), ... ] train_tweets = [x for i, x in enumerate(all_tweets) if i % K != k] test_tweets = [x for i, x in enumerate(all_tweets) if i % K == k] unigrams_fd = nltk.FreqDist() if add_ngram_feat > 1: n_grams_fd = nltk.FreqDist() for (words, sentiment) in train_tweets: words_uni = words unigrams_fd.update(words) if add_ngram_feat >= 2: words_bi = [','.join(map(str, bg)) for bg in nltk.bigrams(words)] n_grams_fd.update(words_bi) if add_ngram_feat >= 3: words_tri = [','.join(map(str, tg)) for tg in nltk.trigrams(words)] n_grams_fd.update(words_tri) sys.stderr.write('\nlen( unigrams ) = ' + str(len(unigrams_fd.keys()))) #unigrams_sorted = nltk.FreqDist(unigrams).keys() unigrams_sorted = unigrams_fd.keys() #bigrams_sorted = nltk.FreqDist(bigrams).keys() #trigrams_sorted = nltk.FreqDist(trigrams).keys() if add_ngram_feat > 1: sys.stderr.write('\nlen( n_grams ) = ' + str(len(n_grams_fd))) ngrams_sorted = [k for (k, v) in n_grams_fd.items() if v > 1] sys.stderr.write('\nlen( ngrams_sorted ) = ' + str(len(ngrams_sorted))) def get_word_features(words): bag = {} words_uni = ['has(%s)' % ug for ug in words] if (add_ngram_feat >= 2): words_bi = [ 'has(%s)' % ','.join(map(str, bg)) for bg in nltk.bigrams(words) ] else: words_bi = [] if (add_ngram_feat >= 3): words_tri = [ 'has(%s)' % ','.join(map(str, tg)) for tg in nltk.trigrams(words) ] else: words_tri = [] for f in words_uni + words_bi + words_tri: bag[f] = 1 #bag = collections.Counter(words_uni+words_bi+words_tri) return bag negtn_regex = re.compile( r"""(?: ^(?:never|no|nothing|nowhere|noone|none|not| havent|hasnt|hadnt|cant|couldnt|shouldnt| wont|wouldnt|dont|doesnt|didnt|isnt|arent|aint )$ ) | n't """, re.X) def get_negation_features(words): INF = 0.0 negtn = [bool(negtn_regex.search(w)) for w in words] left = [0.0] * len(words) prev = 0.0 for i in range(0, len(words)): if (negtn[i]): prev = 1.0 left[i] = prev prev = max(0.0, prev - 0.1) right = [0.0] * len(words) prev = 0.0 for i in reversed(range(0, len(words))): if (negtn[i]): prev = 1.0 right[i] = prev prev = max(0.0, prev - 0.1) return dict( zip(['neg_l(' + w + ')' for w in words] + ['neg_r(' + w + ')' for w in words], left + right)) def counter( func ): #http://stackoverflow.com/questions/13512391/to-count-no-times-a-function-is-called @wraps(func) def tmp(*args, **kwargs): tmp.count += 1 return func(*args, **kwargs) tmp.count = 0 return tmp @counter #http://stackoverflow.com/questions/13512391/to-count-no-times-a-function-is-called def extract_features(words): features = {} word_features = get_word_features(words) features.update(word_features) if add_negtn_feat: negation_features = get_negation_features(words) features.update(negation_features) sys.stderr.write('\rfeatures extracted for ' + str(extract_features.count) + ' tweets') return features extract_features.count = 0 if ('1step' == method): # Apply NLTK's Lazy Map v_train = nltk.classify.apply_features(extract_features, train_tweets) v_test = nltk.classify.apply_features(extract_features, test_tweets) return (v_train, v_test) elif ('2step' == method): isObj = lambda sent: sent in ['neg', 'pos'] makeObj = lambda sent: 'obj' if isObj(sent) else sent train_tweets_obj = [(words, makeObj(sent)) for (words, sent) in train_tweets] test_tweets_obj = [(words, makeObj(sent)) for (words, sent) in test_tweets] train_tweets_sen = [(words, sent) for (words, sent) in train_tweets if isObj(sent)] test_tweets_sen = [(words, sent) for (words, sent) in test_tweets if isObj(sent)] v_train_obj = nltk.classify.apply_features(extract_features, train_tweets_obj) v_train_sen = nltk.classify.apply_features(extract_features, train_tweets_sen) v_test_obj = nltk.classify.apply_features(extract_features, test_tweets_obj) v_test_sen = nltk.classify.apply_features(extract_features, test_tweets_sen) test_truth = [sent for (words, sent) in test_tweets] return (v_train_obj, v_train_sen, v_test_obj, v_test_sen, test_truth) else: return nltk.classify.apply_features(extract_features, all_tweets)
term_freq_hash = counting_terms(term_type='terms_hash', fname='listener_results.json', nwords=10) # COUNTING JUST TERMS term_freq_only = counting_terms(term_type='terms_only', fname='listener_results.json', nwords=10) ###################### BIGRAMS ###################### # CREATE BIGRAMS tokens = df['text_clean_ngrams'].apply(nltk.word_tokenize) # Flatening nested list flat_tokens = [term for sublist in tokens for term in sublist] bgs = nltk.bigrams(flat_tokens) # FREQUENCY DISTRIBUTION FOR ALL BIGRAMS fdist = nltk.FreqDist(bgs) for k, v in fdist.items(): print(k, v) fdist_10 = fdist.most_common(10) print(fdist_10) # CONVERT TO DF AND SORT labels = ['bigram', 'Weight'] df_bigrams = pd.DataFrame([tuple_item for tuple_item in fdist.items()], columns=labels) df_bigrams[['Source_Name', 'Target_Name' ]] = pd.DataFrame([tuple_item for tuple_item in df_bigrams.bigram])
def best_word_features_com(words,best_words): d1 = dict([(word, True) for word in words if word in best_words]) d2 = dict([(word, True) for word in nltk.bigrams(words) if word in best_words]) d3 = dict(d1, **d2) return d3
f = open('fdist.pkl', 'rb') fdist = pickle.load(f) f.close() else: f = open('lyrics.pkl', 'rb') data = pickle.load(f) f.close() lis = [] cnt = 0 for _ in data: js = [__ for __ in data[_]] lis += js cnt += 1 print(cnt) bigram = list(nltk.bigrams(lis)) fdist = nltk.ConditionalFreqDist(bigram) f = open('fdist.pkl', 'wb') pickle.dump(fdist, f, -1) f.close() f = open('test_data.pkl', 'rb') test_data = pickle.load(f) f.close() Ans = [] cnt = 0 for in_data, out_data in test_data: ans = ''.join(in_data) bg = in_data[-1][-1]
def find_nltk_bigrams(my_str): split_str = my_str.split() bigram_str = bigrams(split_str) return [item for item in bigram_str]
[w for w in word_tokens if not w in stop_words]) text = "".join( [w for w in str(filtered_sentence) if w not in string.punctuation]) word_tokens = re.split('\W+', text) for w in word_tokens: #print(ps.stem(w)) #stem_text=stem_text.join([ps.stem(w)]) stem_text = stem_text + ps.lemmatize(w) + " " p["filtered_sentence"].iloc[i] = stem_text word2vec_tokenize = word_tokenize(p["filtered_sentence"].iloc[i]) #%%%%%%%%%%%%%%%%%5 mystring = p.iloc[i, 4] msystring = mystring.split(" ") list(nltk.bigrams(msystring)) #%%%%%%%%%%%55 megastring = "" for i in range(len(p)): megastring = megastring + str(p.iloc[i, 4]) + "" #%%%%%%%%%%%%5 from nltk.collocations import BigramCollocationFinder def bi(text): bigram_measures = nltk.collocations.BigramAssocMeasures() finder = BigramCollocationFinder.from_words(word_tokenize(text)) finder.apply_freq_filter(3) finder.nbest(bigram_measures.pmi, 5) return finder.ngram_fd.items()
# Group by stemmed word stem_word_index.setdefault(stemmed_word, []) stem_word_index[stemmed_word].append(word) # Calculate coefficient ================================================================================== coef_threshold = 0.0 dice_stemmed_word_data = [] mim_stemmed_word_data = [] emim_stemmed_word_data = [] chi_sqr_stemmed_word_data = [] counter = 0 for stemmed_word, words in stem_word_index.items(): # create bigrams from words bigrams = list(nltk.bigrams(words)) for word_a, word_b in bigrams: # Lookup filename in word_files_index files_a = word_files_index[word_a] files_b = word_files_index[word_b] files_a_sliced_b = list(set(files_b) & set(files_a)) # Using dice coef dice_coef = float( len(files_a_sliced_b)) / (len(files_a) + len(files_b)) if (dice_coef > coef_threshold): dice_stemmed_word_data.append( (stemmed_word, word_a, word_b, dice_coef)) # Using MIM coef mim_coef = float(len(files_a_sliced_b)) / (len(files_a) * len(files_b))
import nltk from nltk import bigrams from nltk.tokenize import word_tokenize f=open("sample.txt","r") dataf=f.read().replace('\n',' '); delimiters=['(',')',';',',','.','/'] for i in delimiters: dataf=dataf.replace(i,'') print(dataf) data=word_tokenize(dataf) bigrams=list(bigrams(data)) print(bigrams) print('\n')
# #By default a FreqDist is not sorted. # print(list(freq_brown.keys())[:20]) # # #if we sort it and print it, it will give the top words but without the frequencies # fdist1 = sorted(freq_brown , key = freq_brown.__getitem__, reverse = True) # print(fdist1[0:20]) # # #prints the most common words with frequency; same result as the previous one with frequency # print(freq_brown.most_common(20)) # an nltk.ConditionalFreqDist() counts frequencies of pairs. # When given a list of bigrams, it maps each first word of a bigram # to a FreqDist over the second words of the bigram. cfreq_brown_2gram = nltk.ConditionalFreqDist(nltk.bigrams(brown.words())) # print(cfreq_brown_2gram) # conditions() in a ConditionalFreqDist are like keys() # in a dictionary # print(cfreq_brown_2gram.conditions()) # the cfreq_brown_2gram entry for "my" is a FreqDist. # print(cfreq_brown_2gram["my"]) # here are the words that can follow after "my". # We first access the FreqDist associated with "my", # then the keys in that FreqDist # print(cfreq_brown_2gram["my"].keys()) # # # here are the 20 most frequent words to come after "my", with their frequencies
def main(file1, file2): file1_input = codecs.open( file1, "r", "utf-8") # apre il "file1", in sola lettura "r", in codifica "utf-8" file2_input = codecs.open( file2, "r", "utf-8") # apre il "file2", in sola lettura "r", in codifica "utf-8" nome1 = splitterFileName(file1_input) # nome del file senza estensione nome2 = splitterFileName(file2_input) riga1 = file1_input.read() riga2 = file2_input.read() sent_tokenizer = nltk.data.load( 'tokenizers/punkt/english.pickle' ) # metodo di lettura del file per la tokenizzazione frasi_file1 = sent_tokenizer.tokenize(riga1) # frasi file1 frasi_file2 = sent_tokenizer.tokenize(riga2) # frasi file2 lunghezza_corpus_file1, listaToken_file1, POS_token_tag_file1 = CorpusTokensPOS( frasi_file1) # lunghezza del Corpus Token e POS tag lunghezza_corpus_file2, listaToken_file2, POS_token_tag_file2 = CorpusTokensPOS( frasi_file2) lista_Solo_POS_file1 = estraiSoloTagPOS( POS_token_tag_file1) # estrae solo PoS tag senza token lista_Solo_POS_file2 = estraiSoloTagPOS(POS_token_tag_file2) token_20_file1, aggettivi_20_file1, verbi_20_file1, POS_10_file1, trigrammi_10_file1 = analisiFrequenze( listaToken_file1, POS_token_tag_file1, lista_Solo_POS_file1) # analisi delle frequenze token_20_file2, aggettivi_20_file2, verbi_20_file2, POS_10_file2, trigrammi_10_file2 = analisiFrequenze( listaToken_file2, POS_token_tag_file2, lista_Solo_POS_file2) bigrammi_POS_file1 = bigrams( lista_Solo_POS_file1) # estre coppie (<token-PoS, token-PoS>) bigrammi_POS_file2 = bigrams(lista_Solo_POS_file2) lista_10_bigrammi_PCong_file1, lista_10_bigrammi_PCond_file1 = probabilita( lista_Solo_POS_file1, bigrammi_POS_file1) # calcola la probabilità lista_10_bigrammi_PCong_file2, lista_10_bigrammi_PCond_file2 = probabilita( lista_Solo_POS_file2, bigrammi_POS_file2) lista_SOST, lista_SOST_uguali = listaAggSost(POS_token_tag_file1, POS_token_tag_file2) LMI_AGG_SOST_1 = calcolaLMI(listaToken_file1, POS_token_tag_file1, lista_SOST) LMI_AGG_SOST_2 = calcolaLMI(listaToken_file2, POS_token_tag_file2, lista_SOST) anlisi_linguistica_file1 = estraiEnitaNominateDiLuoghi( POS_token_tag_file1) # analisi delle name entity anlisi_linguistica_file2 = estraiEnitaNominateDiLuoghi(POS_token_tag_file2) luoghi_10_file1 = nltk.FreqDist( anlisi_linguistica_file1["GPE"]).most_common( 20) # estre i 20 nomi di luogo più frequenti luoghi_10_file2 = nltk.FreqDist( anlisi_linguistica_file2["GPE"]).most_common(20) # RISULTATI ANALISI********************************** # 20 TOKEN print "\nIl confronto avviene su due corpus (", nome1, ".txt ,", nome2, ".txt) i quali contengono: blog scritti da autori di sesso maschile e blog scritti da autori di sesso femminile.\n" # LISTA = [Frequenza, token] print "\n\n- I 20 TOKEN PIù FREQUENTI (NO PUNTEGGIATURA) -\n" print nome1, "\t\t\t\t\t\t", nome2 for elemento1, elemento2 in zip(token_20_file1, token_20_file2): print " Token --> %-20s Freq --> %-20s" % ( elemento1[1], elemento1[0]), "Token --> %-20s Freq --> %-20s" % ( elemento2[1], elemento2[0]) # AGGETTIVI # LISTA = [Frequenza, token] print "\n\n- I 20 AGGETTIVI PIù FREQUENTI -\n" print nome1, "\t\t\t\t\t\t", nome2 for elemento1, elemento2 in zip(aggettivi_20_file1, aggettivi_20_file2): print " Token --> %-20s Freq --> %-20s" % ( elemento1[1][0], elemento1[0]), "Token --> %-20s Freq --> %-20s" % (elemento2[1][0], elemento2[0]) # VERBI # LISTA = [Frequenza, token] print "\n\n- I 20 VERBI PIù FREQUENTI -\n" print nome1, "\t\t\t\t\t\t", nome2 for elemento1, elemento2 in zip(verbi_20_file1, verbi_20_file2): print " Token --> %-20s Freq --> %-20s" % ( elemento1[1][0], elemento1[0]), "Token --> %-20s Freq --> %-20s" % (elemento2[1][0], elemento2[0]) # 10 POS # LISTA = [Frequenza,POS] print "\n\n- I 10 POS TAG PIù FREQUENTI -\n" print nome1, "\t\t\t\t\t\t", nome2 for elemento1, elemento2 in zip(POS_10_file1, POS_10_file2): print " PoS --> %-20s Freq --> %-20s" % ( elemento1[1], elemento1[0]), "Token --> %-20s Freq --> %-20s" % ( elemento2[1], elemento2[0]) # 10 TROGRAMMI DI POS TAG # LISTA = [Frequenza,[POS,POS,POS]] print "\n\n- I 10 TRIGRAMMI DI POS TAG PIù FREQUENTI -\n" print nome1, "\t\t\t\t\t\t\t\t", nome2 for elemento1, elemento2 in zip(trigrammi_10_file1, trigrammi_10_file2): print " Trigramma --> %-3s - %-3s - %-20s Freq --> %-20s" % ( elemento1[1][0], elemento1[1][1], elemento1[1][2], elemento1[0] ), " Trigramma --> %-3s - %-3s - %-20s Freq --> %-20s" % ( elemento2[1][0], elemento2[1][1], elemento2[1][2], elemento2[0]) # 10 BIGRAMMI + PROBABILITà # LISTA = [Probabilita,[POS,POS]] # CONGIUNTA print "\n\n- I 10 BIGRAMMI DI POS TAG CON PROBABILITà CONGIUNTA MASSIMA -\n" print nome1, "\t\t\t\t\t\t\t\t\t", nome2 for elemento1, elemento2 in zip(lista_10_bigrammi_PCong_file1, lista_10_bigrammi_PCong_file2): print " Bigramma --> %-3s - %-20s P.Congiunta--> %-0s %-20s" % ( elemento1[1][0], elemento1[1][1], "%1.2f" % elemento1[0], "%"), " Bigramma --> %-3s - %-20s P.Congiunta --> %-0s %-20s" % ( elemento2[1][0], elemento2[1][1], "%1.2f" % elemento2[0], "%") # CONDIZIONATA print "\n\n- I 10 BIGRAMMI DI POS TAG CON PROBABILITà CONDIZIONATA MASSIMA -\n" print nome1, "\t\t\t\t\t\t\t\t\t", nome2 for elemento1, elemento2 in zip(lista_10_bigrammi_PCond_file1, lista_10_bigrammi_PCond_file2): print " Bigramma --> %-3s - %-20s P.Condizionata --> %-0s %-20s" % ( elemento1[1][0], elemento1[1][1], "%1.2f" % elemento1[0], "%" ), " Bigramma --> %-3s - %-20s P.Condizionata --> %-0s %-20s" % ( elemento2[1][0], elemento2[1][1], "%1.2f" % elemento2[0], "%") #20 SOSTANTIVI AGGETIVI # LISTA = [SOST,[AGG,LMI]] print "\n\n- I 2O SOSTANTIVI PIù FREQUENTI, E PER OGNIUNO I SUOI AGGETTIVI ORDINATI IN BASE ALLA LOCAL MUTUAL INFORMATION -\n" print "\nIL NUMERO DEI SOSTANTIVI TOTALI é 20", "MA NE VERRANO STAMPATI SOLO", len( lista_SOST ), "PERCHè", len( lista_SOST_uguali ), "SOSTANTIVI SONO PRESENTI NEI 10 SOSTANTIVI PIù FREQUENTI DI ENTRAMBI I CORPUS. QUESTI SOSTANTIVI SONO:" for uguale in lista_SOST_uguali: print "-", uguale print "\n\t\t", nome1, "------------------------------------------------------", nome2 for lista1, lista2 in zip(LMI_AGG_SOST_1, LMI_AGG_SOST_2): print "\n", "\t\t\t---------------------------SOST:", lista1[ 0], "---------------------------" for listaValori1, listaValori2 in zip(lista1[1:], lista2[1:]): for valore1, valore2 in zip(listaValori1, listaValori2): print " \nAGG --> %-10s LMI --> %-10s" % ( valore1[0], valore1[1]), "\t\t\t\tAGG --> %-10s LMI --> %-10s" % ( valore2[0], valore2[1]) # 20 NOMI PROPRI DI LUOGO print "\n\n- I 20 NOMI PROPRI DI LUOGO PIù FREQUENTI -\n" print nome1, ":", "\t\t\t\t", nome2, ":" for elemento1, elemento2 in zip(luoghi_10_file1, luoghi_10_file2): print "%-20s Freq --> %-20s" % ( elemento1[0], elemento1[1]), "%-20s Freq --> %-20s" % ( elemento2[0], elemento2[1])
import nltk import nltk.book as book import string # Let's do frequency analysis for book 1 (Moby Dicke) text1 = book.text1 concatenated_text = ''.join(text1) dis = nltk.FreqDist(concatenated_text) # dis.plot() # Now we can get the bigrams aux = nltk.bigrams(text1) bigram_frequency = nltk.FreqDist(aux) print('Most common bigrams') print(bigram_frequency.most_common(20)) # Now we print the most common bigrams
""" return top 50 common non-content words used in the four columns combined """ STOPLIST = set(nltk.corpus.stopwords.words()) def is_content_word(word): return word.lower() not in STOPLIST and word[0].isalpha() dist = nltk.FreqDist([w.lower() for w in vocab if is_content_word(w)]) freq2=dist.most_common(50) # Oops, the words here are not informative. I will try bigrams instead. """ bigrams, b_dict returns a dictionary of bigrams each row; b_vocab gives the whole bigrams vocaburary """ b_dict={} bivocab=[] for index, row in templist.items(): filtered_temp =[b for b in list(nltk.bigrams(row)) if is_content_word(b[0]) and is_content_word(b[1])] b_dict.update({index: filtered_temp}) bivocab+=filtered_temp dist1 = nltk.FreqDist([b for b in bivocab]) freq0 = dist1.most_common(50) biig, biigfreq=zip(*freq0) fig, ax = plt.subplots() index = np.arange(len(biig)) bar_width = 0.25 opacity = 0.8
def get_bigrams_list(text): bigrm = list(nltk.bigrams(text)) return bigrm
plt.figure(figsize=(30,10)) plt.bar(height, freq) plt.xticks(height, labels) plt.ylabel("Occurences") plt.xlabel("Word/Text") plt.show() #Bigrams -- Terms adjacent to each other that occur frequently with open('collection.json', 'r') as f: count_all = Counter() for line in f: tweet = json.loads(line) if "text" in tweet: terms = [term for term in tk.tokenize(tweet['text']) if term not in stop and not term.startswith('http') and not term.startswith('@')] term_pairs = bigrams(terms) count_all.update(term_pairs) print (count_all.most_common(20)) #Co-Occurences (Within Tweets) com = defaultdict(lambda: defaultdict(int)) with open('collection.json', 'r') as f: count_all = Counter() for line in f: tweet = json.loads(line) if "text" in tweet: terms = [term for term in tk.tokenize(tweet['text']) if term not in stop and not term.startswith('http') and not term.startswith('@')] for i in range(len(terms)-1): for j in range (i+1, len(terms)): t1, t2 = sorted([terms[i],terms[j]]) if t1 != t2:
# -*- coding: utf-8 -*- """ Created on Mon Jun 12 19:09:52 2017 @author: chetan """ # Count terms only once, equivalent to Document Frequency terms_single = set(terms_all) # Count hashtags only terms_hash = [ term for term in preprocess(tweet['text']) if term.startswith('#') ] # Count terms only (no hashtags, no mentions) terms_only = [ term for term in preprocess(tweet['text']) if term not in stop and not term.startswith(('#', '@')) ] # mind the ((double brackets)) # startswith() takes a tuple (not a list) if # we pass a list of inputs from nltk import bigrams terms_bigram = bigrams(terms_stop)
locations = re.sub(r'[^\x00-\x7F]+', "", location) #apply tokenization, lemmatization, bigrams, and stemmer to look at different sequences of terms; this will determine the best features tokens = [ word for sent in nltk.sent_tokenize(str(cleaned_tweets)) for word in nltk.word_tokenize(sent) ] for token in sorted(set(tokens))[:30]: print 'tokens are: ' + token + ' [' + str(tokens.count(token)) + ']' lemmatizer = nltk.WordNetLemmatizer() lemm_tokens = [lemmatizer.lemmatize(t) for t in tokens] for token in sorted(set(lemm_tokens))[:30]: print 'lemm are: ' + token + ', [' + str(lemm_tokens.count(token)) + ']' bigrams = [" ".join(pair) for pair in nltk.bigrams(tokens)] # bigramslist = re.sub(',', '', str(bigrams)) print 'bigrams: ', bigrams[:10] stemmer = SnowballStemmer("english") stemmed_tokens = [stemmer.stem(t) for t in tokens] for token in sorted(set(stemmed_tokens))[:30]: print 'stems are: ' + token + ' [' + str(stemmed_tokens.count(token)) + ']' # n = 3 # trigrams = ngrams(str(tokens).split(), n) # for grams in sorted(set(trigrams))[:20]: # print 'tri grams are:', grams trigrams = [" ".join(pair) for pair in nltk.trigrams(tokens)] # trigramslist = re.sub(',', '', str(trigrams))
# Transition probability 찾기 tagged_words = [] all_tags = [] #nltk.corpus.brown.tagged_sents(tagset='universal')[0] for sent in nltk.corpus.brown.tagged_sents(tagset='universal'): tagged_words.append(("START", "START")) all_tags.append("START") for (word, tag) in sent: all_tags.append(tag) tagged_words.append((tag, word)) tagged_words.append(("END", 'END')) all_tags.append("END") cfd_tags = nltk.ConditionalFreqDist(nltk.bigrams(all_tags)) cpd_tags = nltk.ConditionalProbDist(cfd_tags, nltk.MLEProbDist) print("Count('DET','NOUN') =", cfd_tags['DET']['NOUN']) print("P('NOUN | 'DET') =", cpd_tags['DET'].prob('NOUN')) #Emission probability 찾기 cfd_tagwords = nltk.ConditionalFreqDist(tagged_words) cpd_tagwords = nltk.ConditionalProbDist(cfd_tagwords, nltk.MLEProbDist) print("Count('DET','the') =", cfd_tagwords['DET']['the']) print("P('the'|'DET')=", cpd_tagwords['DET'].prob('the')) #p56
# # use different tagset on tagged corpora # print(nltk.corpus.brown.tagged_words(tagset='universal')) # print(nltk.corpus.treebank.tagged_words(tagset='universal')) # # tagged corpora for various language in NLTK ## nltk.download('sinica_treebank') # print(nltk.corpus.sinica_treebank.tagged_words()) # # 2.3 a universal part-of-speech tagset from nltk.corpus import brown brown_news_tagged = brown.tagged_words(categories='news', tagset='universal') tag_fd = nltk.FreqDist(tag for (word, tag) in brown_news_tagged) print(tag_fd.most_common()) # # 2.4 nouns word_tag_pairs = nltk.bigrams(brown_news_tagged) noun_preceders = [a[1] for (a, b) in word_tag_pairs if b[1] == 'NOUN'] fdist = nltk.FreqDist(noun_preceders) print([tag for (tag, _) in fdist.most_common()]) # # 2.5 verbs wsj = nltk.corpus.treebank.tagged_words(tagset='universal') word_tag_fd = nltk.FreqDist(wsj) print([wt[0] for (wt, _) in word_tag_fd.most_common() if wt[1] == 'VERB']) cfd1 = nltk.ConditionalFreqDist(wsj) print(cfd1['yield'].most_common()) print(cfd1['cut'].most_common())
FILE2 = ['doyle-case-27.txt'] for doc in FILE1: with open(doc, 'r') as file: text = ''.join(file.readlines()).lower().split() ####Set up dictionary with single word counts dict1 = {} dict_of_dicts = {} totalWords = 0; for word in text: dict1[word] = 0; dict_of_dicts[word] = {}; for word in text: dict1[word] = dict1[word] + 1 ####Set up dictionary with bigram counts bigrams1 = list(nltk.bigrams(text)) for big in bigrams1: dict_of_dicts[big[0]][big[1]] = 0; for big in bigrams1: dict_of_dicts[big[0]][big[1]] = dict_of_dicts[big[0]][big[1]] + 1; for prev in dict1: for curr in dict_of_dicts[prev]: dict_of_dicts[prev][curr] = dict_of_dicts[prev][curr] / dict1[prev] #for i in range(100): #key1 = random.choice(list(dict_of_dicts.keys())) #key2 = random.choice(list(dict_of_dicts[key1].keys())) #print('p(' + str(key2) + '|' + str(key1) + ') = ' + str(dict_of_dicts[key1][key2])) #print(bigrams1[2][0]) #print(dict_of_dicts) for doc in FILE2: with open(doc, 'r') as file:
def _update_ngram_database(notes_directory, ngram_db_dir): line_tokenizer = LineTokenizer(blanklines='discard') word_tokenizer = WhitespaceTokenizer() grep_command = 'find {} | grep ".note$"'.format(notes_directory) proc = Popen( grep_command, stdout=PIPE, stderr=PIPE, shell=True) output, err = proc.communicate() all_notes_files = output.decode().split('\n') ''' Create master list of all raw tokens. Will look like: tokens = { 'unigrams': ['all', 'unigrams'], 'bigrams': [('all', 'bigrams')], 'trigrams': [('all', 'the', 'trigrams')] } ''' tokens = { 'unigrams': [], 'bigrams': [], 'trigrams': [] } for note_file in all_notes_files: if not note_file: continue with codecs.open(note_file, mode="r", encoding="utf-8") \ as note_file_object: note_file_content = note_file_object.read() note_file_content = note_file_content.lower() lines = line_tokenizer.tokenize(note_file_content) for line in lines: sentences = sent_tokenize(line) for sentence in sentences: sentence_safe_split = [] all_words = word_tokenizer.tokenize(sentence) for word in all_words: # Skip any word with a forbidden character if any([char in word for char in FORBIDDEN_CHARS]): continue has_letters = False for char in word: if char.isalpha(): has_letters = True break if word and has_letters: sentence_safe_split.append(word) tokens['unigrams'].extend(sentence_safe_split) tokens['bigrams'].extend(bigrams(sentence_safe_split)) tokens['trigrams'].extend(trigrams(sentence_safe_split)) ''' Squash the list of tokens into a dict that tracks the number of occurences of each token. Will look like: tokens = { 'unigrams': { 'foo': 17, 'bar': 42, ... }, ... } ''' for token_type in tokens.keys(): all_tokens_of_type = tokens[token_type] weighted_tokens = {} for single_token in all_tokens_of_type: if not isinstance(single_token, str): single_token = ' '.join(single_token) if not weighted_tokens.get(single_token): weighted_tokens[single_token] = 1 else: weighted_tokens[single_token] = weighted_tokens[single_token]+1 tokens[token_type] = OrderedDict(sorted( weighted_tokens.items(), key=lambda t: t[1], reverse=True)) # Write Unigrams to Disk unigrams_json_file_path = ngram_db_dir + '/unigrams.json' unigrams_text_file_path = ngram_db_dir + '/unigrams.txt' with open(unigrams_json_file_path, 'w') as unigrams_json_file_object: json.dump(tokens['unigrams'], unigrams_json_file_object) with codecs.open(unigrams_text_file_path, mode="w", encoding="utf-8") \ as unigrams_text_file_object: for unigram, frequency in tokens['unigrams'].items(): unigrams_text_file_object.write(unigram + '\n') # Write Bigrams to Disk bigrams_json_file_path = ngram_db_dir + '/bigrams.json' bigrams_text_file_path = ngram_db_dir + '/bigrams.txt' with open(bigrams_json_file_path, 'w') as bigrams_json_file_object: json.dump(tokens['bigrams'], bigrams_json_file_object) with codecs.open(bigrams_text_file_path, mode="w", encoding="utf-8") \ as bigrams_text_file_object: for bigram, frequency in tokens['bigrams'].items(): bigrams_text_file_object.write(bigram + '\n') # Write Trigrams to Disk trigrams_json_file_path = ngram_db_dir + '/trigrams.json' trigrams_text_file_path = ngram_db_dir + '/trigrams.txt' with open(trigrams_json_file_path, 'w') as trigrams_json_file_object: json.dump(tokens['trigrams'], trigrams_json_file_object) with codecs.open(trigrams_text_file_path, mode="w", encoding="utf-8") \ as trigrams_text_file_object: for trigram, frequency in tokens['trigrams'].items(): trigrams_text_file_object.write(trigram + '\n')
def extract_bigrams(text): tokens = word_tokenize(text) return [gram[0] + ' ' + gram[1] for gram in bigrams(tokens)]
for word in data: tokens.append(word[:-1]) print "after read tokens" for x in xrange(len(tokens)): # count the frequency of each word if (tokens[x] in words): words[tokens[x]] += 1 else: words[tokens[x]] = 1 words[("UNK", "UNK")] = 0 print "after words" #dic=defaultdict(lambda: defaultdict(lambda: 1)) #make the matrix dic = {} one = list(bigrams(tokens)) print "after bigram" n = 1812418 compute() pre(" At the ") pre(" it is ") """with open('dic', 'wb') as file: pickle.dump(dic, file)""" elapsed_time = time.time() - start_time print "time elapsed ", elapsed_time
import nltk def generate_model(cfdist, word, num=15): for i in range(num): print word, word = cfdist[word].max() text = nltk.corpus.genesis.words('english-kjv.txt') biagrams = nltk.bigrams(text) cfd = nltk.ConditionalFreqDist(biagrams) #cfd.plot() #print(cfd) generate_model(cfd, 'living')