def freq_dst(self,posCorpus,negCorpus):
         
        #Creates frequency distribution for words in corpus
        posFreqDist = FreqDist()
        for word in posCorpus.words():
            posFreqDist.inc(word)

        negFreqDist = FreqDist()
        for word in negCorpus.words():
            negFreqDist.inc(word)
 
        #Frequency Distributions with Laplace Smoothing 
        global posLapFreq
        posLapFreq = nltk.probability.LaplaceProbDist(posFreqDist) 
        global negLapFreq
        negLapFreq = nltk.probability.LaplaceProbDist(negFreqDist)

        #GetBigrams
        posBigrams = nltk.bigrams(posCorpus.words())
        negBigrams = nltk.bigrams(negCorpus.words())

        #Get no. of words per corpus
        posWordLen = len(posCorpus.words())
        negWordLen = len(negCorpus.words())


        #FreqDist for Bigrams
        global posBiFreq
        posBiFreq = nltk.probability.LaplaceProbDist(nltk.FreqDist(posBigrams))
        global negBiFreq
        negBiFreq = nltk.probability.LaplaceProbDist(nltk.FreqDist(negBigrams))
Esempio n. 2
0
  def to_bigram(self, termpos):
    words = [elem[0] for elem in termpos]
    pos_tags = [elem[1] for elem in termpos]

    b_words = nltk.bigrams(words)
    b_pos = nltk.bigrams(pos_tags)
    return (b_words, b_pos)
Esempio n. 3
0
def similarity(paper1,paper2):
    score=[]
    stops=nltk.corpus.stopwords.words('english') #stopwords to weed out

##compare the titles and score the word cosine similarity 
    title1 = paper1[1]
    title2 = paper2[1]
    tokens1=[w for w in nltk.word_tokenize(title1) if w not in stops]
    tokens2=[w for w in nltk.word_tokenize(title2) if w not in stops]
    fd1=nltk.FreqDist(tokens1)
    fd2=nltk.FreqDist(tokens2)
    keys=list(set(list(fd1.keys())+list(fd2.keys())))
    scoretemp=0
    for key in keys:
      scoretemp += fd1[key]*fd2[key]
    a = numpy.linalg.norm(numpy.asarray(list(fd1.values())))*numpy.linalg.norm(numpy.asarray(list(fd2.values())))
    if a:
      score.append(1-scoretemp/a)
    else:
      score.append(0)
    
##compare the abstracts and score single word cosine similarity 
    abstract1 = paper1[3]
    abstract2 = paper2[3]
    tokens1=[w for w in nltk.word_tokenize(abstract1) if w not in stops]
    tokens2=[w for w in nltk.word_tokenize(abstract2) if w not in stops]
    fd1=nltk.FreqDist(tokens1)
    fd2=nltk.FreqDist(tokens2)
    keys=list(set(list(fd1.keys())+list(fd2.keys())))
    scoretemp=0
    for key in keys:
      scoretemp += fd1[key]*fd2[key]
    a = numpy.linalg.norm(numpy.asarray(list(fd1.values())))*numpy.linalg.norm(numpy.asarray(list(fd2.values())))
    if a:
      score.append(1-scoretemp/(numpy.linalg.norm(numpy.asarray(list(fd1.values())))*numpy.linalg.norm(numpy.asarray(list(fd2.values())))))    
    else:
      score.append(0)

##compare the abstracts and score bigram cosine similarity 
    tokens1 = nltk.word_tokenize(abstract1)
    tokens2 = nltk.word_tokenize(abstract2)
    bgsall1 = nltk.bigrams(tokens1)
    bgsall2 = nltk.bigrams(tokens2)
    bgs1 = [bg for bg in bgsall1 if bg[0] not in stops and bg[1] not in stops]
    bgs2 = [bg for bg in bgsall2 if bg[0] not in stops and bg[1] not in stops]
    fd1=nltk.FreqDist(bgs1)
    fd2=nltk.FreqDist(bgs2)
    keys=list(set(list(fd1.keys())+list(fd2.keys())))
    scoretemp=0
    for key in keys:
      scoretemp += fd1[key]*fd2[key]
#    print(fd1.values())
    a = numpy.linalg.norm(numpy.asarray(list(fd1.values())))*numpy.linalg.norm(numpy.asarray(list(fd2.values())))
    if a:
      score.append(1-scoretemp/(numpy.linalg.norm(numpy.asarray(list(fd1.values())))*numpy.linalg.norm(numpy.asarray(list(fd2.values())))))
    else:
      score.append(0)

##total score is sum of the three scores    
    return sum(score)
Esempio n. 4
0
def test():
	uniDictList = [{} for x in range(6)]
	biDictList = [{} for x in range(6)]
	vocabSize = [0 for x in range(6)]
	totalSize = [0 for x in range(6)]
	biVocabSize = [0 for x in range(6)]
	bitotalSize = [0 for x in range(6)]
	numList = [0 for x in range(6)]
	numCorrect = total = 0

	# randomly split set 
	for entry in entryList:
		if random.random() > 0.10:
			entry.test = 0
		else:
			entry.test = 1

	# compute train dictionaries
	for entry in entryList:
		if entry.test == 0:
			for word in entry.review.split():
				uniDictList[entry.rating][word] = uniDictList[entry.rating].get(word,0)+1 

			for bigram in bigrams(entry.review.split()):
				biDictList[entry.rating][bigram] = biDictList[entry.rating].get(word,0)+1
							 
			numList[entry.rating] += 1


	print numList

	totalCount = reduce(lambda x,y: x+y, numList)


	# compute dictionary stats
	for x in xrange(1,6):
		vocabSize[x] = len(uniDictList[x].keys())
		totalSize[x] = reduce(lambda x,y: x+y,uniDictList[x].values())
		biVocabSize[x] = len(biDictList[x].keys())
		bitotalSize[x] = reduce(lambda x,y: x+y,biDictList[x].values())
			
	# testing
	for entry in entryList:
		if entry.test == 1:
			rankProb = [0 for x in range(6)]
			for x in range(1,6):
				for word in entry.review.split():
					rankProb[x] += math.log(uniDictList[x].get(word,1)) - math.log(vocabSize[x]+totalSize[x])
				for bigram in bigrams(entry.review.split()):
					rankProb[x] += math.log(biDictList[x].get(bigram,1)) - math.log(biVocabSize[x]+bitotalSize[x])

			map(lambda x: x*numList[entry.rating]/totalCount,rankProb)
			entry.pRating = rankProb.index(max(rankProb[1:6]))
			if entry.pRating == entry.rating:
				numCorrect += 1
			total += 1
		print bigrams(entry.review.split())


	return [numCorrect, total]
Esempio n. 5
0
 def estimateLikelihood(self):
   uniqBigrams = set()
   uniqCount = 0
   for tweet in self._focusTweets['aae']:
     tweet = tweet.split('\t')
     for bigram in nltk.bigrams(tweet):
       try:
         dummy = self._biDict[bigram]
         self._likelihood['aae'][bigram] += 1
         self._likelihood['aae']['__BITOTAL__'] += 1
         if bigram not in uniqBigrams:
           uniqBigrams.add(bigram)
           uniqCount += 1
       except:
         continue
   self._likelihood['aae']['__BITOTAL__'] += uniqCount ## Adding vocab to total for add one smoothing!!
   sys.stderr.write("Likelihood Bigram Entries AAE:"+str(len(self._likelihood['aae']))+"\n")
   uniqBigrams = set()
   uniqCount = 0
   for tweet in self._focusTweets['mse']:
     tweet = tweet.split('\t')
     for bigram in nltk.bigrams(tweet):
       try:
         dummy = self._biDict[bigram]
         self._likelihood['mse'][bigram] += 1
         self._likelihood['mse']['__BITOTAL__'] += 1
         if bigram not in uniqBigrams:
           uniqBigrams.add(bigram)
           uniqCount += 1
       except:
         continue
   self._likelihood['mse']['__BITOTAL__'] += uniqCount
   sys.stderr.write("Likelihood Bigram Entries MSE:"+str(len(self._likelihood['mse']))+"\n")
Esempio n. 6
0
def find_colloc(data):  # find most common collocations
    def check(wb, tb):
        if len(wb[0]) <= 1 or len(wb[1]) <= 2:
            return False
        try:
            if detect(wb[0]) != "ar" or detect(wb[1]) != "ar":
                return False
        except:
            return False

        if tb in [("NN", "NN"), ("NN", "DTNN"), ("NNP", "NNP")]:
            return True
        return False

    bigrams = FreqDist()

    for d in data:
        tokens = d["tokens"]
        words_bigrams = nltk.bigrams([t[0] for t in tokens])
        tags_bigrams = nltk.bigrams([t[1] for t in tokens])

        for wb, tb in zip(words_bigrams, tags_bigrams):
            if check(wb, tb):
                bigrams[wb] += 1

    return bigrams
Esempio n. 7
0
def main():
	text = open('holmes.txt').read()
	tokens = nltk.wordpunct_tokenize(text)
	charList = []
	for word in tokens:
		for char in word:
			charList.append(char)
	fDistChars = nltk.FreqDist(charList)
	fDistWords = nltk.FreqDist(tokens)
	
	print("Answer to 1A, there are {} character types in the book, namely: \n{}".format(len(fDistChars),sorted(fDistChars)))
	print("\nAnswer to 1B, there are {} word types in the book, namely: \n{}".format(len(fDistWords),sorted(fDistWords)))
	
	bigramChars = nltk.bigrams(charList)
	trigramChars = nltk.trigrams(charList)

	print("\nAnswer to 1C, the 20 most common characters are: \nUnigrams: \n{}\nBigrams: \n{}\nTrigrams: \n{}".format(most_common(charList), 
		most_common(bigramChars), most_common(trigramChars)))

	bigramWords = nltk.bigrams(tokens)
	trigramWords = nltk.trigrams(tokens)

	print("\nAnswer to 1D, the 20 most common words are: \nUnigrams: \n{}\nBigrams: \n{}\nTrigrams: \n{}".format(most_common(tokens), 
		most_common(bigramWords), most_common(trigramWords)))
	
	bigram_measures = nltk.collocations.BigramAssocMeasures()
	finder = BigramCollocationFinder.from_words(tokens)
	scoredPMI = finder.score_ngrams(bigram_measures.pmi)
	scoredCHI = finder.score_ngrams(bigram_measures.chi_sq)
	
	print("\nAnswer to 2, the 20 most likely collocations are:\nPMI:\n{} \nChi's square\n{}" .format(scoredPMI[:20],scoredCHI[:20]))
	
	print("\nSpearmans correlation = {}".format(nltk.metrics.spearman.spearman_correlation(scoredPMI, scoredCHI)))
Esempio n. 8
0
def textsimilarity(text1,text2):
    score=[]
    stops=nltk.corpus.stopwords.words('english') #stopwords to weed out
    stops = stops + ['we',',','.','(',')','using','new','propose','investigate']
    stops = stops + ['-','show','infer','novel','method']

#get tokens and bigrams from the text, either string or list of keywords
    if type(text1) is not list:
      alltokens = nltk.word_tokenize(text1.lower())
      allpairs = [list(pair) for pair in nltk.bigrams(alltokens)]
      tokens1 = [token for token in alltokens if token not in stops]
      pairs1 = [" ".join(bg) for bg in allpairs if bg[0] not in stops and bg[1] not in stops]
    else:
      alltokens = []
      allpairs1 = []
      for el in text1:
        atokens = nltk.word_tokenize(el.lower())
        alltokens += atokens
        apairs = [list(pair) for pair in nltk.bigrams(atokens)]
        allpairs += apairs
      tokens1 = [token for token in alltokens if token not in stops]
      pairs1 = [" ".join(bg) for bg in allpairs if bg[0] not in stops and bg[1] not in stops]

    if type(text2) is not list:
      tokens = nltk.word_tokenize(text2.lower())
      allpairs = [list(pair) for pair in nltk.bigrams(tokens)]
      tokens2 = [token for token in tokens if token not in stops]
      pairs2 = [" ".join(bg) for bg in allpairs if bg[0] not in stops and bg[1] not in stops]
    else:
      for el in text2:
        atokens = nltk.word_tokenize(el.lower())
        alltokens += atokens
        apairs = [list(pair) for pair in nltk.bigrams(atokens)]
        allpairs += apairs
      tokens2 = [token for token in alltokens if token not in stops]
      pairs2 = [" ".join(bg) for bg in allpairs if bg[0] not in stops and bg[1] not in stops]
      
###score single word cosine similarity
##    fd1=nltk.FreqDist(tokens1)
##    fd2=nltk.FreqDist(tokens2)
##    keys=list(set(list(fd1.keys())+list(fd2.keys())))
##    scoretemp=0
##    for key in keys:
##      scoretemp += fd1[key]*fd2[key]
##    score.append(1-scoretemp/(numpy.linalg.norm(numpy.asarray(list(fd1.values())))*numpy.linalg.norm(numpy.asarray(list(fd2.values())))))
##    
####score bigram cosine similarity 
##    fd1=nltk.FreqDist(pairs1)
##    fd2=nltk.FreqDist(pairs2)
##    keys=list(set(list(fd1.keys())+list(fd2.keys())))
##    scoretemp=0
##    for key in keys:
##      scoretemp += fd1[key]*fd2[key]
##    score.append(1-scoretemp/(numpy.linalg.norm(numpy.asarray(list(fd1.values())))*numpy.linalg.norm(numpy.asarray(list(fd2.values())))))
    score.append(sum(1 for token in tokens1 if token in tokens2))
    score.append(sum(1 for pair in pairs1 if pair in pairs2))
    print('done')
##total score is sum of the the scores    
    return sum(score)
def main():
    
    # Corpus Location
    #for training data
    posTrainCorpus = 'C:/Users/Abhinav/Desktop/Course work/NLP/txt_sentoken/pos_train'
    negTrainCorpus = 'C:/Users/Abhinav/Desktop/Course work/NLP/txt_sentoken/neg_train'

    #for test data
    posTestCorpus = 'C:/Users/Abhinav/Desktop/Course work/NLP/txt_sentoken/pos_test'
    negTestCorpus = 'C:/Users/Abhinav/Desktop/Course work/NLP/txt_sentoken/neg_test'

    # Create Plain Text Corpus for training data
    posCorpus = PlaintextCorpusReader(posTrainCorpus, '.*')
    negCorpus = PlaintextCorpusReader(negTrainCorpus, '.*')


    # Create Plain Text Corpus for test data
    posTstCorpus = PlaintextCorpusReader(posTestCorpus, '.*')
    negTstCorpus = PlaintextCorpusReader(negTestCorpus, '.*')
    
    #GetBigrams
    posBigrams = nltk.bigrams(posCorpus.words())
    negBigrams = nltk.bigrams(negCorpus.words())

    #Get no. of words per corpus
    posWordLen = len(posCorpus.words())
    negWordLen = len(negCorpus.words())
    
    # Creating object of Lang_Model_classifier
    obj1 = Lang_Model_Classifier()
    obj1.freq_dst(posCorpus, negCorpus)
    
    #For negative test data
    for filename in os.listdir(negTestCorpus):
        wordSet =  negTstCorpus.words(filename)
    
        print '**Unigram**'
        unigr = obj1.perp(wordSet)
    
        print unigr
    
        print '**Bigram**'
        bigr = obj1.perpBi(nltk.bigrams(wordSet))
    
        print bigr
        
    #For positive test data    
    for filename in os.listdir(posTestCorpus):
        wordSet2 =  posTstCorpus.words(filename)
    
        print '**Unigram**'
        posunigr = obj1.perp(wordSet2)
    
        print posunigr
    
        print '**Bigram**'
        posbigr = obj1.perpBi(nltk.bigrams(wordSet2))
    
        print posbigr
Esempio n. 10
0
def hybrid_cfdist():
    sherlock_corpus = PlaintextCorpusReader(CORPUS_ROOT_SHERLOCK, '.*', encoding='utf-8')
    sherlock_bigrams = nltk.bigrams(sherlock_corpus.words())

    pokemon_corpus = PlaintextCorpusReader(CORPUS_ROOT_POKEMON, '.*', encoding='utf-8')
    pokemon_bigrams = nltk.bigrams(pokemon_corpus.words())

    return nltk.ConditionalFreqDist(sherlock_bigrams + pokemon_bigrams)
Esempio n. 11
0
def wordlistfun(filename):
    minlength = 2
    lmtzr = nltk.stem.wordnet.WordNetLemmatizer()
    wordlist = []
    wordfreq = []
    hashlist = []
    hashfreq = []

    with open(filename, "r") as f:
        count_all = Counter()
        count_hash = Counter()
        count_only = Counter()
        count_bi = Counter()
        count_only2 = Counter()
        count_bigramonly = Counter()
        count_bigramstop = Counter()
        for line in f:
            try:
                tweet = json.loads(line)
                # Create a list with all the terms
                terms_stop = [
                    term for term in preprocess(tweet["text"]) if term.lower() not in stop
                ]  # Update the counter
                terms_hash = [term for term in preprocess(tweet["text"]) if term.lower().startswith("#")]
                terms_only = [
                    term
                    for term in preprocess(tweet["text"])
                    if term.lower() not in stop and not term.lower().startswith(("#", "@"))
                ]
                # mind the ((double brackets))
                # startswith() takes a tuple (not a list) if # we pass a list of inputs
                terms_only2 = [
                    term.encode("unicode-escape")
                    for term in preprocess(tweet["text"])
                    if term.lower() not in stop
                    and not term.lower().startswith(("#", "@"))
                    and not term.lower().startswith(("htt", "\u"))
                    and term.lower() not in [r"(?:(?:\d+,?)+(?:\.?\d+)?)"]
                    and len(term) > minlength
                ]

                terms_bigramstop = bigrams(terms_stop)
                terms_bigramonly = bigrams(terms_only2)

                count_all.update(terms_stop)
                count_hash.update(terms_hash)
                count_only.update(terms_only)
                count_only2.update(terms_only2)

                count_bigramonly.update(terms_bigramonly)
                count_bigramstop.update(terms_bigramstop)
            except:
                pass

        wordlist, wordfreq = zip(*count_only2.most_common())
        hashlist, hashfreq = zip(*count_hash.most_common())
    return wordlist, wordfreq, hashlist, hashfreq
Esempio n. 12
0
File: G8.py Progetto: lum4chi/IR
def do_ir2(db, param):
    print 'Computazione di IR2', db, param, '...'

    def words(text):
        stopwords = set(nltk.corpus.stopwords.words('english'))
        return [w for w in nltk.word_tokenize(text.lower()) if w not in string.punctuation and w not in stopwords]

    class BigramsCorpus:
        def __init__(self, db, collection):
            self.client = MongoClient()[db][collection]

        def __iter__(self):
            for doc in self.client.find():
                yield [doc['_id']]

        def __len__(self):
            return self.client.count()

    bigram_corpus = BigramsCorpus('cordis', 'bi_grams')
    bigrams = Dictionary(bigram_corpus)

    project ={'$project': {'_id': 0, 'title': 1, 'reference': 1}}
    a = [project]
    project_corpus = MongoCorpus('cordis', 'projects', aggregate=a)

    n = max(bigrams.keys())
    dataset = []

    for doc in project_corpus:
        temp = bigrams.doc2bow([' '.join(x) for x in nltk.bigrams(words(doc['title']))])
        x = [0]*(n+1)
        for bi, _ in temp:
            x[bi] = 1
        dataset.append(x)

    alg = KMeans(n_clusters=int(param))
    alg.fit(dataset)

    clusters = defaultdict(list)
    for i, doc in enumerate(project_corpus):
        temp = bigrams.doc2bow([' '.join(x) for x in nltk.bigrams(words(doc['title']))])
        x = [0]*(n+1)
        for bi, _ in temp:
            x[bi] = 1
        p = alg.predict([x])
        clusters[p[0]].append(doc['reference'])

    mongo_clusters = []
    for k, v in clusters.items():
        mongo_clusters.append({'cluster': k, 'projects': v})

    # Mongo da questo errore: InvalidDocument: Cannot encode object: 0
    print mongo_clusters
    # Salva su collezione Mongo
    mongo = MongoClient()['g8']['ir2']
    mongo.insert_many(mongo_clusters)
    print 'Fatto!'
Esempio n. 13
0
def how_is_often_used_in_text():
    from nltk.corpus import brown

    brown_learned_text = brown.words(categories="learned")
    print sorted(set(b for (a, b) in nltk.bigrams(brown_learned_text) if a == "often"))
    # or use the tagged words for the actual POS tags
    brown_learned_tagged = brown.tagged_words(categories="learned", simplify_tags=True)
    fd = nltk.FreqDist([b[1] for (a, b) in nltk.bigrams(brown_learned_tagged) if a[0] == "often"])
    fd.tabulate()
def extract_bigrams(articleList, commentCount):
    featureMatrix = np.zeros([commentCount,100])

    index = 0
    stemmer = SnowballStemmer("english", ignore_stopwords=True)
    bagOfWords = []
    for art in articleList.items():        
        for comm in art[1]:
            mywords = words(comm.body)
            mywords = known_words(mywords)
            # Remove Stops
            filtered_words = [w for w in mywords if not w in stopwords.words('english')]
            # Stemming
            stemmed_words = [stemmer.stem(w) for w in filtered_words]
            bagOfWords += stemmed_words
            bagOfWords.append("\n")
            
    tempVector = dict()
        
    #Create your bigrams
    bgs = nltk.bigrams(bagOfWords)

    fdist = nltk.FreqDist(bgs)   
    
    for k in fdist.keys()[:100]:
        tempVector[k] = 0
    
    
    theKeys = tempVector.keys()
    
    for art in articleList.items():        
        for comm in art[1]:
            mywords = words(comm.body)
            mywords = known_words(mywords)
            # Remove Stops
            filtered_words = [w for w in mywords if not w in stopwords.words('english')]
            # Stemming
            stemmed_words = [stemmer.stem(w) for w in filtered_words]
            bgs = nltk.bigrams(stemmed_words)
            for word in (w for w in bgs if tempVector.has_key(w)):
                keyInd = theKeys.index(word)      
                featureMatrix[index][keyInd] += 1
                           
            index += 1
            if index % 100 == 0:
                print "extracted", index, "features"
        
            if index >= commentCount:
                break            
            
            
    
    
    print "non-zero",np.count_nonzero(featureMatrix)
    print "Percentage filled:%.2f" %(float(np.count_nonzero(featureMatrix))/(featureMatrix.shape[0]*featureMatrix.shape[1]))
    return featureMatrix
Esempio n. 15
0
    def get_joint_entropy(string1, string2):
        first_bigram = list(nltk.bigrams(string1.lower()))
        second_bigram = list(nltk.bigrams(string2.lower()))
        combo = first_bigram + second_bigram
        bigram_dict = collections.Counter(combo)

        for i in bigram_dict:
            if i in first_bigram and i in second_bigram:
                value = float(bigram_dict[i]) / float(len(combo))
                yield value
Esempio n. 16
0
 def get_joint_entropy(string1, string2):
     bigram1 = list(nltk.bigrams(string1.lower()))
     bigram2 = list(nltk.bigrams(string2.lower()))
     combo = bigram1 + bigram2
     bigram_dict = collections.Counter(combo)
     
     for i in bigram_dict:
         if i in bigram1 and i in bigram2:
             value = float(bigram_dict[i]) / float(len(combo))
             yield value
Esempio n. 17
0
   def featureSets(data): #data accepted as (rating, list of words)
      fs = [] 
      for (r, words) in data:
         nicewords = [word.lower() for word in words if not isStopWord(word) and not isPunctuation(word)]
         for bigram in nltk.bigrams(nicewords):
            fs.append((BigramClassifier.features(bigram),r))

      return fs

      return [(BigramClassifier.features(bigram), r)  for bigram in nltk.bigrams(words)]
Esempio n. 18
0
def bigrami(documents,dg1,gg1,dg2,gg2):
    bigram = []
    stopwords = nltk.corpus.stopwords.words('english')
    for i in range(dg1,gg1):
        bigram.append([w for w in bigrams(documents[i][0])])
    for i in range(dg2,gg2):
        bigram.append([w for w in bigrams(documents[i][0])])
    result = []
    [result.extend(w) for w in bigram]
    result = [w for w in result if w[0] not in stopwords and w[1] not in stopwords and w[0] and nije_interpunkcija(w[0]) and nije_interpunkcija(w[1])]
    result = nltk.FreqDist(result)
    return result.keys()
Esempio n. 19
0
def joint_entropy(string1, string2):
    x = []
    bi1 = list(nltk.bigrams(string1.lower()))
    bi2 = list(nltk.bigrams(string2.lower()))
    combo = bi1 + bi2
    yes = list(set(combo))
    for i in yes:
        if i in bi1 and i in bi2:
            count = (float(bi1.count(i))+float(bi2.count(i)))/float(len(combo))
            x.append(count)
    calc = sum(i*np.log2(i) for i in x)*-1
    return calc
Esempio n. 20
0
def exercise_bigrams():
    sent = ["In", "the", "beginning", "God", "created", "the heaven", "and", "the earth"]
    print list(nltk.bigrams(sent))

    text = nltk.corpus.genesis.words("english-kjv.txt")
    bigrams = nltk.bigrams(text)
    cfd = nltk.ConditionalFreqDist(bigrams)

    word = "living"
    for i in range(15):
        print word
        word = cfd[word].max()
def jacquard_bigram(query):
    final=[]
    for a in file('enwiktionary.a.list'):
        a=a.rstrip()
        bigram=set(nltk.bigrams(a))
        q_bigram=set(nltk.bigrams(query))
        intersect=q_bigram.intersection(bigram)
        union=q_bigram.union(bigram)
        sim=float(len(intersect))/len(union)
        
        final.append([a,sim])
    final_sorted= sorted(final,key=lambda sim:sim[1], reverse=True)
    print final_sorted[:10]
Esempio n. 22
0
def main():

    OUT = open("../output.txt", "w")
    OUT.close()
    INP = open("../data/test.hyp1-hyp2-ref", "r")
    inp = INP.read()
    for sent in inp.split("\n")[:-1]:
        h1 = sent.split(" ||| ")[0].split(" ")
        h2 = sent.split(" ||| ")[1].split(" ")
        ref = sent.split(" ||| ")[2].split(" ")
        h1p = process(h1)
        h2p = process(h2)
        refp = process(ref)
        #print(h1c, h2c, refc)
        #h1_match = word_matches(h1, rset)
        #h2_match = word_matches(h2, rset)
        h1c = Counter(h1)
        h2c = Counter(h2)
        refc = Counter(ref)
        h1_bigrams = nltk.bigrams(h1)
        h2_bigrams = nltk.bigrams(h2)
        ref_bigrams = nltk.bigrams(ref)
        h1_trigrams = nltk.trigrams(h1)
        h2_trigrams = nltk.trigrams(h2)
        ref_trigrams = nltk.trigrams(ref)
        #print(h_bigrams, ref_bigrams)
        h1_bigramsc = Counter(h1_bigrams)
        h2_bigramsc = Counter(h2_bigrams)
        ref_bigramsc = Counter(ref_bigrams)
        h1_trigramsc = Counter(h1_trigrams)
        h2_trigramsc = Counter(h2_trigrams)
        ref_trigramsc = Counter(ref_trigrams)
        h1_allc = h1c + h1_bigramsc + h1_trigramsc
        h2_allc = h2c + h2_bigramsc + h2_trigramsc
        ref_allc = refc + ref_bigramsc + ref_trigramsc
        h1_precision = precision(h1_allc, ref_allc)
        h2_precision = precision(h2_allc, ref_allc)
        h1_recall = recall(h1_allc, ref_allc)
        h2_recall = recall(h2_allc, ref_allc)
        h1_meteor = meteor(h1_precision, h1_recall)
        h2_meteor = meteor(h2_precision, h2_recall)
        OUT = open("../output.txt", "a")

        if h1_meteor > h2_meteor:
            OUT.write("-1\n")
        else:
            if h1_meteor == h2_meteor:
                OUT.write("0\n")
            else:
                OUT.write("1\n")
        OUT.close()
Esempio n. 23
0
  def construct_features(self, sentences, use_smoothing=True):
    print 'creating features...'

    if not use_smoothing:
      self.set_lambda(0)

    data = []
    regex = re.compile('[%s]' % re.escape(string.punctuation))
    for i, sent in enumerate(sentences):
      print i
      term, tpos, posf, bterm, btpos, bposf = (0, 0, 0, 0, 0, 0)
      tokenized_tagged = nltk.pos_tag(nltk.word_tokenize(sent))
      for token, p in tokenized_tagged:
        # unigrams
        try:
          new_token = regex.sub(u'', token).decode('utf-8')
          if not new_token == u'' and not new_token in stopwords.words('english'):
            term += self.Term_Freq[new_token]/self.N_Term
            # I think we need a different normalizer here
            posf += self.POS_Freq[p]/self.N_Term
            tpos += self.Term_Freq[(new_token, p)]/self.N_Term
        except:
          pass

      # normalize with respect to sentence length
      term /= len(sent)
      posf /= len(sent)
      tpos /= len(sent)

      # bigrams
      words = [elem[0] for elem in tokenized_tagged]
      pos_tags = [elem[1] for elem in tokenized_tagged]

      b_words = nltk.bigrams(words)
      b_pos = nltk.bigrams(pos_tags)

      if len(b_words) > 0:
        for b_w, b_p in zip(b_words, b_pos):
          bterm += self.BTerm_Freq[b_w]/self.BN_Term
          bposf += self.BPOS_Freq[b_p]/self.BN_Term
          btpos += self.BTPOS_Freq[(b_w, b_p)]/self.BN_Term

      # normalize
        bterm /= len(b_words)
        bposf /= len(b_pos)
        btpos /= len(b_words)

      data.append([term, posf, tpos, bterm, bposf, btpos])

    return np.asarray(data)
Esempio n. 24
0
def window_bigrams(problem):
    """Get the bigrams of window size 5, realling don't know what size is good."""
    tokenized = nltk.tag.untag(problem.tagged)
    out = {}
    for index in problem.head_indices:
        upperbound = index + 5
        lowerbound = max(0, index - 5)
        words_after = tokenized[index + 1 : upperbound + 1]
        words_before = tokenized[lowerbound:index]
        bigrams_before = nltk.bigrams(words_before)
        bigrams_after = nltk.bigrams(words_after)
        bigrams_before.extend(bigrams_after)
        windowfeatures = dict([("wbigram({}&{})".format(w[0], w[1]), True) for w in bigrams_before])
        out.update(windowfeatures)
    return out
Esempio n. 25
0
def window_bigrams_with_tags(problem):
    """Get the tagged bigrams, just three before and three after"""
    tagged = [nltk.tag.tuple2str(tup) for tup in problem.tagged]
    out = {}
    for index in problem.head_indices:
        ## window of WIDTH before
        lowerbound = max(0, index - 3)
        bigrams_before = nltk.bigrams(tagged[lowerbound:index])
        ## and WIDTH after
        upperbound = index + 3
        bigrams_after = nltk.bigrams(tagged[index + 1 : upperbound + 1])
        bigrams_before.extend(bigrams_after)
        windowfeatures = dict([("wbigram({}&{})".format(w[0], w[1]), True) for w in bigrams_before])
        out.update(windowfeatures)
    return out
Esempio n. 26
0
 def train(self, lFileList):   
    """Trains the Naive Bayes Sentiment Classifier."""
    # lFileList = self.loop_files()
    #flag will be -1 for a negative word
    #flag will be 1 for a positive word
    #iterate through each files
    for sFilename in lFileList:
       #checks positive files
       if sFilename[7] == '5':
          #tokenize file name
          file = self.loadFile('movies_reviews/' + sFilename)
          tokens = self.tokenize(file)
          #extract bigrams 
          bigrams_text = nltk.bigrams(tokens)
          #iterate through bigrams
          for (w,x) in bigrams_text:
             #check to see if the bigrams are in punctuation
             if w.lower() not in punctuation_stopwords and x.lower() not in punctuation_stopwords:
                #if the bigrams are not already in the dict, increment
                if (w.lower(), x.lower()) not in self.poswordsfreq:
                   self.poswordsfreq[(w.lower(), x.lower())] = 1
                   self.total_positive = 1
                else:
                   #if the bigrams are in the dict, increment
                   self.poswordsfreq[(w.lower(), x.lower())]+= 1
                   self.total_positive += 1
       #checks negative files
       elif sFilename[7] == '1':
          #tokenize file name
          file = self.loadFile('movies_reviews/' + sFilename)
          #print file
          tokens = self.tokenize(file)
          #extract bigrams
          bigrams_text = nltk.bigrams(tokens)
          #print bigrams_text
          #iterate through the bigrams
          for (w,x) in bigrams_text:
             #check and make sure the bigrams are not punctuation
             if w.lower() not in punctuation_stopwords and x.lower() not in punctuation_stopwords:
                #print sFilename
                #if the bigram is not already in the dict, increment
                if (w.lower(), x.lower()) not in self.negwordsfreq:
                   self.negwordsfreq[(w.lower(), x.lower())] = 1
                   self.total_negative = 1
                else:
                   #if the bigram is already in the dict, increment
                   self.negwordsfreq[(w.lower(), x.lower())]+= 1
                   self.total_negative += 1
def getRedCarpetInfo(tweets):
    bestDressed = dict()
    worstDressed = dict()
    mostTalkedAbout = dict()
    rivalries = dict()
    for tweet in tweets:
        if regExRivalries.search(tweet):
            filteredSentences = ' '.join(word for word in tweet.split() if word.lower() not in stop and word.lower() not in blacklistWords
                and word.lower() not in wordsToIgnoreRedCarpet)
            unigrams = wordTokenizer.tokenize(filteredSentences)
            for bigram in nltk.bigrams(unigrams):
                    posTags = nltk.pos_tag(bigram)
                    noun = 0
                    for (data, tag) in posTags:
                        if tag == 'NNP':
                            noun += 1
                    if noun == 2:
                        name = "%s %s" % bigram
                        addToDictionary(name, rivalries)                
        if regExBestDress.search(tweet):
            filteredSentences = ' '.join(word for word in tweet.split() if word.lower() not in stop and word.lower() not in blacklistWords
                and word.lower() not in wordsToIgnoreRedCarpet)
            unigrams = wordTokenizer.tokenize(filteredSentences)
            for bigram in nltk.bigrams(unigrams):
                    posTags = nltk.pos_tag(bigram)
                    noun = 0
                    for (data, tag) in posTags:
                        if tag == 'NNP':
                            noun += 1
                    if noun == 2:
                        name = "%s %s" % bigram
                        addToDictionary(name, bestDressed)
        if regExWorstDress.search(tweet):
            filteredSentences = ' '.join(word for word in tweet.split() if word.lower() not in stop and word.lower() not in blacklistWords 
                and word.lower() not in wordsToIgnoreRedCarpet)
            unigrams = wordTokenizer.tokenize(filteredSentences)
            for bigram in nltk.bigrams(unigrams):
                    posTags = nltk.pos_tag(bigram)
                    noun = 0
                    for (data, tag) in posTags:
                        if tag == 'NNP':
                            noun += 1
                    if noun == 2:
                        name = "%s %s" % bigram
                        addToDictionary(name, worstDressed)                
    getTopN(bestDressed, bestDressedList, 5)                    
    getTopN(worstDressed, worstDressedList, 5)
    getTopN(rivalries, rivalriesList, 2)
def worst_dressed(year):
    # returns up to 10 (but usually many fewer) of the worst dressed attendees
    worst_pat = re.compile(r"worst dressed", re.I)
    worst_dressed_tweets = []
    if year == 2013 or year == "2013":
        for text in tweets_2013_texts:
            if worst_pat.search(text):
                worst_dressed_tweets.append(text)
    elif year == 2015 or year == "2015":
        for text in tweets_2015_texts:
            if worst_pat.search(text):
                worst_dressed_tweets.append(text)
    else:
        pass

    possible_worst_bigrams = Counter()

    for tweet in worst_dressed_tweets:
        tweet = tweet.translate(translate_table)
        tokens = nltk.word_tokenize(tweet)
        bigrams = nltk.bigrams(tokens)
        for bigram in bigrams:
            if bigram[0].istitle() and bigram[1].istitle():
                possible_worst_bigrams[bigram] += 1
    p_worst_dressed = possible_worst_bigrams.most_common(10)

    worst_dressed = []
    for i in p_worst_dressed:
        if i[0][0] not in red_stop and i[0][1] not in red_stop:
            worst_dressed.append(i[0][0] + " " + i[0][1])
    return worst_dressed
def best_dressed(year):
    # returns up to 10 of the best dressed attendees
    best_pat = re.compile(r"best dressed", re.I)
    best_dressed_tweets = []
    if year == 2013 or year == "2013":
        for text in tweets_2013_texts:
            if best_pat.search(text):
                best_dressed_tweets.append(text)
    elif year == 2015 or year == "2015":
        for text in tweets_2015_texts:
            if best_pat.search(text):
                best_dressed_tweets.append(text)
    else:
        pass

    possible_best_bigrams = Counter()

    for tweet in best_dressed_tweets:
        tweet = tweet.translate(translate_table)
        tokens = nltk.word_tokenize(tweet)
        bigrams = nltk.bigrams(tokens)
        for bigram in bigrams:
            if bigram[0].istitle() and bigram[1].istitle():
                possible_best_bigrams[bigram] += 1
    p_best_dressed = possible_best_bigrams.most_common(10)

    best_dressed = []
    for i in p_best_dressed:
        if i[0][0] in female_names or i[0][0] in male_names and i[0][0] not in red_stop:
            best_dressed.append(i[0][0] + " " + i[0][1])

    return best_dressed
Esempio n. 30
0
    def ngramify(self, word_list, stop):
        # creates an ngram from a word_list based on class settings
        mode = self.mode
        pos = self.inclued_pos
        word = self.include_word
        stopset = set(stopwords.words("english"))
        stopset.remove("not")
        if stop:
            if word and pos:
                selection = [(w.lower(), p) for w, p in word_list if w.lower() not in stopset]
            elif word:
                selection = [w.lower() for w, p in word_list if w.lower() not in stopset]
            elif pos:
                selection = [p for w, p in word_list if w.lower() not in stopset]
        else:
            if word and pos:
                selection = [(w.lower(), p) for w, p in word_list]
            elif word:
                selection = [w.lower() for w, p in word_list]
            elif pos:
                selection = [p for w, p in word_list]

        if mode == "unigrams":
            word_list = selection
        elif mode == "bigrams":
            word_list = nltk.bigrams(selection)
        elif mode == "trigrams":
            word_list = nltk.trigrams(selection)
        return word_list
Esempio n. 31
0
  # Generate plotgrid
  fig, ax = plt.subplots(*grid, figsize=(15,10))
  rows, cols = grid

  # Extract word clouds from clusters
  for index, label in enumerate(np.unique(cluster_data[f"Labels_{c}"])):

    # Get total text to be analyzed
    text = " ".join(cluster_data[cluster_data[f"Labels_{c}"] == label]["Obiettivo / Motivazione"].map(lemmatizer))

    # Extract monogram end bigram frequencies from text as dictionary
    tokenizer = RegexpTokenizer(r'\w+')
    sent_words = tokenizer.tokenize(text)
    freq_monogram = FreqDist(sent_words)
    freq_bigram = FreqDist(bigrams(sent_words))
    dict_monogram = dict(freq_monogram)
    dict_bigram = {" ".join(k):v for k, v in dict(freq_bigram).items()}
    dict_token = {**dict_monogram, **dict_bigram}

    # Get only valid words and remove stopwords, storing the frequencies for analysis
    clean_dict = {k:v for k, v in dict_token.items() if validate_token(k, enriched_stopwords)}
    freq_df = pd.DataFrame([(k, v) for k, v in clean_dict.items()], columns=["ngram", "freq"]).nlargest(100, ['freq'])
    freq_df.to_csv(f"frequencies/frequencies_{c}_{label}.csv")

    # Generat and plot the word cloud
    wordcloud = WordCloud(stopwords=enriched_stopwords, background_color="white").generate_from_frequencies(clean_dict)

    if c == 2:
      ax[index%c].imshow(wordcloud, interpolation='bilinear')
      ax[index%c].axis("off")
Esempio n. 32
0
def go_pos_context(pos_model_path, meter_model_path, corpus_path):
    from nltk import bigrams
    #line = 'Laut zerspringt der Weiherspiegel.'
    #print(line)

    pos_model = joblib.load(pos_model_path)
    meter_model = joblib.load(meter_model_path)
    corpus = json.load(open(corpus_path, 'r'))

    #get_pos_meter_mapping(pos_model, meter_model, line)

    pos_dict = {}

    counter = 0
    for idx, doc in corpus.items():
        counter += 1
        if counter > 20000:
            break
        lines = doc['lines']
        for line in lines:
            mp = get_pos_meter_mapping(pos_model, meter_model, line)
            for tuple1, tuple2 in bigrams(mp):
                pos1 = tuple1[0]
                meter1 = tuple1[1]
                pos2 = tuple2[0]
                meter2 = tuple2[1]
                #print(pos, meter)
                cnt = pos_dict.setdefault("_".join([pos1, pos2]), Counter())
                cnt[meter2] += 1

    #print(pos_dict)
    ranking = []

    for pos, contours in pos_dict.items():
        ps = pos.split('_')
        pos1 = ps[0]
        pos2 = ps[1]
        print(pos1, pos2, contours)

        plus1 = 1
        minus1 = 1
        plus2 = 1
        minus2 = 1
        amphi = 1
        dibrach = 1
        spondee = 1
        for c in contours:
            if len(c) == 1:
                if pos2 == 'VM':
                    print(pos1, pos2, c, contours[c])
                    if c[0] == '+':
                        plus1 += float(contours[c])
                    elif c[0] == '-':
                        minus1 += float(contours[c])
            #if len(c) > 1:
            #	print(pos, c, contours[c])
            #	for prom in c:
            #		if prom == '+':
            #			plus2+= float(contours[c]/len(c))
            #		if prom == '-':
            #			minus2+= float(contours[c]/len(c))

            #if len(c) == 2:
            #	print(pos, c, c[0], contours[c])
            #	if c[0] == '+' and c[1] == '-':
            #		plus2 = float(contours[c])
            #	elif c[0] == '-' and c[1] == '+':
            #		minus2 = float(contours[c])
            #	elif c[0] == '-' and c[1] == '-':
            #		dibrach = float(contours[c])
            #	elif c[0] == '+' and c[1] == '+':
            #		spondee = float(contours[c])
            #if len(c) == 3:
            #	print(pos, c, c[0], contours[c])
            #	if c[0] == '+' and c[1] == '-' and c[2] == '-':
            #		plus = float(contours[c])
            #	elif c[0] == '-' and c[1] == '-' and c[2] == '+':
            #		minus = float(contours[c])
            #	elif c[0] == '-' and c[1] == '+' and c[2] == '-':
            #		amphi = float(contours[c])

        if pos2 == 'VM':
            plus = plus1 + plus2
            minus = minus1 + minus2
            print("_".join([pos1, pos2]), round(plus, 2), round(minus, 2))
            ranking.append((round(plus / minus, 2), "_".join([pos1, pos2])))

        #einsilber = plus1/minus1
        #zweisilber = (plus2+minus2+spondee)/dibrach
        #print(pos, round(einsilber, 2), round(zweisilber, 2))
        #ranking.append((round(einsilber,2), pos))

    s = sorted(ranking)
    s.reverse()
    print(s)
Esempio n. 33
0
import nltk
from nltk.probability import *
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
import codecs
import sys
# from urllib import request
# url = 'https://www.gutenberg.org/cache/epub/2707/pg2707.txt'
# raw = response.read().decode('utf8')
f = codecs.open('text.txt', encoding='utf8')
lines = f.readlines()
all_text = ' '.join(lines).lower()
tokenizer = nltk.RegexpTokenizer('\w+')
tokens = tokenizer.tokenize(all_text)
sys.stderr.write('Finding bigrams...' + '\n')
bigrams = nltk.bigrams(tokens)
for b in bigrams:
    print(b)
#tokens = nltk.word_tokenize(all_text)

english_stopwords = stopwords.words('english')
stopwords_set = set(english_stopwords)
filtered_tokens = [w for w in tokens if w not in stopwords_set]
lemmatizer = WordNetLemmatizer()
lemmatized = [lemmatizer.lemmatize(w) for w in filtered_tokens]
#print(len(tokens))
fd = nltk.FreqDist(filtered_tokens)
print(fd.most_common(10))
print(len(tokens))
print(len(lemmatized))
Esempio n. 34
0
    args = parser.parse_args()

    FILE_LENGTH = args.flen
    STOPWORDS = set(stopwords.words('english') + list(string.punctuation))
    stemmer = SnowballStemmer("english")

    with open(args.input, 'r') as f, open(f'{args.output}', 'w') as outp:
        for line in tqdm(f,
                         total=FILE_LENGTH,
                         mininterval=10.0,
                         maxinterval=20.0):
            raw = json.loads(line)
            doc = {}
            doc["id"] = raw["_id"]
            doc["contents"] = "".join(raw["text"])
            if args.bigrams:
                tokens = filter(lambda word: word.lower() not in STOPWORDS,
                                word_tokenize(doc["contents"]))
                if args.stem:
                    tokens = map(stemmer.stem, tokens)
                bigram_doc = bigrams(tokens)
                bigram_doc = " ".join(
                    ["".join(bigram) for bigram in bigram_doc])
                doc["contents"] += " " + bigram_doc
            doc["wikipedia_id"] = raw["wikipedia_id"]
            doc["wikipedia_title"] = raw["wikipedia_title"]
            doc["categories"] = raw["categories"]
            _ = outp.write(json.dumps(doc))
            _ = outp.write('\n')
def addNGrams(search_query_performance_df, columns):
    if functions.dfIsEmpty(search_query_performance_df):
        return

    n_gram_dict = {}

    for i, row in search_query_performance_df.fillna(0).iterrows():
        impressions = float(row['impressions'])
        clicks = float(row['clicks'])

        conversions = float(row['conversions'])
        conversion_value = float(row['conversion_value'])
        cost = float(row['cost'])
        text = (row['query'])

        # tidy up
        puncts = [",", ".", "!", "?", ":"]
        for punct in puncts:
            text = text.replace(punct, "")
        text = word_tokenize(text)

        bigram = bigrams(text)
        bigram_vec = []
        for gram in bigram:
            bigram_vec.append(gram)
        trigram = trigrams(text)
        trigram_vec = []
        for gram in trigram:
            trigram_vec.append(gram)
        total_gram_vec = bigram_vec + trigram_vec
        for gram in total_gram_vec:
            if gram not in n_gram_dict.keys():
                n_gram_dict[gram] = {'impressions': impressions, \
                                     #  'avg_pos_mult': impressions * avg_pos, \
                                     'gram_count': 1, 'clicks': clicks, \
                                     'cost': cost, 'conversions': conversions, 'conversion_value': conversion_value}
            else:
                n_gram_dict[gram]['impressions'] += impressions
                # n_gram_dict[gram]['avg_pos_mult'] += impressions * avg_pos
                n_gram_dict[gram]['gram_count'] += 1
                n_gram_dict[gram]['clicks'] += clicks
                n_gram_dict[gram]['cost'] += cost
                n_gram_dict[gram]['conversions'] += conversions
                n_gram_dict[gram]['conversion_value'] += conversion_value

    ### compute average position
    ### and statistic data

    n_gram_df_data = {}

    for gram in n_gram_dict.keys():
        impressions = n_gram_dict[gram]['impressions']
        count = n_gram_dict[gram]['gram_count']
        # avg_pos = n_gram_dict[gram]['avg_pos_mult'] / count
        clicks = n_gram_dict[gram]['clicks']
        conversions = n_gram_dict[gram]['conversions']
        cost = n_gram_dict[gram]['cost']
        conversion_value = n_gram_dict[gram]['conversion_value']

        try:
            cpa = cost / conversions
        except ZeroDivisionError:
            cpa = 0

        try:
            roas = conversion_value / cost
        except ZeroDivisionError:
            roas = 0

        try:
            ctr = clicks / impressions
        except ZeroDivisionError:
            ctr = 0
        try:
            conversion_rate = conversions / clicks
        except ZeroDivisionError:
            conversion_rate = 0
        try:
            average_cpc = cost / clicks
        except ZeroDivisionError:
            average_cpc = 0
        if clicks != 0 and clicks != 1:
            std = np.sqrt(clicks * (1 - ctr) ** 2 + \
                          (impressions - clicks) * ctr ** 2) / (impressions - 1)
            standard_error = std / np.sqrt(impressions)
        else:
            standard_error = 0
        min_result = ctr - standard_error * 2
        max_result = ctr + standard_error * 2
        n_gram_df_data[gram] = {'n_gram_count': count, 'impressions': impressions,
                                'ctr': ctr, 'conversion_rate': conversion_rate, \
                                'average_cpc': average_cpc, 'ctr_significance': standard_error,
                                'conversions': conversions,
                                'cost': cost, 'conversion_value': conversion_value, 'cpa': cpa, 'roas': roas,
                                'clicks': clicks}

    df = pd.DataFrame(n_gram_df_data)
    df = df.T

    df["ctr_significance"] = df["ctr_significance"].replace(
        r'^\s*$', 0, regex=True).astype("float")

    return df
Esempio n. 36
0
import nltk
from nltk.corpus import treebank

treebank_tagged = treebank.tagged_words(tagset='universal')
tagpairs = nltk.bigrams(treebank_tagged)
preceders_noun = [x[1] for (x, y) in tagpairs if y[1] == 'NOUN']
freqdist = nltk.FreqDist(preceders_noun)
print([tag for (tag, _) in freqdist.most_common()])
## Create collocations with intervneing words (gapped n-grams)
finder = BigramCollocationFinder.from_words(brown.words(), window_size=2)
finder.apply_word_filter(lambda x: not x.isalpha())
finder.apply_freq_filter(10)
finder.nbest(bigram_measures.pmi, 10)

## Finders
scored = finder.score_ngrams(bigram_measures.raw_freq)
scored[:10]

```{note}
How to get the document frequency of the bigrams???
```

unigram_freq = nltk.FreqDist(brown.words())
bigram_freq = nltk.FreqDist('_'.join(x) for x in nltk.bigrams(brown.words()))

unigram_freq_per_file = [nltk.FreqDist(words) 
                         for words in [brown.words(fileids=f) for f in brown.fileids()]]
bigram_freq_per_file = [nltk.FreqDist('_'.join(x) for x in nltk.bigrams(words))
                         for words in [brown.words(fileids=f) for f in brown.fileids()]]

## Function to get unigram dispersion
def createUnigramDipsersionDist(uni_freq, uni_freq_per_file):
    len(uni_freq_per_file)
    unigram_dispersion = {}

    for fid in uni_freq_per_file:
        for w, f in fid.items():
            if w in unigram_dispersion:
                unigram_dispersion[w] += 1
Esempio n. 38
0
def getTrainingAndTestData(tweets, K, k, method, feature_set):

    add_ngram_feat = feature_set.get('ngram', 1)
    add_negtn_feat = feature_set.get('negtn', False)

    from functools import wraps

    procTweets = [ (processAll(text, subject=subj, query=quer), sent)    \
                        for (text, sent, subj, quer) in tweets]

    stemmer = nltk.stem.PorterStemmer()

    all_tweets = []  #DATADICT: all_tweets =   [ (words, sentiment), ... ]
    for (text, sentiment) in procTweets:
        words = [word if(word[0:2]=='__') else word.lower() \
                    for word in text.split() \
                    if len(word) >= 3]
        words = [stemmer.stem(w)
                 for w in words]  #DATADICT: words = [ 'word1', 'word2', ... ]
        all_tweets.append((words, sentiment))

    # train_tweets = all_tweets[:int(len(all_tweets)*ratio)]      #DATADICT: train_tweets = [ (words, sentiment), ... ]
    # test_tweets  = all_tweets[int(len(all_tweets)*ratio):]      #DATADICT: test_tweets  = [ (words, sentiment), ... ]
    train_tweets = [x for i, x in enumerate(all_tweets) if i % K != k]
    test_tweets = [x for i, x in enumerate(all_tweets) if i % K == k]

    unigrams_fd = nltk.FreqDist()
    if add_ngram_feat > 1:
        n_grams_fd = nltk.FreqDist()

    for (words, sentiment) in train_tweets:
        words_uni = words
        unigrams_fd.update(words)

        if add_ngram_feat >= 2:
            words_bi = [','.join(map(str, bg)) for bg in nltk.bigrams(words)]
            n_grams_fd.update(words_bi)

        if add_ngram_feat >= 3:
            words_tri = [','.join(map(str, tg)) for tg in nltk.trigrams(words)]
            n_grams_fd.update(words_tri)

    sys.stderr.write('\nlen( unigrams ) = ' + str(len(unigrams_fd.keys())))

    #unigrams_sorted = nltk.FreqDist(unigrams).keys()
    unigrams_sorted = unigrams_fd.keys()
    #bigrams_sorted = nltk.FreqDist(bigrams).keys()
    #trigrams_sorted = nltk.FreqDist(trigrams).keys()
    if add_ngram_feat > 1:
        sys.stderr.write('\nlen( n_grams ) = ' + str(len(n_grams_fd)))
        ngrams_sorted = [k for (k, v) in n_grams_fd.items() if v > 1]
        sys.stderr.write('\nlen( ngrams_sorted ) = ' + str(len(ngrams_sorted)))

    def get_word_features(words):
        bag = {}
        words_uni = ['has(%s)' % ug for ug in words]

        if (add_ngram_feat >= 2):
            words_bi = [
                'has(%s)' % ','.join(map(str, bg))
                for bg in nltk.bigrams(words)
            ]
        else:
            words_bi = []

        if (add_ngram_feat >= 3):
            words_tri = [
                'has(%s)' % ','.join(map(str, tg))
                for tg in nltk.trigrams(words)
            ]
        else:
            words_tri = []

        for f in words_uni + words_bi + words_tri:
            bag[f] = 1

        #bag = collections.Counter(words_uni+words_bi+words_tri)
        return bag

    negtn_regex = re.compile(
        r"""(?:
        ^(?:never|no|nothing|nowhere|noone|none|not|
            havent|hasnt|hadnt|cant|couldnt|shouldnt|
            wont|wouldnt|dont|doesnt|didnt|isnt|arent|aint
        )$
    )
    |
    n't
    """, re.X)

    def get_negation_features(words):
        INF = 0.0
        negtn = [bool(negtn_regex.search(w)) for w in words]

        left = [0.0] * len(words)
        prev = 0.0
        for i in range(0, len(words)):
            if (negtn[i]):
                prev = 1.0
            left[i] = prev
            prev = max(0.0, prev - 0.1)

        right = [0.0] * len(words)
        prev = 0.0
        for i in reversed(range(0, len(words))):
            if (negtn[i]):
                prev = 1.0
            right[i] = prev
            prev = max(0.0, prev - 0.1)

        return dict(
            zip(['neg_l(' + w + ')'
                 for w in words] + ['neg_r(' + w + ')' for w in words],
                left + right))

    def counter(
        func
    ):  #http://stackoverflow.com/questions/13512391/to-count-no-times-a-function-is-called
        @wraps(func)
        def tmp(*args, **kwargs):
            tmp.count += 1
            return func(*args, **kwargs)

        tmp.count = 0
        return tmp

    @counter  #http://stackoverflow.com/questions/13512391/to-count-no-times-a-function-is-called
    def extract_features(words):
        features = {}

        word_features = get_word_features(words)
        features.update(word_features)

        if add_negtn_feat:
            negation_features = get_negation_features(words)
            features.update(negation_features)

        sys.stderr.write('\rfeatures extracted for ' +
                         str(extract_features.count) + ' tweets')
        return features

    extract_features.count = 0

    if ('1step' == method):
        # Apply NLTK's Lazy Map
        v_train = nltk.classify.apply_features(extract_features, train_tweets)
        v_test = nltk.classify.apply_features(extract_features, test_tweets)
        return (v_train, v_test)

    elif ('2step' == method):
        isObj = lambda sent: sent in ['neg', 'pos']
        makeObj = lambda sent: 'obj' if isObj(sent) else sent

        train_tweets_obj = [(words, makeObj(sent))
                            for (words, sent) in train_tweets]
        test_tweets_obj = [(words, makeObj(sent))
                           for (words, sent) in test_tweets]

        train_tweets_sen = [(words, sent) for (words, sent) in train_tweets
                            if isObj(sent)]
        test_tweets_sen = [(words, sent) for (words, sent) in test_tweets
                           if isObj(sent)]

        v_train_obj = nltk.classify.apply_features(extract_features,
                                                   train_tweets_obj)
        v_train_sen = nltk.classify.apply_features(extract_features,
                                                   train_tweets_sen)
        v_test_obj = nltk.classify.apply_features(extract_features,
                                                  test_tweets_obj)
        v_test_sen = nltk.classify.apply_features(extract_features,
                                                  test_tweets_sen)

        test_truth = [sent for (words, sent) in test_tweets]

        return (v_train_obj, v_train_sen, v_test_obj, v_test_sen, test_truth)

    else:
        return nltk.classify.apply_features(extract_features, all_tweets)
Esempio n. 39
0
term_freq_hash = counting_terms(term_type='terms_hash',
                                fname='listener_results.json',
                                nwords=10)

# COUNTING JUST TERMS
term_freq_only = counting_terms(term_type='terms_only',
                                fname='listener_results.json',
                                nwords=10)

###################### BIGRAMS ######################

# CREATE BIGRAMS
tokens = df['text_clean_ngrams'].apply(nltk.word_tokenize)
# Flatening nested list
flat_tokens = [term for sublist in tokens for term in sublist]
bgs = nltk.bigrams(flat_tokens)

# FREQUENCY DISTRIBUTION FOR ALL BIGRAMS
fdist = nltk.FreqDist(bgs)
for k, v in fdist.items():
    print(k, v)

fdist_10 = fdist.most_common(10)
print(fdist_10)

# CONVERT TO DF AND SORT
labels = ['bigram', 'Weight']
df_bigrams = pd.DataFrame([tuple_item for tuple_item in fdist.items()],
                          columns=labels)
df_bigrams[['Source_Name', 'Target_Name'
            ]] = pd.DataFrame([tuple_item for tuple_item in df_bigrams.bigram])
def best_word_features_com(words,best_words):
    d1 = dict([(word, True) for word in words if word in best_words])
    d2 = dict([(word, True) for word in nltk.bigrams(words) if word in best_words])
    d3 = dict(d1, **d2)
    return d3
Esempio n. 41
0
    f = open('fdist.pkl', 'rb')
    fdist = pickle.load(f)
    f.close()
else:
    f = open('lyrics.pkl', 'rb')
    data = pickle.load(f)
    f.close()
    lis = []
    cnt = 0
    for _ in data:
        js = [__ for __ in data[_]]
        lis += js
        cnt += 1
        print(cnt)

    bigram = list(nltk.bigrams(lis))
    fdist = nltk.ConditionalFreqDist(bigram)
    f = open('fdist.pkl', 'wb')
    pickle.dump(fdist, f, -1)
    f.close()

f = open('test_data.pkl', 'rb')
test_data = pickle.load(f)
f.close()

Ans = []
cnt = 0

for in_data, out_data in test_data:
    ans = ''.join(in_data)
    bg = in_data[-1][-1]
Esempio n. 42
0
def find_nltk_bigrams(my_str):
    split_str = my_str.split()
    bigram_str = bigrams(split_str)    
    return [item for item in bigram_str]
Esempio n. 43
0
        [w for w in word_tokens if not w in stop_words])
    text = "".join(
        [w for w in str(filtered_sentence) if w not in string.punctuation])
    word_tokens = re.split('\W+', text)

    for w in word_tokens:
        #print(ps.stem(w))
        #stem_text=stem_text.join([ps.stem(w)])
        stem_text = stem_text + ps.lemmatize(w) + " "

    p["filtered_sentence"].iloc[i] = stem_text
    word2vec_tokenize = word_tokenize(p["filtered_sentence"].iloc[i])
#%%%%%%%%%%%%%%%%%5
mystring = p.iloc[i, 4]
msystring = mystring.split(" ")
list(nltk.bigrams(msystring))
#%%%%%%%%%%%55
megastring = ""
for i in range(len(p)):
    megastring = megastring + str(p.iloc[i, 4]) + ""
#%%%%%%%%%%%%5
from nltk.collocations import BigramCollocationFinder


def bi(text):
    bigram_measures = nltk.collocations.BigramAssocMeasures()
    finder = BigramCollocationFinder.from_words(word_tokenize(text))
    finder.apply_freq_filter(3)
    finder.nbest(bigram_measures.pmi, 5)
    return finder.ngram_fd.items()
Esempio n. 44
0
    # Group by stemmed word
    stem_word_index.setdefault(stemmed_word, [])
    stem_word_index[stemmed_word].append(word)

# Calculate coefficient ==================================================================================
coef_threshold = 0.0

dice_stemmed_word_data = []
mim_stemmed_word_data = []
emim_stemmed_word_data = []
chi_sqr_stemmed_word_data = []

counter = 0
for stemmed_word, words in stem_word_index.items():
    # create bigrams from words
    bigrams = list(nltk.bigrams(words))
    for word_a, word_b in bigrams:
        # Lookup filename in word_files_index
        files_a = word_files_index[word_a]
        files_b = word_files_index[word_b]
        files_a_sliced_b = list(set(files_b) & set(files_a))

        # Using dice coef
        dice_coef = float(
            len(files_a_sliced_b)) / (len(files_a) + len(files_b))
        if (dice_coef > coef_threshold):
            dice_stemmed_word_data.append(
                (stemmed_word, word_a, word_b, dice_coef))

        # Using MIM coef
        mim_coef = float(len(files_a_sliced_b)) / (len(files_a) * len(files_b))
import nltk
from nltk import bigrams
from nltk.tokenize import word_tokenize
f=open("sample.txt","r")
dataf=f.read().replace('\n',' ');
delimiters=['(',')',';',',','.','/']
for i in delimiters:
	dataf=dataf.replace(i,'')
print(dataf)
data=word_tokenize(dataf)
bigrams=list(bigrams(data))
print(bigrams)
print('\n')
Esempio n. 46
0
# #By default a FreqDist is not sorted.
# print(list(freq_brown.keys())[:20])
#
# #if we sort it and print it, it will give the top words but without the frequencies
# fdist1 = sorted(freq_brown , key = freq_brown.__getitem__, reverse = True)
# print(fdist1[0:20])
#
# #prints the most common words with frequency; same result as the previous one with frequency
# print(freq_brown.most_common(20))

# an nltk.ConditionalFreqDist() counts frequencies of pairs.
# When given a list of bigrams, it maps each first word of a bigram
# to a FreqDist over the second words of the bigram.

cfreq_brown_2gram = nltk.ConditionalFreqDist(nltk.bigrams(brown.words()))
# print(cfreq_brown_2gram)

# conditions() in a ConditionalFreqDist are like keys()
# in a dictionary
# print(cfreq_brown_2gram.conditions())

# the cfreq_brown_2gram entry for "my" is a FreqDist.
# print(cfreq_brown_2gram["my"])

# here are the words that can follow after "my".
# We first access the FreqDist associated with "my",
# then the keys in that FreqDist
# print(cfreq_brown_2gram["my"].keys())
#
# # here are the 20 most frequent words to come after "my", with their frequencies
def main(file1, file2):
    file1_input = codecs.open(
        file1, "r",
        "utf-8")  # apre il "file1", in sola lettura "r", in codifica "utf-8"
    file2_input = codecs.open(
        file2, "r",
        "utf-8")  # apre il "file2", in sola lettura "r", in codifica "utf-8"
    nome1 = splitterFileName(file1_input)  # nome del file senza estensione
    nome2 = splitterFileName(file2_input)
    riga1 = file1_input.read()
    riga2 = file2_input.read()
    sent_tokenizer = nltk.data.load(
        'tokenizers/punkt/english.pickle'
    )  # metodo di lettura del file per la tokenizzazione
    frasi_file1 = sent_tokenizer.tokenize(riga1)  # frasi file1
    frasi_file2 = sent_tokenizer.tokenize(riga2)  # frasi file2
    lunghezza_corpus_file1, listaToken_file1, POS_token_tag_file1 = CorpusTokensPOS(
        frasi_file1)  # lunghezza del Corpus Token e POS tag
    lunghezza_corpus_file2, listaToken_file2, POS_token_tag_file2 = CorpusTokensPOS(
        frasi_file2)
    lista_Solo_POS_file1 = estraiSoloTagPOS(
        POS_token_tag_file1)  # estrae solo PoS tag senza token
    lista_Solo_POS_file2 = estraiSoloTagPOS(POS_token_tag_file2)
    token_20_file1, aggettivi_20_file1, verbi_20_file1, POS_10_file1, trigrammi_10_file1 = analisiFrequenze(
        listaToken_file1, POS_token_tag_file1,
        lista_Solo_POS_file1)  # analisi delle frequenze
    token_20_file2, aggettivi_20_file2, verbi_20_file2, POS_10_file2, trigrammi_10_file2 = analisiFrequenze(
        listaToken_file2, POS_token_tag_file2, lista_Solo_POS_file2)
    bigrammi_POS_file1 = bigrams(
        lista_Solo_POS_file1)  # estre coppie (<token-PoS, token-PoS>)
    bigrammi_POS_file2 = bigrams(lista_Solo_POS_file2)
    lista_10_bigrammi_PCong_file1, lista_10_bigrammi_PCond_file1 = probabilita(
        lista_Solo_POS_file1, bigrammi_POS_file1)  # calcola la probabilità
    lista_10_bigrammi_PCong_file2, lista_10_bigrammi_PCond_file2 = probabilita(
        lista_Solo_POS_file2, bigrammi_POS_file2)
    lista_SOST, lista_SOST_uguali = listaAggSost(POS_token_tag_file1,
                                                 POS_token_tag_file2)
    LMI_AGG_SOST_1 = calcolaLMI(listaToken_file1, POS_token_tag_file1,
                                lista_SOST)
    LMI_AGG_SOST_2 = calcolaLMI(listaToken_file2, POS_token_tag_file2,
                                lista_SOST)
    anlisi_linguistica_file1 = estraiEnitaNominateDiLuoghi(
        POS_token_tag_file1)  # analisi delle name entity
    anlisi_linguistica_file2 = estraiEnitaNominateDiLuoghi(POS_token_tag_file2)
    luoghi_10_file1 = nltk.FreqDist(
        anlisi_linguistica_file1["GPE"]).most_common(
            20)  # estre i 20 nomi di luogo più frequenti
    luoghi_10_file2 = nltk.FreqDist(
        anlisi_linguistica_file2["GPE"]).most_common(20)
    # RISULTATI ANALISI**********************************
    # 20 TOKEN
    print "\nIl confronto avviene su due corpus (", nome1, ".txt ,", nome2, ".txt) i quali contengono: blog scritti da autori di sesso maschile e blog scritti da autori di sesso femminile.\n"
    # LISTA = [Frequenza, token]
    print "\n\n- I 20 TOKEN PIù FREQUENTI (NO PUNTEGGIATURA) -\n"
    print nome1, "\t\t\t\t\t\t", nome2
    for elemento1, elemento2 in zip(token_20_file1, token_20_file2):
        print " Token --> %-20s Freq --> %-20s" % (
            elemento1[1], elemento1[0]), "Token --> %-20s Freq --> %-20s" % (
                elemento2[1], elemento2[0])
# AGGETTIVI
# LISTA = [Frequenza, token]
    print "\n\n- I 20 AGGETTIVI PIù FREQUENTI -\n"
    print nome1, "\t\t\t\t\t\t", nome2
    for elemento1, elemento2 in zip(aggettivi_20_file1, aggettivi_20_file2):
        print " Token --> %-20s Freq --> %-20s" % (
            elemento1[1][0],
            elemento1[0]), "Token --> %-20s Freq --> %-20s" % (elemento2[1][0],
                                                               elemento2[0])
# VERBI
# LISTA = [Frequenza, token]
    print "\n\n- I 20 VERBI PIù FREQUENTI -\n"
    print nome1, "\t\t\t\t\t\t", nome2
    for elemento1, elemento2 in zip(verbi_20_file1, verbi_20_file2):
        print " Token --> %-20s Freq --> %-20s" % (
            elemento1[1][0],
            elemento1[0]), "Token --> %-20s Freq --> %-20s" % (elemento2[1][0],
                                                               elemento2[0])
# 10 POS
# LISTA = [Frequenza,POS]
    print "\n\n- I 10 POS TAG PIù FREQUENTI -\n"
    print nome1, "\t\t\t\t\t\t", nome2
    for elemento1, elemento2 in zip(POS_10_file1, POS_10_file2):
        print " PoS --> %-20s Freq --> %-20s" % (
            elemento1[1], elemento1[0]), "Token --> %-20s Freq --> %-20s" % (
                elemento2[1], elemento2[0])
# 10 TROGRAMMI DI POS TAG
# LISTA = [Frequenza,[POS,POS,POS]]
    print "\n\n- I 10 TRIGRAMMI DI POS TAG PIù FREQUENTI -\n"
    print nome1, "\t\t\t\t\t\t\t\t", nome2
    for elemento1, elemento2 in zip(trigrammi_10_file1, trigrammi_10_file2):
        print " Trigramma --> %-3s - %-3s - %-20s Freq --> %-20s" % (
            elemento1[1][0], elemento1[1][1], elemento1[1][2], elemento1[0]
        ), " Trigramma --> %-3s - %-3s - %-20s Freq --> %-20s" % (
            elemento2[1][0], elemento2[1][1], elemento2[1][2], elemento2[0])
# 10 BIGRAMMI + PROBABILITà
# LISTA = [Probabilita,[POS,POS]]
# CONGIUNTA
    print "\n\n- I 10 BIGRAMMI DI POS TAG CON PROBABILITà CONGIUNTA MASSIMA -\n"
    print nome1, "\t\t\t\t\t\t\t\t\t", nome2
    for elemento1, elemento2 in zip(lista_10_bigrammi_PCong_file1,
                                    lista_10_bigrammi_PCong_file2):
        print " Bigramma --> %-3s - %-20s  P.Congiunta--> %-0s %-20s" % (
            elemento1[1][0], elemento1[1][1], "%1.2f" % elemento1[0],
            "%"), " Bigramma --> %-3s - %-20s  P.Congiunta --> %-0s %-20s" % (
                elemento2[1][0], elemento2[1][1], "%1.2f" % elemento2[0], "%")
    # CONDIZIONATA
    print "\n\n- I 10 BIGRAMMI DI POS TAG CON PROBABILITà CONDIZIONATA MASSIMA -\n"
    print nome1, "\t\t\t\t\t\t\t\t\t", nome2
    for elemento1, elemento2 in zip(lista_10_bigrammi_PCond_file1,
                                    lista_10_bigrammi_PCond_file2):
        print " Bigramma --> %-3s - %-20s  P.Condizionata --> %-0s %-20s" % (
            elemento1[1][0], elemento1[1][1], "%1.2f" % elemento1[0], "%"
        ), " Bigramma --> %-3s - %-20s  P.Condizionata  --> %-0s %-20s" % (
            elemento2[1][0], elemento2[1][1], "%1.2f" % elemento2[0], "%")
#20 SOSTANTIVI AGGETIVI
# LISTA = [SOST,[AGG,LMI]]
    print "\n\n- I 2O SOSTANTIVI PIù FREQUENTI, E PER OGNIUNO I SUOI AGGETTIVI ORDINATI IN BASE ALLA LOCAL MUTUAL INFORMATION -\n"
    print "\nIL NUMERO DEI SOSTANTIVI TOTALI é 20", "MA NE VERRANO STAMPATI SOLO", len(
        lista_SOST
    ), "PERCHè", len(
        lista_SOST_uguali
    ), "SOSTANTIVI SONO PRESENTI NEI 10 SOSTANTIVI PIù FREQUENTI DI ENTRAMBI I CORPUS. QUESTI SOSTANTIVI SONO:"
    for uguale in lista_SOST_uguali:
        print "-", uguale
    print "\n\t\t", nome1, "------------------------------------------------------", nome2
    for lista1, lista2 in zip(LMI_AGG_SOST_1, LMI_AGG_SOST_2):
        print "\n", "\t\t\t---------------------------SOST:", lista1[
            0], "---------------------------"
        for listaValori1, listaValori2 in zip(lista1[1:], lista2[1:]):
            for valore1, valore2 in zip(listaValori1, listaValori2):
                print " \nAGG --> %-10s LMI --> %-10s" % (
                    valore1[0],
                    valore1[1]), "\t\t\t\tAGG --> %-10s LMI --> %-10s" % (
                        valore2[0], valore2[1])


# 20 NOMI PROPRI DI LUOGO
    print "\n\n- I 20 NOMI PROPRI DI LUOGO PIù FREQUENTI -\n"
    print nome1, ":", "\t\t\t\t", nome2, ":"
    for elemento1, elemento2 in zip(luoghi_10_file1, luoghi_10_file2):
        print "%-20s Freq --> %-20s" % (
            elemento1[0], elemento1[1]), "%-20s Freq --> %-20s" % (
                elemento2[0], elemento2[1])
Esempio n. 48
0
import nltk
import nltk.book as book
import string

# Let's do frequency analysis for book 1 (Moby Dicke)
text1 = book.text1
concatenated_text = ''.join(text1)
dis = nltk.FreqDist(concatenated_text)
# dis.plot()

# Now we can get the bigrams
aux = nltk.bigrams(text1)
bigram_frequency = nltk.FreqDist(aux)
print('Most common bigrams')
print(bigram_frequency.most_common(20))  # Now we print the most common bigrams

Esempio n. 49
0
""" return top 50 common non-content words used in the four columns combined """ 
STOPLIST = set(nltk.corpus.stopwords.words())
def is_content_word(word):
    return word.lower() not in STOPLIST and word[0].isalpha()


dist = nltk.FreqDist([w.lower() for w in vocab if is_content_word(w)])
freq2=dist.most_common(50)
# Oops, the words here are not informative. I will try bigrams instead.

""" bigrams, b_dict returns a dictionary of bigrams each row; b_vocab gives the whole bigrams vocaburary """
b_dict={}
bivocab=[]
for index, row in templist.items():
    filtered_temp =[b for b in list(nltk.bigrams(row)) if is_content_word(b[0]) 
                    and is_content_word(b[1])]
    b_dict.update({index: filtered_temp})
    bivocab+=filtered_temp

dist1 = nltk.FreqDist([b for b in bivocab])
freq0 = dist1.most_common(50)


biig, biigfreq=zip(*freq0)

fig, ax = plt.subplots()
index = np.arange(len(biig))
bar_width = 0.25
opacity = 0.8
Esempio n. 50
0
def get_bigrams_list(text):
    bigrm = list(nltk.bigrams(text))
    return bigrm
Esempio n. 51
0
plt.figure(figsize=(30,10))
plt.bar(height, freq)
plt.xticks(height, labels)
plt.ylabel("Occurences")
plt.xlabel("Word/Text")
plt.show()

#Bigrams -- Terms adjacent to each other that occur frequently
with open('collection.json', 'r') as f:
    count_all = Counter()
    for line in f:
        tweet = json.loads(line)
        if "text" in tweet:
            terms = [term for term in tk.tokenize(tweet['text']) if term not in stop and not term.startswith('http') and not term.startswith('@')]
            term_pairs = bigrams(terms)
            count_all.update(term_pairs)
    print (count_all.most_common(20))

#Co-Occurences (Within Tweets)
com = defaultdict(lambda: defaultdict(int))
with open('collection.json', 'r') as f:
    count_all = Counter()
    for line in f:
        tweet = json.loads(line)
        if "text" in tweet:
            terms = [term for term in tk.tokenize(tweet['text']) if term not in stop and not term.startswith('http') and not term.startswith('@')]
            for i in range(len(terms)-1):
                for j in range (i+1, len(terms)):
                    t1, t2 = sorted([terms[i],terms[j]])
                    if t1 != t2:
Esempio n. 52
0
# -*- coding: utf-8 -*-
"""
Created on Mon Jun 12 19:09:52 2017

@author: chetan
"""

# Count terms only once, equivalent to Document Frequency
terms_single = set(terms_all)
# Count hashtags only
terms_hash = [
    term for term in preprocess(tweet['text']) if term.startswith('#')
]
# Count terms only (no hashtags, no mentions)
terms_only = [
    term for term in preprocess(tweet['text'])
    if term not in stop and not term.startswith(('#', '@'))
]
# mind the ((double brackets))
# startswith() takes a tuple (not a list) if
# we pass a list of inputs

from nltk import bigrams

terms_bigram = bigrams(terms_stop)
Esempio n. 53
0
locations = re.sub(r'[^\x00-\x7F]+', "", location)

#apply tokenization, lemmatization, bigrams, and stemmer to look at different sequences of terms; this will determine the best features
tokens = [
    word for sent in nltk.sent_tokenize(str(cleaned_tweets))
    for word in nltk.word_tokenize(sent)
]
for token in sorted(set(tokens))[:30]:
    print 'tokens are: ' + token + ' [' + str(tokens.count(token)) + ']'

lemmatizer = nltk.WordNetLemmatizer()
lemm_tokens = [lemmatizer.lemmatize(t) for t in tokens]
for token in sorted(set(lemm_tokens))[:30]:
    print 'lemm are: ' + token + ', [' + str(lemm_tokens.count(token)) + ']'

bigrams = [" ".join(pair) for pair in nltk.bigrams(tokens)]
# bigramslist = re.sub(',', '', str(bigrams))
print 'bigrams: ', bigrams[:10]

stemmer = SnowballStemmer("english")
stemmed_tokens = [stemmer.stem(t) for t in tokens]
for token in sorted(set(stemmed_tokens))[:30]:
    print 'stems are: ' + token + ' [' + str(stemmed_tokens.count(token)) + ']'

# n = 3
# trigrams = ngrams(str(tokens).split(), n)
# for grams in sorted(set(trigrams))[:20]:
#     print 'tri grams are:', grams

trigrams = [" ".join(pair) for pair in nltk.trigrams(tokens)]
# trigramslist = re.sub(',', '', str(trigrams))
Esempio n. 54
0
# Transition probability 찾기
tagged_words = []
all_tags = []
#nltk.corpus.brown.tagged_sents(tagset='universal')[0]

for sent in nltk.corpus.brown.tagged_sents(tagset='universal'):
    tagged_words.append(("START", "START"))
    all_tags.append("START")
    for (word, tag) in sent:
        all_tags.append(tag)
        tagged_words.append((tag, word))
    tagged_words.append(("END", 'END'))
    all_tags.append("END")

cfd_tags = nltk.ConditionalFreqDist(nltk.bigrams(all_tags))
cpd_tags = nltk.ConditionalProbDist(cfd_tags, nltk.MLEProbDist)

print("Count('DET','NOUN') =", cfd_tags['DET']['NOUN'])
print("P('NOUN | 'DET') =", cpd_tags['DET'].prob('NOUN'))

#Emission probability 찾기
cfd_tagwords = nltk.ConditionalFreqDist(tagged_words)

cpd_tagwords = nltk.ConditionalProbDist(cfd_tagwords, nltk.MLEProbDist)

print("Count('DET','the') =", cfd_tagwords['DET']['the'])
print("P('the'|'DET')=", cpd_tagwords['DET'].prob('the'))

#p56
Esempio n. 55
0
# # use different tagset on tagged corpora
# print(nltk.corpus.brown.tagged_words(tagset='universal'))
# print(nltk.corpus.treebank.tagged_words(tagset='universal'))

# # tagged corpora for various language in NLTK
## nltk.download('sinica_treebank')
# print(nltk.corpus.sinica_treebank.tagged_words())

# # 2.3 a universal part-of-speech tagset
from nltk.corpus import brown

brown_news_tagged = brown.tagged_words(categories='news', tagset='universal')
tag_fd = nltk.FreqDist(tag for (word, tag) in brown_news_tagged)
print(tag_fd.most_common())

# # 2.4 nouns
word_tag_pairs = nltk.bigrams(brown_news_tagged)
noun_preceders = [a[1] for (a, b) in word_tag_pairs if b[1] == 'NOUN']
fdist = nltk.FreqDist(noun_preceders)
print([tag for (tag, _) in fdist.most_common()])

# # 2.5 verbs
wsj = nltk.corpus.treebank.tagged_words(tagset='universal')
word_tag_fd = nltk.FreqDist(wsj)
print([wt[0] for (wt, _) in word_tag_fd.most_common() if wt[1] == 'VERB'])

cfd1 = nltk.ConditionalFreqDist(wsj)
print(cfd1['yield'].most_common())
print(cfd1['cut'].most_common())
Esempio n. 56
0
FILE2 = ['doyle-case-27.txt']

for doc in FILE1:
	with open(doc, 'r') as file:
		text = ''.join(file.readlines()).lower().split()
		####Set up dictionary with single word counts
		dict1 = {}
		dict_of_dicts = {}
		totalWords = 0;
		for word in text:
			dict1[word] = 0;
			dict_of_dicts[word] = {};
		for word in text:
			dict1[word] = dict1[word] + 1
		####Set up dictionary with bigram counts
		bigrams1 = list(nltk.bigrams(text))
		for big in bigrams1:
			dict_of_dicts[big[0]][big[1]] = 0;
		for big in bigrams1:
			dict_of_dicts[big[0]][big[1]] = dict_of_dicts[big[0]][big[1]] + 1;
		for prev in dict1:
			for curr in dict_of_dicts[prev]:
				dict_of_dicts[prev][curr] = dict_of_dicts[prev][curr] / dict1[prev]
		#for i in range(100):
			#key1 = random.choice(list(dict_of_dicts.keys()))
			#key2 = random.choice(list(dict_of_dicts[key1].keys()))
			#print('p(' + str(key2) + '|' + str(key1) + ') = ' + str(dict_of_dicts[key1][key2]))
#print(bigrams1[2][0])
#print(dict_of_dicts)
for doc in FILE2:
	with open(doc, 'r') as file:
Esempio n. 57
0
def _update_ngram_database(notes_directory, ngram_db_dir):

    line_tokenizer = LineTokenizer(blanklines='discard')
    word_tokenizer = WhitespaceTokenizer()

    grep_command = 'find {} | grep ".note$"'.format(notes_directory)

    proc = Popen(
        grep_command,
        stdout=PIPE, stderr=PIPE,
        shell=True)
    output, err = proc.communicate()
    all_notes_files = output.decode().split('\n')

    '''
    Create master list of all raw tokens. Will look like:
        tokens = {
            'unigrams': ['all', 'unigrams'],
            'bigrams': [('all', 'bigrams')],
            'trigrams': [('all', 'the', 'trigrams')]
        }
    '''

    tokens = {
        'unigrams': [],
        'bigrams': [],
        'trigrams': []
    }

    for note_file in all_notes_files:

        if not note_file:
            continue

        with codecs.open(note_file, mode="r", encoding="utf-8") \
                as note_file_object:
            note_file_content = note_file_object.read()

        note_file_content = note_file_content.lower()

        lines = line_tokenizer.tokenize(note_file_content)
        for line in lines:

            sentences = sent_tokenize(line)
            for sentence in sentences:

                sentence_safe_split = []

                all_words = word_tokenizer.tokenize(sentence)
                for word in all_words:

                    # Skip any word with a forbidden character
                    if any([char in word for char in FORBIDDEN_CHARS]):
                        continue

                    has_letters = False
                    for char in word:
                        if char.isalpha():
                            has_letters = True
                            break

                    if word and has_letters:
                        sentence_safe_split.append(word)

                tokens['unigrams'].extend(sentence_safe_split)
                tokens['bigrams'].extend(bigrams(sentence_safe_split))
                tokens['trigrams'].extend(trigrams(sentence_safe_split))

    '''
    Squash the list of tokens into a dict that tracks
    the number of occurences of each token. Will look like:
    tokens = {
        'unigrams': {
            'foo': 17,
            'bar': 42,
            ...
        },
        ...
    }
    '''

    for token_type in tokens.keys():

        all_tokens_of_type = tokens[token_type]
        weighted_tokens = {}

        for single_token in all_tokens_of_type:
            if not isinstance(single_token, str):
                single_token = ' '.join(single_token)
            if not weighted_tokens.get(single_token):
                weighted_tokens[single_token] = 1
            else:
                weighted_tokens[single_token] = weighted_tokens[single_token]+1

        tokens[token_type] = OrderedDict(sorted(
            weighted_tokens.items(),
            key=lambda t: t[1],
            reverse=True))

    # Write Unigrams to Disk
    unigrams_json_file_path = ngram_db_dir + '/unigrams.json'
    unigrams_text_file_path = ngram_db_dir + '/unigrams.txt'
    with open(unigrams_json_file_path, 'w') as unigrams_json_file_object:
        json.dump(tokens['unigrams'], unigrams_json_file_object)
    with codecs.open(unigrams_text_file_path, mode="w", encoding="utf-8") \
            as unigrams_text_file_object:
        for unigram, frequency in tokens['unigrams'].items():
            unigrams_text_file_object.write(unigram + '\n')

    # Write Bigrams to Disk
    bigrams_json_file_path = ngram_db_dir + '/bigrams.json'
    bigrams_text_file_path = ngram_db_dir + '/bigrams.txt'
    with open(bigrams_json_file_path, 'w') as bigrams_json_file_object:
        json.dump(tokens['bigrams'], bigrams_json_file_object)
    with codecs.open(bigrams_text_file_path, mode="w", encoding="utf-8") \
            as bigrams_text_file_object:
        for bigram, frequency in tokens['bigrams'].items():
            bigrams_text_file_object.write(bigram + '\n')

    # Write Trigrams to Disk
    trigrams_json_file_path = ngram_db_dir + '/trigrams.json'
    trigrams_text_file_path = ngram_db_dir + '/trigrams.txt'
    with open(trigrams_json_file_path, 'w') as trigrams_json_file_object:
        json.dump(tokens['trigrams'], trigrams_json_file_object)
    with codecs.open(trigrams_text_file_path, mode="w", encoding="utf-8") \
            as trigrams_text_file_object:
        for trigram, frequency in tokens['trigrams'].items():
            trigrams_text_file_object.write(trigram + '\n')
Esempio n. 58
0
def extract_bigrams(text):
    tokens = word_tokenize(text)
    return [gram[0] + ' ' + gram[1] for gram in bigrams(tokens)]
Esempio n. 59
0
for word in data:
    tokens.append(word[:-1])

print "after read tokens"

for x in xrange(len(tokens)):  # count the frequency of each word
    if (tokens[x] in words):
        words[tokens[x]] += 1
    else:

        words[tokens[x]] = 1

words[("UNK", "UNK")] = 0

print "after words"
#dic=defaultdict(lambda: defaultdict(lambda: 1)) #make the matrix
dic = {}
one = list(bigrams(tokens))
print "after bigram"

n = 1812418
compute()

pre("  At the ")
pre(" it is ")
"""with open('dic', 'wb') as file:
 
  pickle.dump(dic, file)"""
elapsed_time = time.time() - start_time
print "time elapsed  ", elapsed_time
Esempio n. 60
0
import nltk


def generate_model(cfdist, word, num=15):
    for i in range(num):
        print word,
        word = cfdist[word].max()


text = nltk.corpus.genesis.words('english-kjv.txt')
biagrams = nltk.bigrams(text)
cfd = nltk.ConditionalFreqDist(biagrams)
#cfd.plot()
#print(cfd)
generate_model(cfd, 'living')