def compare_pos(file_name_1, file_name_2):

    tokens_1 = make_tokens(file_name_1)
    tokens_2 = make_tokens(file_name_2)

    tri_tokens_1  = trigrams(tokens_1)
    tri_tokens_2  = trigrams(tokens_2)

    dist_1 = nltk.FreqDist(tri_tokens_1)
    dist_2 = nltk.FreqDist(tri_tokens_2)

    diff_1 = dist_1 - dist_2
    diff_2 = dist_2 - dist_1

    with open("common_pos_mt.txt", "w") as file:
        for word, freq in diff_1.most_common(20):
            line = str(word) + " " + str(freq) + '\n'
            print(line)
            file.write(line)

    with open("common_pos_hmn.txt", "w") as file:
        for word, freq in diff_2.most_common(20):
            line = str(word) + " " + str(freq) + '\n'
            print(line)
            file.write(line)


    """
    def train(self,tweets):
        # 1st step: build the bag-of-words model
        tweet_tokens_list = [tweet_tokens for tweet_tokens,label in tweets]
        tokens = []
        print('Computing the trainset vocabulary of n-grams')
        for tweet_tokens in tweet_tokens_list:
            unigrams = [w.lower() for w,t in tweet_tokens]
            tokens += unigrams
            tokens += ['_'.join(b) for b in bigrams(unigrams)]
            tokens += ['_'.join(t) for t in trigrams(unigrams)]
            tokens += [t1 + '_*_' + t3 for t1,t2,t3 in trigrams(unigrams)]

        # build the bag-of-words list using all the tokens
        self.bag_of_words = set(tokens)

        data = list()
        total_tweets = len(tweets)
        features_list = list()
        for index,(tweet_tokens,label) in enumerate(tweets):
            print('Training for tweet n. {}/{}'.format(index+1,total_tweets))
            features_list.append(self.extract_features(tweet_tokens))

        # Train a SVM classifier
        #data = self.vectorizer.fit_transform([features for features,label in self.train_set_features])
        print('Vectorizing the features')
        data = self.vectorizer.fit_transform(features_list)
        target = self.encoder.fit_transform([label for tweet_tokens,label in tweets])
        print('Building the model')
        self.classifier.fit(data, target)
Beispiel #3
0
def main():
	text = open('holmes.txt').read()
	tokens = nltk.wordpunct_tokenize(text)
	charList = []
	for word in tokens:
		for char in word:
			charList.append(char)
	fDistChars = nltk.FreqDist(charList)
	fDistWords = nltk.FreqDist(tokens)
	
	print("Answer to 1A, there are {} character types in the book, namely: \n{}".format(len(fDistChars),sorted(fDistChars)))
	print("\nAnswer to 1B, there are {} word types in the book, namely: \n{}".format(len(fDistWords),sorted(fDistWords)))
	
	bigramChars = nltk.bigrams(charList)
	trigramChars = nltk.trigrams(charList)

	print("\nAnswer to 1C, the 20 most common characters are: \nUnigrams: \n{}\nBigrams: \n{}\nTrigrams: \n{}".format(most_common(charList), 
		most_common(bigramChars), most_common(trigramChars)))

	bigramWords = nltk.bigrams(tokens)
	trigramWords = nltk.trigrams(tokens)

	print("\nAnswer to 1D, the 20 most common words are: \nUnigrams: \n{}\nBigrams: \n{}\nTrigrams: \n{}".format(most_common(tokens), 
		most_common(bigramWords), most_common(trigramWords)))
	
	bigram_measures = nltk.collocations.BigramAssocMeasures()
	finder = BigramCollocationFinder.from_words(tokens)
	scoredPMI = finder.score_ngrams(bigram_measures.pmi)
	scoredCHI = finder.score_ngrams(bigram_measures.chi_sq)
	
	print("\nAnswer to 2, the 20 most likely collocations are:\nPMI:\n{} \nChi's square\n{}" .format(scoredPMI[:20],scoredCHI[:20]))
	
	print("\nSpearmans correlation = {}".format(nltk.metrics.spearman.spearman_correlation(scoredPMI, scoredCHI)))
    def extract_features(self, tweet_tokens):

        if len(self.bag_of_words) == 0:
            print('Bag-of-Words empty!')

        unigrams = [w.lower() for w,t in tweet_tokens]
        tokens = unigrams
        tokens += ['_'.join(b) for b in bigrams(unigrams)]
        tokens += ['_'.join(t) for t in trigrams(unigrams)]
        tokens += [t1 + '_*_' + t3 for t1,t2,t3 in trigrams(unigrams)]

        tweet_tags =  [tag for token, tag in tweet_tokens]

        feature_set = {}

        # 1st set of features: bag-of-words
        for token in set(tokens).intersection(self.bag_of_words):
            feature_set['has_'+token] = True

        # 2nd set of features: the count for each tag type present in the message
        # Tweet_nlp taget. Info:
        # http://www.ark.cs.cmu.edu/TweetNLP/annot_guidelines.pdf
        for tag in ['N','O','^','S','Z','V','A','R','!','D','P','&','T','X','#','@','~','U','E','$',',','G','L','M','Y']:
            feature_set['num_'+tag] = sum([1 for t in tweet_tags if t == tag])

        # 3rd feature: negation is present?
        negators = set(LexiconClassifier().read_negation_words())
        if len(negators.intersection(set(tokens))) > 0:
            feature_set['has_negator'] = True

        # 4th feature: character ngrams
        regexp = re.compile(r"([a-z])\1{2,}")
        feature_set['has_char_ngrams'] = False
        for token,tag in tweet_tokens:
            if regexp.search(token):
                feature_set['has_char_ngrams'] = True
                break

        # 5th feature: punctuation ngrams
        regexp = re.compile(r"([!\?])\1{2,}")
        feature_set['has_punct_ngrams'] = False
        for token,tag in tweet_tokens:
            if regexp.search(token):
                feature_set['has_punct_ngrams'] = True
                break

        # 6th feature: the number of all upper cased words
        feature_set['num_all_caps'] = sum([1 for token,tag in tweet_tokens if token.isupper() and len(token)>=3])

        # 7th and 8th feature: the positive and negative score from lexicon
        # classifier (i.e., number of positive and negative words from lexicon)
        positive_score, negative_score = self.lexicon_classifier.classify(tweet_tokens)
        feature_set['pos_lexicon'] = positive_score
        feature_set['neg_lexicon'] = -1 * negative_score

        return feature_set
def jacquard_trigram(query):
    final=[]
    for a in file('enwiktionary.a.list'):
        a=a.rstrip()
        trigram=set(nltk.trigrams(a))
        q_trigram=set(nltk.trigrams(query))
        intersect=q_trigram.intersection(trigram)
        union=q_trigram.union(trigram)
        sim=float(len(intersect))/len(union)
        
        final.append([a,sim])
    final_sorted= sorted(final,key=lambda sim:sim[1], reverse=True)
    print final_sorted[:10]
Beispiel #6
0
def main():

    OUT = open("../output.txt", "w")
    OUT.close()
    INP = open("../data/test.hyp1-hyp2-ref", "r")
    inp = INP.read()
    for sent in inp.split("\n")[:-1]:
        h1 = sent.split(" ||| ")[0].split(" ")
        h2 = sent.split(" ||| ")[1].split(" ")
        ref = sent.split(" ||| ")[2].split(" ")
        h1p = process(h1)
        h2p = process(h2)
        refp = process(ref)
        #print(h1c, h2c, refc)
        #h1_match = word_matches(h1, rset)
        #h2_match = word_matches(h2, rset)
        h1c = Counter(h1)
        h2c = Counter(h2)
        refc = Counter(ref)
        h1_bigrams = nltk.bigrams(h1)
        h2_bigrams = nltk.bigrams(h2)
        ref_bigrams = nltk.bigrams(ref)
        h1_trigrams = nltk.trigrams(h1)
        h2_trigrams = nltk.trigrams(h2)
        ref_trigrams = nltk.trigrams(ref)
        #print(h_bigrams, ref_bigrams)
        h1_bigramsc = Counter(h1_bigrams)
        h2_bigramsc = Counter(h2_bigrams)
        ref_bigramsc = Counter(ref_bigrams)
        h1_trigramsc = Counter(h1_trigrams)
        h2_trigramsc = Counter(h2_trigrams)
        ref_trigramsc = Counter(ref_trigrams)
        h1_allc = h1c + h1_bigramsc + h1_trigramsc
        h2_allc = h2c + h2_bigramsc + h2_trigramsc
        ref_allc = refc + ref_bigramsc + ref_trigramsc
        h1_precision = precision(h1_allc, ref_allc)
        h2_precision = precision(h2_allc, ref_allc)
        h1_recall = recall(h1_allc, ref_allc)
        h2_recall = recall(h2_allc, ref_allc)
        h1_meteor = meteor(h1_precision, h1_recall)
        h2_meteor = meteor(h2_precision, h2_recall)
        OUT = open("../output.txt", "a")

        if h1_meteor > h2_meteor:
            OUT.write("-1\n")
        else:
            if h1_meteor == h2_meteor:
                OUT.write("0\n")
            else:
                OUT.write("1\n")
        OUT.close()
Beispiel #7
0
def calc_probabilities(training_corpus):
    unigram_c = collections.defaultdict(int)
    bigram_c = collections.defaultdict(int)
    trigram_c = collections.defaultdict(int)

    for sentence in training_corpus:
        tokens0 = sentence.strip().split()
        tokens1 = tokens0 + [STOP_SYMBOL]
        tokens2 = [START_SYMBOL] + tokens0 + [STOP_SYMBOL]
        tokens3 = [START_SYMBOL] + [START_SYMBOL] + tokens0 + [STOP_SYMBOL]
        # unigrams
        for unigram in tokens1:
            unigram_c[unigram] += 1

        # bigrams
        for bigram in nltk.bigrams(tokens2):
            bigram_c[bigram] += 1

        # trigrams
        for trigram in nltk.trigrams(tokens3):
            trigram_c[trigram] += 1

    unigrams_len = sum(unigram_c.itervalues())
    unigram_p = {k: math.log(float(v) / unigrams_len, 2) for k, v in unigram_c.iteritems()}

    # calc P(W2|W1) = P(W2,W1) / P(W1) = C(W2,W1) / C(W1)
    unigram_c[START_SYMBOL] = len(training_corpus)
    bigram_p = {k: math.log(float(v) / unigram_c[k[0]], 2) for k, v in bigram_c.iteritems()}

    bigram_c[(START_SYMBOL, START_SYMBOL)] = len(training_corpus)
    trigram_p = {k: math.log(float(v) / bigram_c[k[:2]], 2) for k, v in trigram_c.iteritems()}
    return unigram_p, bigram_p, trigram_p
Beispiel #8
0
    def ngramify(self, word_list):
        """
            Tranforms word_list into unigrams, bigrams, trigrams

            input:
                list of words
        """

        # creates an ngram from a word_list based on class settings
        mode = self.mode
        pos = self.inclued_pos
        word = self.include_word
        if word and pos:
            selection = [(w.lower(), p) for w, p in word_list]
        elif word:
            selection = [w.lower() for w, p in word_list]
        elif pos:
            selection = [p for w, p in word_list]

        if mode == "unigrams":
            word_list = selection
        elif mode == "bigrams":
            word_list = nltk.bigrams(selection)
        elif mode == "trigrams":
            word_list = nltk.trigrams(selection)
        return word_list
Beispiel #9
0
def calc_trigrams(brown_tags):
    '''
    Calculate the log-probabilities of tag trigrams.
    :param brown_tags: List of 'sentence tags List' [ [], [] .. ]
    :return: tag trigram probability dictionary
    '''
    unigram_p, bigram_p, trigram_p = {}, {}, {}
    unigram_c, bigram_c, trigram_c = Counter(), Counter(), Counter()

    # flatten brown tags since it's list of tag lists.
    brown_tags_flat = [item for sublist in brown_tags for item in sublist]

    unigram_c.update(brown_tags_flat)    # unigram
    bigram_c.update(nltk.bigrams(brown_tags_flat))   # bigram
    trigram_c.update(nltk.trigrams(brown_tags_flat))    # trigram

    unigram_len, bigram_len, trigram_len = sum(unigram_c.values()), sum(bigram_c.values()), sum(trigram_c.values())
    # prepare unigram log probabilities -> P(Wi) = c(Wi) / V
    for unigram, count in unigram_c.iteritems():
        unigram_p[(unigram,)] = math.log(count / float(unigram_len-32491), 2)

    # prepare bigram log probabilities -> P(Wi|Wi-1) = c(Wi-1,Wi)/c(Wi-1)
    for bigram, count in bigram_c.iteritems():
        bigram_p[bigram] = math.log(count / float(unigram_c[bigram[0]]), 2)

    # prepare trigram log probabilities -> P(Wi|Wi-2, Wi-1) = c(Wi-2,Wi-1,Wi)/c(Wi-2,Wi-1)
    for trigram, count in trigram_c.iteritems():
        trigram_p[trigram] = math.log(count / float(bigram_c[trigram[:2]]), 2)

    return trigram_p
Beispiel #10
0
def linearscore(unigrams, bigrams, trigrams, corpus):
    """Linear interpolate the probabilities.

    See http://web.stanford.edu/~jurafsky/slp3/4.pdf paragraph 4.4.3
    """
    scores = []
    # Set lambda equal to all the n-grams so that it sums up to 1.
    lambda_ = 1.0 / 3
    for sentence in corpus:
        interpolated_score = 0
        tokens0 = sentence.strip().split()
        for trigram in nltk.trigrams([START_SYMBOL] + [START_SYMBOL] + tokens0 + [STOP_SYMBOL]):
            try:
                p3 = trigrams[trigram]
            except KeyError:
                p3 = MINUS_INFINITY_SENTENCE_LOG_PROB
            try:
                p2 = bigrams[trigram[1:3]]
            except KeyError:
                p2 = MINUS_INFINITY_SENTENCE_LOG_PROB
            try:
                p1 = unigrams[trigram[2]]
            except KeyError:
                p1 = MINUS_INFINITY_SENTENCE_LOG_PROB
            interpolated_score += math.log(lambda_ * (2 ** p3) + lambda_ * (2 ** p2) + lambda_ * (2 ** p1), 2)
        scores.append(interpolated_score)
    return scores
Beispiel #11
0
    def _find_names_in_tokens(self, tokens):
        """Returns tuple
        Takes list of all tokens from a document and returns back tuple
        of found names. First element is a an alphabetised list of unique
        names, second -- names in the order of their occurance in the document,
        third --  offsets for each mention of the name in the document

        Arguments:
        tokens -- list with all tokens from the searched document

        """
        self._index_dict = create_index(tokens)
        token_string = " ".join(tokens)

        if len(tokens) == 2:
            if (self._is_like_binomial(tokens[0], tokens[1])
                    and self._is_a_name(token_string, tokens, 0, 1)):
                self._names_list.append(token_string)
        elif len(tokens) == 1:
            if (len(tokens[0]) > 2
                and tokens[0][0].isupper()
                and tokens[0].isalpha()
                and self._is_not_in_black_list(tokens[0])
                and self._is_a_name(tokens[0], tokens, 0, 0)):
                self._names_list.append(tokens[0])
        else:
            trigrams = nltk.trigrams(tokens)
            self._walk_trigrams(trigrams, tokens)
            self._check_last_bigram_unigram(trigrams[-1], tokens)
        return self._generate_output()
Beispiel #12
0
    def ngramify(self, word_list, stop):
        # creates an ngram from a word_list based on class settings
        mode = self.mode
        pos = self.inclued_pos
        word = self.include_word
        stopset = set(stopwords.words("english"))
        stopset.remove("not")
        if stop:
            if word and pos:
                selection = [(w.lower(), p) for w, p in word_list if w.lower() not in stopset]
            elif word:
                selection = [w.lower() for w, p in word_list if w.lower() not in stopset]
            elif pos:
                selection = [p for w, p in word_list if w.lower() not in stopset]
        else:
            if word and pos:
                selection = [(w.lower(), p) for w, p in word_list]
            elif word:
                selection = [w.lower() for w, p in word_list]
            elif pos:
                selection = [p for w, p in word_list]

        if mode == "unigrams":
            word_list = selection
        elif mode == "bigrams":
            word_list = nltk.bigrams(selection)
        elif mode == "trigrams":
            word_list = nltk.trigrams(selection)
        return word_list
def getTriGramsFromComments(text):
    # split the texts into tokens
    tokens = nltk.word_tokenize(text)
    tokens = [token.lower() for token in tokens if len(token) > 1] #same as unigrams
    tri_tokens = trigrams(tokens)
    fdist = nltk.FreqDist(tri_tokens)
    return fdist
def exercise2(category):
    print
    print "For Category: " + category
    print "Part 1"
    print "Words with the tag 'JJ':"
    words = bn.tagged_words(categories = category)
    wordlist = bn.words(categories = category)
    words_JJ = set(sorted([(word, tag) for (word, tag) in words if tag == 'JJ']))
    print len(words_JJ)
    print
    print "Part 2"
    print "Words with tags 'VBZ' -> 3rd Person Singular Verbs or ('NNPS' or 'NNS') -> plural nouns:"
    words_VBP_NNPS_NNS = [(word, tag) for (word, tag) in words if tag == 'VBZ' or tag == 'NNPS' or tag == 'NNS']
    print words_VBP_NNPS_NNS[:10]
    print
    sent = ""
    print "Part 3"
    print "The 3 most frequent 3-word prepositional phrases are:"
    words = bn.tagged_words(categories = category)
    for (w1, t1), (w2, t2), (w3, t3) in nltk.trigrams(words):
        if(t1.startswith('IN') and t2.startswith('AT') and t3.startswith('NN')):
            sent = sent + w1.lower() + " " + w2.lower() + " " + w3.lower() + "."
    sent_part = sent.split(".")
    fd = nltk.FreqDist(sent_part)
    v = fd.most_common(3)
    print v
    print
    print "Part 4"
    print "Ratio of Masculine to Feminine is:"
    male_pattern = r'\bhe\b|\bhis\b|\bhim\b|\bhimself\b'
    female_pattern = r'\bshe\b|\bher\b|\bhers\b|\bherself\b'
    male_pronouns = len([w for w in wordlist if re.search(male_pattern, w.lower())])
    female_pronouns = len([w for w in wordlist if re.search(female_pattern, w.lower())])
    print "Male : Female is -> %d : %d" %(male_pronouns, female_pronouns)
    print
Beispiel #15
0
 def trigrams(self, unigrams):
     """
     Generate trigrams from unigrams
     @param unigrams: unigram words
     @type unigrams: C{list}
     """
     return nltk.trigrams(unigrams)
def calcSentProb(sent, NGramProbDict, n):
    '''
    Look up each tag-ngram (trigrams here) in the target sentence in the
    ngrams log-prob dictionary; if found, add log-prob to total, else use
    the default prob;
    '''
    prob    = 0.0
    count   = 0
    if len(sent)< 2:
        prob = -12
        count = 1
    elif len(sent)<3 or n==2:
        for (w1,t1),(w2,t2) in nltk.bigrams(sent):
            if (t1,t2) in NGramProbDict.keys():
                prob += NGramProbDict[(t1,t2)]
            else:
                prob += tri_default_prob
            count += 1
    elif n==3:
        for (w1,t1),(w2,t2),(w3,t3) in nltk.trigrams(sent):
            if (t1,t2,t3) in NGramProbDict.keys():
                prob += NGramProbDict[(t1,t2,t3)]
            else:
                prob += bi_default_prob
            count += 1
    return float(prob) / count
    def filter_input_file(self, input_file):

        with open(input_file) as f:

            # Break apart file into list of single words, bigrams, and trigrams
            full_text = f.read()
            words = nltk.word_tokenize(full_text)

            filtered_trigrams = [' '.join(tgram) for tgram in nltk.trigrams(words) if self.query(' '.join(tgram))]
            filtered_bigrams = [' '.join(bgram) for bgram in nltk.bigrams(words) if self.query(' '.join(bgram))]
            filtered_words = [word for word in words if self.query(word)]

            filtered = filtered_trigrams + filtered_bigrams + filtered_words

            new_text = ""

            for word in words:
                if word in filtered:
                    new_text += "**** "
                else:
                    new_text += word + " "

            # out_file = open('bloom-output-200bits-10hashes', 'w')
            # out_file.write(new_text)
            # out_file.close()

            for line in textwrap.wrap(new_text, 140):
                print line

            return filtered

        pass
Beispiel #18
0
def ngrams_freq(tokens):
    trigrams  = nltk.trigrams(tokens)
    fdist = nltk.FreqDist(trigrams)
    dd = {}
    for k,v in fdist.items():
        dd[k] = v
    return dd
Beispiel #19
0
    def pos_tagger(self):
        tweets = []
        for tw in self.tweet_original:
            try:
                tw = tw.decode('unicode_escape').encode('ascii','ignore')
            except:
                tw = re.sub(r'\\+', '', tw)
                tw = tw.decode('unicode_escape').encode('ascii','ignore')
            tweets.append(tw)

        # tweets = [tw.encode('utf8') for tw in self.tweet_original[:3]]
        sent_tags = CMUTweetTagger.runtagger_parse(tweets)
        # fil_tweet = open('tweet_tags.json','w')
        i = 0
        for sent in sent_tags:
            unigrams = [tag_tuple[1] for tag_tuple in sent]
            bigrams = set(nltk.bigrams(unigrams))
            trigrams = set(nltk.trigrams(unigrams))
            self.tweet_unigram[self.tweet_id[i]] = set(unigrams)
            self.tweet_bigram[self.tweet_id[i]] = bigrams
            self.tweet_trigram[self.tweet_id[i]] = trigrams

            self.tweet_feature_list.extend(unigrams)
            self.tweet_feature_list.extend(bigrams)
            self.tweet_feature_list.extend(trigrams)

            i += 1
        #json.dump(self.tweet_unigram,fil_tweet)
        self.tweet_feature_list = list(set(self.tweet_feature_list))
Beispiel #20
0
	def get_classification(self, text):
		text = ut.clean(text)
	
		uni = nltk.tokenize.word_tokenize(text)
		
		bi = nltk.bigrams (uni)
		tri = nltk.trigrams (uni)
		
		temp_lambda = self.lambda_pi
		
		# Map to store answer to its divergence pairs
		list_of_ans = dict()
		
		for (ques, ans) in self.training_set:
			
			fin_val = 0.0
		
			for t in uni:
				fin_val += temp_lambda[5] * (float(self.unigram_tot_dict.get(t,0))/self.len)
				fin_val += temp_lambda[4] * (float(self.unigram_dict.get((ques,t),0))/len(ques))
			
			for t in bi:
				fin_val += temp_lambda[3] * (float(self.bigram_tot_dict.get(t,0))/self.unigram_tot_dict.get(t[:1],1))
				fin_val += temp_lambda[2] * (float(self.bigram_dict.get((ques,t),0))/self.unigram_dict.get((ques,t[:1]),1)) 
			
			for t in tri:
				fin_val += temp_lambda[1] * (float(self.trigram_tot_dict.get(t,0))/self.bigram_tot_dict.get(t[:2],1))
				fin_val += temp_lambda[0] * (float(self.trigram_dict.get((ques,t),0))/self.bigram_dict.get((ques,t[:2]),1))		
			
			list_of_ans[self.training_orig.get(ans, ans)] = fin_val
		
		# Return Weighted list of responses
		return list_of_ans
def calc_trigrams(brown_tags):
    #print brown_tags[0]
    #q_values = {}
    #unigram_c = collections.defaultdict(int)
    bigram_c = collections.defaultdict(int)
    trigram_c = collections.defaultdict(int)

    for stags in brown_tags:
        unigram_tuples = stags
        bigram_tuples =  list(nltk.bigrams(stags))
        trigram_tuples = list(nltk.trigrams(stags))


        #print unigram_tuples
        #for g in unigram_tuples:
            #unigram_c[g] += 1

        for g in bigram_tuples:
            bigram_c[g] += 1

        for g in trigram_tuples:
            trigram_c[g] += 1

    bigram_c[(START_SYMBOL, START_SYMBOL)] = len(brown_tags)
    q_values = {k: math.log(float(v) / bigram_c[k[:2]], 2) for k, v in trigram_c.iteritems()}

    return q_values
def calc_probabilities(training_corpus):
    unigram_p = {}
    bigram_p = {}
    trigram_p = {}
    total_unigram=0
    unigram_freq=Counter()
    bigram_freq=Counter()
    trigram_freq=Counter()
    u_freq=Counter()
    for line in training_corpus:
        line=START_SYMBOL+" "+ line+STOP_SYMBOL
        unigram_tokens=line.split()
        unigram_freq.update(unigram_tokens)
        total_unigram=total_unigram+len(unigram_tokens)
    for sent in training_corpus:
        sent=START_SYMBOL+" "+ START_SYMBOL+" "+sent+STOP_SYMBOL
        unigram_tokens=sent.split()
        u_freq.update(unigram_tokens)
        bigram_tuples=list(nltk.bigrams(unigram_tokens))
        bigram_freq.update(bigram_tuples)
        trigram_tuples=list(nltk.trigrams(unigram_tokens))
        trigram_freq.update(trigram_tuples)

    for key in unigram_freq:
        unigram_p[(key,)]= math.log(unigram_freq[key]/float(total_unigram),2)

    for key in bigram_freq:
        bigram_p[key]= math.log(bigram_freq[key]/float(u_freq[key[0]]),2)
    
    for key in trigram_freq:
        trigram_p[key]=math.log(trigram_freq[key]/float(bigram_freq[key[0],key[1]]),2)

    
    return unigram_p, bigram_p, trigram_p
def linearscore(unigrams, bigrams, trigrams, corpus):
    scores = []
    lamb=float(1)/3;
    for line in corpus:
        line= STOP_SYMBOL+" "+START_SYMBOL+" "+line+" "+STOP_SYMBOL
        tokens = line.split()
        trigram = list(nltk.trigrams(tokens))
        prob=0
        for t in trigram:
            c=t[2]
            b=t[1]
            a=t[0]
            #tri= pow(2,trigrams[(a,b,c)])
            #bi= pow(2,bigrams[(b,c)])
            #uni= pow(2,unigrams[(c,)])
            try:
                prob= prob + math.log(lamb*(pow(2,trigrams[(a,b,c)])+pow(2,bigrams[(b,c)])+pow(2,unigrams[(c,)])),2)
                #prob= prob + math.log(lamb*(tri+bi+uni),2)
            except:
                prob=MINUS_INFINITY_SENTENCE_LOG_PROB
                scores.append(prob)
 

        scores.append(prob)
                

    return scores
Beispiel #24
0
def calc_trigrams(brown_tags):
    q_values = {}
    unigram_count = {}
    bigram_count = {}
    trigram_count = {}

    for tag_list in brown_tags:
        unigram_tuples = [(word,) for word in tag_list]
        bigram_tuples = list(nltk.bigrams(tag_list))
        trigram_tuples = list(nltk.trigrams(tag_list))

        for word in unigram_tuples:
            if word in unigram_count:
                unigram_count[word] += 1
            else:
                unigram_count[word] = 1

        for word in bigram_tuples:
            if word in bigram_count:
                bigram_count[word] += 1
            else:
                bigram_count[word] = 1

        for word in trigram_tuples:
            if word in trigram_count:
                trigram_count[word] += 1
            else:
                trigram_count[word] = 1

    for word in trigram_count:
        q_values[word] = math.log(float(trigram_count[word])/bigram_count[(word[0], word[1])], 2)
   
    return q_values
def calc_trigrams(brown_tags):
    q_values = {}
    
    bigram_count = {}
    trigram_count = {}

    for item in brown_tags:
	bigram_tmp = nltk.bigrams(item)
	trigram_tmp = nltk.trigrams(item)

	for bigram in bigram_tmp:
	    if bigram in bigram_count:
		bigram_count[bigram] += 1
	    else:
		bigram_count[bigram] = 1

	for trigram in trigram_tmp:
	    if trigram in trigram_count:
		trigram_count[trigram] += 1
	    else:
		trigram_count[trigram] =1

    for trigram in trigram_count:
	q_values[trigram] = math.log(trigram_count[trigram], 2) - math.log(bigram_count[trigram[:2]],2)
    return q_values
def calc_trigrams(brown_tags):

    q_values = {}

    trigram_tags = list(nltk.trigrams(brown_tags))
    bigram_tags = list(nltk.bigrams(brown_tags))

    trigram_tags_count = {}
    bigram_tags_count = {}

    for trigram_tag in trigram_tags:
        if trigram_tag not in trigram_tags_count:
            trigram_tags_count[trigram_tag] = 1
        else:
            trigram_tags_count[trigram_tag] += 1

    for bigram_tag in bigram_tags:
        if bigram_tag not in bigram_tags_count:
            bigram_tags_count[bigram_tag] = 1
        else:
            bigram_tags_count[bigram_tag] += 1

    for trigram_tag in trigram_tags:
        q_values[trigram_tag] = math.log(trigram_tags_count[trigram_tag] / float(bigram_tags_count[trigram_tag[:2]]), 2)

    return q_values
Beispiel #27
0
def score(ngram_p, n, data):
    scores = []
    if n == 1:
        for sentence in data:
            line_score = 0
            sentence += "STOP "
            unigram_tokens = nltk.word_tokenize(sentence)
            for token in unigram_tokens:
                line_score += ngram_p[(token,)]
            scores.append(line_score)
    elif n == 2:
        for sentence in data:
            line_score = 0
            sentence = "* " + sentence + "STOP "
            bigram_tuples = tuple(nltk.bigrams(nltk.word_tokenize(sentence)))
            for bigram in bigram_tuples:
                line_score += ngram_p[bigram]
            scores.append(line_score)
    elif n == 3:
        for sentence in data:
            line_score = 0
            sentence = "* * " + sentence + "STOP "
            trigra_tuples = tuple(nltk.trigrams(nltk.word_tokenize(sentence)))
            for trigram in trigra_tuples:
                line_score += ngram_p[trigram]
            scores.append(line_score)
    return scores
Beispiel #28
0
def demo_findPOSpattern(words_tagged, num=20):
  print "List the most {0} ambiguous words ...".format(num)
  i = 0
  data = nltk.ConditionalFreqDist(words_tagged)
  for word in data.conditions(): 
    if len(data[word]) > 3:
      i += 1
      tags = data[word].keys()
      print word.encode('big5'), "=>", ', '.join(tags)
      if i >= num: break
  while True:
    inp = raw_input("Enter a 3-frame pattern (example:'把 N V', 0 to exit): ")
    if inp == '0': break
    inp = inp.decode('big5')
    P = inp.split(' ')
    for (w1,t1), (w2,t2), (w3,t3) in nltk.trigrams(words_tagged):
      W = (w1, w2, w3); T = (t1, t2, t3); 
      flag = 0
      for i in range(len(W)):
      	if len(P[i]) == 0: break # if no input pattern then show dialog again
        if ord(P[i]) < 128: # an English tag name 
          if T[i].startswith(P[i]): flag += 1
        elif W[i] == P[i]: flag += 1
      if flag == len(W):
        print ', '.join(W)
Beispiel #29
0
def get_trigrams(sentence, stopwords, porter):
  words = nltk.word_tokenize(sentence)
  words = [word.lower() for word in words]
  words = [normalize_numeric(word) for word in words]
  words = [normalize_stopword(word, stopwords) for word in words]
  words = [porter.stem(word) for word in words]
  return nltk.trigrams(words)
Beispiel #30
0
def _count_words(path):
    print path

    word_count = defaultdict(int)

    with open(path, 'r') as f:
        tokens = nltk.word_tokenize(f.read().decode('utf-8').lower())

    word_counts = nltk.FreqDist(tokens)

    for word, count in word_counts.items():
        word_count[word] = count 
    
    bigrams = nltk.bigrams(tokens)
    bigram_counts = nltk.FreqDist(bigrams)

    for bigram, count in bigram_counts.items():
        word_count['%s %s' % bigram] = count

    trigrams = nltk.trigrams(tokens)
    trigram_counts = nltk.FreqDist(trigrams)

    for trigram, count in trigram_counts.items():
        word_count['%s %s %s' % trigram] = count

    filename = path.split('/')[2]
    count_date = '%s-%s-%s' % (filename.split('-')[0], filename.split('-')[1], filename.split('-')[2])

    with open('data/text/counts/%s.json' % count_date, 'w') as f:
        json.dump({ 'words': word_count }, f)
Beispiel #31
0
def getTrainingAndTestData(tweets, K, k, method, feature_set):

    add_ngram_feat = feature_set.get('ngram', 1)
    add_negtn_feat = feature_set.get('negtn', False)

    from functools import wraps
    import preprocessing

    procTweets = [ (preprocessing.processAll(text, subject=subj, query=quer), sent)    \
                        for (text, sent, subj, quer) in tweets]

    stemmer = nltk.stem.PorterStemmer()

    all_tweets = []  #DATADICT: all_tweets =   [ (words, sentiment), ... ]
    for (text, sentiment) in procTweets:
        words = [word if(word[0:2]=='__') else word.lower() \
                    for word in text.split() \
                    if len(word) >= 3]
        words = [stemmer.stem(w)
                 for w in words]  #DATADICT: words = [ 'word1', 'word2', ... ]
        all_tweets.append((words, sentiment))

    # train_tweets = all_tweets[:int(len(all_tweets)*ratio)]      #DATADICT: train_tweets = [ (words, sentiment), ... ]
    # test_tweets  = all_tweets[int(len(all_tweets)*ratio):]      #DATADICT: test_tweets  = [ (words, sentiment), ... ]
    train_tweets = [x for i, x in enumerate(all_tweets) if i % K != k]
    test_tweets = [x for i, x in enumerate(all_tweets) if i % K == k]

    unigrams_fd = nltk.FreqDist()
    if add_ngram_feat > 1:
        n_grams_fd = nltk.FreqDist()

    for (words, sentiment) in train_tweets:
        words_uni = words
        unigrams_fd.update(words)

        if add_ngram_feat >= 2:
            words_bi = [','.join(map(str, bg)) for bg in nltk.bigrams(words)]
            n_grams_fd.update(words_bi)

        if add_ngram_feat >= 3:
            words_tri = [','.join(map(str, tg)) for tg in nltk.trigrams(words)]
            n_grams_fd.update(words_tri)

    sys.stderr.write('\nlen( unigrams ) = ' + str(len(unigrams_fd.keys())))

    #unigrams_sorted = nltk.FreqDist(unigrams).keys()
    unigrams_sorted = unigrams_fd.keys()
    #bigrams_sorted = nltk.FreqDist(bigrams).keys()
    #trigrams_sorted = nltk.FreqDist(trigrams).keys()
    if add_ngram_feat > 1:
        sys.stderr.write('\nlen( n_grams ) = ' + str(len(n_grams_fd)))
        ngrams_sorted = [k for (k, v) in n_grams_fd.items() if v > 1]
        sys.stderr.write('\nlen( ngrams_sorted ) = ' + str(len(ngrams_sorted)))

    def get_word_features(words):
        bag = {}
        words_uni = ['has(%s)' % ug for ug in words]

        if (add_ngram_feat >= 2):
            words_bi = [
                'has(%s)' % ','.join(map(str, bg))
                for bg in nltk.bigrams(words)
            ]
        else:
            words_bi = []

        if (add_ngram_feat >= 3):
            words_tri = [
                'has(%s)' % ','.join(map(str, tg))
                for tg in nltk.trigrams(words)
            ]
        else:
            words_tri = []

        for f in words_uni + words_bi + words_tri:
            bag[f] = 1

        #bag = collections.Counter(words_uni+words_bi+words_tri)
        return bag

    negtn_regex = re.compile(
        r"""(?:
        ^(?:never|no|nothing|nowhere|noone|none|not|
            havent|hasnt|hadnt|cant|couldnt|shouldnt|
            wont|wouldnt|dont|doesnt|didnt|isnt|arent|aint
        )$
    )
    |
    n't
    """, re.X)

    def get_negation_features(words):
        INF = 0.0
        negtn = [bool(negtn_regex.search(w)) for w in words]

        left = [0.0] * len(words)
        prev = 0.0
        for i in range(0, len(words)):
            if (negtn[i]):
                prev = 1.0
            left[i] = prev
            prev = max(0.0, prev - 0.1)

        right = [0.0] * len(words)
        prev = 0.0
        for i in reversed(range(0, len(words))):
            if (negtn[i]):
                prev = 1.0
            right[i] = prev
            prev = max(0.0, prev - 0.1)

        return dict(
            zip(['neg_l(' + w + ')'
                 for w in words] + ['neg_r(' + w + ')' for w in words],
                left + right))

    def counter(
        func
    ):  #http://stackoverflow.com/questions/13512391/to-count-no-times-a-function-is-called
        @wraps(func)
        def tmp(*args, **kwargs):
            tmp.count += 1
            return func(*args, **kwargs)

        tmp.count = 0
        return tmp

    @counter  #http://stackoverflow.com/questions/13512391/to-count-no-times-a-function-is-called
    def extract_features(words):
        features = {}

        word_features = get_word_features(words)
        features.update(word_features)

        if add_negtn_feat:
            negation_features = get_negation_features(words)
            features.update(negation_features)

        sys.stderr.write('\rfeatures extracted for ' +
                         str(extract_features.count) + ' tweets')
        return features

    extract_features.count = 0

    if ('1step' == method):
        # Apply NLTK's Lazy Map
        v_train = nltk.classify.apply_features(extract_features, train_tweets)
        v_test = nltk.classify.apply_features(extract_features, test_tweets)
        return (v_train, v_test)

    elif ('2step' == method):
        isObj = lambda sent: sent in ['neg', 'pos']
        makeObj = lambda sent: 'obj' if isObj(sent) else sent

        train_tweets_obj = [(words, makeObj(sent))
                            for (words, sent) in train_tweets]
        test_tweets_obj = [(words, makeObj(sent))
                           for (words, sent) in test_tweets]

        train_tweets_sen = [(words, sent) for (words, sent) in train_tweets
                            if isObj(sent)]
        test_tweets_sen = [(words, sent) for (words, sent) in test_tweets
                           if isObj(sent)]

        v_train_obj = nltk.classify.apply_features(extract_features,
                                                   train_tweets_obj)
        v_train_sen = nltk.classify.apply_features(extract_features,
                                                   train_tweets_sen)
        v_test_obj = nltk.classify.apply_features(extract_features,
                                                  test_tweets_obj)
        v_test_sen = nltk.classify.apply_features(extract_features,
                                                  test_tweets_sen)

        test_truth = [sent for (words, sent) in test_tweets]

        return (v_train_obj, v_train_sen, v_test_obj, v_test_sen, test_truth)

    else:
        return nltk.classify.apply_features(extract_features, all_tweets)
Beispiel #32
0
def n_gram_creator(tokens,
                   top_n=20,
                   n=2,
                   freq_filter=None,
                   window_size=None,
                   counts=False,
                   show_freq=True,
                   show_pmi=False,
                   keep=None):
    # Helper function creating [2-4]grams with a variety of options

    import nltk.collocations as colloc
    from nltk import bigrams, trigrams

    ## Check if n-gram is supported
    if n in [2, 3, 4]:

        ## Allowing for non-contiguous ngram creation
        if isinstance(window_size, int):
            window = window_size
        else:
            window = n

        ## Bigram setup
        if n == 2:
            word = 'Bi'

            if counts:
                ngrams = bigrams(tokens)
                return ngrams
            else:
                ngram_measures = colloc.BigramAssocMeasures()
                ngram_finder = colloc.BigramCollocationFinder.from_words(
                    tokens, window_size=window)

        ## Trigram setup
        elif n == 3:
            word = 'Tri'

            if counts:
                ngrams = trigrams(tokens)
                return ngrams
            else:
                ngram_measures = colloc.TrigramAssocMeasures()
                ngram_finder = colloc.TrigramCollocationFinder.from_words(
                    tokens, window_size=window)

        ## Quadgram setup
        elif n == 4:
            word = 'Quad'
            ngram_measures = colloc.QuadgramAssocMeasures()
            ngram_finder = colloc.QuadgramCollocationFinder.from_words(
                tokens, window_size=window)

        ## Applying frequency filter to results if selected for
        if isinstance(freq_filter, int):
            ngram_finder.apply_freq_filter(freq_filter)

        ## Create ngram scores
        ngram_score = ngram_finder.score_ngrams(ngram_measures.raw_freq)
        ngram_pmi_score = ngram_finder.score_ngrams(ngram_measures.pmi)

        ## Optional display
        if show_freq:
            print(f'Top {top_n} {word}-grams by frequency')
            display(ngram_score[:top_n])

        ## Optional display
        if show_pmi:
            print(f'PMI score for {top_n} {word}-grams')
            display(ngram_pmi_score[:top_n])

        ## Optional return
        if keep == 'score':
            return ngram_score
        elif keep == 'pmi':
            return ngram_pmi_score

    ## Messaging for non-supported ngrams
    else:
        return f"{n}-grams are not supported. Try 2, 3, or 4."
Beispiel #33
0
    if len(word) > len(longest):
        longest = word
print('longest word:{}'.format(longest))

# 下面是等效的代码,使用两个链表推导式
# 可以找到所有最长的词
maxlen = max(len(word) for word in text)
print([word for word in text if len(word) == maxlen])

# 4.3.3 计数器(counter)的常规用法
# 使用循环变量来提取链表中连续重叠的3-grams
n = 3
sent = ['The', 'dog', 'gave', 'John', 'the', 'newspaper']
print("3-grams= ", [sent[i:i + n] for i in range(len(sent) - n + 1)])
# 下面是等效的代码
print("3-grams= ", list(nltk.trigrams(sent)))
# 下面是 2-grams
print("2-grams= ", list(nltk.bigrams(sent)))
# 下面是 4-grams
print("4-grams= ", list(nltk.ngrams(sent, 4)))

import pprint

# 使用循环变量构建多维结构
# 嵌套的链表推导式
m, n = 3, 7
array = [[set() for i in range(n)] for j in range(m)]
array[2][5].add('Alice')
pprint.pprint(array)

# 链表乘法则会对象复制的影响
def all_trigram_withcount(mylist):  # Use Counter to sort all trigram.
    # trigram_withcount = Counter(list(trigrams(mylist))).most_common()
    trigram_withcount = FreqDist(list(trigrams(mylist))).most_common()
    return trigram_withcount
Beispiel #35
0
def respond(message, slack_words):
    string_trigrams = trigrams(slack_words.lower().split(' '))

    for tri in string_trigrams:
        if ' '.join(tri).lower() in all_trigrams:
            message.reply(markov.generate_markov_text_with_words(tri[0], tri[1]))
Beispiel #36
0


brown_sents_train = []
for sent3 in brown_sent_train:
    sent3 = list(filter(lambda a: a not in ("``", "''", "--", ".", ",", "!",";","(",")","?",":"), sent3))
    sent3 = [x.lower() for x in sent3]
    sent3 = ['<unk>' if x not in brown_unigram_dict_train.keys() else x for x in sent3]
    brown_sents_train.append(sent3)

# list of sentences (as lists)
elist = []
elist_10 = []
for sent in brown_sents_train:
    elist.append(list(bigrams(sent, pad_left=True, pad_right=True, left_pad_symbol='<s>', right_pad_symbol='</s>')))
    elist_10.append(list(trigrams(sent, pad_left=True, pad_right=True, left_pad_symbol='<s>', right_pad_symbol='</s>')))

# list of tuples containing bigrams, trigrams
elist_2 = []
elist_12 = []
for l in elist:
    for t in l:
        elist_2.append(t)
for l in elist_10:
    for t in l:
        elist_12.append(t)
elist_2 += [('<s>', '<s>'), ('</s>', '</s>')]*len(brown_sents_train)
brown_bigram_dict_train = FreqDist(elist_2)
brown_trigram_dict_train = FreqDist(elist_12)

from nltk.corpus import reuters
from nltk import bigrams, trigrams
from collections import Counter, defaultdict

model = defaultdict(lambda: defaultdict(lambda: 0))

# Count frequency of co-occurance
for sentence in reuters.sents():
    for w1, w2, w3 in trigrams(sentence, pad_right=True, pad_left=True):
        model[(w1, w2)][w3] += 1

# Let's transform the counts to probabilities
for w1_w2 in model:
    total_count = float(sum(model[w1_w2].values()))
    for w3 in model[w1_w2]:
        model[w1_w2][w3] /= total_count

print(dict(model['today', 'the']))
Beispiel #38
0
#%% Test with English sentences

import nltk
import langdetect

# Test package
langdetect.detect('Hello,How are you?')
# copy a test dataset
data = data_valid.copy()
data.sent = data.sent.str.lower()  # lowercase
tokenizor = nltk.tokenize.RegexpTokenizer(
    "[a-zA-Z'`éèî]+")  # this re need to be edited
token = list()
trigram = list()
for i in range(len(data)):
    token.append(tokenizor.tokenize(data['sent'][i + 1]))
    trigram.append(nltk.trigrams(token[i]))
fileloc = " "
fo = open(fileloc,"r+")
inp = fo.read()
print(inp)
inp = clean_text(inp)
#inp = input()
tokens = word_tokenize(inp)
for i in range(len(tokens)):
    tokens[i] = tokens[i].lower()
    tokens[i] = "|" + tokens[i] + "|"

profile= FreqDist()
for t in tokens:
    token_bigrams = bigrams(list(t))
    token_trigrams = trigrams(list(t))

    for cur_bigram in token_bigrams:
        cur_bigram = "".join(cur_bigram)
        if cur_bigram in profile:
            profile[cur_bigram] += 1
        else:
            profile[cur_bigram] = 1

    for cur_trigram in token_trigrams:
        cur_trigram = "".join(cur_trigram)
        if cur_trigram in profile:
            profile[cur_trigram] += 1
        else:
            profile[cur_trigram] = 1
names.reverse()
value.reverse()
val = value    # the bar lengths
pos = arange(15)+.5    # the bar centers on the y axis
pos

plt.figure(figsize=(9,9))
barh(pos,val, align='center',alpha=0.7,color='rgbcmyk')
yticks(pos, names)
xlabel('Mentions')
grid(True)


list(nltk.bigrams(tokens))

list(nltk.trigrams(tokens))


sorted(w for w in set(tokens) if w.endswith('ing'))

[w.upper() for w in tokens]

for token in tokens:
    if token.islower():
        print(token, 'is a lowercase word')
    elif token.istitle():
        print(token, 'is a titlecase word')
    else:
        print(token, 'is punctuation')

########################################################
Beispiel #41
0
    return math.log(len(list_of_docs) /
            float(num_docs_containing(word, list_of_docs)))


def tf_idf(word, doc, list_of_docs):
    return (tf(word, doc) * idf(word, list_of_docs))

#Compute the frequency for each term.
vocabulary = []
docs = {}
all_tips = []
for tip in (['documment 1', 'documment 2']):
    tokens = tokenizer.tokenize(tip.text)

    bi_tokens = bigrams(tokens)
    tri_tokens = trigrams(tokens)
    tokens = [token.lower() for token in tokens if len(token) > 2]
    tokens = [token for token in tokens if token not in stopwords]

    bi_tokens = [' '.join(token).lower() for token in bi_tokens]
    bi_tokens = [token for token in bi_tokens if token not in stopwords]

    tri_tokens = [' '.join(token).lower() for token in tri_tokens]
    tri_tokens = [token for token in tri_tokens if token not in stopwords]

    final_tokens = []
    final_tokens.extend(tokens)
    final_tokens.extend(bi_tokens)
    final_tokens.extend(tri_tokens)
    docs[tip] = {'freq': {}, 'tf': {}, 'idf': {},
                        'tf-idf': {}, 'tokens': []}
#compute word frequency in text
fdist = FreqDist(words_nltk)
# get the most two common words in the text
fdist.most_common(2)
"""
stop_words = set(stopwords.words("english"))

filtered_words=[]
for w in words_nltk:
    if w not in stop_words:
        filtered_words.append(w)
"""

#generate trigram
trigram_nltk = [t for t in trigrams(words_nltk)]
#trigram_nltk = [t for t in trigrams(filtered_words)]

print(
    "Note: We are running now NLTK Option 2! For text generation we'll use text generating function from Option 1 in the program."
)

print(
    "Please enter an integer as the number of sentences you want to generate, preferably between {} and {}:>>"
    .format(len(trigram_nltk) // 10, 2 * (len(trigram_nltk) // 10)))
number_sentences = input()
print(
    "Please enter an integer as the lenght of the sentence in the generated text, preferably bigger than 2 and smaller than {}:>>"
    .format(len(trigram_nltk) // 10))
sentence_lenght = input()
Beispiel #43
0
def process(sentence):
    for (w1, t1), (w2, t2), (w3, t3) in nltk.trigrams(sentence):
        if t1.startswith('V') and t2 == 'TO' and t3.startswith('V'):
            print(w1, w2, w3)
def summaryGen(fileName,domain,gram=5,debug=False):
	if os.path.exists("../datasets/"+domain.lower()+".pickle"):
		stopwords = nltk.corpus.stopwords.words()
		tokenizer = RegexpTokenizer("[\w']+", flags=re.UNICODE)

		def freq(word, doc):
		    return doc.count(word)


		def word_count(doc):
		    return len(doc)


		def tf(word, doc):
		    return (freq(word, doc) / float(word_count(doc)))


		def num_docs_containing(word, list_of_docs):
		    count = 0
		    for document in list_of_docs:
			if freq(word, document) > 0:
			    count += 1
		    return 1 + count


		def idf(word, list_of_docs):
		    return math.log(len(list_of_docs) /
			    float(num_docs_containing(word, list_of_docs)))


		def tf_idf(word, doc, list_of_docs):
		    return (tf(word, doc) * idf(word, list_of_docs))

		#Compute the frequency for each term.
		vocabulary = []
		docs = {}
		all_tips = []
		text = "" 
		brands_reviews = pickle.load( open( "../datasets/"+domain.lower()+".pickle", "rb" ) )
		review_data = brands_reviews[fileName]
		for i in review_data:
			text+=i["review"]

		tokens = tokenizer.tokenize(text)

		bi_tokens = bigrams(tokens)
		tri_tokens = trigrams(tokens)
		n_tokens = ngrams(tokens, gram)
		tokens = [token.lower() for token in tokens if len(token) > 2]
		tokens = [token for token in tokens if token not in stopwords]

		bi_tokens = [' '.join(token).lower() for token in bi_tokens]
		bi_tokens = [token for token in bi_tokens if token not in stopwords]

		tri_tokens = [' '.join(token).lower() for token in tri_tokens]
		tri_tokens = [token for token in tri_tokens if token not in stopwords]

		n_tokens = [' '.join(token).lower() for token in n_tokens]
		n_tokens = [token for token in n_tokens if token not in stopwords]

		final_tokens = []
		final_tokens.extend(tokens)
		final_tokens.extend(bi_tokens)
		final_tokens.extend(tri_tokens)
		final_tokens.extend(n_tokens)
		docs[0] = {'freq': {}, 'tf': {}, 'idf': {},
				'tf-idf': {}, 'tokens': []}

		for token in final_tokens:
			#The frequency computed for each tip
			docs[0]['freq'][token] = freq(token, final_tokens)
			#The term-frequency (Normalized Frequency)
			docs[0]['tf'][token] = tf(token, final_tokens)
			docs[0]['tokens'] = final_tokens

		vocabulary.append(final_tokens)

		for doc in docs:
		    for token in docs[doc]['tf']:
			#The Inverse-Document-Frequency
			docs[doc]['idf'][token] = idf(token, vocabulary)
			#The tf-idf
			docs[doc]['tf-idf'][token] = tf_idf(token, docs[doc]['tokens'], vocabulary)

		#Now let's find out the most relevant words by tf-idf.
		words = {}
		for doc in docs:
		    for token in docs[doc]['tf-idf']:
			if token not in words:
			    words[token] = docs[doc]['tf-idf'][token]
			else:
			    if docs[doc]['tf-idf'][token] > words[token]:
				words[token] = docs[doc]['tf-idf'][token]

		review_keywords = sorted(words.items(), key=lambda x: x[1], reverse=True)
		if debug:	
			print "After tokenization...."
			sleep(1)
			print final_tokens
			sleep(1)
			print "After frequency computation...."
			sleep(1)
			print docs[0]['freq']
			sleep(1)
			print "After term frequency computation...."
			sleep(1)
			print docs[0]['tf']
			sleep(1)
			print "After Inverse-Document-Frequency computation...."
			sleep(1)
			print docs[0]['tf-idf']
			sleep(1)
			print "After term-frequency Inverse-Document-Frequency computation...."
			sleep(1)
			print docs[0]['tf-idf']
		sleep(1)
		print "Scores....."
		for i in review_keywords:
			print i[0],"-",i[1]
		return [i[0] for i in review_keywords]
	else:
		print "Domain not in dataset"
		return "Domain not in dataset"
Beispiel #45
0
len(single)
single

# Frequency distribution of the words
tokens.count('gluten')
fd = nltk.FreqDist(tokens)
fd.most_common(50)
fd.plot(50)

# How long are the words?
fd_wlen = nltk.FreqDist([len(w) for w in unique])
fd_wlen

# What about bigrams and trigrams?
bigr = nltk.bigrams(tokens[:10])
trigr = nltk.trigrams(tokens[:10])
tokens[:10]
list(bigr)
list(trigr)

# Back to text preprocessing: remove punctuations
tokens_nop = [t for t in tokens if t not in string.punctuation]
print(tokens[:50])
print(tokens_nop[:50])
len(tokens)
len(tokens_nop)
len(set(tokens_nop))

# Convert all characters to Lower case
tokens_lower = [t.lower() for t in tokens_nop]
print(tokens_lower[:50])
Beispiel #46
0
def all_trigrams(my_list):
    all_trigrams = list(nltk.trigrams(my_list))
    return all_trigrams
Beispiel #47
0
def data_cleaning():
    # have all the variables populated which are required below
    new_list_job_description = []
    lemmatized_description = []

    # Getting the all teams job posting file from Amazon S3
    aws_id = '***'
    aws_secret = '***'

    s3 = boto3.client('s3',
                      aws_access_key_id=aws_id,
                      aws_secret_access_key=aws_secret)
    obj = s3.get_object(Bucket='data-science-team1', Key='all_job_posting.csv')
    data = obj['Body'].read()
    all_job_data = pd.read_csv(io.BytesIO(data), encoding='ISO-8859-1')

    print('All job posting file read successful from amazon S3')

    # Copy the read csv into a new data frame
    all_job_data_copy = all_job_data
    all_job_data_copy.reset_index(drop=True, inplace=True)

    # Description column of the data frame is converted into a list
    list_job_description = all_job_data['Description'].tolist()

    # Removing special characters from the list and converting it to lowercase
    for i in list_job_description:
        a = re.sub('[^A-Za-z]+', ' ', str(i))
        a = a.lower()
        new_list_job_description.append(a)

    # Removing stop words
    new_list_job_description = [
        word for word in new_list_job_description
        if word not in stopwords.words('english')
    ]

    # Adding SPLITHEREAFTERLEMMATIZATION at the end of each column
    combined_description_data = " SPLITHEREAFTERLEMMATIZATION ".join(
        new_list_job_description)

    # lemmatizing words
    lmtzr = WordNetLemmatizer()
    a = combined_description_data.split(' ')
    for i in a:
        word_after_lammatize = lmtzr.lemmatize(i)
        lemmatized_description.append(word_after_lammatize)

    print('Job Posting lemmatization successful')

    lemmatized_description_join = " ".join(lemmatized_description)
    description_df = pd.DataFrame({
        "Job Description":
        lemmatized_description_join.split('SPLITHEREAFTERLEMMATIZATION')
    })

    descr_lemmatizes_data_frame = all_job_data_copy.join(description_df)
    descr_lemmatizes_data_frame = descr_lemmatizes_data_frame.drop(
        columns="Description")
    descr_lemmatizes_data_frame = descr_lemmatizes_data_frame.dropna()

    descr_lemmatizes_data_frame.to_csv("descrlemmatizesdataframeclean.csv")
    all_job_data['Description'] = new_list_job_description
    job_description_list_descr_lemmatizes_data_frame = descr_lemmatizes_data_frame[
        'Job Description'].tolist()

    # df = pd.read_excel("Words_for_Clustering - copy.xlsx")
    # df = df.dropna()

    # Converting the 100 words list to  lowercase, removing special characters, lammetizing
    s3 = boto3.client('s3',
                      aws_access_key_id=aws_id,
                      aws_secret_access_key=aws_secret)
    obj = s3.get_object(Bucket='data-science-team1',
                        Key='Final100Keywords.xlsx')
    data = obj['Body'].read()
    word_list = pd.read_excel(io.BytesIO(data), encoding='ISO-8859-1')

    print('100 keywords file read successful from amazon S3')

    # lemmatized_clean_word_list: List of clean word list
    clean_word_list = []
    for key, i in word_list['Keywords'].iteritems():
        a = re.sub('[^A-Za-z]+', ' ', str(i))
        a = a.lower()
        clean_word_list.append(a)
    lemmatizer = WordNetLemmatizer()
    lemmatized_clean_word_list = []
    for word in clean_word_list:
        a = []
        for each in word.split(" "):
            a1 = lemmatizer.lemmatize(each)
            a.append(a1.lower())
        a = " ".join(a)
        lemmatized_clean_word_list.append(a)
    # print(lemmatized_clean_word_list)

    # Counting the numbers of words in each description : one-gram, bi-gram, tri-grams
    list_of_counts = []
    for list_ in job_description_list_descr_lemmatizes_data_frame:
        matched_words = {}
        words = nltk.word_tokenize(list_)
        words_set = set(words)
        bi_grams = nltk.bigrams(words)
        trigr = nltk.trigrams(words)

        bi_grams_pairs = [' '.join(pair) for pair in bi_grams]

        bi_grams_pairs_set = set(bi_grams_pairs)
        trigram_pairs = [' '.join(each) for each in trigr]
        trigram_pairs_set = set(trigram_pairs)
        # count = 0

        matched_words.update({
            word: words.count(word)
            for word in words_set if word in lemmatized_clean_word_list
        })
        matched_words.update({
            bi: bi_grams_pairs.count(bi)
            for bi in bi_grams_pairs_set if bi in lemmatized_clean_word_list
        })
        matched_words.update({
            tri: trigram_pairs.count(tri)
            for tri in trigram_pairs_set if tri in lemmatized_clean_word_list
        })
        list_of_counts.append(matched_words)

    df_with_required_count = pd.DataFrame(list_of_counts)

    # with_count_data_frame: Final data frame with jobs and their count
    with_count_data_frame = descr_lemmatizes_data_frame.join(
        df_with_required_count)
    with_count_data_frame = with_count_data_frame.fillna(0)

    # appending all the remaining rows with the genearted dataframe

    remainingwords = []
    duplicatedf = with_count_data_frame.loc[:, "access":"wealth management"]
    duplicatedf = duplicatedf.columns.values.tolist()
    word_list = word_list["Keywords"].tolist()
    for each in word_list:
        if each not in duplicatedf:
            remainingwords.append(each)
    for i in remainingwords:
        with_count_data_frame[i] = float(0)

    # with_count_data_frame.to_csv(r'A:\2nd Semester\Data Science\Assignment 2\Anurag\FINAL\word_count.csv')

    print('Job posting Word Count file successful')
    # return with_count_data_frame

    with_count_data_frame.to_csv("job_posting_with_count.csv")
    client = boto3.client('s3',
                          aws_access_key_id='***',
                          aws_secret_access_key='***')
    transfer = S3Transfer(client)
    transfer.upload_file('job_posting_with_count.csv', 'data-science-team1',
                         'job_posting_with_count.csv')
    # transfer.upload_file('with_count_data_frame.csv', 'data-science-team1', 'with_count_data_frame' + "/" + 'with_count_data_frame')
    print('Job posting file transferred to S3 successful')
Beispiel #48
0
def to_trigrams(words):
    for trigram in nltk.trigrams(words, pad_left=True, pad_right=True):
        if trigram != (None, None, None):
            yield trigram
Beispiel #49
0
def common_strings(start, end):
    CS_THRESHOLD = 6
    sep = "tvlwz"

    tokens = string_range_tokenize(start, end, sep)

    #make a copy since we're going to edit it
    u_tokens = tokens
    c = 0
    while (c < len(u_tokens)):
        if u_tokens[c] == sep:
            del u_tokens[c]
        else:
            c += 1

    print("common_strings tokens:")
    print(tokens)

    if len(u_tokens) < CS_THRESHOLD:
        #print "%08x - %08x : %s" % (start,end,"no string")
        return ("", 0)

    f = nltk.FreqDist(u_tokens)
    u_gram = f.most_common(1)[0][0]
    u_gram_score = f.most_common(1)[0][1]

    #print "Tokens:"
    #print tokens
    #print len(tokens)

    bgs = list(nltk.bigrams(tokens))
    c = 0
    while (c < len(bgs)):
        if sep in bgs[c]:
            del bgs[c]
        else:
            c += 1

    #print "Bigrams:"
    #print bgs
    if (len(bgs) != 0):
        fs = nltk.FreqDist(bgs)
        b_gram = fs.most_common(1)[0][0]
        #print "Most Common:"
        #print b_gram
        b_str = b_gram[0] + "_" + b_gram[1]
        b_gram_score = fs.most_common(1)[0][1]
    else:
        b_str = ""
        b_gram_score = 0

    tgs = list(nltk.trigrams(tokens))
    c = 0
    while (c < len(tgs)):
        if sep in tgs[c]:
            del tgs[c]
        else:
            c += 1
    #print "Trigrams:"
    #print tgs
    if (len(tgs) != 0):
        ft = nltk.FreqDist(tgs)
        t_gram = ft.most_common(1)[0][0]
        t_str = t_gram[0] + "_" + t_gram[1] + "_" + t_gram[2]
        t_gram_score = ft.most_common(1)[0][1]
    else:
        t_str = ""
        t_gram_score = 0

    #print "1: %s - %d 2: %s - %d 3: %s - %d\n" % (u_gram,u_gram_score,b_str,b_gram_score,t_str,t_gram_score)

    if (b_gram_score * 2 >= u_gram_score):
        if (t_gram_score * 2 >= b_gram_score):
            ret = t_str
            ret_s = t_gram_score
        else:
            ret = b_str
            ret_s = b_gram_score
    else:
        ret = u_gram
        ret_s = u_gram_score

    #print "%08x - %08x : %s" % (start,end,ret)

    return (ret, ret_s)
Beispiel #50
0
def calc_trigrams(brown_tags):
    q_values = {}

    #John's edit starts here
    #should be very simple from the other file

    unigram_c = {}
    bigram_c = {}
    trigram_c = {}

    unigram_p = {}
    bigram_p = {}

    sentence_count = 0
    unigram_count = 0
    bigram_count = 0
    trigram_count = 0

    # John's edit starts here
    # First need to find the count of all the tuples, and put them in an outer dictionary
    test_me = 1
    for sentence in brown_tags:
        # passing this to helper function with the stripped version of each sentence
        #unigram_tuples, bigram_tuples, trigram_tuples = sentence_split(sentence.strip())

        trigram_tuples = list(nltk.trigrams(sentence))
        # if test_me:
        #     print trigram_tuples
        #     test_me = 0

        #sentence.pop(0) #remove first start symbol
        new_sent = list(sentence[1:])
        bigram_tuples = list(nltk.bigrams(new_sent))

        newnew_sent = list(new_sent[1:])  #remove other start symbol
        unigram_tuples = list(newnew_sent)

        for phrase in unigram_tuples:
            if (phrase, ) in unigram_c:
                unigram_c[(phrase, )] += 1
            else:
                unigram_c[(phrase, )] = 1

            unigram_count += 1

        for phrase in bigram_tuples:
            if phrase in bigram_c:
                bigram_c[phrase] += 1
            else:
                bigram_c[phrase] = 1

            bigram_count += 1

        for phrase in trigram_tuples:
            if phrase in trigram_c:
                trigram_c[phrase] += 1
            else:
                trigram_c[phrase] = 1

            trigram_count += 1

        sentence_count += 1  # keeps track of how many sentences there are

    # now that we have all the data, we now need to convert counts into probabilities
    for one_word in unigram_c:
        current_count = unigram_c[one_word]
        #print unigram_count
        #if one_word[0] == "captain":
        #print "captain", current_count
        unigram_p[one_word] = math.log(
            float(current_count) / float(unigram_count), 2)

    for two_words in bigram_c:
        count_both_words = bigram_c[two_words]
        if (two_words[0] == '*'):
            count_word_one = sentence_count
        else:
            count_word_one = unigram_c[(two_words[0], )]

        #count_word_two = unigram_c[(two_words[1],)]
        bigram_p[two_words] = math.log(
            float(count_both_words) / float(count_word_one), 2)

    testing_this = 1
    for three_words in trigram_c:
        count_three_words = trigram_c[three_words]

        prev_bigram = tuple([three_words[0], three_words[1]])
        #count_prev_bigram = bigram_c[prev_bigram]
        #above was previous code

        if (prev_bigram[0] == '*' and prev_bigram[1] == '*'):
            count_prev_bigram = sentence_count
        else:
            count_prev_bigram = bigram_c[prev_bigram]

        q_values[three_words] = math.log(
            float(count_three_words) / float(count_prev_bigram), 2)

    #professor provided return value here
    return q_values
Beispiel #51
0
def triGrams(words):
    tGrams = []
    for item in nltk.trigrams(words):
        tGrams.append(' '.join(item))
    return tGrams
    NEG = scores.get('neg')
    NEU = scores.get('neu')
    RES = str()

    if POS > NEG:
        RES = 'Positive'
    elif NEG > POS:
        RES = 'Negative'
    elif NEU >= 0.5 or POS > NEU:
        RES = 'Positive'
    elif NEU < 0.5:
        RES = 'Negative'

    # -------------------------------------------------------- PATTERN ADVERB, ADVERB, ADJECTIVE (Down)
    tri_pairs = list()
    for (w1, tag1), (w2, tag2), (w3, tag3) in nltk.trigrams(PoS_TAGS):
        if tag1.startswith("RB") and tag2.startswith("RB") and tag3.startswith(
                "JJ"):
            tri_pairs.append((w1, w2, w3))
            if tri_pairs[0] or tri_pairs[1] or tri_pairs[2] in D:
                print("[True]: Tri Pairs are found in Drought Rel. Term")
                # TRIGGER AREA
                for j in range(len(F)):
                    if tri_pairs[0] or tri_pairs[1] or tri_pairs[2] in F[j]:
                        print(
                            "[True]: Tri Pairs are found in Frequent Wordset")
                        if RES is "Positive":
                            RES = "Highly Positive"
                            FW = F[j]
                            # fuzzy_df['FreqWord'].map(lambda x: next((y for y in x.split() if y in F), 'Not Found'))
                        elif RES is "Negative":
 def get_trigrams(self, doc_num):
     return nltk.trigrams(self.texts[doc_num].split())
 def __init__(self,
              rdgDir,
              general,
              working_dir='.',
              overwrite=False,
              rank_from_previous=False,
              background_cache_file='ranking.pkl',
              full_to_abbr=False):
     # available metrics
     global bck_cache_file
     bck_cache_file = background_cache_file
     self.metrics = {
         'DR': self._calDR,
         'DC': self._calDC,
         'DRDC': self._calDRDC,
         'IDF': self._calIDF,
         'TFIDF': self._calTFIDF,
         'TokenDRDC': self._calTokenDRDC,
         'TokenIDF': self._calTokenIDF,
         'Entropy': self._calEntropy,
         'KLDiv': self._calKLDiv,
         'Weighted': self._calWeighted,
         'TF': self._calTF
     }
     # used for restoring ranking
     # from previous
     self.rankingmap = {}
     # input files
     self.genDocs = Document(overwrite=overwrite)
     #for numBackDocs # updates by Y Gu 11/2018 for pkl file type compatibility
     self.genDocsNum = 0
     #filtfname = os.path.join(rdgDir, 'filter.save')
     #filtfname = os.path.join(working_dir, '.filter.save')
     # General document group is given as files in a directory
     if rank_from_previous:
         pass
     elif type(general) == type(str()):
         logging.debug('Loading general documents from ' + general)
         # gen = [Document(general+genFile) for genFile in os.listdir(general) if genFile[-4:]=='.txt']
         gen = map(
             lambda x: Document(filename=x.strip(), overwrite=overwrite),
             open(general).readlines())
         ## note that the iterator only les us calculate this once
         ## this is OK because this is the initialization function
         ## other maps should be cast into lists
         # we only need the sum for the general class
         ## python3 compatibility change
         ## TrueTdf updates by Y Gu 6/2018 (next 2 lines + 5 lines in for loop)
         ## Updated again by Y Gu 11/2018 for type compatibility
         for iterator in gen:
             self.genDocsNum += 1
             for w in iterator.counts:
                 ## print(2,w,iterator.counts[w]) ## 57 OK
                 self.genDocs.counts[w] += iterator.counts[w]
                 self.genDocs.token_counts[
                     w] += 1  # updates by Y Gu 11/2018 for pkl file type compatibility
                 ## input('pausing')
         # for i in range(len(list(gen))):
         #     for w in gen[i].counts:
         #         self.genDocs.counts[w] += gen[i].counts[w]
     # General document group is given as a corpus
     else:
         logging.debug('Loading from general corpus...')
         # NGrams in lieu of NPs -- we are storing extra info
         words = general.words()
         logging.debug('Unigrams loading')
         bigrams = nltk.bigrams(words)
         logging.debug('Bigrams loading')
         trigrams = nltk.trigrams(words)
         logging.debug('Trigrams loading')
         #filters = ['abbreviation', 'case', 'stem']
         filters = Settings.getCorpusFilters()
         logging.debug('Filtering unigrams')
         for w in words:
             for filt in filters:
                 # if filt == 'abbreviation':
                 #     w = Filter.criteria[filt](w,full_to_abbr)
                 #     ## Somewhat of a kludge, the more general approach
                 #     ## would be to allow all filters to take multiple arguments.
                 #     ## If these get expanded, that would be the way to go.
                 # else:
                 w = Filter.criteria[filt](w)
             if w:
                 self.genDocs.counts[w] += 1
                 self.genDocs.token_counts[w] += 1
         logging.debug('Filtering bigrams')
         for gram in bigrams:
             w = ' '.join(gram)
             for filt in filters:
                 w = Filter.criteria[filt](w)
             if w:
                 self.genDocs.counts[w] += 1
         logging.debug('Filtering trigrams')
         for gram in trigrams:
             w = ' '.join(gram)
             for filt in filters:
                 w = Filter.criteria[filt](w)
             if w:
                 self.genDocs.counts[w] += 1
         logging.debug('done')
     # Related Document Group -- we need each document separately
     logging.debug('Loading RDG from ' + rdgDir + '...')
     #self.rdgDocs = [Document(rdgDir+rdgFile) for rdgFile in os.listdir(rdgDir) if rdgFile[-4:]=='.txt']
     self.rdgDocs = list(
         map(lambda x: Document(filename=x.strip(), overwrite=overwrite),
             open(rdgDir).readlines()))
     ## Python 3 compatibility -- rdgDocs needs to be a list and Python3 makes it an iterator
     logging.debug('done')
import collections
counter = collections.Counter()

for line in lines:
    for word in nltk.word_tokenize(line):
        counter[word.lower()] += 1

word2idx = {w: (i + 1) for i, (w, _) in enumerate(counter.most_common())}
idx2word = {v: k for k, v in word2idx.items()}

xs = []
ys = []

for line in lines:
    embedding = [word2idx[w.lower()] for w in nltk.word_tokenize(line)]
    triples = list(nltk.trigrams(embedding))
    w_lefts = [x[0] for x in triples]
    w_centers = [x[1] for x in triples]
    w_rights = [x[2] for x in triples]
    xs.extend(w_centers)
    ys.extend(w_lefts)
    xs.extend(w_centers)
    ys.extend(w_rights)

print(len(word2idx))

vocab_size = len(word2idx) + 1

ohe = OneHotEncoder(n_values=vocab_size)
X = ohe.fit_transform(np.array(xs).reshape(-1, 1)).todense()
Y = ohe.fit_transform(np.array(ys).reshape(-1, 1)).todense()
Beispiel #56
0
# Remove single-character tokens (mostly punctuation)
words = [word for word in words if len(word) > 1]

# turn words into their roots
words = [textblob.Word(w).lemmatize() for w in words]

# extract freq of single words
fdist = nltk.FreqDist(words)

# print for preview
for word, frequency in fdist.most_common(50):
    print(u'{};{}'.format(word, frequency))

 # 2 and 3-ngrams   
bgs = nltk.bigrams(words)
tgs = nltk.trigrams(words)
fdist_bgs = nltk.FreqDist(bgs)
fdist_tgs = nltk.FreqDist(tgs)

# preview
for word, frequency in fdist_bgs.most_common(50):
    print(u'{};{}'.format(word, frequency))
    
for word, frequency in fdist_tgs.most_common(50):
    print(u'{};{}'.format(word, frequency))

# sort by occurance and count
sgs_sorted = fdist.most_common()
bgs_sorted = fdist_bgs.most_common()
tgs_sorted = fdist_tgs.most_common()
Beispiel #57
0
sentimentWords = []
sentimentBigrams = []
sentimentTrigrams = []

for tweet in trainSet.keys():
    for tag in taggedTweets[tweet]:
        if tag[0] == 'Group':
            # Split the text and clean it up by removing punctuation and
            # making each word lowercase
            text = [
                w.strip('"\\.,:/!?\'()').lower()
                for w in tag[1]['text'].split()
            ]
            groupWords.extend(text)
            groupBigrams.extend(list(nltk.bigrams(text)))
            groupTrigrams.extend(list(nltk.trigrams(text)))
        elif tag[0] == 'Stereotype':
            text = [
                w.strip('"\\.,:/!?\'()').lower()
                for w in tag[1]['text'].split()
            ]
            stereotypeWords.extend(text)
            stereotypeBigrams.extend(list(nltk.bigrams(text)))
            stereotypeTrigrams.extend(list(nltk.trigrams(text)))
        elif tag[0] == 'Sentiment':
            text = [
                w.strip('"\\.,:/!?\'()').lower()
                for w in tag[1]['text'].split()
            ]
            sentimentWords.extend(text)
            sentimentBigrams.extend(list(nltk.bigrams(text)))
Beispiel #58
0
def extract_bias_features(text):
    features = {}
    txt_lwr = str(text).lower()
    words = nltk.word_tokenize(txt_lwr)
    words = [w for w in words if len(w) > 0 and w not in '.?!,;:\'s"$']
    unigrams = sorted(list(set(words)))
    bigram_tokens = nltk.bigrams(words)
    bigrams = [" ".join([w1, w2]) for w1, w2 in sorted(set(bigram_tokens))]
    trigram_tokens = nltk.trigrams(words)
    trigrams = [
        " ".join([w1, w2, w3]) for w1, w2, w3 in sorted(set(trigram_tokens))
    ]
    # print words
    # print unigrams
    # print bigrams
    # print trigrams
    # print "----------------------"

    # word count
    features['word_cnt'] = len(words)

    # unique word count
    features['unique_word_cnt'] = len(unigrams)

    # coherence marker count
    count = count_feature_list_freq(coherence, words, bigrams, trigrams)
    features['cm_cnt'] = count
    features['cm_rto'] = round(float(count) / float(len(words)), 4)

    # degree modifier count
    count = count_feature_list_freq(modifiers, words, bigrams, trigrams)
    features['dm_cnt'] = count
    features['dm_rto'] = round(float(count) / float(len(words)), 4)

    # hedge word count
    count = count_feature_list_freq(hedges, words, bigrams, trigrams)
    features['hedge_cnt'] = count
    features['hedge_rto'] = round(float(count) / float(len(words)), 4)

    # factive verb count
    count = count_feature_list_freq(factives, words, bigrams, trigrams)
    features['factive_cnt'] = count
    features['factive_rto'] = round(float(count) / float(len(words)), 4)

    # assertive verb count
    count = count_feature_list_freq(assertives, words, bigrams, trigrams)
    features['assertive_cnt'] = count
    features['assertive_rto'] = round(float(count) / float(len(words)), 4)

    # implicative verb count
    count = count_feature_list_freq(implicatives, words, bigrams, trigrams)
    features['implicative_cnt'] = count
    features['implicative_rto'] = round(float(count) / float(len(words)), 4)

    # bias words and phrases count
    count = count_feature_list_freq(biased, words, bigrams, trigrams)
    features['bias_cnt'] = count
    features['bias_rto'] = round(float(count) / float(len(words)), 4)

    # opinion word count
    count = count_feature_list_freq(opinionLaden, words, bigrams, trigrams)
    features['opinion_cnt'] = count
    features['opinion_rto'] = round(float(count) / float(len(words)), 4)

    # weak subjective word count
    count = count_feature_list_freq(subj_weak, words, bigrams, trigrams)
    features['subj_weak_cnt'] = count
    features['subj_weak_rto'] = round(float(count) / float(len(words)), 4)

    # strong subjective word count
    count = count_feature_list_freq(subj_strong, words, bigrams, trigrams)
    features['subj_strong_cnt'] = count
    features['subj_strong_rto'] = round(float(count) / float(len(words)), 4)

    # composite sentiment score using VADER sentiment analysis package
    compound_sentiment = vader_sentiment_analysis.polarity_scores(
        text)['compound']
    features['vader_sentiment'] = compound_sentiment

    # subjectivity score using Pattern.en
    pattern_subjectivity = pattern_sentiment(text)[1]
    features['subjectivity'] = round(pattern_subjectivity, 4)

    # modality (certainty) score and mood using  http://www.clips.ua.ac.be/pages/pattern-en#modality
    sentence = parse(text, lemmata=True)
    sentenceObj = Sentence(sentence)
    features['modality'] = round(modality(sentenceObj), 4)
    features['mood'] = mood(sentenceObj)

    # Flesch-Kincaid Grade Level (reading difficulty) using textstat
    features['fk_gl'] = textstat.flesch_kincaid_grade(text)

    # liwc 3rd person pronoun count (combines S/he and They)
    count = count_liwc_list_freq(liwc_3pp, words)
    features['liwc_3pp_cnt'] = count
    features['liwc_3pp_rto'] = round(float(count) / float(len(words)), 4)

    # liwc auxiliary verb count
    count = count_liwc_list_freq(liwc_aux, words)
    features['liwc_aux_cnt'] = count
    features['liwc_aux_rto'] = round(float(count) / float(len(words)), 4)

    # liwc adverb count
    count = count_liwc_list_freq(liwc_adv, words)
    features['liwc_adv_cnt'] = count
    features['liwc_adv_rto'] = round(float(count) / float(len(words)), 4)

    # liwc preposition count
    count = count_liwc_list_freq(liwc_prep, words)
    features['liwc_prep_cnt'] = count
    features['liwc_prep_rto'] = round(float(count) / float(len(words)), 4)

    # liwc conjunction count
    count = count_liwc_list_freq(liwc_conj, words)
    features['liwc_conj_cnt'] = count
    features['liwc_conj_rto'] = round(float(count) / float(len(words)), 4)

    # liwc discrepency word count
    count = count_liwc_list_freq(liwc_discr, words)
    features['liwc_discr_cnt'] = count
    features['liwc_discr_rto'] = round(float(count) / float(len(words)), 4)

    # liwc tentative word count
    count = count_liwc_list_freq(liwc_tent, words)
    features['liwc_tent_cnt'] = count
    features['liwc_tent_rto'] = round(float(count) / float(len(words)), 4)

    # liwc certainty word count
    count = count_liwc_list_freq(liwc_cert, words)
    features['liwc_cert_cnt'] = count
    features['liwc_cert_rto'] = round(float(count) / float(len(words)), 4)

    # liwc causation word count
    count = count_liwc_list_freq(liwc_causn, words)
    features['liwc_causn_cnt'] = count
    features['liwc_causn_rto'] = round(float(count) / float(len(words)), 4)

    # liwc work word count
    count = count_liwc_list_freq(liwc_work, words)
    features['liwc_work_cnt'] = count
    features['liwc_work_rto'] = round(float(count) / float(len(words)), 4)

    # liwc achievement word count
    count = count_liwc_list_freq(liwc_achiev, words)
    features['liwc_achiev_cnt'] = count
    features['liwc_achiev_rto'] = round(float(count) / float(len(words)), 4)

    return features
# -*- coding: utf-8 -*-
"""Unigram, Biagram And Trigram Code .ipynb

Automatically generated by Colaboratory.

Original file is located at
    https://colab.research.google.com/drive/1fz3ji7JokGo0IDMD78uL4FBfQWh8-Wzg
"""

import nltk as nlp
#nlp.download()
paragrapha = """"
Machine learning is a form of AI that enables a system to learn from data rather than through explicit programming.
However, machine learning is not a simple process.
As the algorithms ingest training data, it is then possible to produce more precise models based on that data.
A machine-learning model is the output generated when you train your machine-learning algorithm with data. After training, when you provide a model with an input, you will be given an output. For example, a predictive algorithm will create a predictive model. Then, when you provide the predictive model with data, you will receive a prediction based on the data that trained the model.
illustration of robot solving puzzle Iterative learning Machine learning enables models to train on data sets before being deployed. Some machine- learning models are online and continuous. This iterative process of online models leads to an improvement in the types of associations made between data elements. Due to their complexity and size, these patterns and associations could have easily been overlooked by human observation. After a model has been trained, it can be used in real time to learn from data. The improvements in accuracy are a result of the training process and automation that are part of machine learning.
"""
sentence = nlp.sent_tokenize(paragrapha)
word = nlp.word_tokenize(paragrapha)


print(list(nlp.bigrams(word)))
print(list(nlp.trigrams(word)))
#print(sentence)
Beispiel #60
0
text = soup.p.contents[0]

text_1 = text.lower()

text_2 = re.sub('\W', ' ', text_1)

from nltk import word_tokenize
from nltk import bigrams
from nltk import trigrams
from nltk import ngrams

text_3 = word_tokenize(text_2)

text_3_bi = bigrams(text_3)
text_3_tri = trigrams(text_3)
text_3_n = ngrams(text_3, 4)

stop_words = urlopen(
    'http://jmlr.org/papers/volume5/lewis04a/a11-smart-stop-list/english.stop'
).read().split('\n')

##we can then identify the stop words and then eliminate them from the list

##this is code that executes a very simple for loop to check the list
text_4 = [x for x in text_3 if x not in stop_words]

##you can check what was removed with:

text_rem = [x for x in text_3 if x not in text_4]