Exemple #1
0
def getTestData(corpus, embedsize, ngramsize, m):
	f = open(corpus)
	datap = []
	for line in f:
		data = line.strip().split('\t')
		s1 = data[0]
		s2 = data[1]
		label = data[2]
		s1ng = ngrams(s1.split(' '), ngramsize)
		s2ng = ngrams(s2.split(' '), ngramsize)
		s1ng = set([ng for ng in s1ng])
		s2ng = set([ng for ng in s2ng])
		#diff = s2ng.difference(s1ng)
		all = s1ng.union(s2ng)
		datap.append(list(all))
	Xs = []
	wildcard = np.array([0.0]*embedsize)
	for ngs in datap:
		X = np.zeros((len(ngs), ngramsize, embedsize))
		for i in range(0, len(ngs)):
			ngram = ngs[i]
			vectors = getEmbedVectors(ngramsize, embedsize, ngram, m, wildcard)
			X[i] = vectors
		Xs.append(X)
	return Xs
 def str_common_grams(str1, str2, length=3):
     '''Return how many times the ngrams (of length min_len to max_len) of str1
     appeared on str2
     '''
     grams1 = list(ngrams(str1, length))
     grams2 = list(ngrams(str2, length))
     return sum(grams2.count(gram) for gram in grams1)
Exemple #3
0
def getTrainData(corpus, embedsize, ngramsize, m):
	f = open(corpus)
	datap = []
	for line in f:
		data = line.strip().split('\t')
		s1 = data[0]
		s2 = data[1]
		label = data[2]
		s1ng = ngrams(s1.split(' '), ngramsize)
		s2ng = ngrams(s2.split(' '), ngramsize)
		s1ng = set([ng for ng in s1ng])
		s2ng = set([ng for ng in s2ng])
		#diff = s2ng.difference(s1ng)
		all = s1ng.union(s2ng)
		for ng in all:
			datap.append([ng, label])
	X = np.zeros((len(datap), ngramsize, embedsize))
	Y = np.zeros((len(datap), 3))
	wildcard = np.array([0.0]*embedsize)
	for i in range(0, len(datap)):
		item = datap[i]
		ngram = item[0]
		label = item[1]
		vectors = getEmbedVectors(ngramsize, embedsize, ngram, m, wildcard)
		labels = getLabels(label)
		X[i] = vectors
		Y[i] = labels
	return X, Y
    def modified_precision(candidate, references, n):
        candidate_ngrams=[]
        candidate_n = ngrams(candidate, n)
        
        for x in candidate_n:
            #print x
            candidate_ngrams.append(x)
        # print candidate_ngrams
        #print type(candidate_ngrams)
            #length+=1
            
        
        if len(candidate_ngrams) == 0:
            return 0
        
        #raw_input()
        c_words = set(candidate_ngrams)
        #print c_words
        for word in c_words:
            count_w = candidate_ngrams.count(word) + 1
            #print count_w

            count_max = 0
            for reference in references:
                reference_ngrams=[]
                reference_n = ngrams(reference, n)
                for x in reference_n:
                    reference_ngrams.append(x)

                count = reference_ngrams.count(word) + 1
                if count > count_max:
                    count_max = count

        return min(count_w, count_max) / (len(candidate) + len(c_words))
Exemple #5
0
def char_ngram_similarity(doc1, doc2, n, top=100):
    """
    Gives a positive dissimilarity score of two documents with respect to their top m character n-grams distribution.
    If the value is 0 the documents are identical (or at least share an identical top m character n-grams distribution.
    :param doc1:
    :param doc2:
    :param n: the n-gram length
    :param top: Only use the N most frequent n-grams from each document.
    :return: A positive dissimilarity score. If the value is 0 the documents are identical (or at least their top m
             character n-grams distribution.)
    """

    ngrams1 = Counter(ngrams(doc1, n))
    ngrams2 = Counter(ngrams(doc2, n))

    profile1 = [n[0] for n in ngrams1.most_common(top)]
    profile2 = [n[0] for n in ngrams2.most_common(top)]

    # normalise the two ngram distributions
    total1 = np.sum(list(ngrams1.values()))
    for key in ngrams1:
        ngrams1[key] /= total1

    total2 = np.sum(list(ngrams2.values()))
    for key in ngrams2:
        ngrams2[key] /= total2

    # calculate global dissimilarity score
    score = 0
    for n in set(profile1 + profile2):
        f1 = ngrams1[n]
        f2 = ngrams2[n]
        score += ((2 * (f1 - f2)) / (f1 + f2)) ** 2
    return score
Exemple #6
0
	def scoreScopeOverlap(self,scopeHyp,scopeRef):
		
		totalScore = 0

		for scope_h in scopeHyp:
			bestScore = 0
			for scope_r in scopeRef:

				if scope_r==[] or scope_h==[]:
					partialScore = 0
					if partialScore > bestScore: bestScore = partialScore
				else:
					ngram_range=range(1,len(scope_h)+1)
					logging.info("ngram_range")
					logging.info(ngram_range)
					score_weights=map(lambda x: round(x/reduce(lambda x,y:x+y,ngram_range),4),ngram_range)
					logging.info(score_weights)
				
					partialScore=float()
					for i in ngram_range:
						hyp=ngrams(scope_h,i)
						ref=ngrams(scope_r,i)
						partialScore+=(len(set(hyp).intersection(set(ref)))*score_weights[i-1])
					logging.info("partialScore")
					logging.info(partialScore)
					if partialScore > bestScore: bestScore = partialScore

			totalScore+=bestScore
			logging.info("totalScore")
			logging.info(totalScore)
			
		return totalScore
def calc_ngram(htokens,etokens):
    features = []
    for n in range(1,5):
        hgrams = nltk.FreqDist(ngrams(htokens,n))
        egrams = nltk.FreqDist(ngrams(etokens,n))
        prec = 0
        num = 0
        for k in hgrams:
            if k in egrams:
                prec = prec + hgrams[k]
            num = num + hgrams[k]
        if num > 0:
            prec = float(prec) / num
        features.append(prec)
        recall = 0
        num = 0
        for k in egrams:
            if k in hgrams:
                recall = recall + egrams[k]
            num = num + egrams[k]
        if num > 0:
            recall = float(recall) / num
        features.append(recall)
        features.append(calc_f1(prec,recall))
    return features
def format_text(entries, LSTM_shape=True):
	THIS_FOLDER = str(os.path.dirname(os.path.abspath(__file__)))
	sentences = []
	tokenizer = nltk.data.load('tokenizers/punkt/english.pickle')
	decoded = base64.b64decode(entries)
	decoded = str(decoded)
	decoded = decoded[2:]
	decoded = decoded[:-1]
	decoded = decoded.split(".")
	#print(decoded, "is decoded")
	for entry in decoded:
		token_sentences = tokenizer.tokenize(entry)
		for sentence in token_sentences:
			sentences.append(sentence)

	tokenized_sentences = []
	#remove_tokens = ['%', ']', '[', '.', ',', '?', '!', '\'']
	#remove_tokens = string.punctuation
	remove_tokens = '!"#$%&\'()*+,-./:;<=>?@[\\]^_`{|}~'
	stop_words = set(stopwords.words('english'))
	tweet_tknzr = TweetTokenizer()
	for sentence in sentences:
		tokens = tweet_tknzr.tokenize(sentence)
		tokens = list(filter(lambda a: a not in remove_tokens and a not in stop_words, tokens))
		tokenized_sentences.append(tokens)

	all_ngrams1 = np.load(THIS_FOLDER+'/ngrams1.npy').item()
	all_ngrams2 = np.load(THIS_FOLDER+'/ngrams2.npy').item()
	all_ngrams3 = np.load(THIS_FOLDER+'/ngrams3.npy').item()
	#once the model gets updated with good data, ngrams.py needs to get changed/updated too!

	X = np.zeros((len(sentences), len(all_ngrams1)+len(all_ngrams2)+len(all_ngrams3)))
	for i in range(len(tokenized_sentences)):
		sentence = tokenized_sentences[i]
		my_ngrams = ngrams(sentence, 1)
		for gram in my_ngrams:
			if gram in all_ngrams1:
				index = all_ngrams1[gram]
				X[i][index] = 1
	for i in range(len(tokenized_sentences)):
		sentence = tokenized_sentences[i]
		my_ngrams = ngrams(sentence, 2)
		for gram in my_ngrams:
			if gram in all_ngrams2:
				index = len(all_ngrams1) + all_ngrams2[gram]
				X[i][index] = 1
	for i in range(len(tokenized_sentences)):
		sentence = tokenized_sentences[i]
		my_ngrams = ngrams(sentence, 3)
		for gram in my_ngrams:
			if gram in all_ngrams3:
				index = len(all_ngrams1) + len(all_ngrams2) + all_ngrams3[gram]
				X[i][index] = 1


	if LSTM_shape:
		X = np.reshape(X, (X.shape[0], 1, X.shape[1]))
	else:
		X = np.reshape(X, (X.shape[0], X.shape[1]))
	return X
Exemple #9
0
    def extract_terms_features(terms, separateGrams=False):
        vector = dict()
        
        while('' in terms):
            terms.remove('')
#        for term in terms:
#            if vector.has_key(term):
#                vector[term] += 1
#            else:
#                vector[term] = 1
#        for i in range(len(terms) - 2):
#            cb2 = ' '.join(terms[i:i+1])
#            cb3 = ' '.join(terms[i:i+2])
#            if vector.has_key(cb2):
#                vector[cb2] += 1
#            else:
#                vector[cb2] = 1
#            if vector.has_key(cb3):
#                vector[cb3] += 1
#            else:
#                vector[cb3] = 1
#        cb2 = ' '.join(terms[len(terms)-2:len(terms)])
#        if vector.has_key(cb2):
#            vector[cb2] += 1
#        else:
#            vector[cb2] = 1
#        print terms
        g2 = ngrams(terms, 2)
        g3 = ngrams(terms, 3)
        
        
        g2j = [' '.join(gterms) for gterms in g2]
        g3j = [' '.join(gterms) for gterms in g3]
        
        
        vec1 = {}
        vec2 = {}
        vec3 = {}
        
        for t in terms:
            if(not vector.has_key(t)):
                vec1[t] = 1
            else:
                vec1[t] += 1
        for t in g2j:
            if(not vector.has_key(t)):
                vec2[t] = 1
            else:
                vec2[t] += 1
        for t in g3j:
            if(not vector.has_key(t)):
                vec3[t] = 1
            else:
                vec3[t] += 1
        
        vector = dict(vec1.items() + vec2.items() + vec3.items())
        if(separateGrams == True):
            return (vector, vec1, vec2, vec3)
        else:
            return vector
Exemple #10
0
def getNgramProbs(file):
	f = open(file,'r');
	unigramList = [] ;
	for line in f.read().split():
		unigramList.append( line );

	bigramList = ngrams(unigramList, 2);
	trigramList = ngrams(unigramList, 3);

	#dictionary of unigrams, bigrams, trigrams
	unigramDict = dict()
	bigramDict = dict()
	trigramDict = dict()

	#Counts for Unigrams
	countUni = 0 ;
	for item in unigramList:
		countUni += 1
		if item not in unigramDict:
			unigramDict[item] = 1
		else:
			unigramDict[item] += 1

	#Counts for Bigram
	for item in bigramList:
		if item not in bigramDict:
			bigramDict[item] = 1
		else:
			bigramDict[item] += 1

	#Counts for Trigrams
	for item in trigramList:
		if item not in trigramDict:
			trigramDict[item] = 1
		else:
			trigramDict[item] += 1

	#Probabilities for Trigrams
	for key,item in trigramDict.iteritems():
		trigramDict[key] /= float(bigramDict[(key[0],key[1])]) ; 

	#Probabilities for Bigrams
	for key,item in bigramDict.iteritems():
		bigramDict[key] /= float(unigramDict[key[0]]) ; 

	#Probabilities for Unigrams
	for key,item in unigramDict.iteritems():
		unigramDict[key] /= float(countUni) ; 

	# print "***** Unigrams";
	# for key,item in unigramDict.iteritems():
	#     print str(key) + ' ' + str(item) ;
	# print "***** Bigrams";
	# for key,item in bigramDict.iteritems():
	#     print str(key) + ' ' + str(item) ;
	# print "***** Trigrams";
	# for key,item in trigramDict.iteritems():
	#     print str(key) + ' ' + str(item) ;
	
	return [unigramDict,bigramDict,trigramDict];
Exemple #11
0
def create_candidate_list(sentence):
    tokens = nltk.tokenize.word_tokenize(sentence)

    candidates_lists = create_candidates_lists(tokens)

    # Create list of 1-grams.
    candidates = []
    for l in candidates_lists:
        candidates += l

    # Remove irrelevant stop words in 1-grams.
    res = [token for token in candidates
        if token not in ENGLISH_STOPWORDS]

    # Create list of bigrams.
    bigrams = []
    for l in candidates_lists:
        bigrams += ngrams(l, 2)

    # Create list of trigrams.
    trigrams = []
    for l in candidates_lists:
        trigrams += ngrams(l, 3)

    # Create list of 4-grams.
    fourgrams = []
    for l in candidates_lists:
        fourgrams += ngrams(l, 4)

    res += [' '.join(a) for a in bigrams]
    res += [' '.join(a) for a in trigrams]
    res += [' '.join(a) for a in fourgrams]

    return res
Exemple #12
0
    def rouge_s(references, candidate, beta, d_skip=None, averaging=True, smoothing=False):

        rouge_s_list = []
        k_c = len(candidate) if d_skip is None else d_skip
        cand_skip_list = list(skipgrams(tokenizer.tokenize(candidate),
                              n=2, k=k_c))
        for ref in references:
            k_ref = len(ref) if d_skip is None else d_skip
            ref_skip_list = list(skipgrams(tokenizer.tokenize(ref),
                                 n=2, k=k_ref))
            count = 0
            for bigram in cand_skip_list:
                if bigram in ref_skip_list:
                    count = count+1
            if not smoothing:
                r_skip = count/len(ref_skip_list)
                p_skip = count/len(cand_skip_list)
            else:
                cand_ungm = list(ngrams(tokenizer.tokenize(candidate),
                                      n=1))
                ref_ungm = list(ngrams(tokenizer.tokenize(ref),
                                     n=1))
                for ungm in cand_ungm:
                    if ungm in ref_ungm:
                        count += 1
                r_skip = count/(len(ref_skip_list)+len(ref_ungm))
                p_skip = count/(len(cand_skip_list)+len(cand_ungm))
            score = Rouge.get_score(r_skip, p_skip, beta)           
            rouge_s_list.append(score)
        return Rouge.jacknifing(rouge_s_list, averaging=averaging)
Exemple #13
0
def jaccardIdx(w1, w2):
    w1ngrams = set(ngrams(w1, 2))
    w2ngrams = set(ngrams(w2, 2))

    union = w1ngrams.union(w2ngrams)
    intersect = w1ngrams.intersection(w2ngrams)

    return 1.0 - float(len(intersect)) / float(len(union))
Exemple #14
0
    def count_word(self,doc,unigram = True,bigram = False,binary = False):
        str = word_tokenize(self.remove_non_ascii(doc))
        doc_voc = {}
        if(unigram):
            uni = ngrams(str,1)
            self.count_word_sub(doc_voc,uni,binary)

        if(bigram):
            bi = ngrams(str,2)
            self.count_word_sub(doc_voc,bi,binary)
	def trainModel(self, listOfFilenames):
		#dictionary of unigrams, bigrams, trigrams
		unigramDict = dict()
		bigramDict = dict()
		trigramDict = dict()

		#total count of unigrams, bigrams, trigrams
		countUni = 0
		countBi = 0
		countTri = 0

		i = 1
		#iterate over list of files
		for fileName in listOfFilenames:
			print "Reading", i
			i += 1
			stag = STagger(fileName)
			stag.find_unigrams(True, False)
			for item in stag.unigrams:
				countUni += 1
				if item not in unigramDict:
					unigramDict[item] = 1
				else:
					unigramDict[item] += 1
			codeBigrams = ngrams(stag.unigrams, 2)
			codeTrigrams = ngrams(stag.unigrams, 3)
			for item in codeBigrams:
				countBi += 1
				if item not in bigramDict:
					bigramDict[item] = 1
				else:
					bigramDict[item] += 1
			for item in codeTrigrams:
				countTri += 1
				if item not in trigramDict:
					trigramDict[item] = 1
				else:
					trigramDict[item] += 1

		
		#write the ngrams to the file
		outputFile = open('corpus.txt', 'w')
		outputFile.write(str(countUni) + "\n")
		for key, x in unigramDict.iteritems():
			outputFile.write(str(key) + " " + str(x) + "\n")

		outputFile.write(str(countBi) + "\n")
		for key, x in bigramDict.iteritems():
			outputFile.write(str(key[0]) + " "  + str(key[1]) + " " + str(x) + "\n")

		outputFile.write(str(countTri) + "\n")
		for key, x in trigramDict.iteritems():
			outputFile.write(str(key[0]) + " " + str(key[1]) + " " + str(key[2]) + " " + str(x) + "\n")

		outputFile.close()
Exemple #16
0
 def get_ngrams(self, tokens):
     tokens.insert(0, '<START>')
     unigrams = ngrams(tokens,1)
     # key for unigrams is ('word',), not just 'word' string.
     for item in unigrams: self.fdist1[item] += 1 
     
     bigrams = ngrams(tokens,2)
     for item in bigrams: self.fdist2[item] += 1 
     
     trigrams = ngrams(tokens,3)
     for item in trigrams: self.fdist3[item] += 1 
Exemple #17
0
 def uni_bi_gram(self,doc,unigram,bigram):
     ret_list = []
     if(unigram):
         uni = ngrams(doc,1)
         for gram in uni:
             ret_list.append(gram)
     if(bigram):
         bi = ngrams(doc,2)
         for gram in bi:
             ret_list.append(gram)
     return ret_list
Exemple #18
0
    def modified_precision(candidate, references, n):
        """ Calculate modified ngram precision.

        >>> BLEU.modified_precision(
        ...    'the the the the the the the'.split(),
        ...    ['the cat is on the mat'.split(), 'there is a cat on the mat'.split()],
        ...    n=1,
        ... )
        0.28...

        >>> BLEU.modified_precision(
        ...    'the the the the the the the'.split(),
        ...    ['the cat is on the mat'.split(), 'there is a cat on the mat'.split()],
        ...    n=2,
        ... )
        0.0

        >>> BLEU.modified_precision(
        ...    'of the'.split(),
        ...    [
        ...        'It is a guide to action that ensures that the military will forever heed Party commands.'.split(),
        ...        'It is the guiding principle which guarantees the military forces always being under the command of the Party.'.split(),
        ...        'It is the practical guide for the army always to heed the directions of the party'.split(),
        ...    ],
        ...    n=1,
        ... )
        1.0

        >>> BLEU.modified_precision(
        ...    'of the'.split(),
        ...    [
        ...        'It is a guide to action that ensures that the military will forever heed Party commands.'.split(),
        ...        'It is the guiding principle which guarantees the military forces always being under the command of the Party.'.split(),
        ...        'It is the practical guide for the army always to heed the directions of the party'.split(),
        ...    ],
        ...    n=2,
        ... )
        1.0

        """
        counts = Counter(ngrams(candidate, n))

        if not counts:
            return 0

        max_counts = {}
        for reference in references:
            reference_counts = Counter(ngrams(reference, n))
            for ngram in counts:
                max_counts[ngram] = max(max_counts.get(ngram, 0), reference_counts[ngram])

        clipped_counts = dict((ngram, min(count, max_counts[ngram])) for ngram, count in counts.items())

        return sum(clipped_counts.values()) / sum(counts.values())
def ngram_similarity(str1, str2, n = 3):
    str1 = str1.split()
    str2 = str2.split()
    ngram1 = []
    ngram2 = []
    for i in range(n):
        ngram1 = ngram1 + list(ngrams(str1,n-i))
    
    for i in range(n):
        ngram2 = ngram2 + list(ngrams(str2,n-i))
    return jaccard_dis(set(ngram1),set(ngram2))
Exemple #20
0
def create_model(tokenized_data):
    tokens_list = [tokens  for ndata in tokenized_data for tokens in ndata]   
    cfreq_data_bigram = nltk.ConditionalFreqDist(nltk.bigrams((tokens_list)))
    n = 3
    trigrams = ngrams(tokens_list, n)
    z = 4
    m = 5
    n = 6
    fourgram = ngrams(tokens_list, z)
    fivegram = ngrams(tokens_list, m)
    sixgram = ngrams(tokens_list, n)
    return cfreq_data_bigram,trigrams,fourgram,fivegram,sixgram
 def __call__(self, doc): 
     filtered_words = doc.split(" ")
     tokens = []
         
         
     for word in filtered_words:
         tokens.append(word)
     for bigram in ngrams(filtered_words,2):
         tokens.append('%s %s' %bigram)
     for trigram in ngrams(filtered_words,3):
         tokens.append('%s %s %s' %trigram)
     return tokens
def modified_precision(h,ref,n):
    ng_counts_h = Counter(ngrams(h,n))
    ng_counts_ref = Counter(ngrams(ref,n))
    modified_counts = Counter()   

    
    if not ng_counts_h:
        return 0
    for ng in ng_counts_h.keys():
        modified_counts[ng] = max(modified_counts[ng], ng_counts_ref[ng])
    truncated_cts = Counter((ng, min(ng_counts_h[ng],modified_counts[ng])) for ng in ng_counts_h)
    return sum(truncated_cts.values())/float(sum(ng_counts_h.values()))
def alzahrani_window_similarity(a_string, b_string, window_length, preprocess=1, window_length_normalize=1):
	'''read in two strings a,b and integer window_length and return the maximum alwhazari similarity across subwindows of the specified length'''
	a_windows      = list( ngrams( preprocess_string(a_string), window_length ) )
	b_windows      = list( ngrams( preprocess_string(b_string), window_length ) )
	max_similarity = 0
	
	for a_window in a_windows:
		for b_window in b_windows:						
			sim = alzahrani_similarity( list(a_window), list(b_window), preprocess=0, length_normalize=window_length_normalize )
			if sim > max_similarity:
				max_similarity = sim
				
	return max_similarity
Exemple #24
0
def mappings(x):
    if not re.search('^\[.*\]$|javascript:void|^nan$',str(x['target'])):
        query_tokens =[i for i in nltk.word_tokenize(queryFromURL(x['url']))]
    #target_tokens =[i for i in nltk.word_tokenize(queryFromURL(x['target'])) if i not in stops]    
        unigram = [ i for i in query_tokens if i not in stops]
        bigrams = ngrams(query_tokens,2)
        trigrams = ngrams(query_tokens,3)
        for i in unigram:        
            query_target_mappings[(i,x['target'])] = query_target_mappings.get((i,x['target']),0) +1
        for i in bigrams:        
            query_target_mappings[(i,x['target'])] = query_target_mappings.get((i,x['target']),0) +1
        for i in trigrams:        
            query_target_mappings[(i,x['target'])] = query_target_mappings.get((i,x['target']),0) +1
Exemple #25
0
def vocab(all_rev):
    vocab=[]
    
    for rows in all_rev:                  
        words=[word_tokenize(str(i)) for i in rows]  
        words=[i for i in words[0] if i.isalpha() and len(i)>1]
        word=[i for i in words if i not in stopwords.words('english')]
        bigrams=[i for i in ngrams(word,2) if i not in word]
        trigrams=[i for i in ngrams(word,3) if i not in word]         
        word.extend(bigrams)
        word.extend(trigrams)                                  
        vocab.append(word)
    return vocab
Exemple #26
0
    def count_word_per_file(self,doc,class_dict,unigram = True,bigram = False ,binary =False):
        str = word_tokenize(self.remove_non_ascii(doc))
        word_list = []
        word_count = 0
        if(unigram):
            uni = ngrams(str,1)
            word_count += self.count_word_per_file_sub(class_dict,uni,binary)

        if(bigram):
            bi = ngrams(str,2)
            word_count += self.count_word_per_file_sub(class_dict,bi,binary)

        return word_count
def distance_bigrams_same(t1, t2):
    """Bigram distance metric, term frequency is ignored,
       0 if bigrams are identical, 1.0 if no bigrams are common"""
    t1_terms = make_terms_from_string(t1)
    t2_terms = make_terms_from_string(t2)
    terms1 = set(ngrams(t1_terms, 2))  # was using nltk.bigrams
    terms2 = set(ngrams(t2_terms, 2))
    shared_terms = terms1.intersection(terms2)
    all_terms = terms1.union(terms2)
    dist = 1.0
    if len(all_terms) > 0:
        dist = 1.0 - (len(shared_terms) / float(len(all_terms)))
    return dist
Exemple #28
0
    def score(self,parallel_corpus):
        
        # containers
        count = [0,0,0,0]
        clip_count = [0,0,0,0]
        r = 0
        c = 0
        weights=[0.25,0.25,0.25,0.25]

        # accumulate ngram statistics
        for hyps,refs in parallel_corpus:
            hyps = [hyp.split() for hyp in hyps]
            refs = [ref.split() for ref in refs]
            for hyp in hyps:
                
                for i in range(4):
                    # accumulate ngram counts
                    hypcnts = Counter(ngrams(hyp, i+1))
                    cnt = sum(hypcnts.values())
                    count[i] += cnt

                    # compute clipped counts 
                    max_counts = {}
                    for ref in refs:
                        refcnts = Counter(ngrams(ref, i+1))
                        for ng in hypcnts:
                            max_counts[ng] = max(max_counts.get(ng, 0),refcnts[ng])
                    clipcnt = dict((ng, min(count, max_counts[ng])) \
                            for ng, count in hypcnts.items())
                    clip_count[i] += sum(clipcnt.values())

                # accumulate r & c
                bestmatch = [1000,1000]
                for ref in refs:
                    if bestmatch[0]==0: break
                    diff = abs(len(ref)-len(hyp))
                    if diff<bestmatch[0]:
                        bestmatch[0] = diff
                        bestmatch[1] = len(ref)
                r += bestmatch[1]
                c += len(hyp)

        # computing bleu score
        p0 = 1e-7
        bp = 1 if c>r else math.exp(1-float(r)/float(c))
        p_ns = [float(clip_count[i])/float(count[i]+p0)+p0 \
                for i in range(4)]
        s = math.fsum(w*math.log(p_n) \
                for w, p_n in zip(weights, p_ns) if p_n)
        bleu = bp*math.exp(s)
        return bleu
    def ngram_similarity(str1, str2, n=3):
        def jaccard_dis(s1, s2):
            return float(len(s1.intersection(s2))) / len(s1.union(s2))

        str1 = str1.split()
        str2 = str2.split()
        ngram1 = []
        ngram2 = []
        for i in range(n):
            ngram1 = ngram1 + list(ngrams(str1, n - i))

        for i in range(n):
            ngram2 = ngram2 + list(ngrams(str2, n - i))
        return jaccard_dis(set(ngram1), set(ngram2))
def tokens(filename):
  """
  Read feature tokens
  """
  with codecs.open(filename, 'rb', encoding="windows-1251") as myfile:
    text = myfile.read().strip().lower()

  token = nltk.word_tokenize(unicode(text))
  unigrams = ngrams(token, 1)
  trigrams = ngrams(token, 3)

  # unigrams = re.split(r"\s+", text)

  return chain(unigrams, trigrams)
Exemple #31
0
# FIRST ALTERNATIVE FOR BIGRAMS
df_aux = []
def words(text): return re.findall(r'\w+', text.lower())

for text in df_train_total.get('reviewText'):
    df_aux.extend(words(text))

# A SECOND ALTERNATIVE THAT PUT THE BIGRAMS INTO A DATA FRAME
bigrams= list()
bigrams_all = list()
for text in df_train_total.get('reviewText'):
    bigrams.clear()
    for word in text.split():
        bigrams.append(word)

    bigrams = list(ngrams(bigrams, 2))
    bigrams_all.append(tuple(bigrams))

#antes
bigrams = ngrams(df_aux, 2)
BigramFreq = Counter(bigrams_all)
get_bigrams_to_list = list(BigramFreq)

#tentative altrnativa de lista
get_bigrams_to_list = list(bigrams_all)
df_bigrams = df_train_total
df_bigrams['reviewText'] = get_bigrams_to_list

# MOST FREQUENT BIGRAMS
get_bigrams = BigramFreq.most_common(10)
Exemple #32
0
def get_cluster_label_sentences(cluster, dic, knowledge_threshold, debug_file):
    candidate_label = {}
    most_freq_word = {}
    filtered_words = open('msft/filtered_tf.txt', 'r')
    filtered_words = [word.strip() for word in filtered_words.readlines()]
    filtered_words = set(filtered_words)
    if len(cluster) <= 1:
        return 'NOLABEL'
    for value in cluster:
        value = filter(value)
        unigrams = value.split()
        bigrams = ngrams(unigrams, 2)
        trigrams = ngrams(unigrams, 3)
        combined = unigrams.extend(bigrams).extend(trigrams)
        for word in combined:
            if word in filtered_words:
                for concept in dic[word]:
                    concept = concept.strip()
                    if concept not in candidate_label:
                        candidate_label[concept] = 0
                    candidate_label[concept] += 1
            else:
                # for word in value.split():
                word = word.strip()
                if word not in most_freq_word:
                    most_freq_word[word] = 0
                most_freq_word[word] += 1
    backup = ''
    if len(candidate_label) > 0:
        candidate_label_sets = sorted(candidate_label.items(),
                                      key=operator.itemgetter(1))
        if (candidate_label_sets[len(candidate_label_sets) - 1][1] >
                knowledge_threshold):
            debug_file.write('Mode 1\n')
            debug_file.write(str(cluster) + '\n')
            debug_file.write(str(candidate_label_sets) + '\n')
            debug_file.write(candidate_label_sets[len(candidate_label_sets) -
                                                  1][0] + '\n\n')
            return candidate_label_sets[len(candidate_label_sets) - 1][0]
        else:
            backup = candidate_label_sets[len(candidate_label_sets) - 1][0]
    most_freq_word_sets = sorted(most_freq_word.items(),
                                 key=operator.itemgetter(1))
    if len(most_freq_word_sets) > 0:
        debug_file.write('Mode 2\n')
        debug_file.write(str(cluster) + '\n')
        debug_file.write(str(most_freq_word_sets) + '\n')
        debug_file.write(most_freq_word_sets[len(most_freq_word_sets) - 1][0] +
                         '\n\n')
        filtered_words = [
            word for word in most_freq_word_sets
            if word[0].lower() not in stopwords.words('english')
            and len(filter(word[0])) > 3
        ]
        if len(filtered_words) > 0:
            return filtered_words[len(filtered_words) - 1][0]
        return 'NOLABEL'
    if len(backup) > 0:
        return backup
    else:
        return 'NOLABEL'


# def get_cluster_label_sentences(cluster, dic, knowledge_threshold, debug_file):
#     candidate_label = {}
#     most_freq_word = {}
#     if len(cluster)<=1:
#         return 'NOLABEL'
#     for value in cluster:
#         for word in value.split():
#             if word in dic:
#                 for concept in dic[word]:
#                     concept = concept.strip()
#                     if concept not in candidate_label:
#                         candidate_label[concept]=0
#                     candidate_label[concept]+=1
#             else:
#                 #for word in value.split():
#                 word = word.strip()
#                 if word not in most_freq_word:
#                     most_freq_word[word]=0
#                 most_freq_word[word]+=1
#     backup = ''
#     if 0 and len(candidate_label)>0:
#         candidate_label_sets = sorted(candidate_label.items(),key=operator.itemgetter(1))
#         if(candidate_label_sets[len(candidate_label_sets)-1][1]>knowledge_threshold):
#             debug_file.write('Mode 1\n')
#             debug_file.write(str(cluster)+'\n')
#             debug_file.write(str(candidate_label_sets)+'\n')
#             debug_file.write(candidate_label_sets[len(candidate_label_sets)-1][0]+'\n\n')
#             return candidate_label_sets[len(candidate_label_sets)-1][0]
#         else:
#             backup = candidate_label_sets[len(candidate_label_sets)-1][0]
#     most_freq_word_sets = sorted(most_freq_word.items(), key=operator.itemgetter(1))
#     if len(most_freq_word_sets)>0:
#         debug_file.write('Mode 2\n')
#         debug_file.write(str(cluster) + '\n')
#         debug_file.write(str(most_freq_word_sets) + '\n')
#         debug_file.write(most_freq_word_sets[len(most_freq_word_sets)-1][0] + '\n\n')
#         filtered_words = [word for word in most_freq_word_sets if word[0].lower() not in stopwords.words('english') and len(filter(word[0]))>3]
#         if len(filtered_words) > 0:
#             return filtered_words[len(filtered_words)-1][0]
#         return 'NOLABEL'
#     if len(backup)>0:
#         return backup
#     else:
#         return 'NOLABEL'
 def char_ngram(self, n, word):
     char_tokens = list(word)
     char_ngrams = ngrams(
         char_tokens, n)  # prefix-suffix is automatically generated here
     return map(lambda x: ''.join(x), char_ngrams)
Exemple #34
0
            words = nltk.tokenize.word_tokenize(recordNoStop)

        #remove punctuation

        puncString = ".,?!()0123456789"
        for c in words:
            if ((c in puncString) or (c == '.') or (c == ',')): words.remove(c)

        #Graph
        g = nx.DiGraph()
        g.add_nodes_from(words)

        #print('Number of keywords in abstract: ',g.number_of_nodes())
        numberOfKeywords = g.number_of_nodes() / 3

        bg = ngrams(words, 2)
        g.add_edges_from(bg)
        #print("g.edges(data=True)",g.edges(data=True))
        #plt.figure()
        #nx.draw(g,with_labels=True,node_size=3000,font_size=8,font_color="navy",node_color="orange")
        #plt.show()
        #print(g.edges())

        #find all pair in the sentances
        pairEd = ""
        pairEdCountDict = {}
        maxPairCount = 0

        #Begin loop 1
        for item in g.edges():
            pairItem1 = 0
Exemple #35
0
from nltk import word_tokenize
from nltk.util import ngrams

text = ['cant railway station', 'citadel hotel', 'police stn']
for line in text:
    token = word_tokenize(line)
    bigram = list(ngrams(token, 2))
    print(bigram)

print([[b for b in zip(l.split(" ")[:-1], l.split(" ")[1:])] for l in text])
# print([[b for b in zip(l.split(" ")[:-(n-1)], l.split(" ")[(n-1):])] for l in text])
Exemple #36
0
import nltk, re
from nltk.tokenize import word_tokenize
# importing ngrams module from nltk
from nltk.util import ngrams
from collections import Counter
from looking_glass import looking_glass_full_text

cleaned = re.sub('\W+', ' ', looking_glass_full_text).lower()
tokenized = word_tokenize(cleaned)

# Change the n value to 2:
looking_glass_bigrams = ngrams(tokenized, 2)
looking_glass_bigrams_frequency = Counter(looking_glass_bigrams)

# Change the n value to 3:
looking_glass_trigrams = ngrams(tokenized, 3)
looking_glass_trigrams_frequency = Counter(looking_glass_trigrams)

# Change the n value to a number greater than 3:
looking_glass_ngrams = ngrams(tokenized, 10)
looking_glass_ngrams_frequency = Counter(looking_glass_ngrams)

print("Looking Glass Bigrams:")
print(looking_glass_bigrams_frequency.most_common(10))

print("\nLooking Glass Trigrams:")
print(looking_glass_trigrams_frequency.most_common(10))

print("\nLooking Glass n-grams:")
print(looking_glass_ngrams_frequency.most_common(10))
Exemple #37
0
def _get_ngrams(text, n):
    punctuation = set(string.punctuation)
    no_punc = "".join(char for char in text.lower() if char not in punctuation)
    words = word_tokenize(no_punc)
    return set(ngrams(words, n))
Exemple #38
0
ls = LancasterStemmer()
ss = SnowballStemmer('english')
print("Stemming Output")
for words in wtokens:
    print("Porter stemming Output")
    #print(ps.stem(words))
    print("Lancaster stemming Output")
    #print(ls.stem(words))
    print("Snowball stemming Output")
    #print(ss.stem(words))

# Lemmatization
lemmatizer = WordNetLemmatizer()
print("Lemmatized Output")
#print(lemmatizer.lemmatize(text))

# Parts of speech
for w in wtokens:
    print("POS output")
# print(nltk.pos_tag(w))

# Named Entity Recognition
sentence = "The grapevine has it that disgruntled Congressmen are looking to join hands with BJP to bring down Karnataka government"
print(ne_chunk(pos_tag(word_tokenize(sentence))))

# Trigram
mySentence = "Hi How are you? i am fine and you"
token = nltk.word_tokenize(mySentence)
trigram = ngrams(token, 3)
for t in trigram:
    print(t)
Exemple #39
0
from nltk.util import ngrams
from nltk.tokenize import word_tokenize
from nltk.tokenize import sent_tokenize
from nltk.stem import WordNetLemmatizer
from nltk.stem import PorterStemmer
from nltk import ne_chunk
from collections import Counter
ps = PorterStemmer()

lemmatizer = WordNetLemmatizer()
text = open('input.txt', encoding="utf8").read()

w_tokens = word_tokenize(text)
s_tokens = sent_tokenize(text)
print("Word tokens:", w_tokens)
print("\nSentence tokens:", s_tokens)

trigrams = ngrams(w_tokens, 3)
print("\nTrigrams: ", list(trigrams))

lemmatized_output = ' '.join([lemmatizer.lemmatize(w) for w in w_tokens])
print("\nLemmatization:\n", lemmatized_output)

stemmed_output = ' '.join([ps.stem(w) for w in w_tokens])
print("\nStemming:\n", stemmed_output)

n_pos = nltk.pos_tag(w_tokens)
print("\nParts of Speech :", n_pos)

noe = ne_chunk(n_pos)
print("\nNamed Entity Recognition :", noe)
Exemple #40
0
from preprocessing import preprocess_text
from nltk.util import ngrams
from collections import Counter

text = "It's exciting to watch flying fish after a hard day's work. I don't know why some fish prefer flying and other fish would rather swim. It seems like the fish just woke up one day and decided, 'hey, today is the day to fly away.'"
tokens = preprocess_text(text)

# Bigram approach:
bigrams_prepped = ngrams(tokens, 2)
bigrams = Counter(bigrams_prepped)
print("Three most frequent word sequences and the number of occurrences according to Bigrams:")
print(bigrams.most_common(3))

# Bag-of-Words approach:
# Define bag_of_words here:
bag_of_words = Counter(tokens)
print("\nThree most frequent words and number of occurrences according to Bag-of-Words:")
most_common_three = bag_of_words.most_common(3)
print(most_common_three)



def ngram(words,n):
	output = list(ngrams(words, n))
	return output
import nltk
from nltk.util import ngrams
from nltk.corpus import alpino
print(alpino.words())
bigrams_tokens = ngrams(alpino.words(), 2)
for i in bigrams_tokens:
    print(i)
f= (open('input.txt').read())
z = sent_tokenize(f)
word_token = []
for a in z:
    word_token.append(word_tokenize(f))

l = []
Lm=WordNetLemmatizer()
for i in word_token:
    for x in i:
        l.append(Lm.lemmatize(x,'v'))
print('lemmatizing words, we get: ', l)


b = []
biagram_logic = ngrams(l,2)
for j in biagram_logic:
    b.append(j)
print('\n','biagram solution is :', b)

count = nltk.FreqDist(b)
freq= []
for i, j in count.items():
    freq.append((i,j))
print('\n', 'bigrams and frequencies are: ',freq)
common5= []
common5=count.most_common(5)
print('\n','repeated bi-grams are: ',common5)
text = []
for i in common5:
    text.append(i[0])
Exemple #44
0
train.loc[train.SentenceId == 2]
print('Average count of phrases per sentence in train is {0:.0f}.'.format(
    train.groupby('SentenceId')['Phrase'].count().mean()))
print('Average count of phrases per sentence in test is {0:.0f}.'.format(
    test.groupby('SentenceId')['Phrase'].count().mean()))
print(
    'Number of phrases in train: {}. Number of sentences in train: {}.'.format(
        train.shape[0], len(train.SentenceId.unique())))
print('Number of phrases in test: {}. Number of sentences in test: {}.'.format(
    test.shape[0], len(test.SentenceId.unique())))
print('Average word length of phrases in train is {0:.0f}.'.format(
    np.mean(train['Phrase'].apply(lambda x: len(x.split())))))
print('Average word length of phrases in test is {0:.0f}.'.format(
    np.mean(test['Phrase'].apply(lambda x: len(x.split())))))
text = ' '.join(train.loc[train.Sentiment == 4, 'Phrase'].values)
text_trigrams = [i for i in ngrams(text.split(), 3)]
Counter(text_trigrams).most_common(30)
text = ' '.join(train.loc[train.Sentiment == 4, 'Phrase'].values)
text = [i for i in text.split() if i not in stopwords.words('english')]
text_trigrams = [i for i in ngrams(text, 3)]
Counter(text_trigrams).most_common(30)
tokenizer = TweetTokenizer()
vectorizer = TfidfVectorizer(ngram_range=(1, 2), tokenizer=tokenizer.tokenize)
full_text = list(train['Phrase'].values) + list(test['Phrase'].values)
vectorizer.fit(full_text)
train_vectorized = vectorizer.transform(train['Phrase'])
test_vectorized = vectorizer.transform(test['Phrase'])
y = train['Sentiment']
logreg = LogisticRegression()
ovr = OneVsRestClassifier(logreg)
### %time
Exemple #45
0
from nltk import word_tokenize
from nltk.util import ngrams
from nltk.corpus import inaugural
import numpy as np

file_content = inaugural.raw('2009-Obama.txt')

tokens = word_tokenize(file_content)
print('\nTokens List:\n')
print(tokens)
length = len(list(tokens))
result = list()
gramslist = ngrams(tokens, 1)
dictionary = {}
for gram in gramslist:
    if str(gram) in dictionary:
        dictionary[str(gram)] += 1
    else:
        dictionary[str(gram)] = 1
print(np.mean(list(dictionary.values())))
Exemple #46
0
tokenized_words = word_tokenize(contents)

# Lemmatization
from nltk.stem import WordNetLemmatizer

wordnet_lemmatizer = WordNetLemmatizer()
lemmatized_words = [wordnet_lemmatizer.lemmatize(w) for w in tokenized_words]
print("Lemmatized words\n")
print(lemmatized_words)
print("\n")

# Bigrams
from nltk.util import ngrams

bigrams = list(ngrams(tokenized_words, 2))
print("Bigrams\n")
print(bigrams)
print("\n")
# Top 5 Bigrams
import nltk

fdist = nltk.FreqDist(bigrams)
top_5 = fdist.most_common(5)
print("Top 5 bigrams \n ")
print(top_5)
print("\n")

# lines with the top 5 bigrams

summary = ''
Exemple #47
0
def modified_precision(references, hypothesis, n):
    """
    Calculate modified ngram precision.

    The normal precision method may lead to some wrong translations with
    high-precision, e.g., the translation, in which a word of reference
    repeats several times, has very high precision.

    This function only returns the Fraction object that contains the numerator
    and denominator necessary to calculate the corpus-level precision.
    To calculate the modified precision for a single pair of hypothesis and
    references, cast the Fraction object into a float.

    The famous "the the the ... " example shows that you can get BLEU precision
    by duplicating high frequency words.

        >>> reference1 = 'the cat is on the mat'.split()
        >>> reference2 = 'there is a cat on the mat'.split()
        >>> hypothesis1 = 'the the the the the the the'.split()
        >>> references = [reference1, reference2]
        >>> float(modified_precision(references, hypothesis1, n=1)) # doctest: +ELLIPSIS
        0.2857...

    In the modified n-gram precision, a reference word will be considered
    exhausted after a matching hypothesis word is identified, e.g.

        >>> reference1 = ['It', 'is', 'a', 'guide', 'to', 'action', 'that',
        ...               'ensures', 'that', 'the', 'military', 'will',
        ...               'forever', 'heed', 'Party', 'commands']
        >>> reference2 = ['It', 'is', 'the', 'guiding', 'principle', 'which',
        ...               'guarantees', 'the', 'military', 'forces', 'always',
        ...               'being', 'under', 'the', 'command', 'of', 'the',
        ...               'Party']
        >>> reference3 = ['It', 'is', 'the', 'practical', 'guide', 'for', 'the',
        ...               'army', 'always', 'to', 'heed', 'the', 'directions',
        ...               'of', 'the', 'party']
        >>> hypothesis = 'of the'.split()
        >>> references = [reference1, reference2, reference3]
        >>> float(modified_precision(references, hypothesis, n=1))
        1.0
        >>> float(modified_precision(references, hypothesis, n=2))
        1.0

    An example of a normal machine translation hypothesis:

        >>> hypothesis1 = ['It', 'is', 'a', 'guide', 'to', 'action', 'which',
        ...               'ensures', 'that', 'the', 'military', 'always',
        ...               'obeys', 'the', 'commands', 'of', 'the', 'party']

        >>> hypothesis2 = ['It', 'is', 'to', 'insure', 'the', 'troops',
        ...               'forever', 'hearing', 'the', 'activity', 'guidebook',
        ...               'that', 'party', 'direct']

        >>> reference1 = ['It', 'is', 'a', 'guide', 'to', 'action', 'that',
        ...               'ensures', 'that', 'the', 'military', 'will',
        ...               'forever', 'heed', 'Party', 'commands']

        >>> reference2 = ['It', 'is', 'the', 'guiding', 'principle', 'which',
        ...               'guarantees', 'the', 'military', 'forces', 'always',
        ...               'being', 'under', 'the', 'command', 'of', 'the',
        ...               'Party']

        >>> reference3 = ['It', 'is', 'the', 'practical', 'guide', 'for', 'the',
        ...               'army', 'always', 'to', 'heed', 'the', 'directions',
        ...               'of', 'the', 'party']
        >>> references = [reference1, reference2, reference3]
        >>> float(modified_precision(references, hypothesis1, n=1)) # doctest: +ELLIPSIS
        0.9444...
        >>> float(modified_precision(references, hypothesis2, n=1)) # doctest: +ELLIPSIS
        0.5714...
        >>> float(modified_precision(references, hypothesis1, n=2)) # doctest: +ELLIPSIS
        0.5882352941176471
        >>> float(modified_precision(references, hypothesis2, n=2)) # doctest: +ELLIPSIS
        0.07692...


    :param references: A list of reference translations.
    :type references: list(list(str))
    :param hypothesis: A hypothesis translation.
    :type hypothesis: list(str)
    :param n: The ngram order.
    :type n: int
    :return: BLEU's modified precision for the nth order ngram.
    :rtype: Fraction
    """
    # Extracts all ngrams in hypothesis
    # Set an empty Counter if hypothesis is empty.
    counts = Counter(ngrams(hypothesis,
                            n)) if len(hypothesis) >= n else Counter()
    # Extract a union of references' counts.
    # max_counts = reduce(or_, [Counter(ngrams(ref, n)) for ref in references])
    max_counts = {}
    for reference in references:
        reference_counts = (Counter(ngrams(reference, n))
                            if len(reference) >= n else Counter())
        for ngram in counts:
            max_counts[ngram] = max(max_counts.get(ngram, 0),
                                    reference_counts[ngram])

    # Assigns the intersection between hypothesis and references' counts.
    clipped_counts = {
        ngram: min(count, max_counts[ngram])
        for ngram, count in counts.items()
    }

    numerator = sum(clipped_counts.values())
    # Ensures that denominator is minimum 1 to avoid ZeroDivisionError.
    # Usually this happens when the ngram order is > len(reference).
    denominator = max(1, sum(counts.values()))

    return Fraction(numerator, denominator, _normalize=False)
Exemple #48
0
wordDict = dict(fdist)


dict1 = {}
counter = 0
# Index Dictionary Mapping
for key, value in wordDict.items():
    dict1[counter] = key
    counter += 1
# Converting words1 and tokenizing it for bigram function NLTK package usage
wordsNew = copy.deepcopy(words1)
w3 = ' '.join(wordsNew)
token = nltk.word_tokenize(w3)
print(token)
# Creating bigrams
bigrams = ngrams(token,2)
s = copy.deepcopy(Counter(bigrams))
# Creating bigram dictionaries for probabiltiy calculations
bigramDict = dict(s)
bigramDict1 = copy.deepcopy(bigramDict)
bigramDict2 = copy.deepcopy(bigramDict1)
# Creating probability dictionary
probIndex = {}
for i, g in bigramDict1.items():
    firstWord = i[0]
    totalFreq = 0
    totalsum = 0
    for k, v in bigramDict1.items():
        if(k[0] == firstWord):
            totalFreq+= v
    for k, v in bigramDict1.items():
Exemple #49
0
def cross_fold(x):

    # train_data = []
    # train_labels = []

    i = 0
    train_data = []
    test_data = []
    test_labels = []
    train_labels = []

    for line in file1:
        if i >= x * 1106 and i < (x + 1) * 1106:
            test_data.append(line)
            test_labels.append("pos")
        # else:
        #     train_data.append(line)
        #     train_labels.append("pos")
        i += 1

    i = 0
    for line in file2:
        if i >= x * 1106 and i < (x + 1) * 1106:
            test_data.append(line)
            test_labels.append("neg")
        # else:
        #     train_data.append(line)
        #     train_labels.append("neg")
        i += 1

    label = 0
    TP = 0
    TN = 0
    FP = 0
    FN = 0

    for content in test_data:
        lines = content.split("।")
        valence_list = []
        for line in lines:
            line.strip()
            token_list = remove_punc(line)
            token_list = remove_stopwords(token_list)
            #print(token_list)
            token_line = []
            token_stem = []
            #print(token_line)
            for word in token_list:

                if token_list.index(word) == len(token_list) - 1:
                    stem_word, last = stem_verb(word)
                    token_stem.append(stem_word)
                    if last != "":
                        token_stem.append(last)
                else:
                    stem_word = _stem_verb_step_1(word)
                    if stem_word in lex_dic:
                        token_stem.append(stem_word)
                    else:
                        stem_word = _stem_verb_step_2(word)
                        if stem_word in lex_dic:
                            token_stem.append(stem_word)
                        else:
                            token_stem.append(word)

            #print(token_stem)
            for word in token_stem:
                if word != "":
                    token_line.append(word)
            #print(token_line)
            # Bi-gram word
            bigrams_list = ngrams(token_line, 2)
            bigram_token_list = []
            remove_token = []
            for bigram in bigrams_list:
                bigram_words = bigram[0] + "_" + bigram[1]
                #bigram_words = "_".join(list(bigram))
                if bigram_words in lex_dic:
                    bigram_token_list.append(bigram_words)
                    remove_token.append(bigram[0])
                    remove_token.append(bigram[1])

            # Tri-gram word
            trigrams_list = ngrams(token_line, 3)
            trigram_token_list = []
            for trigram in trigrams_list:
                trigram_words = trigram[0] + "_" + trigram[1] + "_" + trigram[2]
                #bigram_words = "_".join(list(bigram))
                if trigram_words in lex_dic:
                    trigram_token_list.append(trigram_words)
                    remove_token.append(trigram[0])
                    remove_token.append(trigram[1])
                    remove_token.append(trigram[2])
            for w in remove_token:
                ind = token_line.index(w)
                token_line.pop(ind)

            token_line = trigram_token_list + token_line
            token_line = bigram_token_list + token_line

            sentiments = []
            for item in token_line:
                valence = 0
                i = token_line.index(item)
                if item in booster_dic:
                    sentiments.append(valence)
                    continue
                sentiments = words_valence(valence, token_line, item, i,
                                           sentiments)
                #print(sentiments)

            valence_list.append(score_valence(sentiments))

        # print(valence_line)
        valence_content = np.mean(valence_list)
        valence_content_pos = sum(i > 0 for i in valence_list)
        valence_content_neg = sum(i < 0 for i in valence_list)

        if test_labels[label] == "pos" and valence_content > 0:
            TP += 1
        elif test_labels[label] == "neg" and valence_content < 0:
            TN += 1
        elif test_labels[label] == "pos" and valence_content < 0:
            if valence_content_pos >= valence_content_neg:
                TP += 1
            else:
                FN += 1
                # print(valence_list,valence_content)
                # print(valence_content_pos,valence_content_neg,label)
                # print(content)

        elif test_labels[label] == "neg" and valence_content > 0:
            if valence_content_pos <= valence_content_neg:
                TN += 1
            else:
                FP += 1
            # print(line)
        # else:
        #     # print(valence_list,valence_content)
        #     # print(valence_content_pos,valence_content_neg,label)
        #     # print(content)
        label += 1
    print("Accuracy:", (TP + TN) / (TP + TN + FP + FN))
    PRECISION = TP / (TP + FP)
    RECALL = TP / (TP + FN)
    print("Precision:", TP / (TP + FP))
    print("Recall:", TP / (TP + FN))
    # print(TP,TN,FP,FN)
    # print(TP+TN+FP+FN)
    Accuracy.append((TP + TN) / (TP + TN + FP + FN))
    Precision.append(PRECISION)
    Recall.append(RECALL)
    f1_score.append((2 * PRECISION * RECALL) / (PRECISION + RECALL))
    values_to_text.sort(reverse=True)

    ##add to an array these values words
    for i in values_to_text:
        words_to_text.append(w[i])

    words_to_text = words_to_text[:l]  ##length of text

    f = open("unigram_output.txt", "a")
    with open('unigram_output.txt', 'r+') as f:
        for i in words_to_text:
            f.write(str(i[0]) + ' ')


##ngrams with using nltk
unigram = ngrams(entokens, 1)
bigrams = nltk.bigrams(entokens)
trigrams = nltk.trigrams(entokens)

##freq for each gram
##ore detailed information can be viewed with .items() .keys() .values()
unigram_freq = nltk.FreqDist(unigram)
bigrams_freq = nltk.FreqDist(bigrams)
trigrams_freq = nltk.FreqDist(trigrams)

words_unigram = []
values_unigram = []
words_bigrams = []
values_bigrams = []
words_trigrams = []
values_trigrams = []
NgramType = 3
inputFile = "testClean.txt"
outputFile = "feature" + str(NgramType) + "GramPoSTag.txt"

try:
    nGramFeatureSet = {}
    with codecs.open(inputFile, "r", "utf-8") as file:
        for line in file:
            elementList = line.split("@-?@")
            print elementList[0]
            elementsParser = parse(elementList[2])
            PoSTagList = []
            for PoSTag in elementsParser.split(" "):
                elements = PoSTag.split("/")
                PoSTagList.append(elements[1])
            nGrams = ngrams(PoSTagList, NgramType)
            for nGram in nGrams:
                nGram = ' '.join(e for e in nGram)
                if nGram in nGramFeatureSet:
                    nGramFeatureSet[nGram] = nGramFeatureSet[nGram] + 1
                else:
                    nGramFeatureSet[nGram] = 1
    nGramFeatureSetSort = sorted(nGramFeatureSet.items(),
                                 key=operator.itemgetter(1),
                                 reverse=True)

    with codecs.open(outputFile, "w", "utf-8") as file:
        for a in nGramFeatureSetSort:
            file.write(a[0] + "@-?@" + str(a[1]) + "\n")

except IOError as (errno, strerror):
def word_rank_alignment(reference, hypothesis, character_based=False):
    """    
    This is the word rank alignment algorithm described in the paper to produce
    the *worder* list, i.e. a list of word indices of the hypothesis word orders 
    w.r.t. the list of reference words.
    
    Below is (H0, R0) example from the Isozaki et al. 2010 paper, 
    note the examples are indexed from 1 but the results here are indexed from 0:
    
        >>> ref = str('he was interested in world history because he '
        ... 'read the book').split()
        >>> hyp = str('he read the book because he was interested in world '
        ... 'history').split()
        >>> word_rank_alignment(ref, hyp)
        [7, 8, 9, 10, 6, 0, 1, 2, 3, 4, 5]
        
    The (H1, R1) example from the paper, note the 0th index:
    
        >>> ref = 'John hit Bob yesterday'.split()
        >>> hyp = 'Bob hit John yesterday'.split()
        >>> word_rank_alignment(ref, hyp)
        [2, 1, 0, 3]

    Here is the (H2, R2) example from the paper, note the 0th index here too:
    
        >>> ref = 'the boy read the book'.split()
        >>> hyp = 'the book was read by the boy'.split()
        >>> word_rank_alignment(ref, hyp)
        [3, 4, 2, 0, 1]
        
    :param reference: a reference sentence
    :type reference: list(str)
    :param hypothesis: a hypothesis sentence
    :type hypothesis: list(str)
    """
    worder = []
    hyp_len = len(hypothesis)
    # Stores a list of possible ngrams from the reference sentence.
    # This is used for matching context window later in the algorithm.
    ref_ngrams = []
    hyp_ngrams = []
    for n in range(1, len(reference) + 1):
        for ng in ngrams(reference, n):
            ref_ngrams.append(ng)
        for ng in ngrams(hypothesis, n):
            hyp_ngrams.append(ng)
    for i, h_word in enumerate(hypothesis):
        # If word is not in the reference, continue.
        if h_word not in reference:
            continue
        # If we can determine one-to-one word correspondence for unigrams that
        # only appear once in both the reference and hypothesis.
        elif hypothesis.count(h_word) == reference.count(h_word) == 1:
            worder.append(reference.index(h_word))
        else:
            max_window_size = max(i, hyp_len - i + 1)
            for window in range(1, max_window_size):
                if i + window < hyp_len:  # If searching the right context is possible.
                    # Retrieve the right context window.
                    right_context_ngram = tuple(
                        islice(hypothesis, i, i + window + 1))
                    num_times_in_ref = ref_ngrams.count(right_context_ngram)
                    num_times_in_hyp = hyp_ngrams.count(right_context_ngram)
                    # If ngram appears only once in both ref and hyp.
                    if num_times_in_ref == num_times_in_hyp == 1:
                        # Find the position of ngram that matched the reference.
                        pos = position_of_ngram(right_context_ngram, reference)
                        worder.append(pos)  # Add the positions of the ngram.
                        break
                if window <= i:  # If searching the left context is possible.
                    # Retrieve the left context window.
                    left_context_ngram = tuple(
                        islice(hypothesis, i - window, i + 1))
                    num_times_in_ref = ref_ngrams.count(left_context_ngram)
                    num_times_in_hyp = hyp_ngrams.count(left_context_ngram)
                    if num_times_in_ref == num_times_in_hyp == 1:
                        # Find the position of ngram that matched the reference.
                        pos = position_of_ngram(left_context_ngram, reference)
                        # Add the positions of the ngram.
                        worder.append(pos + len(left_context_ngram) - 1)
                        break
    return worder
Exemple #53
0
        word = tags[0]
        wordTag = tags[1]
        if '+' in wordTag:
            position = wordTag.find('+')
            wordTag = wordTag[0:position]
        if '-' in wordTag and wordTag != '--':
            position = wordTag.find('-')
            wordTag = wordTag[0:position]
        tag_list.append(wordTag)
        corpus_with_tag.append((word, wordTag))

print("Done creating tag lists....")

print("Creating tag corpus...")
#Code snippet that works upon the unigrams list
unigrams = ngrams(tag_list, 1)
unigrams_freq = Counter(unigrams)

#Code snippet that works upon the bigrams list
bigrams = ngrams(tag_list, 2)
bigrams_freq = Counter(bigrams)

#Code snippet that works upon the trigrams list
trigrams = ngrams(tag_list, 3)
trigrams_freq = Counter(trigrams)

#Length of the corpus
len_corpus = brown.words().__len__()

word_with_tag = Counter(corpus_with_tag)
print("Corpus tagged!")
import nltk
from nltk.tokenize import word_tokenize
from nltk.util import ngrams
from sklearn.preprocessing import LabelEncoder
from keras.preprocessing.text import one_hot
sentences = ["To Sherlock Holmes she is always the woman.", "I have seldom heard him mention her under any other name."]
bigrams = []
for sentence in sentences:
    sequence = word_tokenize(sentence) 
    bigrams.extend(list(ngrams(sequence, 2)))
#print(bigrams)
freq_dist = nltk.FreqDist(bigrams)
prob_dist = nltk.MLEProbDist(freq_dist)
number_of_bigrams = freq_dist.N()
#Finding the unigram representation
from sklearn.feature_extraction.text import CountVectorizer
# vectorizer=CountVectorizer()
# unigram_training_words=vectorizer.fit_transform(bigrams)
# print( unigram_training_words.shape)
import pandas as pd
#df = pd.read_csv('Consumer_Complaints.csv')
# df=pd.read_csv('finaltext.csv',delimiter='\t',encoding='utf-8')
# print(df.head())
# print(df[text])
label=[]
text=[]
import csv
from sklearn.model_selection import train_test_split
with open('finaltext.csv') as myFile1:  
	reader = csv.reader(myFile1,delimiter=',')
	for row in reader:
Exemple #55
0
    return fixed_keyword

root = r"C:\Users\JLee35\dentsu\iProspect Hub - Documents\Channels\Owned & Earned\Automation\Microsoft\Word\Data\Erica's Project"
input_dir = "Input"
output_dir = "Output"

for filename in os.listdir(os.path.join(root,input_dir)):
    if filename.endswith(".docx"):
        save_name = filename.replace('.docx','.csv')
        doc = getText(os.path.join(root,input_dir,filename))

        all_ngrams = pd.DataFrame(columns=['word','count'])

        unigrams = doc.split()

        bigrams = ngrams(unigrams,2)
        bigrams = dict(collections.Counter(bigrams))

        trigrams = ngrams(unigrams, 3)
        trigrams = dict(collections.Counter(trigrams))

        unigrams = dict(collections.Counter(unigrams))
        unigrams = pd.DataFrame.from_dict(list(unigrams.items()))
        unigrams = unigrams.rename(columns={0:'word',1:'count'})
        all_ngrams = all_ngrams.append(unigrams)

        bigrams = pd.DataFrame.from_dict(list(bigrams.items()))
        bigrams = bigrams.rename(columns={0:'word',1:'count'})
        bigrams['word'] = bigrams.word.apply(lambda x: fix_those_brackets(x))
        all_ngrams = all_ngrams.append(bigrams)

if __name__ == '__main__':
    xmldoc = sys.argv[1]
    knownJava = sys.argv[2]
    knownCpp = sys.argv[3]
    ###################################################################
    # Section 1: Gather known data to create frequencies for known information
    ###################################################################
    knownJavaFile = open(knownJava)
    knownJavaString = ""
    for line in knownJavaFile:
        knownJavaString += line

    # knownJavaGram = ngramsFunction(knownJavaString, 3)
    knownJavaGram = ngrams(knownJavaString.split(' '),
                           3)  #ngramsFunction(knownJavaString, 3)
    knownJavaHashFreq = nltk.FreqDist(knownJavaGram)

    # javaMaxGram = max(knownJavaHashFreq, key=knownJavaHashFreq.get)
    # print(javaMaxGram, knownJavaHashFreq[javaMaxGram])

    knownCPPFile = open(knownCpp)
    knownCPPString = ""
    for line in knownCPPFile:
        knownCPPString += line

    # print(knownCPPString)
    knownCPPGram = ngrams(knownCPPString.split(' '), 3)
    knownCPPHashFreq = nltk.FreqDist(knownCPPGram)

    # cppMaxGram = max(knownCPPHashFreq, key=knownCPPHashFreq.get)
    for tok in tok_arr:
        if tok.endswith('*'):
            tok = tok[:-1]
        if tok.endswith('.') or tok.endswith(','):
            tok = tok[:-1]
        if tok not in pron_dict:
            oov.add(tok)
        token_dict[tok] = token_dict[tok] + 1 if tok in token_dict else 1

    sum_utt_len += len(tok_arr)
    no_of_lines += 1

    # n-gram analysis (character based)
    chrs = [c for c in utt_lower]

    unigrams = ngrams(chrs, 1)
    for c in unigrams:
        unigram_dict[c] = unigram_dict[c] + 1 if c in unigram_dict else 1

    bigrams = ngrams(chrs, 2)
    for bigram in bigrams:
        bigram_dict[
            bigram] = bigram_dict[bigram] + 1 if bigram in bigram_dict else 1

    quingrams = ngrams(chrs, 5)
    for quingram in quingrams:
        quingram_dict[quingram] = quingram_dict[
            quingram] + 1 if quingram in quingram_dict else 1

# print the results