def getTestData(corpus, embedsize, ngramsize, m): f = open(corpus) datap = [] for line in f: data = line.strip().split('\t') s1 = data[0] s2 = data[1] label = data[2] s1ng = ngrams(s1.split(' '), ngramsize) s2ng = ngrams(s2.split(' '), ngramsize) s1ng = set([ng for ng in s1ng]) s2ng = set([ng for ng in s2ng]) #diff = s2ng.difference(s1ng) all = s1ng.union(s2ng) datap.append(list(all)) Xs = [] wildcard = np.array([0.0]*embedsize) for ngs in datap: X = np.zeros((len(ngs), ngramsize, embedsize)) for i in range(0, len(ngs)): ngram = ngs[i] vectors = getEmbedVectors(ngramsize, embedsize, ngram, m, wildcard) X[i] = vectors Xs.append(X) return Xs
def str_common_grams(str1, str2, length=3): '''Return how many times the ngrams (of length min_len to max_len) of str1 appeared on str2 ''' grams1 = list(ngrams(str1, length)) grams2 = list(ngrams(str2, length)) return sum(grams2.count(gram) for gram in grams1)
def getTrainData(corpus, embedsize, ngramsize, m): f = open(corpus) datap = [] for line in f: data = line.strip().split('\t') s1 = data[0] s2 = data[1] label = data[2] s1ng = ngrams(s1.split(' '), ngramsize) s2ng = ngrams(s2.split(' '), ngramsize) s1ng = set([ng for ng in s1ng]) s2ng = set([ng for ng in s2ng]) #diff = s2ng.difference(s1ng) all = s1ng.union(s2ng) for ng in all: datap.append([ng, label]) X = np.zeros((len(datap), ngramsize, embedsize)) Y = np.zeros((len(datap), 3)) wildcard = np.array([0.0]*embedsize) for i in range(0, len(datap)): item = datap[i] ngram = item[0] label = item[1] vectors = getEmbedVectors(ngramsize, embedsize, ngram, m, wildcard) labels = getLabels(label) X[i] = vectors Y[i] = labels return X, Y
def modified_precision(candidate, references, n): candidate_ngrams=[] candidate_n = ngrams(candidate, n) for x in candidate_n: #print x candidate_ngrams.append(x) # print candidate_ngrams #print type(candidate_ngrams) #length+=1 if len(candidate_ngrams) == 0: return 0 #raw_input() c_words = set(candidate_ngrams) #print c_words for word in c_words: count_w = candidate_ngrams.count(word) + 1 #print count_w count_max = 0 for reference in references: reference_ngrams=[] reference_n = ngrams(reference, n) for x in reference_n: reference_ngrams.append(x) count = reference_ngrams.count(word) + 1 if count > count_max: count_max = count return min(count_w, count_max) / (len(candidate) + len(c_words))
def char_ngram_similarity(doc1, doc2, n, top=100): """ Gives a positive dissimilarity score of two documents with respect to their top m character n-grams distribution. If the value is 0 the documents are identical (or at least share an identical top m character n-grams distribution. :param doc1: :param doc2: :param n: the n-gram length :param top: Only use the N most frequent n-grams from each document. :return: A positive dissimilarity score. If the value is 0 the documents are identical (or at least their top m character n-grams distribution.) """ ngrams1 = Counter(ngrams(doc1, n)) ngrams2 = Counter(ngrams(doc2, n)) profile1 = [n[0] for n in ngrams1.most_common(top)] profile2 = [n[0] for n in ngrams2.most_common(top)] # normalise the two ngram distributions total1 = np.sum(list(ngrams1.values())) for key in ngrams1: ngrams1[key] /= total1 total2 = np.sum(list(ngrams2.values())) for key in ngrams2: ngrams2[key] /= total2 # calculate global dissimilarity score score = 0 for n in set(profile1 + profile2): f1 = ngrams1[n] f2 = ngrams2[n] score += ((2 * (f1 - f2)) / (f1 + f2)) ** 2 return score
def scoreScopeOverlap(self,scopeHyp,scopeRef): totalScore = 0 for scope_h in scopeHyp: bestScore = 0 for scope_r in scopeRef: if scope_r==[] or scope_h==[]: partialScore = 0 if partialScore > bestScore: bestScore = partialScore else: ngram_range=range(1,len(scope_h)+1) logging.info("ngram_range") logging.info(ngram_range) score_weights=map(lambda x: round(x/reduce(lambda x,y:x+y,ngram_range),4),ngram_range) logging.info(score_weights) partialScore=float() for i in ngram_range: hyp=ngrams(scope_h,i) ref=ngrams(scope_r,i) partialScore+=(len(set(hyp).intersection(set(ref)))*score_weights[i-1]) logging.info("partialScore") logging.info(partialScore) if partialScore > bestScore: bestScore = partialScore totalScore+=bestScore logging.info("totalScore") logging.info(totalScore) return totalScore
def calc_ngram(htokens,etokens): features = [] for n in range(1,5): hgrams = nltk.FreqDist(ngrams(htokens,n)) egrams = nltk.FreqDist(ngrams(etokens,n)) prec = 0 num = 0 for k in hgrams: if k in egrams: prec = prec + hgrams[k] num = num + hgrams[k] if num > 0: prec = float(prec) / num features.append(prec) recall = 0 num = 0 for k in egrams: if k in hgrams: recall = recall + egrams[k] num = num + egrams[k] if num > 0: recall = float(recall) / num features.append(recall) features.append(calc_f1(prec,recall)) return features
def format_text(entries, LSTM_shape=True): THIS_FOLDER = str(os.path.dirname(os.path.abspath(__file__))) sentences = [] tokenizer = nltk.data.load('tokenizers/punkt/english.pickle') decoded = base64.b64decode(entries) decoded = str(decoded) decoded = decoded[2:] decoded = decoded[:-1] decoded = decoded.split(".") #print(decoded, "is decoded") for entry in decoded: token_sentences = tokenizer.tokenize(entry) for sentence in token_sentences: sentences.append(sentence) tokenized_sentences = [] #remove_tokens = ['%', ']', '[', '.', ',', '?', '!', '\''] #remove_tokens = string.punctuation remove_tokens = '!"#$%&\'()*+,-./:;<=>?@[\\]^_`{|}~' stop_words = set(stopwords.words('english')) tweet_tknzr = TweetTokenizer() for sentence in sentences: tokens = tweet_tknzr.tokenize(sentence) tokens = list(filter(lambda a: a not in remove_tokens and a not in stop_words, tokens)) tokenized_sentences.append(tokens) all_ngrams1 = np.load(THIS_FOLDER+'/ngrams1.npy').item() all_ngrams2 = np.load(THIS_FOLDER+'/ngrams2.npy').item() all_ngrams3 = np.load(THIS_FOLDER+'/ngrams3.npy').item() #once the model gets updated with good data, ngrams.py needs to get changed/updated too! X = np.zeros((len(sentences), len(all_ngrams1)+len(all_ngrams2)+len(all_ngrams3))) for i in range(len(tokenized_sentences)): sentence = tokenized_sentences[i] my_ngrams = ngrams(sentence, 1) for gram in my_ngrams: if gram in all_ngrams1: index = all_ngrams1[gram] X[i][index] = 1 for i in range(len(tokenized_sentences)): sentence = tokenized_sentences[i] my_ngrams = ngrams(sentence, 2) for gram in my_ngrams: if gram in all_ngrams2: index = len(all_ngrams1) + all_ngrams2[gram] X[i][index] = 1 for i in range(len(tokenized_sentences)): sentence = tokenized_sentences[i] my_ngrams = ngrams(sentence, 3) for gram in my_ngrams: if gram in all_ngrams3: index = len(all_ngrams1) + len(all_ngrams2) + all_ngrams3[gram] X[i][index] = 1 if LSTM_shape: X = np.reshape(X, (X.shape[0], 1, X.shape[1])) else: X = np.reshape(X, (X.shape[0], X.shape[1])) return X
def extract_terms_features(terms, separateGrams=False): vector = dict() while('' in terms): terms.remove('') # for term in terms: # if vector.has_key(term): # vector[term] += 1 # else: # vector[term] = 1 # for i in range(len(terms) - 2): # cb2 = ' '.join(terms[i:i+1]) # cb3 = ' '.join(terms[i:i+2]) # if vector.has_key(cb2): # vector[cb2] += 1 # else: # vector[cb2] = 1 # if vector.has_key(cb3): # vector[cb3] += 1 # else: # vector[cb3] = 1 # cb2 = ' '.join(terms[len(terms)-2:len(terms)]) # if vector.has_key(cb2): # vector[cb2] += 1 # else: # vector[cb2] = 1 # print terms g2 = ngrams(terms, 2) g3 = ngrams(terms, 3) g2j = [' '.join(gterms) for gterms in g2] g3j = [' '.join(gterms) for gterms in g3] vec1 = {} vec2 = {} vec3 = {} for t in terms: if(not vector.has_key(t)): vec1[t] = 1 else: vec1[t] += 1 for t in g2j: if(not vector.has_key(t)): vec2[t] = 1 else: vec2[t] += 1 for t in g3j: if(not vector.has_key(t)): vec3[t] = 1 else: vec3[t] += 1 vector = dict(vec1.items() + vec2.items() + vec3.items()) if(separateGrams == True): return (vector, vec1, vec2, vec3) else: return vector
def getNgramProbs(file): f = open(file,'r'); unigramList = [] ; for line in f.read().split(): unigramList.append( line ); bigramList = ngrams(unigramList, 2); trigramList = ngrams(unigramList, 3); #dictionary of unigrams, bigrams, trigrams unigramDict = dict() bigramDict = dict() trigramDict = dict() #Counts for Unigrams countUni = 0 ; for item in unigramList: countUni += 1 if item not in unigramDict: unigramDict[item] = 1 else: unigramDict[item] += 1 #Counts for Bigram for item in bigramList: if item not in bigramDict: bigramDict[item] = 1 else: bigramDict[item] += 1 #Counts for Trigrams for item in trigramList: if item not in trigramDict: trigramDict[item] = 1 else: trigramDict[item] += 1 #Probabilities for Trigrams for key,item in trigramDict.iteritems(): trigramDict[key] /= float(bigramDict[(key[0],key[1])]) ; #Probabilities for Bigrams for key,item in bigramDict.iteritems(): bigramDict[key] /= float(unigramDict[key[0]]) ; #Probabilities for Unigrams for key,item in unigramDict.iteritems(): unigramDict[key] /= float(countUni) ; # print "***** Unigrams"; # for key,item in unigramDict.iteritems(): # print str(key) + ' ' + str(item) ; # print "***** Bigrams"; # for key,item in bigramDict.iteritems(): # print str(key) + ' ' + str(item) ; # print "***** Trigrams"; # for key,item in trigramDict.iteritems(): # print str(key) + ' ' + str(item) ; return [unigramDict,bigramDict,trigramDict];
def create_candidate_list(sentence): tokens = nltk.tokenize.word_tokenize(sentence) candidates_lists = create_candidates_lists(tokens) # Create list of 1-grams. candidates = [] for l in candidates_lists: candidates += l # Remove irrelevant stop words in 1-grams. res = [token for token in candidates if token not in ENGLISH_STOPWORDS] # Create list of bigrams. bigrams = [] for l in candidates_lists: bigrams += ngrams(l, 2) # Create list of trigrams. trigrams = [] for l in candidates_lists: trigrams += ngrams(l, 3) # Create list of 4-grams. fourgrams = [] for l in candidates_lists: fourgrams += ngrams(l, 4) res += [' '.join(a) for a in bigrams] res += [' '.join(a) for a in trigrams] res += [' '.join(a) for a in fourgrams] return res
def rouge_s(references, candidate, beta, d_skip=None, averaging=True, smoothing=False): rouge_s_list = [] k_c = len(candidate) if d_skip is None else d_skip cand_skip_list = list(skipgrams(tokenizer.tokenize(candidate), n=2, k=k_c)) for ref in references: k_ref = len(ref) if d_skip is None else d_skip ref_skip_list = list(skipgrams(tokenizer.tokenize(ref), n=2, k=k_ref)) count = 0 for bigram in cand_skip_list: if bigram in ref_skip_list: count = count+1 if not smoothing: r_skip = count/len(ref_skip_list) p_skip = count/len(cand_skip_list) else: cand_ungm = list(ngrams(tokenizer.tokenize(candidate), n=1)) ref_ungm = list(ngrams(tokenizer.tokenize(ref), n=1)) for ungm in cand_ungm: if ungm in ref_ungm: count += 1 r_skip = count/(len(ref_skip_list)+len(ref_ungm)) p_skip = count/(len(cand_skip_list)+len(cand_ungm)) score = Rouge.get_score(r_skip, p_skip, beta) rouge_s_list.append(score) return Rouge.jacknifing(rouge_s_list, averaging=averaging)
def jaccardIdx(w1, w2): w1ngrams = set(ngrams(w1, 2)) w2ngrams = set(ngrams(w2, 2)) union = w1ngrams.union(w2ngrams) intersect = w1ngrams.intersection(w2ngrams) return 1.0 - float(len(intersect)) / float(len(union))
def count_word(self,doc,unigram = True,bigram = False,binary = False): str = word_tokenize(self.remove_non_ascii(doc)) doc_voc = {} if(unigram): uni = ngrams(str,1) self.count_word_sub(doc_voc,uni,binary) if(bigram): bi = ngrams(str,2) self.count_word_sub(doc_voc,bi,binary)
def trainModel(self, listOfFilenames): #dictionary of unigrams, bigrams, trigrams unigramDict = dict() bigramDict = dict() trigramDict = dict() #total count of unigrams, bigrams, trigrams countUni = 0 countBi = 0 countTri = 0 i = 1 #iterate over list of files for fileName in listOfFilenames: print "Reading", i i += 1 stag = STagger(fileName) stag.find_unigrams(True, False) for item in stag.unigrams: countUni += 1 if item not in unigramDict: unigramDict[item] = 1 else: unigramDict[item] += 1 codeBigrams = ngrams(stag.unigrams, 2) codeTrigrams = ngrams(stag.unigrams, 3) for item in codeBigrams: countBi += 1 if item not in bigramDict: bigramDict[item] = 1 else: bigramDict[item] += 1 for item in codeTrigrams: countTri += 1 if item not in trigramDict: trigramDict[item] = 1 else: trigramDict[item] += 1 #write the ngrams to the file outputFile = open('corpus.txt', 'w') outputFile.write(str(countUni) + "\n") for key, x in unigramDict.iteritems(): outputFile.write(str(key) + " " + str(x) + "\n") outputFile.write(str(countBi) + "\n") for key, x in bigramDict.iteritems(): outputFile.write(str(key[0]) + " " + str(key[1]) + " " + str(x) + "\n") outputFile.write(str(countTri) + "\n") for key, x in trigramDict.iteritems(): outputFile.write(str(key[0]) + " " + str(key[1]) + " " + str(key[2]) + " " + str(x) + "\n") outputFile.close()
def get_ngrams(self, tokens): tokens.insert(0, '<START>') unigrams = ngrams(tokens,1) # key for unigrams is ('word',), not just 'word' string. for item in unigrams: self.fdist1[item] += 1 bigrams = ngrams(tokens,2) for item in bigrams: self.fdist2[item] += 1 trigrams = ngrams(tokens,3) for item in trigrams: self.fdist3[item] += 1
def uni_bi_gram(self,doc,unigram,bigram): ret_list = [] if(unigram): uni = ngrams(doc,1) for gram in uni: ret_list.append(gram) if(bigram): bi = ngrams(doc,2) for gram in bi: ret_list.append(gram) return ret_list
def modified_precision(candidate, references, n): """ Calculate modified ngram precision. >>> BLEU.modified_precision( ... 'the the the the the the the'.split(), ... ['the cat is on the mat'.split(), 'there is a cat on the mat'.split()], ... n=1, ... ) 0.28... >>> BLEU.modified_precision( ... 'the the the the the the the'.split(), ... ['the cat is on the mat'.split(), 'there is a cat on the mat'.split()], ... n=2, ... ) 0.0 >>> BLEU.modified_precision( ... 'of the'.split(), ... [ ... 'It is a guide to action that ensures that the military will forever heed Party commands.'.split(), ... 'It is the guiding principle which guarantees the military forces always being under the command of the Party.'.split(), ... 'It is the practical guide for the army always to heed the directions of the party'.split(), ... ], ... n=1, ... ) 1.0 >>> BLEU.modified_precision( ... 'of the'.split(), ... [ ... 'It is a guide to action that ensures that the military will forever heed Party commands.'.split(), ... 'It is the guiding principle which guarantees the military forces always being under the command of the Party.'.split(), ... 'It is the practical guide for the army always to heed the directions of the party'.split(), ... ], ... n=2, ... ) 1.0 """ counts = Counter(ngrams(candidate, n)) if not counts: return 0 max_counts = {} for reference in references: reference_counts = Counter(ngrams(reference, n)) for ngram in counts: max_counts[ngram] = max(max_counts.get(ngram, 0), reference_counts[ngram]) clipped_counts = dict((ngram, min(count, max_counts[ngram])) for ngram, count in counts.items()) return sum(clipped_counts.values()) / sum(counts.values())
def ngram_similarity(str1, str2, n = 3): str1 = str1.split() str2 = str2.split() ngram1 = [] ngram2 = [] for i in range(n): ngram1 = ngram1 + list(ngrams(str1,n-i)) for i in range(n): ngram2 = ngram2 + list(ngrams(str2,n-i)) return jaccard_dis(set(ngram1),set(ngram2))
def create_model(tokenized_data): tokens_list = [tokens for ndata in tokenized_data for tokens in ndata] cfreq_data_bigram = nltk.ConditionalFreqDist(nltk.bigrams((tokens_list))) n = 3 trigrams = ngrams(tokens_list, n) z = 4 m = 5 n = 6 fourgram = ngrams(tokens_list, z) fivegram = ngrams(tokens_list, m) sixgram = ngrams(tokens_list, n) return cfreq_data_bigram,trigrams,fourgram,fivegram,sixgram
def __call__(self, doc): filtered_words = doc.split(" ") tokens = [] for word in filtered_words: tokens.append(word) for bigram in ngrams(filtered_words,2): tokens.append('%s %s' %bigram) for trigram in ngrams(filtered_words,3): tokens.append('%s %s %s' %trigram) return tokens
def modified_precision(h,ref,n): ng_counts_h = Counter(ngrams(h,n)) ng_counts_ref = Counter(ngrams(ref,n)) modified_counts = Counter() if not ng_counts_h: return 0 for ng in ng_counts_h.keys(): modified_counts[ng] = max(modified_counts[ng], ng_counts_ref[ng]) truncated_cts = Counter((ng, min(ng_counts_h[ng],modified_counts[ng])) for ng in ng_counts_h) return sum(truncated_cts.values())/float(sum(ng_counts_h.values()))
def alzahrani_window_similarity(a_string, b_string, window_length, preprocess=1, window_length_normalize=1): '''read in two strings a,b and integer window_length and return the maximum alwhazari similarity across subwindows of the specified length''' a_windows = list( ngrams( preprocess_string(a_string), window_length ) ) b_windows = list( ngrams( preprocess_string(b_string), window_length ) ) max_similarity = 0 for a_window in a_windows: for b_window in b_windows: sim = alzahrani_similarity( list(a_window), list(b_window), preprocess=0, length_normalize=window_length_normalize ) if sim > max_similarity: max_similarity = sim return max_similarity
def mappings(x): if not re.search('^\[.*\]$|javascript:void|^nan$',str(x['target'])): query_tokens =[i for i in nltk.word_tokenize(queryFromURL(x['url']))] #target_tokens =[i for i in nltk.word_tokenize(queryFromURL(x['target'])) if i not in stops] unigram = [ i for i in query_tokens if i not in stops] bigrams = ngrams(query_tokens,2) trigrams = ngrams(query_tokens,3) for i in unigram: query_target_mappings[(i,x['target'])] = query_target_mappings.get((i,x['target']),0) +1 for i in bigrams: query_target_mappings[(i,x['target'])] = query_target_mappings.get((i,x['target']),0) +1 for i in trigrams: query_target_mappings[(i,x['target'])] = query_target_mappings.get((i,x['target']),0) +1
def vocab(all_rev): vocab=[] for rows in all_rev: words=[word_tokenize(str(i)) for i in rows] words=[i for i in words[0] if i.isalpha() and len(i)>1] word=[i for i in words if i not in stopwords.words('english')] bigrams=[i for i in ngrams(word,2) if i not in word] trigrams=[i for i in ngrams(word,3) if i not in word] word.extend(bigrams) word.extend(trigrams) vocab.append(word) return vocab
def count_word_per_file(self,doc,class_dict,unigram = True,bigram = False ,binary =False): str = word_tokenize(self.remove_non_ascii(doc)) word_list = [] word_count = 0 if(unigram): uni = ngrams(str,1) word_count += self.count_word_per_file_sub(class_dict,uni,binary) if(bigram): bi = ngrams(str,2) word_count += self.count_word_per_file_sub(class_dict,bi,binary) return word_count
def distance_bigrams_same(t1, t2): """Bigram distance metric, term frequency is ignored, 0 if bigrams are identical, 1.0 if no bigrams are common""" t1_terms = make_terms_from_string(t1) t2_terms = make_terms_from_string(t2) terms1 = set(ngrams(t1_terms, 2)) # was using nltk.bigrams terms2 = set(ngrams(t2_terms, 2)) shared_terms = terms1.intersection(terms2) all_terms = terms1.union(terms2) dist = 1.0 if len(all_terms) > 0: dist = 1.0 - (len(shared_terms) / float(len(all_terms))) return dist
def score(self,parallel_corpus): # containers count = [0,0,0,0] clip_count = [0,0,0,0] r = 0 c = 0 weights=[0.25,0.25,0.25,0.25] # accumulate ngram statistics for hyps,refs in parallel_corpus: hyps = [hyp.split() for hyp in hyps] refs = [ref.split() for ref in refs] for hyp in hyps: for i in range(4): # accumulate ngram counts hypcnts = Counter(ngrams(hyp, i+1)) cnt = sum(hypcnts.values()) count[i] += cnt # compute clipped counts max_counts = {} for ref in refs: refcnts = Counter(ngrams(ref, i+1)) for ng in hypcnts: max_counts[ng] = max(max_counts.get(ng, 0),refcnts[ng]) clipcnt = dict((ng, min(count, max_counts[ng])) \ for ng, count in hypcnts.items()) clip_count[i] += sum(clipcnt.values()) # accumulate r & c bestmatch = [1000,1000] for ref in refs: if bestmatch[0]==0: break diff = abs(len(ref)-len(hyp)) if diff<bestmatch[0]: bestmatch[0] = diff bestmatch[1] = len(ref) r += bestmatch[1] c += len(hyp) # computing bleu score p0 = 1e-7 bp = 1 if c>r else math.exp(1-float(r)/float(c)) p_ns = [float(clip_count[i])/float(count[i]+p0)+p0 \ for i in range(4)] s = math.fsum(w*math.log(p_n) \ for w, p_n in zip(weights, p_ns) if p_n) bleu = bp*math.exp(s) return bleu
def ngram_similarity(str1, str2, n=3): def jaccard_dis(s1, s2): return float(len(s1.intersection(s2))) / len(s1.union(s2)) str1 = str1.split() str2 = str2.split() ngram1 = [] ngram2 = [] for i in range(n): ngram1 = ngram1 + list(ngrams(str1, n - i)) for i in range(n): ngram2 = ngram2 + list(ngrams(str2, n - i)) return jaccard_dis(set(ngram1), set(ngram2))
def tokens(filename): """ Read feature tokens """ with codecs.open(filename, 'rb', encoding="windows-1251") as myfile: text = myfile.read().strip().lower() token = nltk.word_tokenize(unicode(text)) unigrams = ngrams(token, 1) trigrams = ngrams(token, 3) # unigrams = re.split(r"\s+", text) return chain(unigrams, trigrams)
# FIRST ALTERNATIVE FOR BIGRAMS df_aux = [] def words(text): return re.findall(r'\w+', text.lower()) for text in df_train_total.get('reviewText'): df_aux.extend(words(text)) # A SECOND ALTERNATIVE THAT PUT THE BIGRAMS INTO A DATA FRAME bigrams= list() bigrams_all = list() for text in df_train_total.get('reviewText'): bigrams.clear() for word in text.split(): bigrams.append(word) bigrams = list(ngrams(bigrams, 2)) bigrams_all.append(tuple(bigrams)) #antes bigrams = ngrams(df_aux, 2) BigramFreq = Counter(bigrams_all) get_bigrams_to_list = list(BigramFreq) #tentative altrnativa de lista get_bigrams_to_list = list(bigrams_all) df_bigrams = df_train_total df_bigrams['reviewText'] = get_bigrams_to_list # MOST FREQUENT BIGRAMS get_bigrams = BigramFreq.most_common(10)
def get_cluster_label_sentences(cluster, dic, knowledge_threshold, debug_file): candidate_label = {} most_freq_word = {} filtered_words = open('msft/filtered_tf.txt', 'r') filtered_words = [word.strip() for word in filtered_words.readlines()] filtered_words = set(filtered_words) if len(cluster) <= 1: return 'NOLABEL' for value in cluster: value = filter(value) unigrams = value.split() bigrams = ngrams(unigrams, 2) trigrams = ngrams(unigrams, 3) combined = unigrams.extend(bigrams).extend(trigrams) for word in combined: if word in filtered_words: for concept in dic[word]: concept = concept.strip() if concept not in candidate_label: candidate_label[concept] = 0 candidate_label[concept] += 1 else: # for word in value.split(): word = word.strip() if word not in most_freq_word: most_freq_word[word] = 0 most_freq_word[word] += 1 backup = '' if len(candidate_label) > 0: candidate_label_sets = sorted(candidate_label.items(), key=operator.itemgetter(1)) if (candidate_label_sets[len(candidate_label_sets) - 1][1] > knowledge_threshold): debug_file.write('Mode 1\n') debug_file.write(str(cluster) + '\n') debug_file.write(str(candidate_label_sets) + '\n') debug_file.write(candidate_label_sets[len(candidate_label_sets) - 1][0] + '\n\n') return candidate_label_sets[len(candidate_label_sets) - 1][0] else: backup = candidate_label_sets[len(candidate_label_sets) - 1][0] most_freq_word_sets = sorted(most_freq_word.items(), key=operator.itemgetter(1)) if len(most_freq_word_sets) > 0: debug_file.write('Mode 2\n') debug_file.write(str(cluster) + '\n') debug_file.write(str(most_freq_word_sets) + '\n') debug_file.write(most_freq_word_sets[len(most_freq_word_sets) - 1][0] + '\n\n') filtered_words = [ word for word in most_freq_word_sets if word[0].lower() not in stopwords.words('english') and len(filter(word[0])) > 3 ] if len(filtered_words) > 0: return filtered_words[len(filtered_words) - 1][0] return 'NOLABEL' if len(backup) > 0: return backup else: return 'NOLABEL' # def get_cluster_label_sentences(cluster, dic, knowledge_threshold, debug_file): # candidate_label = {} # most_freq_word = {} # if len(cluster)<=1: # return 'NOLABEL' # for value in cluster: # for word in value.split(): # if word in dic: # for concept in dic[word]: # concept = concept.strip() # if concept not in candidate_label: # candidate_label[concept]=0 # candidate_label[concept]+=1 # else: # #for word in value.split(): # word = word.strip() # if word not in most_freq_word: # most_freq_word[word]=0 # most_freq_word[word]+=1 # backup = '' # if 0 and len(candidate_label)>0: # candidate_label_sets = sorted(candidate_label.items(),key=operator.itemgetter(1)) # if(candidate_label_sets[len(candidate_label_sets)-1][1]>knowledge_threshold): # debug_file.write('Mode 1\n') # debug_file.write(str(cluster)+'\n') # debug_file.write(str(candidate_label_sets)+'\n') # debug_file.write(candidate_label_sets[len(candidate_label_sets)-1][0]+'\n\n') # return candidate_label_sets[len(candidate_label_sets)-1][0] # else: # backup = candidate_label_sets[len(candidate_label_sets)-1][0] # most_freq_word_sets = sorted(most_freq_word.items(), key=operator.itemgetter(1)) # if len(most_freq_word_sets)>0: # debug_file.write('Mode 2\n') # debug_file.write(str(cluster) + '\n') # debug_file.write(str(most_freq_word_sets) + '\n') # debug_file.write(most_freq_word_sets[len(most_freq_word_sets)-1][0] + '\n\n') # filtered_words = [word for word in most_freq_word_sets if word[0].lower() not in stopwords.words('english') and len(filter(word[0]))>3] # if len(filtered_words) > 0: # return filtered_words[len(filtered_words)-1][0] # return 'NOLABEL' # if len(backup)>0: # return backup # else: # return 'NOLABEL'
def char_ngram(self, n, word): char_tokens = list(word) char_ngrams = ngrams( char_tokens, n) # prefix-suffix is automatically generated here return map(lambda x: ''.join(x), char_ngrams)
words = nltk.tokenize.word_tokenize(recordNoStop) #remove punctuation puncString = ".,?!()0123456789" for c in words: if ((c in puncString) or (c == '.') or (c == ',')): words.remove(c) #Graph g = nx.DiGraph() g.add_nodes_from(words) #print('Number of keywords in abstract: ',g.number_of_nodes()) numberOfKeywords = g.number_of_nodes() / 3 bg = ngrams(words, 2) g.add_edges_from(bg) #print("g.edges(data=True)",g.edges(data=True)) #plt.figure() #nx.draw(g,with_labels=True,node_size=3000,font_size=8,font_color="navy",node_color="orange") #plt.show() #print(g.edges()) #find all pair in the sentances pairEd = "" pairEdCountDict = {} maxPairCount = 0 #Begin loop 1 for item in g.edges(): pairItem1 = 0
from nltk import word_tokenize from nltk.util import ngrams text = ['cant railway station', 'citadel hotel', 'police stn'] for line in text: token = word_tokenize(line) bigram = list(ngrams(token, 2)) print(bigram) print([[b for b in zip(l.split(" ")[:-1], l.split(" ")[1:])] for l in text]) # print([[b for b in zip(l.split(" ")[:-(n-1)], l.split(" ")[(n-1):])] for l in text])
import nltk, re from nltk.tokenize import word_tokenize # importing ngrams module from nltk from nltk.util import ngrams from collections import Counter from looking_glass import looking_glass_full_text cleaned = re.sub('\W+', ' ', looking_glass_full_text).lower() tokenized = word_tokenize(cleaned) # Change the n value to 2: looking_glass_bigrams = ngrams(tokenized, 2) looking_glass_bigrams_frequency = Counter(looking_glass_bigrams) # Change the n value to 3: looking_glass_trigrams = ngrams(tokenized, 3) looking_glass_trigrams_frequency = Counter(looking_glass_trigrams) # Change the n value to a number greater than 3: looking_glass_ngrams = ngrams(tokenized, 10) looking_glass_ngrams_frequency = Counter(looking_glass_ngrams) print("Looking Glass Bigrams:") print(looking_glass_bigrams_frequency.most_common(10)) print("\nLooking Glass Trigrams:") print(looking_glass_trigrams_frequency.most_common(10)) print("\nLooking Glass n-grams:") print(looking_glass_ngrams_frequency.most_common(10))
def _get_ngrams(text, n): punctuation = set(string.punctuation) no_punc = "".join(char for char in text.lower() if char not in punctuation) words = word_tokenize(no_punc) return set(ngrams(words, n))
ls = LancasterStemmer() ss = SnowballStemmer('english') print("Stemming Output") for words in wtokens: print("Porter stemming Output") #print(ps.stem(words)) print("Lancaster stemming Output") #print(ls.stem(words)) print("Snowball stemming Output") #print(ss.stem(words)) # Lemmatization lemmatizer = WordNetLemmatizer() print("Lemmatized Output") #print(lemmatizer.lemmatize(text)) # Parts of speech for w in wtokens: print("POS output") # print(nltk.pos_tag(w)) # Named Entity Recognition sentence = "The grapevine has it that disgruntled Congressmen are looking to join hands with BJP to bring down Karnataka government" print(ne_chunk(pos_tag(word_tokenize(sentence)))) # Trigram mySentence = "Hi How are you? i am fine and you" token = nltk.word_tokenize(mySentence) trigram = ngrams(token, 3) for t in trigram: print(t)
from nltk.util import ngrams from nltk.tokenize import word_tokenize from nltk.tokenize import sent_tokenize from nltk.stem import WordNetLemmatizer from nltk.stem import PorterStemmer from nltk import ne_chunk from collections import Counter ps = PorterStemmer() lemmatizer = WordNetLemmatizer() text = open('input.txt', encoding="utf8").read() w_tokens = word_tokenize(text) s_tokens = sent_tokenize(text) print("Word tokens:", w_tokens) print("\nSentence tokens:", s_tokens) trigrams = ngrams(w_tokens, 3) print("\nTrigrams: ", list(trigrams)) lemmatized_output = ' '.join([lemmatizer.lemmatize(w) for w in w_tokens]) print("\nLemmatization:\n", lemmatized_output) stemmed_output = ' '.join([ps.stem(w) for w in w_tokens]) print("\nStemming:\n", stemmed_output) n_pos = nltk.pos_tag(w_tokens) print("\nParts of Speech :", n_pos) noe = ne_chunk(n_pos) print("\nNamed Entity Recognition :", noe)
from preprocessing import preprocess_text from nltk.util import ngrams from collections import Counter text = "It's exciting to watch flying fish after a hard day's work. I don't know why some fish prefer flying and other fish would rather swim. It seems like the fish just woke up one day and decided, 'hey, today is the day to fly away.'" tokens = preprocess_text(text) # Bigram approach: bigrams_prepped = ngrams(tokens, 2) bigrams = Counter(bigrams_prepped) print("Three most frequent word sequences and the number of occurrences according to Bigrams:") print(bigrams.most_common(3)) # Bag-of-Words approach: # Define bag_of_words here: bag_of_words = Counter(tokens) print("\nThree most frequent words and number of occurrences according to Bag-of-Words:") most_common_three = bag_of_words.most_common(3) print(most_common_three)
def ngram(words,n): output = list(ngrams(words, n)) return output
import nltk from nltk.util import ngrams from nltk.corpus import alpino print(alpino.words()) bigrams_tokens = ngrams(alpino.words(), 2) for i in bigrams_tokens: print(i)
f= (open('input.txt').read()) z = sent_tokenize(f) word_token = [] for a in z: word_token.append(word_tokenize(f)) l = [] Lm=WordNetLemmatizer() for i in word_token: for x in i: l.append(Lm.lemmatize(x,'v')) print('lemmatizing words, we get: ', l) b = [] biagram_logic = ngrams(l,2) for j in biagram_logic: b.append(j) print('\n','biagram solution is :', b) count = nltk.FreqDist(b) freq= [] for i, j in count.items(): freq.append((i,j)) print('\n', 'bigrams and frequencies are: ',freq) common5= [] common5=count.most_common(5) print('\n','repeated bi-grams are: ',common5) text = [] for i in common5: text.append(i[0])
train.loc[train.SentenceId == 2] print('Average count of phrases per sentence in train is {0:.0f}.'.format( train.groupby('SentenceId')['Phrase'].count().mean())) print('Average count of phrases per sentence in test is {0:.0f}.'.format( test.groupby('SentenceId')['Phrase'].count().mean())) print( 'Number of phrases in train: {}. Number of sentences in train: {}.'.format( train.shape[0], len(train.SentenceId.unique()))) print('Number of phrases in test: {}. Number of sentences in test: {}.'.format( test.shape[0], len(test.SentenceId.unique()))) print('Average word length of phrases in train is {0:.0f}.'.format( np.mean(train['Phrase'].apply(lambda x: len(x.split()))))) print('Average word length of phrases in test is {0:.0f}.'.format( np.mean(test['Phrase'].apply(lambda x: len(x.split()))))) text = ' '.join(train.loc[train.Sentiment == 4, 'Phrase'].values) text_trigrams = [i for i in ngrams(text.split(), 3)] Counter(text_trigrams).most_common(30) text = ' '.join(train.loc[train.Sentiment == 4, 'Phrase'].values) text = [i for i in text.split() if i not in stopwords.words('english')] text_trigrams = [i for i in ngrams(text, 3)] Counter(text_trigrams).most_common(30) tokenizer = TweetTokenizer() vectorizer = TfidfVectorizer(ngram_range=(1, 2), tokenizer=tokenizer.tokenize) full_text = list(train['Phrase'].values) + list(test['Phrase'].values) vectorizer.fit(full_text) train_vectorized = vectorizer.transform(train['Phrase']) test_vectorized = vectorizer.transform(test['Phrase']) y = train['Sentiment'] logreg = LogisticRegression() ovr = OneVsRestClassifier(logreg) ### %time
from nltk import word_tokenize from nltk.util import ngrams from nltk.corpus import inaugural import numpy as np file_content = inaugural.raw('2009-Obama.txt') tokens = word_tokenize(file_content) print('\nTokens List:\n') print(tokens) length = len(list(tokens)) result = list() gramslist = ngrams(tokens, 1) dictionary = {} for gram in gramslist: if str(gram) in dictionary: dictionary[str(gram)] += 1 else: dictionary[str(gram)] = 1 print(np.mean(list(dictionary.values())))
tokenized_words = word_tokenize(contents) # Lemmatization from nltk.stem import WordNetLemmatizer wordnet_lemmatizer = WordNetLemmatizer() lemmatized_words = [wordnet_lemmatizer.lemmatize(w) for w in tokenized_words] print("Lemmatized words\n") print(lemmatized_words) print("\n") # Bigrams from nltk.util import ngrams bigrams = list(ngrams(tokenized_words, 2)) print("Bigrams\n") print(bigrams) print("\n") # Top 5 Bigrams import nltk fdist = nltk.FreqDist(bigrams) top_5 = fdist.most_common(5) print("Top 5 bigrams \n ") print(top_5) print("\n") # lines with the top 5 bigrams summary = ''
def modified_precision(references, hypothesis, n): """ Calculate modified ngram precision. The normal precision method may lead to some wrong translations with high-precision, e.g., the translation, in which a word of reference repeats several times, has very high precision. This function only returns the Fraction object that contains the numerator and denominator necessary to calculate the corpus-level precision. To calculate the modified precision for a single pair of hypothesis and references, cast the Fraction object into a float. The famous "the the the ... " example shows that you can get BLEU precision by duplicating high frequency words. >>> reference1 = 'the cat is on the mat'.split() >>> reference2 = 'there is a cat on the mat'.split() >>> hypothesis1 = 'the the the the the the the'.split() >>> references = [reference1, reference2] >>> float(modified_precision(references, hypothesis1, n=1)) # doctest: +ELLIPSIS 0.2857... In the modified n-gram precision, a reference word will be considered exhausted after a matching hypothesis word is identified, e.g. >>> reference1 = ['It', 'is', 'a', 'guide', 'to', 'action', 'that', ... 'ensures', 'that', 'the', 'military', 'will', ... 'forever', 'heed', 'Party', 'commands'] >>> reference2 = ['It', 'is', 'the', 'guiding', 'principle', 'which', ... 'guarantees', 'the', 'military', 'forces', 'always', ... 'being', 'under', 'the', 'command', 'of', 'the', ... 'Party'] >>> reference3 = ['It', 'is', 'the', 'practical', 'guide', 'for', 'the', ... 'army', 'always', 'to', 'heed', 'the', 'directions', ... 'of', 'the', 'party'] >>> hypothesis = 'of the'.split() >>> references = [reference1, reference2, reference3] >>> float(modified_precision(references, hypothesis, n=1)) 1.0 >>> float(modified_precision(references, hypothesis, n=2)) 1.0 An example of a normal machine translation hypothesis: >>> hypothesis1 = ['It', 'is', 'a', 'guide', 'to', 'action', 'which', ... 'ensures', 'that', 'the', 'military', 'always', ... 'obeys', 'the', 'commands', 'of', 'the', 'party'] >>> hypothesis2 = ['It', 'is', 'to', 'insure', 'the', 'troops', ... 'forever', 'hearing', 'the', 'activity', 'guidebook', ... 'that', 'party', 'direct'] >>> reference1 = ['It', 'is', 'a', 'guide', 'to', 'action', 'that', ... 'ensures', 'that', 'the', 'military', 'will', ... 'forever', 'heed', 'Party', 'commands'] >>> reference2 = ['It', 'is', 'the', 'guiding', 'principle', 'which', ... 'guarantees', 'the', 'military', 'forces', 'always', ... 'being', 'under', 'the', 'command', 'of', 'the', ... 'Party'] >>> reference3 = ['It', 'is', 'the', 'practical', 'guide', 'for', 'the', ... 'army', 'always', 'to', 'heed', 'the', 'directions', ... 'of', 'the', 'party'] >>> references = [reference1, reference2, reference3] >>> float(modified_precision(references, hypothesis1, n=1)) # doctest: +ELLIPSIS 0.9444... >>> float(modified_precision(references, hypothesis2, n=1)) # doctest: +ELLIPSIS 0.5714... >>> float(modified_precision(references, hypothesis1, n=2)) # doctest: +ELLIPSIS 0.5882352941176471 >>> float(modified_precision(references, hypothesis2, n=2)) # doctest: +ELLIPSIS 0.07692... :param references: A list of reference translations. :type references: list(list(str)) :param hypothesis: A hypothesis translation. :type hypothesis: list(str) :param n: The ngram order. :type n: int :return: BLEU's modified precision for the nth order ngram. :rtype: Fraction """ # Extracts all ngrams in hypothesis # Set an empty Counter if hypothesis is empty. counts = Counter(ngrams(hypothesis, n)) if len(hypothesis) >= n else Counter() # Extract a union of references' counts. # max_counts = reduce(or_, [Counter(ngrams(ref, n)) for ref in references]) max_counts = {} for reference in references: reference_counts = (Counter(ngrams(reference, n)) if len(reference) >= n else Counter()) for ngram in counts: max_counts[ngram] = max(max_counts.get(ngram, 0), reference_counts[ngram]) # Assigns the intersection between hypothesis and references' counts. clipped_counts = { ngram: min(count, max_counts[ngram]) for ngram, count in counts.items() } numerator = sum(clipped_counts.values()) # Ensures that denominator is minimum 1 to avoid ZeroDivisionError. # Usually this happens when the ngram order is > len(reference). denominator = max(1, sum(counts.values())) return Fraction(numerator, denominator, _normalize=False)
wordDict = dict(fdist) dict1 = {} counter = 0 # Index Dictionary Mapping for key, value in wordDict.items(): dict1[counter] = key counter += 1 # Converting words1 and tokenizing it for bigram function NLTK package usage wordsNew = copy.deepcopy(words1) w3 = ' '.join(wordsNew) token = nltk.word_tokenize(w3) print(token) # Creating bigrams bigrams = ngrams(token,2) s = copy.deepcopy(Counter(bigrams)) # Creating bigram dictionaries for probabiltiy calculations bigramDict = dict(s) bigramDict1 = copy.deepcopy(bigramDict) bigramDict2 = copy.deepcopy(bigramDict1) # Creating probability dictionary probIndex = {} for i, g in bigramDict1.items(): firstWord = i[0] totalFreq = 0 totalsum = 0 for k, v in bigramDict1.items(): if(k[0] == firstWord): totalFreq+= v for k, v in bigramDict1.items():
def cross_fold(x): # train_data = [] # train_labels = [] i = 0 train_data = [] test_data = [] test_labels = [] train_labels = [] for line in file1: if i >= x * 1106 and i < (x + 1) * 1106: test_data.append(line) test_labels.append("pos") # else: # train_data.append(line) # train_labels.append("pos") i += 1 i = 0 for line in file2: if i >= x * 1106 and i < (x + 1) * 1106: test_data.append(line) test_labels.append("neg") # else: # train_data.append(line) # train_labels.append("neg") i += 1 label = 0 TP = 0 TN = 0 FP = 0 FN = 0 for content in test_data: lines = content.split("।") valence_list = [] for line in lines: line.strip() token_list = remove_punc(line) token_list = remove_stopwords(token_list) #print(token_list) token_line = [] token_stem = [] #print(token_line) for word in token_list: if token_list.index(word) == len(token_list) - 1: stem_word, last = stem_verb(word) token_stem.append(stem_word) if last != "": token_stem.append(last) else: stem_word = _stem_verb_step_1(word) if stem_word in lex_dic: token_stem.append(stem_word) else: stem_word = _stem_verb_step_2(word) if stem_word in lex_dic: token_stem.append(stem_word) else: token_stem.append(word) #print(token_stem) for word in token_stem: if word != "": token_line.append(word) #print(token_line) # Bi-gram word bigrams_list = ngrams(token_line, 2) bigram_token_list = [] remove_token = [] for bigram in bigrams_list: bigram_words = bigram[0] + "_" + bigram[1] #bigram_words = "_".join(list(bigram)) if bigram_words in lex_dic: bigram_token_list.append(bigram_words) remove_token.append(bigram[0]) remove_token.append(bigram[1]) # Tri-gram word trigrams_list = ngrams(token_line, 3) trigram_token_list = [] for trigram in trigrams_list: trigram_words = trigram[0] + "_" + trigram[1] + "_" + trigram[2] #bigram_words = "_".join(list(bigram)) if trigram_words in lex_dic: trigram_token_list.append(trigram_words) remove_token.append(trigram[0]) remove_token.append(trigram[1]) remove_token.append(trigram[2]) for w in remove_token: ind = token_line.index(w) token_line.pop(ind) token_line = trigram_token_list + token_line token_line = bigram_token_list + token_line sentiments = [] for item in token_line: valence = 0 i = token_line.index(item) if item in booster_dic: sentiments.append(valence) continue sentiments = words_valence(valence, token_line, item, i, sentiments) #print(sentiments) valence_list.append(score_valence(sentiments)) # print(valence_line) valence_content = np.mean(valence_list) valence_content_pos = sum(i > 0 for i in valence_list) valence_content_neg = sum(i < 0 for i in valence_list) if test_labels[label] == "pos" and valence_content > 0: TP += 1 elif test_labels[label] == "neg" and valence_content < 0: TN += 1 elif test_labels[label] == "pos" and valence_content < 0: if valence_content_pos >= valence_content_neg: TP += 1 else: FN += 1 # print(valence_list,valence_content) # print(valence_content_pos,valence_content_neg,label) # print(content) elif test_labels[label] == "neg" and valence_content > 0: if valence_content_pos <= valence_content_neg: TN += 1 else: FP += 1 # print(line) # else: # # print(valence_list,valence_content) # # print(valence_content_pos,valence_content_neg,label) # # print(content) label += 1 print("Accuracy:", (TP + TN) / (TP + TN + FP + FN)) PRECISION = TP / (TP + FP) RECALL = TP / (TP + FN) print("Precision:", TP / (TP + FP)) print("Recall:", TP / (TP + FN)) # print(TP,TN,FP,FN) # print(TP+TN+FP+FN) Accuracy.append((TP + TN) / (TP + TN + FP + FN)) Precision.append(PRECISION) Recall.append(RECALL) f1_score.append((2 * PRECISION * RECALL) / (PRECISION + RECALL))
values_to_text.sort(reverse=True) ##add to an array these values words for i in values_to_text: words_to_text.append(w[i]) words_to_text = words_to_text[:l] ##length of text f = open("unigram_output.txt", "a") with open('unigram_output.txt', 'r+') as f: for i in words_to_text: f.write(str(i[0]) + ' ') ##ngrams with using nltk unigram = ngrams(entokens, 1) bigrams = nltk.bigrams(entokens) trigrams = nltk.trigrams(entokens) ##freq for each gram ##ore detailed information can be viewed with .items() .keys() .values() unigram_freq = nltk.FreqDist(unigram) bigrams_freq = nltk.FreqDist(bigrams) trigrams_freq = nltk.FreqDist(trigrams) words_unigram = [] values_unigram = [] words_bigrams = [] values_bigrams = [] words_trigrams = [] values_trigrams = []
NgramType = 3 inputFile = "testClean.txt" outputFile = "feature" + str(NgramType) + "GramPoSTag.txt" try: nGramFeatureSet = {} with codecs.open(inputFile, "r", "utf-8") as file: for line in file: elementList = line.split("@-?@") print elementList[0] elementsParser = parse(elementList[2]) PoSTagList = [] for PoSTag in elementsParser.split(" "): elements = PoSTag.split("/") PoSTagList.append(elements[1]) nGrams = ngrams(PoSTagList, NgramType) for nGram in nGrams: nGram = ' '.join(e for e in nGram) if nGram in nGramFeatureSet: nGramFeatureSet[nGram] = nGramFeatureSet[nGram] + 1 else: nGramFeatureSet[nGram] = 1 nGramFeatureSetSort = sorted(nGramFeatureSet.items(), key=operator.itemgetter(1), reverse=True) with codecs.open(outputFile, "w", "utf-8") as file: for a in nGramFeatureSetSort: file.write(a[0] + "@-?@" + str(a[1]) + "\n") except IOError as (errno, strerror):
def word_rank_alignment(reference, hypothesis, character_based=False): """ This is the word rank alignment algorithm described in the paper to produce the *worder* list, i.e. a list of word indices of the hypothesis word orders w.r.t. the list of reference words. Below is (H0, R0) example from the Isozaki et al. 2010 paper, note the examples are indexed from 1 but the results here are indexed from 0: >>> ref = str('he was interested in world history because he ' ... 'read the book').split() >>> hyp = str('he read the book because he was interested in world ' ... 'history').split() >>> word_rank_alignment(ref, hyp) [7, 8, 9, 10, 6, 0, 1, 2, 3, 4, 5] The (H1, R1) example from the paper, note the 0th index: >>> ref = 'John hit Bob yesterday'.split() >>> hyp = 'Bob hit John yesterday'.split() >>> word_rank_alignment(ref, hyp) [2, 1, 0, 3] Here is the (H2, R2) example from the paper, note the 0th index here too: >>> ref = 'the boy read the book'.split() >>> hyp = 'the book was read by the boy'.split() >>> word_rank_alignment(ref, hyp) [3, 4, 2, 0, 1] :param reference: a reference sentence :type reference: list(str) :param hypothesis: a hypothesis sentence :type hypothesis: list(str) """ worder = [] hyp_len = len(hypothesis) # Stores a list of possible ngrams from the reference sentence. # This is used for matching context window later in the algorithm. ref_ngrams = [] hyp_ngrams = [] for n in range(1, len(reference) + 1): for ng in ngrams(reference, n): ref_ngrams.append(ng) for ng in ngrams(hypothesis, n): hyp_ngrams.append(ng) for i, h_word in enumerate(hypothesis): # If word is not in the reference, continue. if h_word not in reference: continue # If we can determine one-to-one word correspondence for unigrams that # only appear once in both the reference and hypothesis. elif hypothesis.count(h_word) == reference.count(h_word) == 1: worder.append(reference.index(h_word)) else: max_window_size = max(i, hyp_len - i + 1) for window in range(1, max_window_size): if i + window < hyp_len: # If searching the right context is possible. # Retrieve the right context window. right_context_ngram = tuple( islice(hypothesis, i, i + window + 1)) num_times_in_ref = ref_ngrams.count(right_context_ngram) num_times_in_hyp = hyp_ngrams.count(right_context_ngram) # If ngram appears only once in both ref and hyp. if num_times_in_ref == num_times_in_hyp == 1: # Find the position of ngram that matched the reference. pos = position_of_ngram(right_context_ngram, reference) worder.append(pos) # Add the positions of the ngram. break if window <= i: # If searching the left context is possible. # Retrieve the left context window. left_context_ngram = tuple( islice(hypothesis, i - window, i + 1)) num_times_in_ref = ref_ngrams.count(left_context_ngram) num_times_in_hyp = hyp_ngrams.count(left_context_ngram) if num_times_in_ref == num_times_in_hyp == 1: # Find the position of ngram that matched the reference. pos = position_of_ngram(left_context_ngram, reference) # Add the positions of the ngram. worder.append(pos + len(left_context_ngram) - 1) break return worder
word = tags[0] wordTag = tags[1] if '+' in wordTag: position = wordTag.find('+') wordTag = wordTag[0:position] if '-' in wordTag and wordTag != '--': position = wordTag.find('-') wordTag = wordTag[0:position] tag_list.append(wordTag) corpus_with_tag.append((word, wordTag)) print("Done creating tag lists....") print("Creating tag corpus...") #Code snippet that works upon the unigrams list unigrams = ngrams(tag_list, 1) unigrams_freq = Counter(unigrams) #Code snippet that works upon the bigrams list bigrams = ngrams(tag_list, 2) bigrams_freq = Counter(bigrams) #Code snippet that works upon the trigrams list trigrams = ngrams(tag_list, 3) trigrams_freq = Counter(trigrams) #Length of the corpus len_corpus = brown.words().__len__() word_with_tag = Counter(corpus_with_tag) print("Corpus tagged!")
import nltk from nltk.tokenize import word_tokenize from nltk.util import ngrams from sklearn.preprocessing import LabelEncoder from keras.preprocessing.text import one_hot sentences = ["To Sherlock Holmes she is always the woman.", "I have seldom heard him mention her under any other name."] bigrams = [] for sentence in sentences: sequence = word_tokenize(sentence) bigrams.extend(list(ngrams(sequence, 2))) #print(bigrams) freq_dist = nltk.FreqDist(bigrams) prob_dist = nltk.MLEProbDist(freq_dist) number_of_bigrams = freq_dist.N() #Finding the unigram representation from sklearn.feature_extraction.text import CountVectorizer # vectorizer=CountVectorizer() # unigram_training_words=vectorizer.fit_transform(bigrams) # print( unigram_training_words.shape) import pandas as pd #df = pd.read_csv('Consumer_Complaints.csv') # df=pd.read_csv('finaltext.csv',delimiter='\t',encoding='utf-8') # print(df.head()) # print(df[text]) label=[] text=[] import csv from sklearn.model_selection import train_test_split with open('finaltext.csv') as myFile1: reader = csv.reader(myFile1,delimiter=',') for row in reader:
return fixed_keyword root = r"C:\Users\JLee35\dentsu\iProspect Hub - Documents\Channels\Owned & Earned\Automation\Microsoft\Word\Data\Erica's Project" input_dir = "Input" output_dir = "Output" for filename in os.listdir(os.path.join(root,input_dir)): if filename.endswith(".docx"): save_name = filename.replace('.docx','.csv') doc = getText(os.path.join(root,input_dir,filename)) all_ngrams = pd.DataFrame(columns=['word','count']) unigrams = doc.split() bigrams = ngrams(unigrams,2) bigrams = dict(collections.Counter(bigrams)) trigrams = ngrams(unigrams, 3) trigrams = dict(collections.Counter(trigrams)) unigrams = dict(collections.Counter(unigrams)) unigrams = pd.DataFrame.from_dict(list(unigrams.items())) unigrams = unigrams.rename(columns={0:'word',1:'count'}) all_ngrams = all_ngrams.append(unigrams) bigrams = pd.DataFrame.from_dict(list(bigrams.items())) bigrams = bigrams.rename(columns={0:'word',1:'count'}) bigrams['word'] = bigrams.word.apply(lambda x: fix_those_brackets(x)) all_ngrams = all_ngrams.append(bigrams)
if __name__ == '__main__': xmldoc = sys.argv[1] knownJava = sys.argv[2] knownCpp = sys.argv[3] ################################################################### # Section 1: Gather known data to create frequencies for known information ################################################################### knownJavaFile = open(knownJava) knownJavaString = "" for line in knownJavaFile: knownJavaString += line # knownJavaGram = ngramsFunction(knownJavaString, 3) knownJavaGram = ngrams(knownJavaString.split(' '), 3) #ngramsFunction(knownJavaString, 3) knownJavaHashFreq = nltk.FreqDist(knownJavaGram) # javaMaxGram = max(knownJavaHashFreq, key=knownJavaHashFreq.get) # print(javaMaxGram, knownJavaHashFreq[javaMaxGram]) knownCPPFile = open(knownCpp) knownCPPString = "" for line in knownCPPFile: knownCPPString += line # print(knownCPPString) knownCPPGram = ngrams(knownCPPString.split(' '), 3) knownCPPHashFreq = nltk.FreqDist(knownCPPGram) # cppMaxGram = max(knownCPPHashFreq, key=knownCPPHashFreq.get)
for tok in tok_arr: if tok.endswith('*'): tok = tok[:-1] if tok.endswith('.') or tok.endswith(','): tok = tok[:-1] if tok not in pron_dict: oov.add(tok) token_dict[tok] = token_dict[tok] + 1 if tok in token_dict else 1 sum_utt_len += len(tok_arr) no_of_lines += 1 # n-gram analysis (character based) chrs = [c for c in utt_lower] unigrams = ngrams(chrs, 1) for c in unigrams: unigram_dict[c] = unigram_dict[c] + 1 if c in unigram_dict else 1 bigrams = ngrams(chrs, 2) for bigram in bigrams: bigram_dict[ bigram] = bigram_dict[bigram] + 1 if bigram in bigram_dict else 1 quingrams = ngrams(chrs, 5) for quingram in quingrams: quingram_dict[quingram] = quingram_dict[ quingram] + 1 if quingram in quingram_dict else 1 # print the results