def create_bigram_model(span_eng_dict): bigram_span_dict = collections.defaultdict(lambda: 0) bigram_eng_dict = collections.defaultdict(lambda: 0) bigram_span_eng_dict = collections.defaultdict(lambda: tuple) text = codecs.open('SpanishText.txt', encoding='utf-8') for sentence in text.readlines(): line = [re.sub('[.?!",]', '', word) for word in sentence.split()] for word1, word2 in bigrams(line): bigram_span_dict[(word1.lower(), word2.lower())] += 1 for word1, word2 in bigrams(line): #print(word1, span_eng_dict[word1]) try: bigram_span_eng_dict[(word1.lower(), word2.lower())] = ( span_eng_dict[word1.lower()], span_eng_dict[word2.lower()]) except: pass eng_text = open('DMT_translate.txt') for sentence in eng_text.readlines(): line = [re.sub('[.?!",]', '', word) for word in sentence.split()] for word1, word2 in bigrams(line): bigram_eng_dict[(word1.lower(), word2.lower())] += 1 text = '' for k, v in bigram_span_eng_dict.items(): try: if (bigram_span_dict.get(k) == bigram_eng_dict.get(v)) and ( bigram_span_dict.get(k) >= 1): #print(k, v) text += k[0] + k[1] except: pass return
def wiki_to_feature(wiki): """ Specifically handles a single wiki document :param wiki: dict for wiki fields :type wiki: dict :return: tuple with wiki id and list of feature strings :rtype: tuple """ try: features = [] bow = [] features += [u'ORIGINAL_HUB:%s' % wiki.get(u'hub_s', u'')] features += [u'TOP_CAT:%s' % u'_'.join(normalize(c)) for c in wiki.get(u'top_categories_mv_en', [])] bow += [u"_".join(normalize(c)) for c in wiki.get(u'top_categories_mv_en', [])] features += [u'TOP_ART:%s' % u"_".join(normalize(a)) for a in wiki.get(u'top_articles_mv_en', [])] bow += [u"_".join(normalize(a)) for a in wiki.get(u'top_articles_mv_en', [])] desc_ngrams = [u"_".join(n) for grouping in [bigrams(normalize(np)) for np in TextBlob(wiki.get(u'description_txt', [u''])[0]).noun_phrases] for n in grouping] bow += desc_ngrams features += [u'DESC:%s' % d for d in desc_ngrams] bow += [u"_".join(b) for b in bigrams(normalize(wiki[u'sitename_txt'][0]))] mp_nps = TextBlob(wiki.get(u'main_page_text', u'')).noun_phrases bow += [u"_".join(bg) for grouping in [bigrams(normalize(n)) for n in mp_nps] for bg in grouping] bow += [u''.join(normalize(w)) for words in [np.split(u" ") for np in mp_nps] for w in words] return wiki[u'id'], bow + features except Exception as e: print e, format_exc() raise e
def wiki_to_feature(wiki): """ Specifically handles a single wiki document :param wiki: dict for wiki fields :type wiki: dict :return: tuple with wiki id and list of feature strings :rtype: tuple """ try: features = [] bow = [] features += [u'ORIGINAL_HUB:%s' % wiki.get(u'hub_s', u'')] features += [ u'TOP_CAT:%s' % u'_'.join(normalize(c)) for c in wiki.get(u'top_categories_mv_en', []) ] bow += [ u"_".join(normalize(c)) for c in wiki.get(u'top_categories_mv_en', []) ] features += [ u'TOP_ART:%s' % u"_".join(normalize(a)) for a in wiki.get(u'top_articles_mv_en', []) ] bow += [ u"_".join(normalize(a)) for a in wiki.get(u'top_articles_mv_en', []) ] desc_ngrams = [ u"_".join(n) for grouping in [ bigrams(normalize(np)) for np in TextBlob( wiki.get(u'description_txt', [u''])[0]).noun_phrases ] for n in grouping ] bow += desc_ngrams features += [u'DESC:%s' % d for d in desc_ngrams] bow += [ u"_".join(b) for b in bigrams(normalize(wiki[u'sitename_txt'][0])) ] mp_nps = TextBlob(wiki.get(u'main_page_text', u'')).noun_phrases bow += [ u"_".join(bg) for grouping in [bigrams(normalize(n)) for n in mp_nps] for bg in grouping ] bow += [ u''.join(normalize(w)) for words in [np.split(u" ") for np in mp_nps] for w in words ] return wiki[u'id'], bow + features except Exception as e: print e, format_exc() raise e
def getFeatures(tokens, typefeat='unigrams'): if typefeat == 'unigrams': _features = FreqDist(tokens) elif typefeat == 'bigrams': _bigrams = bigrams(tokens) _features = FreqDist(_bigrams) elif typefeat == 'uni+bigrams': _bigrams = bigrams(tokens) _features = FreqDist(_bigrams + tokens) return _features
def Merge_for_summary(lis): tweets = lis all_bigrams = [list(bigrams([token for token in tweets])) for tweets in nltk_tweets] starting_nodes = [single_bigram[0] for single_bigram in all_bigrams] end_nodes = [single_bigram[-1] for single_bigram in all_bigrams] all_bigrams = [node for single_bigram in all_bigrams for node in single_bigram] all_bigrams = list(set(all_bigrams)) bigraph = make_bigram_graph(all_bigrams, starting_node) path = breadth_first_search(bigram_graph, starting_nodes[1], end_nodes[2]) bigram_paths = [] for single_start_node in tqdm(starting_nodes): bigram_graph = make_bigram_graph(all_bigrams, single_start_node) for single_end_node in end_nodes: possible_paths = breadth_first_search(bigram_graph, single_start_node, single_end_node) for path in possible_paths: bigram_paths.append(path) for tweet in nltk_tweets: bigram_paths.append(list(bigrams([token for token in tweets]))) word_paths = [] for path in tqdm(bigram_paths): word_paths.append(make_list(path)) begin('COWABS') x = var(str('x'), len(word_paths), bool) y = var(str('y'), len(content_vocab), bool) maximize(sum([linguistic_quality(word_paths[i])*informativeness(word_paths[i])*x[i] for i in range(len(x))]) + sum(y)); sum([x[i]*len(word_paths[i]) for i in range(len(x))]) <= L; for j in range(len(y)): sum([x[i] for i in paths_with_content_words(j)])>= y[j] for i in range(len(x)): sum(y[j] for j in content_words(i)) >= len(content_words(i))*x[i] solve() result_x = [value.primal for value in x] result_y = [value.primal for value in y] end() chosen_paths = np.nonzero(result_x) chosen_words = np.nonzero(result_y) st = '' for i in chosen_paths[0]: st += str(" ").join([token.encode('ascii', 'ignore') for token in word_paths[i]]) print ('. ') return st
def buildBiIndex(inverIndex): biIndex = {} print("Building Bigram Index ...") for token in inverIndex: bigramList = list(bigrams(token)) bigramListLen = len(list(bigramList)) for index, (firstChar, lastChar) in enumerate(bigramList): bigram = firstChar + lastChar # nltk's bigram generator does not create $m if index == 0: startBigram = "$" + firstChar if biIndex.get(startBigram): biIndex[startBigram].append(token) else: biIndex[startBigram] = [token] # nltk's bigram generator does not create m$ elif index == bigramListLen - 1: endBigram = lastChar + "$" if biIndex.get(endBigram): biIndex[endBigram].append(token) else: biIndex[endBigram] = [token] if biIndex.get(bigram): biIndex[bigram].append(token) else: biIndex[bigram] = [token] print("Finished building Bigram Index") return biIndex
def pos_tags(vocab_hash, sentence): sentence = sentence.split() unigram_hash = get_pos(vocab_hash, sentence) bigram_hash = get_pos(vocab_hash, bigrams(sentence)) trigram_hash = get_pos(vocab_hash, trigrams(sentence)) pos_tags = [] ngram_to_tag = {} ngram_ordering = [] for i in xrange(len(sentence)): word = sentence[i] if unigram_hash.has_key(word): tag = unigram_hash[word] pos_tags.append(tag) ngram_to_tag[word] = tag ngram_ordering.append(word) elif i < len(sentence) - 1: bigram = sentence[i] + " " + sentence[i + 1] if bigram_hash.has_key(bigram): tag = bigram_hash[bigram] pos_tags.append(tag) ngram_to_tag[bigram] = tag ngram_ordering.append(bigram) i += 1 elif (i < len(sentence) - 2): trigram = " ".join(sentence[i:i + 2]) if trigram_hash.has_key(trigram): tag = trigram_hash[trigram] pos_tags.append(tag) ngram_to_tag[trigram] = tag ngram_ordering.append(trigram) i += 2 return pos_tags, ngram_to_tag, ngram_ordering
def gender_feature(text, feature_vect): """ Extract the gender features :param text: :param feature_vect: contains a bag of words and a list of bigrams :return: a dictionary which contains the feature and its computed value """ #sentence length and vocab features tokens = word_tokenize(text.lower()) sentences = sent_tokenize(text.lower()) words_per_sent = np.asarray([len(word_tokenize(s)) for s in sentences]) #bag_of_word features bag_dict = {} for bag in feature_vect[:29]: bag_dict[bag] = bag in tokens #bigrams features bigram_dict = {} for big in feature_vect[29:]: bigram_dict[big] = big in bigrams(tokens) #POS tagging features POS_tag = ['ADJ', 'ADV', 'DET', 'NOUN', 'PRT', 'VERB', '.'] tagged_word = parse(text, chunks=False, tagset='UNIVERSAL').split() simplified_tagged_word = [(tag[0], map_tag('en-ptb', 'universal', tag[1])) for s in tagged_word for tag in s] freq_POS = nltk.FreqDist(tag[1] for tag in simplified_tagged_word if tag[1] in POS_tag) d = dict({'sentence_length_variation': words_per_sent.std()}, **bag_dict) return dict(dict(d, **bigram_dict), **freq_POS)
def get_bigram_tags(self) -> Iterator[Tuple[str, str]]: "first concatenate all POS tags in all sentences in bijenkhan corpus" seq_tags = [] gen = self.sent_tag_gen(100) for _, tags in gen: [[seq_tags.append(tag) for tag in tag_seq] for tag_seq in tags] return list(bigrams(seq_tags))
def generate_unibitrigrams(key_score_file): with open(key_score_file, 'rb') as infile: infile.readline() key_list = list() for line in infile: row = list(line.split(',')) key_list.append(row[0]) uni_bi_trigrams = [] for phrase in key_list: words = [] unigrams_ls = [] bigrams_ls = [] trigrams_ls = [] for word in nltk.word_tokenize(phrase): word = re.sub('[!"#$%&\'\(\)*+,-./:;<=>?@[\]\^_`{|}~]', '', word) words.append(word) unigrams_ls = words #bigrams_ls=list(bigrams(words)) for x in list(bigrams(words)): bigrams_ls.append(x[0] + ' ' + x[1]) for x in list(trigrams(words)): trigrams_ls.append(x[0] + ' ' + x[1] + ' ' + x[2]) #trigrams_ls=list(trigrams(words)) uni_bi_trigrams = uni_bi_trigrams + unigrams_ls + bigrams_ls + trigrams_ls return uni_bi_trigrams
def act(self): """ Add words in the last observation to the dictionary. This checks any fields in the message present in the --dict-textfields argument (e.g. "text,labels"). """ for textfield in self.textfields: source = self.observation.get(textfield) if source is None: continue # fields may be singleton strings or lists of strings. # wrap the singleton strings in a list to iterate over them if type(source) is str: source = [source] for text in source: if text: tokens = self.tokenize(text) self.add_to_dict(tokens) unigram_ = nltk.ngrams(tokens, 1) bigrams_ = bigrams(tokens) trigrams_ = trigrams(tokens) self.unigram_freq.update(unigram_) self.bigram_freq.update(bigrams_) self.trigram_freq.update(trigrams_) return {'id': 'Dictionary'}
def autocorrect_query(query,df,cutoff=0.8,warning_on=True): """ autocorrect a query based on the training set """ train_data = df.values[df['search_term'].values==query,:] s = "" for r in train_data: w = r s = "%s %s %s"%(s,BeautifulSoup(r[1]).get_text(" ",strip=True),BeautifulSoup(r[2]).get_text(" ",strip=True)) s = re.findall(r'[\'\"\w]+',s.lower()) s_bigram = [' '.join(i) for i in bigrams(s)] s.extend(s_bigram) corrected_query = [] for q in query.lower().split(): if len(q)<=2: corrected_query.append(q) continue if bool(re.search('\d', q)): # skip if it is word with number, like 4.5in_ corrected_query.append(q) continue corrected_word = difflib.get_close_matches(q, s,n=1,cutoff=cutoff) if len(corrected_word) >0: corrected_query.append(corrected_word[0]) else : if warning_on: print("WARNING: cannot find matched word for '%s' -> used the original word"%(q)) corrected_query.append(q) return ' '.join(corrected_query)
def list_followers(self, word): followers = set() for tup in list(bigrams(flattened)): if tup[0] == word: followers.add(tup[1]) print(followers) return followers
def aggregate_topics_of_segmented_reports(self, cut_of_segmented_reports, topics): aggregated_topics = [] bigrams_of_topics = bigrams(map(lambda x: [x.decode('utf-8')], topics)) for i in range(len(bigrams_of_topics)): for j in range(len(cut_of_segmented_reports)): aggregated_topics.extend(cut_of_segmented_reports[j][cut_of_segmented_reports[j].index(bigrams_of_topics[i][0]):cut_of_segmented_reports[j].index(bigrams_of_topics[i][1])]) return aggregated_topics
def n_grams(word): bigrams_text = [] with open(r"C:\PyCharmGrammarly\Grammarly\lyrics.txt", 'r') as tok: raw_text = tok.read() raw_text = raw_text.split("\n") lines = [] for i in raw_text: tokens = word_tokenize(i) lines.append(tokens) for l in lines: b = list(bigrams(l)) bigrams_text.extend(b) k = 0 song = [] selection = [] while k <= 96: k = k + 1 for first in bigrams_text: if word == first[0]: selection.append(first) freq_bi = FreqDist(selection) best = freq_bi.most_common() word = random.choice(best) word = word[0][1] song.append(word) words_in_line = 0 for lyric in song: words_in_line += 1 if words_in_line % 5 == 0: print(lyric, "\n") continue else: print(lyric, end=" ")
def count_bigrams(input_fp, frequencies, buffer_size=1024): '''Read the text content of a file and keep a running count of how often each bigram (sequence of two) characters appears. Arguments: input_fp -- file pointer with input text frequencies -- mapping from each bigram to its counted frequency buffer_size -- incremental quantity of text to be read at a time, in bytes (1024 if not otherwise specified) Returns: nothing ''' #Read the first chunk of text, and set all letters to lowercase text = input_fp.read(buffer_size).lower() #Loop over the file while there is text to read while text: spans = TOKENIZER.span_tokenize(text) tokens = (text[begin : end] for (begin, end) in spans) for bigram in bigrams(tokens): #Increment the count for the bigram. Automatically handles any #bigram not seen before. The join expression turns 2 separate #single-character strings into one 2-character string frequencies[''.join(bigram)] += 1 #Read the next chunk of text, and set all letters to lowercase text = input_fp.read(buffer_size).lower() return
def sentence_preprocessing(sentence): sentence_clean = [] # Remove HTML entities sentence = BeautifulSoup(sentence, "lxml").get_text() # Tokenize tokens = preprocess(sentence, lowercase=False) # Step 1: Remove all useless things tokens = [tk for tk in tokens if tk not in punc] # Punctuation #tokens = [tk for tk in tokens if tk not in stop] # Stopwords tokens = [tk for tk in tokens if re.match(link_pattern, tk) == None] # Link tokens = [tk for tk in tokens if re.match(html_entities_pattern, tk) == None] # HTML entities tokens = [tk for tk in tokens if tk.lower() not in remove_terms] # Some special terms to remove # Step 2: Add some lower/upper case tokens_lower = [tk.lower() for tk in tokens] # Lowercase tokens_upper = [tk.upper() for tk in tokens] # Uppercase tokens_title = [tk.title() for tk in tokens] # Title tokens = tokens + tokens_lower + tokens_upper + tokens_title # Step 3: Add bigram bigrams_words = ['_'.join(w) for w in list(bigrams(tokens))] sentence_clean = ' '.join(tokens) + ' ' + ' '.join(bigrams_words) return sentence_clean
def count_bigrams(input_fp, frequencies, buffer_size=1024): '''Read the text content of a file and keep a running count of how often each bigram (sequence of two) characters appears. Arguments: input_fp -- file pointer with input text frequencies -- mapping from each bigram to its counted frequency buffer_size -- incremental quantity of text to be read at a time, in bytes (1024 if not otherwise specified) Returns: nothing ''' #Read the first chunk of text, and set all letters to lowercase text = input_fp.read(buffer_size).lower() #Loop over the file while there is text to read while text: spans = TOKENIZER.span_tokenize(text) tokens = (text[begin:end] for (begin, end) in spans) for bigram in bigrams(tokens): #Accommodate the bigram if seen for the first time frequencies.setdefault(bigram, 0) #Increment the count for the bigram frequencies[bigram] += 1 #Read the next chunk of text, and set all letters to lowercase text = input_fp.read(buffer_size).lower() return
def count_bigrams(frequencies): '''Read the text content of a file and keep a running count of how often each bigram (sequence of two) characters appears. Arguments: frequencies ‑‑ mapping from each bigram to its counted frequency Returns: nothing ''' # Read in all the text from the file and set all letters to lowercase with open(sys.argv[1], "r") as f: text = f.read().lower() # This step is needed to collapse runs of space characters into one text = ' '.join(text.split()) """ spans = TOKENIZER.span_tokenize(text) tokens = (text[begin : end] for (begin, end) in spans) """ tokens = TOKENIZER.tokenize(text) for bigram in bigrams(tokens): # Increment the count for the bigram. Automatically handles any # bigram not seen before. The join expression turns 2 separate # single‑character strings into one 2‑character string frequencies[''.join(bigram)] += 1 return
def ngrams(self, gram_size=3): """Gives ngrams. Returns a list of ngrams, each ngram represented as a tuple. Args: gram_size (:obj:`int`, optional) Size of the ngrams to generate Returns: :obj:`list` of :obj:`tuple` Words of each ngram Example: >>> text = EnglishText('They hated to think of sample sentences.') >>> basic_ngrams = text.ngrams() >>> print(basic_ngrams) [('They', 'hated', 'to'), ('hated', 'to', 'think'), ('to', 'think', 'of'), ('think', 'of', 'sample'), ('of', 'sample', 'sentences'), ('sample', 'sentences', '.')] """ # noqa tokens = self.tokenize() if gram_size < 2: # pragma: no cover gram_size = 2 if gram_size == 2: # pragma: no cover return list(bigrams(tokens)) if gram_size == 3: return list(trigrams(tokens)) else: # pragma: no cover return list(ngrams(tokens, gram_size))
def generate_ds(self, words): learning_info_dict = {lang: {w: float(t) for w, t in self._language_model_cfd[lang].most_common()} for lang in self._language_model_cfd.keys()} testing_info_dict = {w: float(t) for w, t in FreqDist([tpl for word in words for tpl in bigrams(word)]).most_common()} return learning_info_dict, testing_info_dict
def generate_ds(self, words): learning_info_dict = {lang: {w: float(t) for w, t in self._language_model_cfd[lang].most_common()} for lang in self._language_model_cfd.keys()} testing_info_dict = {w: float(t) for w, t in FreqDist(bigrams([w.lower() for w in words])).most_common()} return learning_info_dict, testing_info_dict
def organize_aggregated_topics_by_dict(self, aggregated_topics, topics): aggregated_topics.append([None]) topics.pop() topics.append(None) modified_topics = map(lambda x: [x.decode('utf-8')], topics[0:-1]) modified_topics.append([None]) modified_bigrams_of_topics = bigrams(modified_topics) aggregated_list_of_tuple = [] for i in range(len(modified_bigrams_of_topics)): aggregated_list_of_tuple.append( tuple([ aggregated_topics[aggregated_topics.index( modified_bigrams_of_topics[i][0])][0].encode('utf-8'), aggregated_topics[aggregated_topics.index( modified_bigrams_of_topics[i][0]):aggregated_topics. index(modified_bigrams_of_topics[i][1])] ])) dict_of_sentences_by_topic = dict(aggregated_list_of_tuple) for k, v in dict_of_sentences_by_topic.items(): dict_of_sentences_by_topic[k] = [ s for s in dict_of_sentences_by_topic[k] if s != [k.decode('utf-8')] ] for k, v in dict_of_sentences_by_topic.items(): for i in range(len(dict_of_sentences_by_topic[k])): dict_of_sentences_by_topic[k][i] = map( lambda w: w.lower(), dict_of_sentences_by_topic[k][i]) return dict_of_sentences_by_topic
def autocorrect_query(query, df, cutoff=0.8, warning_on=True): """ autocorrect a query based on the training set """ train_data = df.values[df['search_term'].values == query, :] s = "" for r in train_data: w = r s = "%s %s %s" % (s, BeautifulSoup(r[1]).get_text( " ", strip=True), BeautifulSoup(r[2]).get_text(" ", strip=True)) s = re.findall(r'[\'\"\w]+', s.lower()) s_bigram = [' '.join(i) for i in bigrams(s)] s.extend(s_bigram) corrected_query = [] for q in query.lower().split(): if len(q) <= 2: corrected_query.append(q) continue if bool(re.search('\d', q)): # skip if it is word with number, like 4.5in_ corrected_query.append(q) continue corrected_word = difflib.get_close_matches(q, s, n=1, cutoff=cutoff) if len(corrected_word) > 0: corrected_query.append(corrected_word[0]) else: if warning_on: print( "WARNING: cannot find matched word for '%s' -> used the original word" % (q)) corrected_query.append(q) return ' '.join(corrected_query)
def get_bigram(text_list): # text_list is a list of strings new_list = [] for i in range(len(text_list)): new_list.append(list(bigrams(text_list[i]))) return new_list
def _get_filtered_bigrams(self, words): filtered_bigrams = [] for bi in bigrams(words): if not any(w for w in bi if w in stopwords) and bi[0] != bi[1]: filtered_bigrams.append(bi) return filtered_bigrams
def sentence_preprocessing(sentence): sentence_clean = [] # Remove HTML entities sentence = BeautifulSoup(sentence, "lxml").get_text() # Tokenize tokens = preprocess(sentence, lowercase=False) # Step 1: Remove all useless things tokens = [tk for tk in tokens if tk not in punc] # Punctuation tokens = [tk for tk in tokens if tk.lower() not in stop] # Stopwords tokens = [tk for tk in tokens if re.match(link_pattern, tk) == None] # Link tokens = [tk for tk in tokens if emoticon_re.search(tk) == None] # Emoticons #tokens = [tk for tk in tokens if re.match(html_entities_pattern, tk) == None] # HTML entities tokens = [tk for tk in tokens if tk.lower() not in remove_terms] # Some special terms to remove # Step 2: Remove short words and non char tokens = [tk for tk in tokens if len(tk) >= 3] # Remove short words tokens = [tk for tk in tokens if re.match(number_pattern, tk) == None] # Remove number # Step 3: Add bigram tokens = [tk.lower() for tk in tokens] # Lowercase bigrams_words = ['_'.join(w) for w in list(bigrams(tokens))] sentence_clean = ' '.join(tokens) + ' ' + ' '.join(bigrams_words) return sentence_clean
def generate_unibitrigrams(key_score_file): with open(key_score_file,'rb') as infile: infile.readline() key_list=list() for line in infile: row=list(line.split(',')) key_list.append(row[0]) uni_bi_trigrams=[] for phrase in key_list: words=[] unigrams_ls=[] bigrams_ls=[] trigrams_ls=[] for word in nltk.word_tokenize(phrase): word=re.sub('[!"#$%&\'\(\)*+,-./:;<=>?@[\]\^_`{|}~]','',word) words.append(word) unigrams_ls=words #bigrams_ls=list(bigrams(words)) for x in list(bigrams(words)): bigrams_ls.append(x[0]+' '+x[1] ) for x in list(trigrams(words)): trigrams_ls.append(x[0]+' '+x[1]+' '+x[2] ) #trigrams_ls=list(trigrams(words)) uni_bi_trigrams=uni_bi_trigrams+unigrams_ls+bigrams_ls+trigrams_ls return uni_bi_trigrams
def BigramAll(): to_save_folder = "./#Bigram[.]/" folder_list = os.listdir("./"); for folder in folder_list: if folder.find(".") != -1 : continue; folder_name = "./" + folder + "/" data_path = folder_name+"data.doc"; fw = open(data_path,"r",encoding="utf8"); text = fw.read(); words = word_tokenize(text); big = list(bigrams(w for w in words if len(w) > 1 and w != "``")); myBig = [] for bi in big: myBig.append(bi[0]+" "+bi[1]); fdist = FreqDist(str(w) for w in myBig); keys = fdist.most_common(len(fdist.keys())) dataFreq = ""; for key in keys: dataFreq+= str(key[0]).strip()+","+str(key[1]).strip()+"\n"; make_sure_path_exists(to_save_folder+folder) writer = open(to_save_folder+folder+"/"+folder+"[bigram_Freq].csv","w+",encoding="utf8"); writer.write(dataFreq); fw.close(); writer.close();
def score_by_topic(pkg, scores): '''Examines the pkg and adds scores according to topics in it.''' themes = Themes.instance() for level in range(3): pkg_text = package_text(pkg, level) words, words_without_stopwords = normalize_text(pkg_text) for num_words in (1, 2, 3): if num_words == 1: ngrams = words_without_stopwords topic_ngrams = themes.topic_words topic_ngrams_set = themes.topic_words_set elif num_words == 2: ngrams = bigrams(words) topic_ngrams = themes.topic_bigrams topic_ngrams_set = themes.topic_bigrams_set elif num_words == 3: ngrams = trigrams(words) topic_ngrams = themes.topic_trigrams topic_ngrams_set = themes.topic_trigrams_set matching_ngrams = set(ngrams) & topic_ngrams_set if matching_ngrams: for ngram in matching_ngrams: occurrences = ngrams.count(ngram) score = (3-level) * occurrences * num_words theme = topic_ngrams[ngram] ngram_printable = ' '.join(ngram) if isinstance(ngram, tuple) else ngram reason = '"%s" matched %s' % (ngram_printable, LEVELS[level]) if occurrences > 1: reason += ' (%s times)' % occurrences scores[theme].append((score, reason)) log.debug(' %s %s %s', theme, score, reason)
def markov_model_classify(info, sentence): # TODO prob = [ math.log(x / info.total_examples, math.e) for x in info.sentiment_counts ] tokens = tokenize(sentence) my_bigrams = bigrams(tokens) token = tokens[0] for i in range(5): pToken = info.word_counts[i].get(token, 0) / info.total_words[i] if pToken == 0: prob[i] = prob[i] + math.log(OUT_OF_VOCAB_PROB, math.e) else: prob[i] = prob[i] + math.log(pToken, math.e) for bigram in my_bigrams: for i in range(5): counts = info.bigram_counts[i].get(bigram, 0) denoms = info.bigram_denoms[i].get(bigram[0], 0) if counts != 0: pBigram = counts / denoms else: pBigram = OUT_OF_VOCAB_PROB prob[i] = prob[i] + math.log(pBigram, math.e) return prob.index(max(prob)), max(prob)
def read_file(fileList): global G G = nx.Graph() # Internal mapping region2column = {1:1, 2:4, 3:7, 4:10} for fn in fileList: wb = open_workbook(filename=fn) sheet = wb.sheet_by_name(args['sheetName']) columnNumber = region2column[args['partOfCountry']] for i in range(args['startNumber'], min(sheet.nrows,args['maxNrNames']+(args['startNumber']*len(fileList)))/len(fileList)): name = sheet.cell_value(i,columnNumber) freq = int(sheet.cell_value(i,columnNumber+1)) rank = int(sheet.cell_value(i,columnNumber-1)) # Give importance to first letter charBigrams = bigrams('_%s' % name) if not G.has_node(name): G.add_node(name, {'type': 'firstname', 'freq': freq, 'rank': rank, 'size': int(log(freq))*2}) for cb in charBigrams: if not G.has_node(cb): G.add_node(cb, {'type': 'charbigram'}) if not G.has_edge(name, cb): G.add_edge(name,cb)
def markov_model_classify(info, sentence): tokens = tokenize(sentence) #print("tokens: ", tokens) total_bigrams = list(bigrams(tokens)) #print("my_bigrams: ", total_bigrams) prob_log_mm = -10000 #print("11111111") #print("prob_log_mm: ", prob_log_mm) class_chosen_mm = 0 for sen in range(CLASSES): class_prob = float(info.sentiment_counts[sen] / info.total_examples) #print("class_prob_mm: ", class_prob) sen_word_counts = info.word_counts[sen] #print("sen_word_counts_mm: ", sen_word_counts) sen_word_total = info.total_words[sen] #print("sen_word_total_mm: ", sen_word_total) sen_word_bigram_counts = info.bigram_counts[sen] #print("sen_word_bigram_counts_mm: ", sen_word_bigram_counts) sen_word_denom = info.bigram_denoms[sen] #print("sen_word_denom_mm: ", sen_word_denom) con_prob = 0 for token in tokens: if sen_word_counts.get(token) == None: con_prob += math.log(OUT_OF_VOCAB_PROB) break else: token_con_prob = float( sen_word_counts.get(token) / sen_word_total) #print("token_con_prob_mm: ", token_con_prob) con_prob += math.log(token_con_prob) #print("con_prob: ", con_prob) break for bigram in total_bigrams: #print("bigram: ", bigram) #bigram_con_prob = float(sen_word_bigram_counts.get(bigram, OUT_OF_VOCAB_PROB) / sen_word_denom.get(bigram[0], 1)) if sen_word_bigram_counts.get(bigram) == None: con_prob += math.log(OUT_OF_VOCAB_PROB) else: bigram_con_prob = float( sen_word_bigram_counts.get(bigram) / sen_word_denom.get(bigram[0])) #print("bigram_con_prob: ", bigram_con_prob) con_prob += math.log(bigram_con_prob) #print("con_prob: ", con_prob) temp_mm = math.log(class_prob) + con_prob #print("temp_mm: ", temp_mm) if temp_mm > prob_log_mm: prob_log_mm = temp_mm class_chosen_mm = sen #print("prob_mm: ", prob_log_mm) #print("class_chosen_mm: ", class_chosen_mm) return class_chosen_mm, prob_log_mm
def perplexity(self, sentence, method): """ Compute the perplexity of a sentence given a estimation method No modify """ return 2.0 ** (-1.0 * mean([method(context, word) for context, word in \ bigrams(self.tokenize_and_censor(sentence))]))
def perplexity(self, sentence, method): """ Compute the perplexity of a sentence given a estimation method You do not need to modify this code. """ return 2.0 ** (-1.0 * mean([method(context, word) for context, word in \ bigrams(self.tokenize_and_censor(sentence))]))
def get_ngram_tokens(self, line): tokens = nltk.wordpunct_tokenize(line) message = [self.stemmer.stem(x) for x in tokens if len(x) > 2 and x not in self.stops] bigram = bigrams(message) for pair in bigram: joined = " ".join(pair) message.append(joined) return list(set(message))
def bigram_format( test_corpus ): """ >>> bigram_format(["the dog runs STOP", "the cat walks STOP", "the dog runs STOP"]) [[('the', 'dog'), ('dog', 'runs'), ('runs', 'STOP')], [('the', 'cat'), ('cat', 'walks'), ('walks', 'STOP')], [('the', 'dog'), ('dog', 'runs'), ('runs', 'STOP')]] """ wl = [ [word for word in sentence.split()] for sentence in test_corpus] return [ util.bigrams( l ) for l in wl ]
def bigramsPhi(comment): """The basis for a bigrams feature function. """ sent = [stemmer.stem(tok) for tok in comment.split()] # Stemming + punc unis = Counter() sent = ["<<START>>"] + sent + ["<<END>>"] unis.update(bigrams(sent)) # Bigrams return unis
def sentProbaility(self,sent,smooth_const): V = 217847 tool = MyToolKit() bigrs = bigrams(tool.words(sent)); p = 1 for tuple in bigrs: p = math.exp(math.log(p)+math.log(self.LaplaceSmoothing(tuple[1],tuple[0],smooth_const,V))) #p = math.exp(math.log(p)+math.log(self.AbsoluteDiscountingSmoothing(tuple[1],tuple[0],smooth_const,V))) return p
def getAllBigramInfo(data_df, data_coln, target_coln): all_bigrams = [] for doc in data_df[data_coln]: doc_bigrams = list(bigrams(doc)) all_bigrams.append(doc_bigrams) data_df['Bigrams'] = all_bigrams all_bigrams_flattened = flatten(all_bigrams) bigram_vocab = list(set(all_bigrams_flattened)) vocab_df = pd.DataFrame() vocab_df['Bigrams'] = bigram_vocab data_df_1 = data_df[data_coln][data_df[target_coln] == 1] count_matrix_columns = data_df.index count_matrix = pd.DataFrame(0, columns=count_matrix_columns, index=bigram_vocab) data_df_columns = list(data_df.columns) del_columns = [x for x in data_df_columns if x != 'Bigrams'] data_df.drop(del_columns, axis=1, inplace=True) for doc_id in count_matrix_columns: bigram_counter = dict(Counter(data_df['Bigrams'][doc_id])) for word in bigram_counter: count_matrix[doc_id][word] = bigram_counter[word] total_1_docs = len(data_df_1) all_1_docs = set(data_df_1.index) precision = [] recall = [] f1_score = [] for word in bigram_vocab: t = count_matrix.loc[[word]].transpose() t1 = pd.Series(t[word]) phrase_nonzero = set(t1.nonzero()[0]) r1_with_phrase = phrase_nonzero.intersection(all_1_docs) precision_of_doc = len(r1_with_phrase) / len(phrase_nonzero) recall_of_doc = len(r1_with_phrase) / total_1_docs try: f1_score_of_doc = (2 * precision_of_doc * recall_of_doc) / ( 2 * (precision_of_doc + recall_of_doc)) except: f1_score_of_doc = -1 precision.append(precision_of_doc) recall.append(recall_of_doc) f1_score.append(f1_score_of_doc) vocab_df['Precision'] = precision vocab_df['Recall'] = recall vocab_df['F1_score'] = f1_score return vocab_df
def crear_bigramas(texto): #tokenizando texto #texto_tokenizado = limpiar_texto(texto) #creando bigramas bigramas = list(bigrams(texto)) #filtrando de bigramas del texto sin tomar en cuenta caracteres especiales threshold = 1 # Con esto se eliminan signos de puntuación y caracteres especiales/ bigramas_filtrados = [bigram for bigram in bigramas if len(bigram[0])>=threshold and len(bigram[1])>=threshold] return bigramas_filtrados
def handleGrams(self, tokenList): res = [] if self.unigrams: res.extend(tokenList) if self.bigrams: res.extend(bigrams(tokenList)) if self.gappyBigrams: res.extend(self.gappy_bigrams(tokenList)) return res
def process(self, filename): """process""" in_file = open(filename) self.content[filename] = in_file.read() in_file.close() words = self.content[filename].split(' ') grams = bigrams(words) self.add_grams(filename, grams)
def add_train(self, sentence): """ Add the counts associated with a sentence. """ # You'll need to complete this function, but here's a line of code that # will hopefully get you started. for context, word in bigrams(self.tokenize_and_censor(sentence)): None
def brown_bigrams(category): """Takes as input the name of a brown category, and returns a list of all of the bigrams in the category.""" words = ["<s>"] words += [ word.lower() for word in brown.words(categories=category) if word.isalnum() ] words.append("</s>") return list(bigrams(words))
def check_intent(message): bigram = bigrams(message) bigram_tokens = [] for item in bigram: bigram_tokens.append(''.join(item)) if( "검색" in bigram_tokens): app.INTENT_STATUS = 1
def get_feature_by_all_bigrams(self, bgs): bg_counts = list() for statuses in self._author_statuses: count = 0 for status in statuses: for bg in bigrams(status): if bg in bgs: count += 1 bg_counts.append(count) return bg_counts
def classify_paras(paras, classifier): d = collections.defaultdict(list) for para in paras: words = [w.lower() for w in itertools.chain(*para)] feats = dict([(w, True) for w in words + bigrams(words)]) label = classifier.classify(feats) d[label].append(" ".join(words)) return d
def get_bigrams_frequency_dist(tokens): bigram_freq_dist = {} list_for_ngrams = get_list_for_ngrams(tokens) bigram_list = list(bigrams(list_for_ngrams)) for bigram_tuple in bigram_list: if bigram_freq_dist.has_key(bigram_tuple): bigram_freq_dist[bigram_tuple] += 1 else: bigram_freq_dist[bigram_tuple] = 1 return bigram_freq_dist
def get_ngrams_frequency_dist(tokens): ngram_freq_dist = {} list_for_ngrams = get_list_for_ngrams(tokens) ngram_list = list(bigrams(list_for_ngrams)) + list(ngrams(list_for_ngrams, 3)) + list(ngrams(list_for_ngrams, 4)) for ngram in ngram_list: if ngram_freq_dist.has_key(ngram): ngram_freq_dist[ngram] += 1 else: ngram_freq_dist[ngram] = 1 return ngram_freq_dist
def _order_tags_by_sent(self): self.tokenized_content = self._tokenize_content() tags_into_tokenized_content = [] bigrams_of_tags_by_sent = [] ordered_tags_by_sent = [] for sent in self.tokenized_content: tags_into_tokenized_content.append([tag for tag in sent if tag in TAGS]) for tags_by_sent in tags_into_tokenized_content: bigrams_of_tags_by_sent.append(bigrams(tags_by_sent)) ordered_tags_by_sent.append(list(OrderedSet(tags_by_sent))) return ordered_tags_by_sent
def tokenize(self, sentence, do_stopwords, do_stemming,use_bigrams): words = word_tokenize(sentence) words = [w.lower() for w in words if len(w) > 2] if do_stopwords: words = [w for w in words if w not in stop_set] if do_stemming: stemmer = PorterStemmer() words = [stemmer.stem(w) for w in words] if use_bigrams: words = bigrams(words) return words
def is_gift_card_page(self, tokenList): # TODO This is duplicative - you need to remove this function lower_case_tokens = [self.make_lower_case_without_punctuation(w) for w in tokenList] bigramList = util.bigrams(lower_case_tokens) # this needs to be refined to pull out any punctuation if ("gift", "card") in bigramList: return True else: return False
def cal_bigram_probability(fileName, sentence1, words): sentence_tokens = word_tokenize(sentence1.lower()) index = 0 bigram_probability = 1.0 unigram_counter = FreqDist(words) bigram_counter = FreqDist(bigrams(words)) for index, items in enumerate(sentence_tokens): n = index + 1 if n < len(sentence_tokens): bigram_probability_pair = (float(bigram_counter[items, sentence_tokens[n]]) / float(unigram_counter[items])) print("P("+sentence_tokens[n]+ "|"+ items+ ") = "+ str(bigram_probability_pair)) bigram_probability *= bigram_probability_pair print (str(bigram_probability)) return bigram_probability
def testFunc(): fw = open("./MZI/data.doc", "r", encoding="utf8"); text = fw.read(); tockens = getWordList(text) print(len(set(tockens))) from nltk.probability import FreqDist from nltk.util import bigrams fdist = FreqDist(w for w in tockens if len(w) > 1); fdist.tabulate(50); big = list(bigrams(w for w in tockens if len(w) > 1)); print(big[:100]); fdist = FreqDist(str(w) for w in big); fdist.tabulate(10); fdist.plot(50)
def __init__(self, corpra, *args, **kwargs): if isinstance(corpra, basestring): self.words = nltk.word_tokenize(corpra) else: self.words = corpra.words(*args, **kwargs) self.pos_words = nltk.pos_tag(self.words) self.bgrams = bigrams(self.pos_words) self.freqdist = nltk.ConditionalFreqDist(self.bgrams) self.word_dict = {} self.pos_dict = {} self.punct = [',', '.', '!', ':', ';', '?', '--', '-', '"', "'", 's', '$', ',"'] self.word_dict_builder()
def norm_words(words): if not args.no_lowercase: words = [w.lower() for w in words] if not args.punctuation: words = [w.strip(string.punctuation) for w in words] words = [w for w in words if w] if stopset: words = [w for w in words if w.lower() not in stopset] if args.bigrams: return words + bigrams(words) else: return words