def create_tri_model(span_eng_dict): trigram_span_dict = collections.defaultdict(lambda: 0) trigram_eng_dict = collections.defaultdict(lambda: 0) trigram_span_eng_dict = collections.defaultdict(lambda: tuple) text = codecs.open('SpanishText.txt', encoding='utf-8') for sentence in text.readlines(): line = [re.sub('[.?!",]', '', word) for word in sentence.split()] for word1, word2, word3 in trigrams(line): trigram_span_dict[(word1.lower(), word2.lower(), word3.lower())] += 1 for word1, word2, word3 in trigrams(line): #print(word1, span_eng_dict[word1]) trigram_span_eng_dict[( word1.lower(), word2.lower(), word3.lower())] = (span_eng_dict[word1.lower()], span_eng_dict[word2.lower()], span_eng_dict[word3.lower()]) eng_text = open('DMT_output.txt') for sentence in eng_text.readlines(): line = [re.sub('[.?!",]', '', word) for word in sentence.split()] for word1, word2, word3 in trigrams(line): trigram_eng_dict[(word1.lower(), word2.lower(), word3.lower())] += 1 text = '' for k, v in trigram_span_eng_dict.items(): try: if (trigram_span_dict.get(k) == trigram_eng_dict.get(v)) and ( trigram_span_dict.get(k) >= 1): #print(k, v) text += k[0] + k[1] except: pass return
def score_by_topic(pkg, scores): '''Examines the pkg and adds scores according to topics in it.''' themes = Themes.instance() for level in range(3): pkg_text = package_text(pkg, level) words, words_without_stopwords = normalize_text(pkg_text) for num_words in (1, 2, 3): if num_words == 1: ngrams = words_without_stopwords topic_ngrams = themes.topic_words topic_ngrams_set = themes.topic_words_set elif num_words == 2: ngrams = bigrams(words) topic_ngrams = themes.topic_bigrams topic_ngrams_set = themes.topic_bigrams_set elif num_words == 3: ngrams = trigrams(words) topic_ngrams = themes.topic_trigrams topic_ngrams_set = themes.topic_trigrams_set matching_ngrams = set(ngrams) & topic_ngrams_set if matching_ngrams: for ngram in matching_ngrams: occurrences = ngrams.count(ngram) score = (3-level) * occurrences * num_words theme = topic_ngrams[ngram] ngram_printable = ' '.join(ngram) if isinstance(ngram, tuple) else ngram reason = '"%s" matched %s' % (ngram_printable, LEVELS[level]) if occurrences > 1: reason += ' (%s times)' % occurrences scores[theme].append((score, reason)) log.debug(' %s %s %s', theme, score, reason)
def act(self): """ Add words in the last observation to the dictionary. This checks any fields in the message present in the --dict-textfields argument (e.g. "text,labels"). """ for textfield in self.textfields: source = self.observation.get(textfield) if source is None: continue # fields may be singleton strings or lists of strings. # wrap the singleton strings in a list to iterate over them if type(source) is str: source = [source] for text in source: if text: tokens = self.tokenize(text) self.add_to_dict(tokens) unigram_ = nltk.ngrams(tokens, 1) bigrams_ = bigrams(tokens) trigrams_ = trigrams(tokens) self.unigram_freq.update(unigram_) self.bigram_freq.update(bigrams_) self.trigram_freq.update(trigrams_) return {'id': 'Dictionary'}
def trigram_format( test_corpus ): """ >>> trigram_format(["the dog runs STOP", "the cat walks STOP", "the dog runs STOP"]) [[('the', 'dog', 'runs'), ('dog', 'runs', 'STOP')], [('the', 'cat', 'walks'), ('cat', 'walks', 'STOP')], [('the', 'dog', 'runs'), ('dog', 'runs', 'STOP')]] """ wl = [ [word for word in sentence.split()] for sentence in test_corpus] return [ util.trigrams( l ) for l in wl ]
def generate_unibitrigrams(key_score_file): with open(key_score_file, 'rb') as infile: infile.readline() key_list = list() for line in infile: row = list(line.split(',')) key_list.append(row[0]) uni_bi_trigrams = [] for phrase in key_list: words = [] unigrams_ls = [] bigrams_ls = [] trigrams_ls = [] for word in nltk.word_tokenize(phrase): word = re.sub('[!"#$%&\'\(\)*+,-./:;<=>?@[\]\^_`{|}~]', '', word) words.append(word) unigrams_ls = words #bigrams_ls=list(bigrams(words)) for x in list(bigrams(words)): bigrams_ls.append(x[0] + ' ' + x[1]) for x in list(trigrams(words)): trigrams_ls.append(x[0] + ' ' + x[1] + ' ' + x[2]) #trigrams_ls=list(trigrams(words)) uni_bi_trigrams = uni_bi_trigrams + unigrams_ls + bigrams_ls + trigrams_ls return uni_bi_trigrams
def ngrams(self, gram_size=3): """Gives ngrams. Returns a list of ngrams, each ngram represented as a tuple. Args: gram_size (:obj:`int`, optional) Size of the ngrams to generate Returns: :obj:`list` of :obj:`tuple` Words of each ngram Example: >>> text = EnglishText('They hated to think of sample sentences.') >>> basic_ngrams = text.ngrams() >>> print(basic_ngrams) [('They', 'hated', 'to'), ('hated', 'to', 'think'), ('to', 'think', 'of'), ('think', 'of', 'sample'), ('of', 'sample', 'sentences'), ('sample', 'sentences', '.')] """ # noqa tokens = self.tokenize() if gram_size < 2: # pragma: no cover gram_size = 2 if gram_size == 2: # pragma: no cover return list(bigrams(tokens)) if gram_size == 3: return list(trigrams(tokens)) else: # pragma: no cover return list(ngrams(tokens, gram_size))
def pos_tags(vocab_hash, sentence): sentence = sentence.split() unigram_hash = get_pos(vocab_hash, sentence) bigram_hash = get_pos(vocab_hash, bigrams(sentence)) trigram_hash = get_pos(vocab_hash, trigrams(sentence)) pos_tags = [] ngram_to_tag = {} ngram_ordering = [] for i in xrange(len(sentence)): word = sentence[i] if unigram_hash.has_key(word): tag = unigram_hash[word] pos_tags.append(tag) ngram_to_tag[word] = tag ngram_ordering.append(word) elif i < len(sentence) - 1: bigram = sentence[i] + " " + sentence[i + 1] if bigram_hash.has_key(bigram): tag = bigram_hash[bigram] pos_tags.append(tag) ngram_to_tag[bigram] = tag ngram_ordering.append(bigram) i += 1 elif (i < len(sentence) - 2): trigram = " ".join(sentence[i:i + 2]) if trigram_hash.has_key(trigram): tag = trigram_hash[trigram] pos_tags.append(tag) ngram_to_tag[trigram] = tag ngram_ordering.append(trigram) i += 2 return pos_tags, ngram_to_tag, ngram_ordering
def __init__(self, index: int, sent: str, start: int, end: int): self.index = index self.sent = sent self.words = self.sentToWords() self.nGrams = list(trigrams(self.words)) self.start = start self.end = end
def text(): testo = [x for x in request.form.values()] input_text = testo[0] input_text = re.sub(r'\n', '' , input_text) input_text = re.sub(r'\t', '' , input_text) tok_test = word_tokenize(input_text.lower()) trig = list(trigrams(tok_test)) vocab_file = pd.read_csv('DIZIONARIO.csv', sep=';') vocab = list(vocab_file['TOKEN']) model = pickle.load(open('langmod.pickle', 'rb')) err = 0 for t in trig: if(t[2] not in vocab): err +=1 continue if(model.score(t[2],[t[0],t[1]])==0): err +=1 if(err<=2): punteggio = 'Sembra che non ci siano errori ortografici tipici della disortografia :-)' elif(err>2 and err<=5): punteggio = 'Forse c\'è qualche errore ortografico che potrebbe far pensare alla presenza di disortografia. Controlla le doppie, lo scambio, l\'inserimento o la traslazione di lettere/sillabe, l\'H nei verbi e gli accenti!' elif(err>5): punteggio = 'Gli errori ortografici sembrano essere un po\' tantini e tipici della disortografia! Non preoccuparti, respira e - dopo aver controllato altri testi - consulta uno specialista :-)' return render_template('elements.html',score=punteggio)
def generate_unibitrigrams(key_score_file): with open(key_score_file,'rb') as infile: infile.readline() key_list=list() for line in infile: row=list(line.split(',')) key_list.append(row[0]) uni_bi_trigrams=[] for phrase in key_list: words=[] unigrams_ls=[] bigrams_ls=[] trigrams_ls=[] for word in nltk.word_tokenize(phrase): word=re.sub('[!"#$%&\'\(\)*+,-./:;<=>?@[\]\^_`{|}~]','',word) words.append(word) unigrams_ls=words #bigrams_ls=list(bigrams(words)) for x in list(bigrams(words)): bigrams_ls.append(x[0]+' '+x[1] ) for x in list(trigrams(words)): trigrams_ls.append(x[0]+' '+x[1]+' '+x[2] ) #trigrams_ls=list(trigrams(words)) uni_bi_trigrams=uni_bi_trigrams+unigrams_ls+bigrams_ls+trigrams_ls return uni_bi_trigrams
def __init__(self, index: int, sent: str, start: int, end: int, lang: int): self.lang = LangDiff(lang) self.index = index self.sent = sent self.words = self.remove_puncts(self.lang.word_tokenize(sent)) self.nGrams = Counter(trigrams(self.sent_to_words())) self.start = start self.end = end
def pre_trigram(texto): lista = [] for x in texto: if len(x) < 4: lista.append((x)) else: lista.append(tuple(trigrams(x))) return lista
def wordsToTrigramsWithIndices(self, dictionary): def getIndexedTuple(word: str): index = -1 if word in dictionary.wordsToIndices: index = dictionary.wordsToIndices[word] return (index, word) return list(trigrams(list(map(getIndexedTuple, self.words))))
def brown_trigrams(category): """Takes as input the name of a brown category, and returns a list of all of the trigrams in the category.""" words = ["<s>"] words += [ word.lower() for word in brown.words(categories=category) if word.isalnum() ] words.append("</s>") return list(trigrams(words))
def clean_up_sentence(sentence): # tokenize the pattern sentence_words = [] w = list(trigrams(sentence)) for x in w: sentence_words.append(x[0] + x[1] + x[2]) # stem each word return sentence_words
def getNgrams(self): """Get ngrams from the question. Right now only bigrams and trigrams are supported""" bigram_str = [ bigram[0] + ' ' + bigram[1] for bigram in bigrams(self.tokens) ] trigram_str = [ trigram[0] + ' ' + trigram[1] + ' ' + trigram[2] for trigram in trigrams(self.tokens) ] return (bigram_str, trigram_str)
def ngrams(self, gram_size=3): tokens = self.tokenize() if gram_size < 2: # pragma: no cover gram_size = 2 if gram_size == 2: # pragma: no cover return list(bigrams(tokens)) if gram_size == 3: return list(trigrams(tokens)) else: # pragma: no cover return list(ngrams(tokens, gram_size))
def main(): save_data_from_webpage() text = get_data_from_file() #creates a list of the tolkenized words tt = word_tokenize(text) pprint(tt) #creates a new list for the steam words using all of the stemmers psteam = PorterStemmer() psteam_list = [] for word in tt: psteam_list.append(psteam.stem(word)) pprint(psteam_list) lsteam = LancasterStemmer() lsteam_list = [] for word in tt: lsteam_list.append(lsteam.stem(word)) pprint(lsteam_list) ssteam = SnowballStemmer() ssteam_list = [] for word in tt: ssteam_list.append(ssteam.stem(word)) pprint(ssteam_list) p = set(psteam_list) l = set(lsteam_list) s = set(ssteam_list) #displays the different steams pprint(s.difference(l.difference(p))) #pos taging pos_list = pos_tag(text) pprint(pos_list) #creates a new list for the lematized words lemmatizer = WordNetLemmatizer() lem = [] for word in tt: lem.append(lemmatizer.lemmatize(word)) #pprint(lem) # returns a generator of trigrams using the tokenized list tt trig = trigrams(tt) displays the results print(list(trig)) #ne_chunck finds non overlapping groups #pos_tag ids how the text is used in speech NamedEntity = ne_chunk(pos_tag(wordpunct_tokenize(text))) print(NamedEntity)
def _get_filtered_trigrams(self, words): # Allow stopword in the middle of trigram filtered_trigrams = [] for tri in trigrams(words): leave = True for i, w in enumerate(tri): if w in stopwords and i != 1: leave = False break if leave and tri[0] != tri[1] and tri[1] != tri[2]: filtered_trigrams.append(tri) return filtered_trigrams
def __init__(self, geo_locations): '''Initializes the language model by creating the ConditionalFreqDist and ConditionalProbDist''' words_count = 0 # will contain all names in a list which preserves their frequenceis as # they appear in the gazetteer. The frequenceis are going to be used in # the language model. gaz_n_grams = list() self.unigrams = defaultdict(int) for ln in geo_locations: number_of_mentions = len(geo_locations[ln]) n_gram = ln.split() new_list = [n_gram] * number_of_mentions gaz_n_grams.extend(new_list) for token in n_gram: words_count += 1 self.unigrams[token] += 1 self.unigrams = {"words": self.unigrams, "words_count": words_count} # bigrams +++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++ train_bigrams = list(chain(*[bigrams(i) for i in gaz_n_grams])) cfd_bigrams = ConditionalFreqDist() for bg in train_bigrams: cfd_bigrams[bg[0]][bg[1]] += 1 # bigrams MLE probabilities self.cpd_bigrams = ConditionalProbDist(cfd_bigrams, nltk.MLEProbDist) # trigrams ++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++ train_trigrams = list(chain(*[trigrams(i) for i in gaz_n_grams])) cfd_trigrams = ConditionalFreqDist() for bg in train_trigrams: bi_gr = " ".join(bg[:-1]) cfd_trigrams[bi_gr][bg[2]] += 1 # trigrams MLE probabilities self.cpd_trigrams = ConditionalProbDist(cfd_trigrams, nltk.MLEProbDist)
def _eval_sent_entropy(self, sents, human=''): for i in range(len(sents)): sent = sents[i] sent_tokens = sent.split() unigrams_ = nltk.ngrams(sent_tokens, 1) bigrams_ = bigrams(sent_tokens) trigrams_ = trigrams(sent_tokens) prob_unigrams = [ self.dict.unigram_freq[uni_tok] / self.total_unigrams for uni_tok in unigrams_ ] prob_bigrams = [ self.dict.bigram_freq[bi_tok] / self.total_bigrams for bi_tok in bigrams_ ] prob_trigrams = [ self.dict.trigram_freq[tri_tok] / self.total_trigrams for tri_tok in trigrams_ ] # smoothing zero values prob_unigrams = np.asarray( [p if p > 0 else 1 for p in prob_unigrams]) prob_bigrams = np.asarray( [p if p > 0 else 1 for p in prob_bigrams]) prob_trigrams = np.asarray( [p if p > 0 else 1 for p in prob_trigrams]) sent_entropy_uni = -np.sum(np.log2(prob_unigrams)) sent_entropy_bi = -np.sum(np.log2(prob_bigrams)) sent_entropy_tri = -np.sum(np.log2(prob_trigrams)) word_entropy_uni = sent_entropy_uni / (len(prob_unigrams) + sys.float_info.epsilon) word_entropy_bi = sent_entropy_bi / (len(prob_bigrams) + sys.float_info.epsilon) word_entropy_tri = sent_entropy_tri / (len(prob_trigrams) + sys.float_info.epsilon) self.metrics[human + 'sent_entropy_uni_cnt'] += 1 self.metrics[human + 'sent_entropy_uni'] += sent_entropy_uni self.metrics[human + 'sent_entropy_bi_cnt'] += 1 self.metrics[human + 'sent_entropy_bi'] += sent_entropy_bi self.metrics[human + 'sent_entropy_tri_cnt'] += 1 self.metrics[human + 'sent_entropy_tri'] += sent_entropy_tri self.metrics[human + 'word_entropy_uni_cnt'] += 1 self.metrics[human + 'word_entropy_uni'] += word_entropy_uni self.metrics[human + 'word_entropy_bi_cnt'] += 1 self.metrics[human + 'word_entropy_bi'] += word_entropy_bi self.metrics[human + 'word_entropy_tri_cnt'] += 1 self.metrics[human + 'word_entropy_tri'] += word_entropy_tri
def _create_objects_interface(self): formated_objects_interface = [] specification = [] for sent in self._exchange_states(): for chunk in sent: if chunk[0] == 'specification': specification.append(trigrams([chunk[1], chunk[-2], \ self._convert_to_yakindu_type(type(chunk[-1]).__name__)])) default_specification = list(OrderedSet(chain(*specification))) objects_specification = modified_groupby(default_specification, key=lambda obj: obj[0]) for obj, specification_chunks in objects_specification.items(): formated_objects_interface.append('\n\ninterface ' + obj + ':') for chunk in specification_chunks: formated_objects_interface.append('\nvar ' + chunk[-2] + ':' + chunk[-1]) return ''.join(formated_objects_interface)
def __call__(self, t): t = self.reduce_lengthening(t) tokens = t.split(' ') cleaned_tokens = [] for token in tokens: token = self.replace_username(token) token = self.replace_link(token) cleaned_tokens.append(token) rebuild_str = ' '.join(cleaned_tokens) negated_tokens = mark_negation(list(self.tknzr.tokenize(rebuild_str))) list_of_trigrams = list([' '.join(s) for s in trigrams(negated_tokens)]) return list_of_trigrams
def trigram_plot(l): list_trigrams = list(trigrams(l)) dictionary_trigram = {} for i in range(len(list_trigrams)): dictionary_trigram[list_trigrams[i]] = 0 for i in range(len(list_trigrams)): dictionary_trigram[list_trigrams[i]] += 1 plus = 0 for i in dictionary_trigram.values(): plus += i print("total unique trigram are:", end="") print(plus) dictionary_trigram = dict( sorted(dictionary_trigram.items(), key=lambda x: x[1], reverse=True)) count = 0 pdf = 0.0 for key, i in dictionary_trigram.items(): pdf += i / plus count += 1 if (pdf > 0.7): break print( "total trigrams are required to cover the 70% of the complete corpus:", end="") print(count) print(pdf) threshold = 45 for key in dictionary_trigram.copy(): if (dictionary_trigram[key] < threshold): dictionary_trigram.pop(key) keys_trigram, values_trigram = dictionary_trigram.keys( ), dictionary_trigram.values() keys_trigram = list(keys_trigram) ls = [] for i in keys_trigram: t = ' '.join(i) ls.append(t) print("total trigrams taken for plotting purpose:", end="") print(len(ls)) plt.loglog(tuple(ls), tuple(values_trigram), color='g') plt.xticks(range(len(ls)), ls, rotation=90) plt.xlabel('trigram') plt.ylabel('trigram count') plt.xscale('log') plt.show() plt.savefig('trigram')
def __call__(self, t): t = self.reduce_lengthening(t) tokens = t.split(' ') cleaned_tokens = [] for token in tokens: token = self.replace_username(token) token = self.replace_link(token) cleaned_tokens.append(token) rebuild_str = ' '.join(cleaned_tokens) negated_tokens = mark_negation(list(self.tknzr.tokenize(rebuild_str))) list_of_trigrams = list( [' '.join(s) for s in trigrams(negated_tokens)]) return list_of_trigrams
def getTrigramsDistributionFromText(txt): trigrm = list(trigrams(txt.split())) # print(bigrm) trigramWords = ', '.join(' '.join((a, b, c)) for a, b, c in trigrm) dictResTri = {} for tri in trigramWords.split(","): tri = tri.lstrip() tri = tri.rstrip() if tri in dictTrigrams: # print(bi,dictBigrams[bi]) dictResTri[tri] = dictTrigrams[tri] # else: # # print("NA") # dictResBi[bi]= 0 return (sorted(dictResTri.items(), key=lambda x: x[1], reverse=True))
def write_trigrams(words, name, minlength, count): print("Finding " + name) stopfilter = lambda w: len( w) < minlength or w in stop_numbers + stop_common + ['e', 'i'] collocs = Counter(trigrams(words)) collocs = Counter({ key: val for key, val in collocs.items() if not stopfilter(key[0]) and not stopfilter(key[1]) and not stopfilter(key[2]) }) collocs = collocs.most_common(count) f = open(name + '.csv', 'w', encoding="utf-8") for word, val in collocs: f.write(u'{},{},{},{}\n'.format(word[0], word[1], word[2], val)) f.close
def tuple_to_ngrams(tuple_with_words: tuple[str], n=2) -> tuple[tuple[str]]: """ Take a tuple with words and convert this tuple to a tuple with n grams (bi-/trigrams) :param tuple_with_words: :param n: :return: """ # make bigrams and trigrams and store them in dictionary nGrams: dict[int, tuple[tuple]] = { 2: tuple(bigrams(tuple_with_words)), 3: tuple(trigrams(tuple_with_words)) } # return a tuple that contains tuples of size n return nGrams[n]
def processReview_trigram(self,review): review_text = self.stage2.removePunctuations(review["review"]) if self.const.GENERATE_TRIGRAMS_WITH_STOP_WORDS: pass else: review_text = self.removeStopWordsFromReview(review_text) review_text = review_text.lower() tokens = review_text.split(" ") trigram_list = trigrams(tokens) lst =[] for trigram in trigram_list:# if self.string_found(bigram)] first_word = trigram[0].strip() second_word = trigram[1].strip() third_word = trigram[2].strip() if ""==first_word or ""==second_word or ""==third_word: pass else: lst.append({"word":first_word+" "+second_word+" "+third_word}) return lst
def profile(self, text): ''' Create FreqDist of trigrams within text ''' from nltk import word_tokenize, FreqDist clean_text = self.remove_punctuation(text) tokens = word_tokenize(clean_text) fingerprint = FreqDist() for t in tokens: token_trigram_tuples = trigrams(self._START_CHAR + t + self._END_CHAR) token_trigrams = [''.join(tri) for tri in token_trigram_tuples] for cur_trigram in token_trigrams: if cur_trigram in fingerprint: fingerprint[cur_trigram] += 1 else: fingerprint[cur_trigram] = 1 return fingerprint
def get_trigram_word_dict(emotion_line): words = nltk.word_tokenize(emotion_line) word_bigrams = list(bigrams(words)) word_trigrams = list(trigrams(words)) word_feats = {} for w in words: if w not in word_feats: word_feats[w] = "feature_word" for w in word_bigrams: if w not in word_feats: word_feats[w] = "feature_word" for w in word_trigrams: if w not in word_feats: word_feats[w] = "feature_word" return word_feats
def buildGraphSentence(self, sentence): nodes = list() #unograms possible_unograms = sentence possible_unograms = [ uno for uno in possible_unograms if uno.lower() not in self.stoplist ] nodes = nodes + possible_unograms #bigrams possible_bigrams = list(bigrams(sentence)) possible_bigrams = [ bi for bi in possible_bigrams if (bi[0].lower() not in self.stoplist and bi[1].lower() not in self.stoplist) ] possible_bigrams = [' '.join(bi) for bi in possible_bigrams] nodes = nodes + possible_bigrams #trigrams possible_trigrams = list(trigrams(sentence)) possible_trigrams = [ tri for tri in possible_trigrams if (tri[0].lower() not in self.stoplist and tri[1].lower() not in self.stoplist and tri[2].lower() not in self.stoplist) ] possible_trigrams = [' '.join(tri) for tri in possible_trigrams] nodes = nodes + possible_trigrams #print(nodes) #add nodes for node in nodes: self.graph.add_node(node) #print(self.graph.nodes) #add edges for node in nodes: for node2 in nodes: if node != node2: if self.graph.has_edge(node, node2): if self.NofCooc: self.graph[node][node2]['weight'] += 1 else: self.graph.add_edge(node, node2) self.graph[node][node2]['weight'] = 1
def __init__(self, work_dir): tokens = [] sentences = [] paragraphs = [] paragraph_sentence_length = [] first_word = [] for path_to_text, _, text_file_names in os.walk(work_dir): for text_file_name in text_file_names: text_file = file(os.path.join(path_to_text, text_file_name)) if not os.path.isfile(os.path.join(path_to_text, text_file_name)): continue file_content = text_file.read().decode("utf8") print text_file_name text_paragraphs = nltk.blankline_tokenize(file_content) paragraphs += text_paragraphs self._sent_tokenizer = nltk.tokenize.PunktSentenceTokenizer(file_content) for paragraph in text_paragraphs: paragraph_sentence = self._sent_tokenizer.tokenize(paragraph) paragraph_sentence_length.append(len(paragraph_sentence)) sentences += paragraph_sentence for sentence in paragraph_sentence: sentence_tokens = nltk.word_tokenize(sentence) tokens += sentence_tokens first_word.append(sentence_tokens[0]) self._trigram_pd = nltk.ConditionalProbDist( nltk.ConditionalFreqDist([(t[:2], t[2]) for t in trigrams(tokens)]), nltk.probability.ELEProbDist ) self._bigram_pd = nltk.ConditionalProbDist( nltk.ConditionalFreqDist([(t[:1], t[1]) for t in bigrams(tokens)]), nltk.probability.ELEProbDist ) self._sent_begin_pd = nltk.ELEProbDist(nltk.FreqDist(first_word)) self._paragraph_length_pd = nltk.ELEProbDist(nltk.FreqDist(paragraph_sentence_length))
def get_letters(): data_rinat = open('letters_rinat.json') data_vladimir = open('letters_vladimir.json') root_data = json.loads(data_rinat.readlines()[0]) + json.loads( data_vladimir.readlines()[0]) # print (data) root_letters = list() root_trigrams = dict() for i in root_data: # current_index = i.index(root_data) get_clean_text = (re.split('\W+', ((i['coverLetter'])))) #очистить текст # temp.append(list(trigrams(get_clean_text))) #составить триграммы и добавить их в список # print(temp) for grams in list(trigrams(get_clean_text)): root_letters.append(' '.join(grams)) return collections.Counter(root_letters)
def _create_states_specification(self): states = [] specification = [] states_specification_content = [] for sent in self._exchange_states(): specification.append([list(chain(*trigrams([chunk[1] + '{0}', chunk[-2] + \ ' {1} ', str(chunk[-1]).lower() + '{2}{3}']))) \ for chunk in sent if chunk[0] == 'specification']) while [] in specification: specification.remove([]) for spec in specification: spec[0].insert(0, 'entry/\n') spec[-1][-1] = spec[-1][-1].rstrip('{2}{3}') states_specification = dict(izip(self._get_states_content(), specification)) for state, specification in states_specification.items(): states_specification[state] = list(chain(*specification)) states_specification[state].insert(0, '"') states_specification[state].append('"') for state, specification in states_specification.items(): states_specification[state] = ''.join(specification).format('.', '=', ';', '\n') return states_specification
def findBM25Terms(self): allterms = set() for document in self.dswa: for sentence in document: nodes = list() #unograms possible_unograms = sentence possible_unograms = [ uno for uno in possible_unograms if uno.lower() not in self.stoplist ] nodes = nodes + possible_unograms #bigrams possible_bigrams = list(bigrams(sentence)) possible_bigrams = [ bi for bi in possible_bigrams if (bi[0].lower() not in self.stoplist and bi[1].lower() not in self.stoplist) ] possible_bigrams = [' '.join(bi) for bi in possible_bigrams] nodes = nodes + possible_bigrams #trigrams possible_trigrams = list(trigrams(sentence)) possible_trigrams = [ tri for tri in possible_trigrams if (tri[0].lower() not in self.stoplist and tri[1].lower() not in self.stoplist and tri[2].lower() not in self.stoplist) ] possible_trigrams = [ ' '.join(tri) for tri in possible_trigrams ] nodes = nodes + possible_trigrams #print(nodes) #add nodes for node in nodes: allterms.add(node) return BM25Calculator(self.dswa, allterms)
def get_keywords(sentence, allowed_tags): sentence = _remove_by_regex(_replace_punct(sentence)) tokens = nltk.word_tokenize(sentence) tokens = [token.strip("'") for token in tokens] tagged_tokens = nltk.pos_tag(tokens) stop_words = get_stop_words('en') stop_words = {word.decode('utf-8') for word in stop_words} stop_words |= {'read'} keywords = [] for word, tag in tagged_tokens: word = word.lower() if is_proper_keyword(word, tag, allowed_tags, stop_words): keywords.append(word) bigrams_keywords = list(bigrams(keywords)) trigrams_keywords = list(trigrams(keywords)) for k in bigrams_keywords: keywords.append(' '.join(k)) for k in trigrams_keywords: keywords.append(' '.join(k)) return keywords
def value_for_text(self, t, rp=default_rp): syntax_trees = rp.parse_trees(t) sentence_indices = [] for tree in syntax_trees: if tree.label() == 'ROOT': tree = tree[0] leaves = tree.leaves() word_indices = [0] * len(leaves) for i in range(len(leaves)): ref_vector = tree.leaf_treeposition(i) j = -2 while j >= -len(ref_vector) and ref_vector[j] == 0: parent_index = len(ref_vector) + j parent_node = tree[ref_vector[:parent_index]] if rp.parser().tagset.is_sentence_node(parent_node): word_indices[i] += 1.5 else: word_indices[i] += 1 j -= 1 if len(leaves) < 3: sentence_index = sum(word_indices) else: max_trigrams = 0 for trigram in trigrams(word_indices): if sum(trigram) > max_trigrams: max_trigrams = sum(trigram) sentence_index = max_trigrams sentence_indices.append(sentence_index) return sum(sentence_indices) / len(sentence_indices) \ if sentence_indices else 0
def filterPhrases(self): self.filteredPhrases = [] self.sentences = sent_tokenize(self.text) for sentence in self.sentences: bigramList = list(set(bigrams(wordpunct_tokenize(sentence.lower())))) for bigram in bigramList: (word1, word2) = bigram if word1 == "'": term = word1 + word2 elif re.match(r'\W+', word1) == None and re.match(r'\W+', word2) == None: term = word1 + ' ' + word2 else: continue if self.dbc.execute("SELECT 1 FROM vocabulary WHERE term = %s LIMIT 1;", term) == 0L: if self.udq.recorded(term): if len(term) <= 140: self.dbc.execute("INSERT INTO vocabulary (term) VALUE (%s)", term) else: self.filteredPhrases.append(term) trigramList = list(set(trigrams(wordpunct_tokenize(sentence.lower())))) for trigram in trigramList: (word1, word2, word3) = trigram if word3 == "'": continue elif word2 == "'": term = word1 + word2 + word3 elif re.match(r'\W+', word1) == None and re.match(r'\W+', word2) == None and re.match(r'\W+', word3) == None: term = word1 + ' ' + word2 + ' ' + word3 else: continue if self.dbc.execute("SELECT 1 FROM vocabulary WHERE term = %s LIMIT 1;", term) == 0L: if self.udq.recorded(term): if len(term) <= 140: self.dbc.execute("INSERT INTO vocabulary (term) VALUE (%s)", term) else: self.filteredPhrases.append(term) return self.filteredPhrases
def ngrams(self, value): for trigram in trigrams(self.tokenize(value)): yield trigram
def getRecipeInfo(myURL): ### Here the webpage with the recipe is opened ### driver = webdriver.Chrome('./chromedriver') # driver = webdriver.Firefox() # myURL = sys.argv[1] #'http://allrecipes.com/Recipe/Beef-Brisket-My-Way/' #print myURL try: driver.get(myURL) ### Here the recipe name is extracted ### recipeNameXPath = '//div[@class="detail-right fl-right"]/h1[@id="itemTitle"]' recipeNameObject = driver.find_elements_by_xpath(recipeNameXPath) for value in recipeNameObject: recipeName = value.get_attribute("innerHTML") #print recipeName ingredients = [] singleIngredient = {} ingredientSet1NamesXPath = '//div[@class="ingred-left"]/ul[@class="ingredient-wrap"]/li[@id="liIngredient"]/label/p[@class="fl-ing"]/span[@id="lblIngName"]' ingredientSet1NamesObjects = driver.find_elements_by_xpath(ingredientSet1NamesXPath) ingredientSet1AmountsXPath = '//div[@class="ingred-left"]/ul[@class="ingredient-wrap"]/li[@id="liIngredient"]/label/p[@class="fl-ing"]/span[@id="lblIngAmount"]' ingredientSet1AmountsObjects = driver.find_elements_by_xpath(ingredientSet1AmountsXPath) except: driver.quit() raise for value in ingredientSet1NamesObjects: fullSingleIngredient = str(value.get_attribute("innerHTML")) if string.find(fullSingleIngredient, ', ') > -1: singleIngredientParts = string.split(fullSingleIngredient, ', ') singleIngredient['name'] = singleIngredientParts[0] singleIngredient['descriptor'] = singleIngredientParts[1] else: singleIngredient['name'] = fullSingleIngredient singleIngredient['descriptor'] = '' singleIngredient['preparation'] = '' ingredients.append(singleIngredient) singleIngredient = {} i = 0 for value in ingredientSet1AmountsObjects: amount = str(value.get_attribute("innerHTML")) if string.find(amount, '(') > -1: actualAmount = string.split(amount, '(') amount = string.split(actualAmount[1], ')') amount = amount[0] #print actualAmount qty = re.search(r"[a-z]+", amount) if qty != None: #print qty.group(0) ingredients[i]['measurement'] = qty.group(0) myQty = string.replace(amount, str(qty.group(0)), '') myQty = myQty.strip() if string.find(myQty, '/') > -1: qtyNum = string.split(myQty, '/') if string.find(qtyNum[0], ' ') > -1: numerator = string.split(qtyNum[0], ' ') ingredients[i]['quantity'] = (float(numerator[0])*float(qtyNum[1])+float(numerator[1]))/float(qtyNum[1]) else: ingredients[i]['quantity'] = float(qtyNum[0])/float(qtyNum[1]) else: ingredients[i]['quantity'] = myQty else: ingredients[i]['measurement'] = 'unit' ingredients[i]['quantity'] = str(value.get_attribute("innerHTML")) i += 1 ingredientSet2NamesXPath = '//div[@class="ingred-left"]/ul[@class="ingredient-wrap secondColumn"]/li[@id="liIngredient"]/label/p[@class="fl-ing"]/span[@id="lblIngName"]' ingredientSet2NamesObjects = driver.find_elements_by_xpath(ingredientSet2NamesXPath) ingredientSet2AmountsXPath = '//div[@class="ingred-left"]/ul[@class="ingredient-wrap secondColumn"]/li[@id="liIngredient"]/label/p[@class="fl-ing"]/span[@id="lblIngAmount"]' ingredientSet2AmountsObjects = driver.find_elements_by_xpath(ingredientSet2AmountsXPath) for value in ingredientSet2NamesObjects: fullSingleIngredient = str(value.get_attribute("innerHTML")) if string.find(fullSingleIngredient, ', ') > -1: singleIngredientParts = string.split(fullSingleIngredient, ', ') singleIngredient['name'] = singleIngredientParts[0] singleIngredient['descriptor'] = singleIngredientParts[1] else: singleIngredient['name'] = fullSingleIngredient singleIngredient['descriptor'] = '' singleIngredient['preparation'] = '' ingredients.append(singleIngredient) singleIngredient = {} for value in ingredientSet2AmountsObjects: amount = str(value.get_attribute("innerHTML")) if string.find(amount, '(') > -1: actualAmount = string.split(amount, '(') amount = string.split(actualAmount[1], ')') amount = amount[0] #print amount qty = re.search(r"[a-z]+", amount) if qty != None: #print qty.group(0) ingredients[i]['measurement'] = qty.group(0) myQty = string.replace(amount, str(qty.group(0)), '') myQty = myQty.strip() if string.find(myQty, '/') > -1: qtyNum = string.split(myQty, '/') if string.find(qtyNum[0], ' ') > -1: numerator = string.split(qtyNum[0], ' ') ingredients[i]['quantity'] = (float(numerator[0])*float(qtyNum[1])+float(numerator[1]))/float(qtyNum[1]) else: ingredients[i]['quantity'] = float(qtyNum[0])/float(qtyNum[1]) else: ingredients[i]['quantity'] = myQty else: ingredients[i]['measurement'] = 'unit' ingredients[i]['quantity'] = str(value.get_attribute("innerHTML")) i += 1 #pprint(ingredients) directions = [] i = 0 directionsXPath = '//div[@class="directLeft"]/ol/li/span' directionsObjects = driver.find_elements_by_xpath(directionsXPath) for value in directionsObjects: directions.append(str(value.get_attribute("innerHTML"))) i += 1 #print directions driver.quit() cookingMethods = {} with open('./text_files/cookingMethods.txt', 'r') as f: for line in f: cookingMethods[string.replace(line, '\n', '').strip()] = True #pprint(cookingMethods) cookingUtensils = {} with open('./text_files/cookingUtensils.txt', 'r') as f: for line in f: cookingUtensils[string.replace(line, '\n', '').strip()] = True #pprint(cookingUtensils) recipeCookingMethods = [] recipeCookingUtensils = [] localPhrase = '' for step in directions: for phrase in ngrams(string.split(step), 4): for word in phrase: localPhrase += word localPhrase += ' ' localPhrase = localPhrase.strip() localPhrase = localPhrase.replace(',', '') localPhrase = localPhrase.replace('.', '') for tool in cookingUtensils.keys(): if tool.lower() == localPhrase.lower(): #print localToolPhrase, 'utensil ->', tool recipeCookingUtensils.append(tool) localPhrase = '' #print '4-grams done' for phrase in trigrams(string.split(step)): for word in phrase: localPhrase += word localPhrase += ' ' localPhrase = localPhrase.strip() localPhrase = localPhrase.replace(',', '') localPhrase = localPhrase.replace('.', '') for tool in cookingUtensils.keys(): if tool.lower() == localPhrase.lower(): #print localToolPhrase, 'utensil ->', tool #flag = 1 #for myTool in recipeCookingUtensils: # if string.find(myTool, localPhrase) > -1: # flag = 0 #if flag == 1: recipeCookingUtensils.append(tool) for method in cookingMethods.keys(): if method.lower() == localPhrase.lower(): recipeCookingMethods.append(method) localPhrase = '' #print '3-grams done' for phrase in bigrams(string.split(step)): for word in phrase: localPhrase += word localPhrase += ' ' localPhrase = localPhrase.strip() localPhrase = localPhrase.replace(',', '') localPhrase = localPhrase.replace('.', '') for tool in cookingUtensils.keys(): if tool.lower() == localPhrase.lower(): #print localPhrase, 'utensil ->', tool #flag = 1 #for myTool in recipeCookingUtensils: # if string.find(myTool, localPhrase) > -1: # flag = 0 #if flag == 1: recipeCookingUtensils.append(tool) for method in cookingMethods.keys(): if method.lower() == localPhrase.lower(): #flag = 1 #for myMethod in recipeCookingMethods: # if string.find(myMethod, localPhrase) > -1: # flag = 0 #if flag == 1: recipeCookingMethods.append(method) localPhrase = '' #print '2-grams done' for word in string.split(step, ' '): #print word if len(word) > 2: word = word.replace(',', '') word = word.replace('.', '') for method in cookingMethods.keys(): #if string.find(method, word) > -1: if method.lower() == word.lower(): #flag = 1 #for myMethod in recipeCookingMethods: # if string.find(myMethod, localPhrase) > -1: # flag = 0 #if flag == 1: recipeCookingMethods.append(method) for tool in cookingUtensils.keys(): #if string.find(tool, word) > -1: if tool.lower() == word.lower(): #print localPhrase, 'utensil ->', tool #flag = 1 #for myTool in recipeCookingUtensils: # if string.find(myTool, localPhrase) > -1: # flag = 0 #if flag == 1: recipeCookingUtensils.append(tool) #print '1-grams done' utensilsSet = set(recipeCookingUtensils) recipeCookingUtensils = list(utensilsSet) cookingMethodsSet = set(recipeCookingMethods) recipeCookingMethods = list(cookingMethodsSet) recipe = {} recipe['ingredients'] = ingredients recipe['cooking method'] = random.choice(recipeCookingMethods) recipe['cooking tools'] = recipeCookingUtensils #pprint(recipe) myInternalRecipe = {} myInternalRecipe['name'] = str(recipeName) myInternalRecipe['ingredients'] = [] for item in ingredients: myInternalRecipe['ingredients'].append(item['name']) #print myInternalRecipe f = open('recipeJson.json', 'w') jobj = json.dumps(recipe) f.write(jobj) f.close() with open('recipeJson.json', 'r') as f: myJobj = map(json.loads, f) return myInternalRecipe, recipe
text = gutenberg.raw('austen-emma.txt'); nltk_sents = sent_tokenize(text) # contains the list of sentences detected from the tool nltk_words = word_tokenize(text) #print len(nltk_words) #fnltk_words = [wordnet_lemmatizer.lemmatize(nltk_word) for nltk_word in nltk_words] #print len(fnltk_words) dictn=list(set(nltk_words)) tokens = nltk_words tokens = [token.lower() for token in tokens if len(token) > 1] # same as unigrams lemma_tokens = [wordnet_lemmatizer.lemmatize(token, wordnet.VERB) for token in tokens] #print len(lemma_tokens) tokens = lemma_tokens bi_tokens = list(bigrams(tokens)) # getting the bigrams tri_tokens = list(trigrams(tokens)) uni_fdist = nltk.FreqDist(tokens) bi_fdist = nltk.FreqDist(bi_tokens) #print len(bi_fdist) tri_fdist = nltk.FreqDist(tri_tokens) tri_freq = 0 bi_freq = 0 uni_freq = 0 print "top 15 unigrams with lemma\n\n"
cw=unig[w] except: cw=0 cw+=1 unig[w]=cw bg=bigrams(words) for b in bg: try: cb=big[b] except: cb=0 cb+=1 big[b]=cb tg=trigrams(words) for t in tg: try: ct=trig[t] except: ct=0 ct+=1 trig[t]=ct f.close() msim=[] slens={} for k in models.keys(): m=models[k] uv=compare_histogram(m["unigrams"], unig)
def getStats2(training_set, test_set, data_type): #List of ngrams for training list training_unigram = [unigram for sent in training_set for unigram in sent] training_bigram = [bigram for sent in training_set for bigram in list(bigrams(sent))] training_trigram = [trigram for sent in training_set for trigram in list(trigrams(sent))] #List of ngrams for test list test_unigram = [unigram for sent in test_set for unigram in sent] test_bigram = [bigram for sent in test_set for bigram in list(bigrams(sent))] test_trigram = [trigram for sent in test_set for trigram in list(trigrams(sent))] #FreqDist for each ngram for training list fdist_training_unigram = FreqDist(training_unigram) fdist_training_bigram =FreqDist(training_bigram) fdist_training_trigram = FreqDist(training_trigram) #freqDistfor each ngram for test list fdist_test_unigram = FreqDist(test_unigram) fdist_test_bigram =FreqDist(test_bigram) fdist_test_trigram = FreqDist(test_trigram) #Type freq for ngrams in training list training_unigram_freq = fdist_training_unigram.N() training_bigram_freq = fdist_training_bigram.N() training_trigram_freq = fdist_training_trigram.N() #Type freq for ngrams in test list test_unigram_freq = fdist_test_unigram.N() test_bigram_freq = fdist_test_bigram.N() test_trigram_freq = fdist_test_trigram.N() #Types for ngrams in training list types_training_unigram = set(training_unigram) types_training_bigram = set(training_bigram) types_training_trigram = set(training_trigram) #Types for ngrams in test list types_test_unigram = set(test_unigram) types_test_bigram = set(test_bigram) types_test_trigram = set(test_trigram) #types of ngrams not in test set that not in training set unigrams_not_in_tr = types_test_unigram - types_training_unigram bigrams_not_in_tr = types_test_bigram - types_training_bigram trigrams_not_in_tr = types_test_trigram - types_training_trigram perecent_unigrams_not_in_tr = 100 * len(unigrams_not_in_tr)/float(len(types_test_unigram)) perecent_bigrams_not_in_tr = 100 *len(bigrams_not_in_tr)/float(len(types_test_bigram)) perecent_trigrams_not_in_tr = 100 *len(trigrams_not_in_tr)/float(len(types_test_trigram)) overall_percent_unigrams = 100 *len(unigrams_not_in_tr)/ float(test_unigram_freq) overall_percent_bigrams = 100 * len(bigrams_not_in_tr)/ float(test_bigram_freq ) overall_percent_trigrams = 100* len(trigrams_not_in_tr)/ float(test_trigram_freq) print("""%s\nnum types in unigram training set:%s\n num types in bigram training set:%s\nnum types in trigram training set: %s """ % (data_type,len(types_training_unigram), \ len(types_training_bigram), len(types_training_trigram) )) print ("""\n%s\nnum types in unigram test set: %s\n num types in bigram test set:%s\nnum types in trigram test set: %s """ % (data_type,len(types_test_unigram),\ len(types_test_bigram), len(types_test_trigram) )) print ("""\n%s\nunigrams not in training set: %s\n bigram not in tr set%s\ntrigrams not in training set: %s """ % (data_type,len(unigrams_not_in_tr),\ len(bigrams_not_in_tr), len(trigrams_not_in_tr) )) print ("""\n%s\percent unigrams: %s\n percent bigrams:%s\n percent trigrams: %s """ % (data_type, perecent_unigrams_not_in_tr ,\ perecent_bigrams_not_in_tr , perecent_trigrams_not_in_tr)) print ("""\n%s\noverall percent unigrams: %s\n overall percent bigrams:%s\noverall percent trigrams: %s """ % (data_type,overall_percent_unigrams,\ overall_percent_bigrams, overall_percent_trigrams))
def brown_trigrams(category): """Takes as input the name of a brown category, and returns a list of all of the trigrams in the category.""" words = ["<s>"] words += [word.lower() for word in brown.words(categories=category) if word.isalnum()] words.append("</s>") return list(trigrams(words))
def extract(featureList, dir, fileout, n): tokenizer = RegexpTokenizer(r'\w+') docPos = {} docNeg = {} docFeatures = {} sentiment = "pos" for file in os.listdir(dir+sentiment): if file.endswith(".txt"): features = {} sentiment = "pos" fp = open(dir+sentiment+"/"+file, 'rb') doc = fp.read() tokens = [t for t in trigrams(tokenizer.tokenize(doc))] for word in featureList: if word in tokens: features[word] = 1.0 else: features[word] = 0.0 docPos[file] = "" docFeatures[file] = features sentiment = "neg" for file in os.listdir(dir+sentiment): if file.endswith(".txt"): features = {} sentiment = "neg" fp = open(dir+sentiment+"/"+file, 'rb') doc = fp.read() tokens = [t for t in trigrams(tokenizer.tokenize(doc))] for word in featureList: if word in tokens: features[word] = 1.0 else: features[word] = 0.0 docNeg[file] = "" docFeatures[file] = features f = FreqDist(featureList) featureList = [x for (x,f) in f.items()[:n]] allData = [] for doc in docFeatures.keys(): data = [] count = 1 if doc in docNeg.keys(): val =['-1'] if doc in docPos.keys(): val =['1'] for key in featureList: data.append("%s:%s" %(count, docFeatures[doc][key])) count +=1 val.extend(data) allData.append(" ".join(val)) # for doc in docFeaturesPos.keys(): # data =['+1'] # for key in featureList: # data.append("%s:%s" %(count, docFeaturesPos[doc][key])) # count +=1 # count = 1 # allData.append(" ".join(data)) fVectorWriter = csv.writer(open(dir+fileout+".txt", 'wb')) for d in allData: print d fVectorWriter.writerow([d])
fp.close() sentiment = "neg" for file in os.listdir(dir+sentiment): if file.endswith(".txt"): fp = open(dir+sentiment+"/"+file, 'r') doc = fp.read() #tokens.extend(word_tokenize(doc)) tokens.extend(tokenizer.tokenize(doc)) fp.close() return tokens dir = "/home/jch550/dev/JJboost/data/txt_sentoken/" print "extracting features..." featuresRaw = extractFeatures(dir) # print "cleaning features..." featuresClean = removeStopwords(featuresRaw) featuresTrigrams = trigrams(featuresClean) # print "writing to file..." # fListWriter = csv.writer(open(dir+"featureTrigramsList.txt", 'w')) # for f in featuresTrigrams: # fListWriter.writerow([f]) # features = open(dir+"featureTrigramsList.txt", 'rb') # featuresList = features.read().split('\r\n') featuresList = [t for t in featuresTrigrams] print "extracting features from documents..." extract(featuresList, dir, "docs_train_trigrams", 500) print "DONE."
domain_words['HCF LCM']=['The','and','The','a' .......] ''' domain_words[i] = temp for s in file_dumps[i].splitlines(): question_collection.append((s,i)) print("++++++++++++++++++++++++++++++++++++++++++++++") #for random_question in question_collection: # print(random_question[0]) # print("-----") tokensPerQuestion = [nltk.word_tokenize(random_question[0]) for random_question in question_collection] tokensPerQuestion = [[token.lower() for token in t if token.lower() not in symbols and token.lower() not in stoplist] for t in tokensPerQuestion] print(tokensPerQuestion[:5]) b = list(chain(*[(list(bigrams(tokens))) for tokens in tokensPerQuestion])) t = list(chain(*[(list(trigrams(tokens))) for tokens in tokensPerQuestion])) print(b) fdist = nltk.FreqDist(b) plt.figure(figsize=(20, 8)) # plot the top 20 bigrams fdist.plot(30) fdist = nltk.FreqDist(t) plt.figure(figsize=(20, 8)) # plot the top 20 trigrams fdist.plot(30)
total_word_sents = word_training_set + word_test_set total_pos_sents = pos_training_set + pos_test_set total_words =[] total_pos = [] #This flatens the total word and total sentance lists for (w, p) in zip(total_word_sents, total_pos_sents): for (word, pos) in zip(w, p): total_words.append(word) total_pos.append(pos) # Calculates total ngram freq for words total_word_unigrams = len(total_words) total_word_bigrams = sum([len(list(bigrams(ngrams))) for ngrams in total_word_sents]) total_word_trigrams = sum([len(list(trigrams(ngrams))) for ngrams in total_word_sents]) # Calculates total ngram freq for POS total_pos_unigrams = len(total_pos) total_pos_bigrams = sum([len(list(bigrams(ngrams))) for ngrams in total_pos_sents]) total_pos_trigrams = sum([len(list(trigrams(ngrams))) for ngrams in total_pos_sents]) #Prints the info calculated above print("Words Total:\nUnigrams:%s\nBigrams:%s\nTrigrams:%s\n" % \ (total_word_unigrams, total_word_bigrams, total_word_trigrams)) print("POS Total:\nUnigrams:%s\nBigrams:%s\nTrigrams:%s\n" % \ (total_pos_unigrams, total_pos_bigrams, total_pos_trigrams)) # Calculates all the relevant stats given the training set, testing set, # and data type def getStats2(training_set, test_set, data_type): #List of ngrams for training list training_unigram = [unigram for sent in training_set for unigram in sent]