def train_language(language, training_path): words = [] filter_words(training_path, words) seq = ' ' + ''.join(words) # Bigram bigram_finder = BigramCollocationFinder.from_words(seq) bigram_finder.apply_freq_filter(FREQ_FILTER) bigram_model = bigram_finder.ngram_fd.items() # Trigram trigram_finder = TrigramCollocationFinder.from_words(seq) trigram_finder.apply_freq_filter(FREQ_FILTER) trigram_model = trigram_finder.ngram_fd.items() # Quad quadgram_finder = QuadgramCollocationFinder.from_words(seq) quadgram_finder.apply_freq_filter(FREQ_FILTER) quadgram_model = quadgram_finder.ngram_fd.items() bigram_model = sorted(bigram_finder.ngram_fd.items(), key=lambda item: item[1], reverse=True) trigram_model = sorted(trigram_finder.ngram_fd.items(), key=lambda item: item[1], reverse=True) quadgram_model = sorted(quadgram_finder.ngram_fd.items(), key=lambda item: item[1], reverse=True) final_model = bigram_model + trigram_model + quadgram_model #print(final_model) np.save(MODELS_PATH + language + '.npy', final_model) print("Language model for {} stored at {}".format( language, MODELS_PATH + language + '.npy'))
def get_quadgrams(self, size): file_name = self.disease_type + '-quadgram-freq-' + str(size) if 'training' in file_name: full_training_quadgram_filename = file_name + '.csv' file_quadgrams = csv.writer( open(full_training_quadgram_filename, 'w')) else: full_test_quadgram_filename = file_name + '.csv' file_quadgrams = csv.writer(open(full_test_quadgram_filename, 'w')) finder = QuadgramCollocationFinder.from_words(self.word_set) True sortedQuadGrams = sorted( finder.ngram_fd.items(), key=lambda t: (-t[1], t[0]))[:size] # doctest: +NORMALIZE_WHITESPACE # Store results of 400 bigrams into CSV file for quadgram_tuple, count in sortedQuadGrams: file_quadgrams.writerow([ type(quadgram_tuple)(x.encode('utf-8') for x in quadgram_tuple), count ]) # formatted properly #x.encode return self.full_training_quadgram_filename, self.full_test_quadgram_filename
def _get_quadgrams(words, top_n, min_freq): qcf = QuadgramCollocationFinder.from_words(iter(words)) qcf.apply_freq_filter(min_freq) quadgrams = [ ' '.join(w) for w in qcf.nbest(QuadgramAssocMeasures.chi_sq, top_n) ] return re.compile('(%s)' % '|'.join(quadgrams), re.UNICODE)
def predict(test_string, models): # clean string test_string = pre_processing(test_string) bi_test = BigramCollocationFinder.from_words(test_string) tri_test = TrigramCollocationFinder.from_words(test_string) quad_test = QuadgramCollocationFinder.from_words(test_string) final_test = list(bi_test.ngram_fd.items()) + list( tri_test.ngram_fd.items()) + list(quad_test.ngram_fd.items()) model_name = [] for model in models: model_name.append(model[0]) freq_sum = np.zeros(len(models)) for ngram, freq in final_test: exists = 0 for i, lang_model in enumerate(models): lang = lang_model[0] model = lang_model[1] total_ngram = lang_model[2] if ngram in model: if DEBUG: print("Found", ngram, model[ngram], lang, total_ngram) # normalizing to prevent freq/total to be zero freq_sum[i] = freq_sum[i] + (freq * 10000) / total_ngram exist = 1 if not exists: freq_sum[i] += 1 max_val = freq_sum.max() index = freq_sum.argmax() if not max(freq_sum): if DEBUG: print("[ERROR] Invalid string. String: {}".format(test_string)) return 0, "Hmm, I do not know this word. Please try other words." # get highest score and normalize it to be between 0,1} _max = 0 freq_to_model = list(zip(freq_sum, model_name)) scores = [x for x, y in freq_to_model] normalized_scores_name = [(normalize_score(f, scores), m) for f, m in freq_to_model] sorted_score_model = sorted(normalized_scores_name, reverse=True) if DEBUG: print("[DEBUG] Frequency to model: {}".format(freq_to_model)) if DEBUG: print("[DEBUG] Scores: {}".format(scores)) if DEBUG: print("[DEBUG] Normalized scores name: {}".format( normalized_scores_name)) if DEBUG: print("[DEBUG] Reverse sorted score model: {}".format( sorted_score_model)) return 1, sorted_score_model
def rank_quadgrams(corpus, metric): """ Находит и оценивает тетраграммы в указанном корпусе с применением заданной метрики. Записывает тетраграммы в файл, если указан, иначе возвращает список в памяти. """ # Создать объект оценки словосочетаний из слов в корпусе. ngrams = QuadgramCollocationFinder.from_words(corpus.words()) # Оценить словосочетания в соответствии с заданной метрикой scored = ngrams.score_ngrams(metric) return scored
def compute_ngrams_count(text_corpus, out_p, n=20): print("Compute ngrams count...") list_of_tokens = [] for document in text_corpus: for sentence in document: list_of_tokens.append(word_tokenize(sentence)) # Unigram tokens = util.flatten_one_level(list_of_tokens) custom_sw = [".", "[", "]", ","] sw = stopwords.words("english") + custom_sw tokens = [w for w in tokens if w not in sw] word_fd = FreqDist(tokens) uni_mc = word_fd.most_common(n) # Bigram bi = BigramCollocationFinder.from_documents(list_of_tokens) #bi.apply_freq_filter(2) #print(bi.ngram_fd.items()) bi_mc = bi.ngram_fd.most_common(n) # Trigram tri = TrigramCollocationFinder.from_documents(list_of_tokens) tri_mc = tri.ngram_fd.most_common(n) # Quadgram quad = QuadgramCollocationFinder.from_documents(list_of_tokens) quad_mc = quad.ngram_fd.most_common(n) # Plot data = [uni_mc, bi_mc, tri_mc, quad_mc] x = [] y = [] for i in range(len(data)): x_ng = [] y_ng = [] for d in data[i]: if i==0: x_ng.append(d[0]) else: x_ng.append(" ".join(d[0])) y_ng.append(d[1]) x.append(x_ng[::-1]) y.append(y_ng[::-1]) title = ["Unigram", "Bigram", "Trigram", "Quadgram"] sup_title = "ngrams count" util.plot_bar_chart_grid(x, y, 1, len(data), title, sup_title, out_p, sup_title_font_size=16, tick_font_size=14, title_font_size=14, h_size=5, w_size=5, rotate=True)
def rank_grams(self, docs): """ Find and rank gram from the supplied corpus using the given association metric. Write the quadgrams out to the given path if supplied otherwise return the list in memory. """ # Create a collocation ranking utility from corpus words. if self.n == 2: self.ngrams = BigramCollocationFinder.from_words(docs) elif self.n == 3: self.ngrams = TrigramCollocationFinder.from_words(docs) elif self.n == 4: self.ngrams = QuadgramCollocationFinder.from_words(docs) # Rank collocations by an association metric self.scored = self.ngrams.score_ngrams(self.metric)
def top_words_quadcounter(job_type_list): special_chars = ['--','...','\n','•','®','·'] a = ' '.join(job_type_list) a = a.translate(str.maketrans('','',string.punctuation)).lower() #remove punctuation and make lower case for char in special_chars: a = a.replace(char, ' ') #replace special char with a space #resultwords = [word for word in a.split(' ') if word.lower() not in stopwords] #text = ' '.join(resultwords) a finder = QuadgramCollocationFinder.from_words(word_tokenize(a)) l = [] for k,v in finder.ngram_fd.items(): #count += 1 z = (k,v) l.append(z) l = sorted(l,key=itemgetter(1),reverse=True) return(l[0:300])
def generating_ngrams(words, n): """ generating individual language models words: list of words present in a particular language (list(string)) n: value of n for generating n-grams, values are 2,3,4 (int) """ #Generating N-grams if n == 2: finder = BigramCollocationFinder.from_words(words) #2-grams elif n == 3: finder = TrigramCollocationFinder.from_words(words) #3-grams elif n == 4: finder = QuadgramCollocationFinder.from_words(words) #4-grams else: print("Incorrect value of n") return finder #ngrams
def rank_quadgrams(corpus, metric, path=None): """ Find and rank quadgrams from the supplied corpus using the given association metric. Write the quadgrams out to the given path if supplied otherwise return the list in memory. """ # Create a collocation ranking utility from corpus words. ngrams = QuadgramCollocationFinder.from_words(corpus.words()) # Rank collocations by an association metric scored = ngrams.score_ngrams(metric) if path: with open(path, 'w') as f: f.write("Collocation\tScore ({})\n".format(metric.__name__)) for ngram, score in scored: f.write("{}\t{}\n".format(repr(ngram), score)) else: return scored
def Collocation(contents, n): from nltk.collocations import BigramAssocMeasures, BigramCollocationFinder, TrigramAssocMeasures, TrigramCollocationFinder, QuadgramAssocMeasures, QuadgramCollocationFinder from nltk.probability import FreqDist, DictionaryProbDist if n==2: bigram_measures = BigramAssocMeasures() finder = BigramCollocationFinder.from_words(contents) scored = finder.score_ngrams(bigram_measures.raw_freq) elif n==3: trigram_measures = TrigramAssocMeasures() finder = TrigramCollocationFinder.from_words(contents) scored = finder.score_ngrams(trigram_measures.raw_freq) elif n==4: quadgram_measures = QuadgramAssocMeasures() finder = QuadgramCollocationFinder.from_words(contents) scored = finder.score_ngrams(quadgram_measures.raw_freq) else: print("Collocation is only available for n=2, 3, or 4.") return(scored)
def quadgram_feats(text, score_fn=NgramAssocMeasures.pmi, n_best=200): #n_grams = list(ngrams(characters, n)) + list(ngrams(characters, n-1)) + list(ngrams(characters, n-2)) quadgram_finder = QuadgramCollocationFinder.from_words(text) n_grams = quadgram_finder.nbest(score_fn, n_best) return dict([(n_gram, True) for n_gram in n_grams])
bfreq.append(v) print(k, v) plot_bar_x(bigram, bfreq) from nltk.collocations import TrigramCollocationFinder trigram = [] tfreq = [] finder = TrigramCollocationFinder.from_words(word_tokenize(text)) for k, v in finder.ngram_fd.items(): trigram.append(k[0] + " " + k[1] + " " + k[2]) tfreq.append(v) print(k, v) plot_bar_x(trigram, tfreq) from nltk.collocations import QuadgramCollocationFinder finder = QuadgramCollocationFinder.from_words(word_tokenize(text)) quadgram = [] qfreq = [] for k, v in finder.ngram_fd.items(): quadgram.append(k[0] + " " + k[1] + " " + k[2] + " " + k[3]) qfreq.append(v) print(k, v) plot_bar_x(quadgram, qfreq) #Análise de Tópicos tokenizer = RegexpTokenizer(r'\w+') # create English stop words list en_stop = set(stopwords.words('english')) # Create p_stemmer of class PorterStemmer p_stemmer = PorterStemmer()
def _get_quadgrams(words, top_n, min_freq): qcf = QuadgramCollocationFinder.from_words(iter(words)) qcf.apply_freq_filter(min_freq) quadgrams = [' '.join(w) for w in qcf.nbest(QuadgramAssocMeasures.chi_sq, top_n)] return re.compile('(%s)' % '|'.join(quadgrams), re.UNICODE)
def process(text: str, num_1_grams: int = 100, num_2_grams: int = 100, num_3_grams: int = 100, num_4_grams: int = 100, min_chars: int = 3, max_chars: int = 30): """ Extract keywords from text sources """ # Find all sentences in the text sents = get_sentences(text) # Filter out any sentences which occur identically more than once sent_counter = collections.Counter(sents) sents = [sent for sent in sents if sent_counter[sent] == 1] # Tokenize each sentence sents = [RE_TOKEN.split(sent) for sent in sents] # and len(word) > 1 # Filter out non-alphabetic tokens and convert to lowercase sents = [[token.lower() for token in sent if is_alpha(token)] for sent in sents] # We look at two variants of the input sentences # a. For 1-grams, we remove all stopwords, short tokens, and possesives # b. For 2-grams and longer, we want to keep stopwords and short tokens as # these might provide some information in relation to other words sents_a = [[RE_POSS.sub('', t) for t in sent if filter_token(t)] for sent in sents] sents_b = sents assert len(sents_a) > 0 and len(sents_b), 'Not enough words' counter = collections.Counter() for sent in sents_a: for token in sent: counter[token] += 1 res = [[], [], [], []] if num_1_grams: # Represent tokens using a tuple with only one element to match the # format of the other ngrams with n > 1 tuples = [((token, ), count) for token, count in counter.items()] df_1 = df_top(tuples=tuples, num=num_1_grams, token_filter=filter_1_grams, min_char=min_chars, max_char=max_chars) if df_1 is not None: res[0] = df_1['entry'].tolist() if num_2_grams: bigrams = BigramCollocationFinder.from_documents(sents_b) tuples = bigrams.score_ngrams(BigramAssocMeasures.raw_freq) df_2 = df_top(tuples=tuples, num=num_2_grams, token_filter=filter_2_grams, min_char=min_chars, max_char=max_chars) if df_2 is not None: res[1] = df_2['entry'].tolist() if num_3_grams: trigrams = TrigramCollocationFinder.from_documents(sents_b) tuples = trigrams.score_ngrams(TrigramAssocMeasures.raw_freq) df_3 = df_top(tuples=tuples, num=num_3_grams, token_filter=filter_3_grams, min_char=min_chars, max_char=max_chars) if df_3 is not None: res[2] = df_3['entry'].tolist() if num_4_grams: quadgrams = QuadgramCollocationFinder.from_documents(sents_b) tuples = quadgrams.score_ngrams(QuadgramAssocMeasures.raw_freq) df_4 = df_top(tuples=tuples, num=num_4_grams, token_filter=filter_4_grams, min_char=min_chars, max_char=max_chars) if df_4 is not None: res[3] = df_4['entry'].tolist() return res