def findBestWords(wordsInCategories, scoreFunction=BigramAssocMeasures.chi_sq, max_words=1000): word_fd = FreqDist() label_word_fd = ConditionalFreqDist() for category, words in wordsInCategories: word_fd.update(words) label_word_fd[category].update(words) word_counts = {} for condition in label_word_fd.conditions(): word_counts[condition] = label_word_fd[condition].N() total_word_count = 0 for condition, count in word_counts.items(): total_word_count += count word_scores = {} for word, freq in word_fd.items(): score = 0 for condition, count in word_counts.items(): score += scoreFunction(label_word_fd[condition][word], (freq, word_counts[condition]), total_word_count) word_scores[word] = score best = sorted(word_scores.items(), key=lambda t: t[1], reverse=True)[:max_words] return set([w for w, s in best])
def extract_ngrams(all_sentences: List[List[str]]) -> Tuple[Any, Any]: unigram_freqs = FreqDist() bigram_freqs = FreqDist() for sentence in all_sentences: unigram_freqs.update(ngrams(sentence, 1)) bigram_freqs.update(ngrams(sentence, 2)) return unigram_freqs, bigram_freqs
def load_english_frequencies(): nltk.download(['brown', 'gutenberg', 'reuters']) global english_frequencies english_frequencies = FreqDist(w.lower() for w in brown.words()) english_frequencies.update(w.lower() for w in gutenberg.words()) english_frequencies.update(w.lower() for w in reuters.words())
def _prepare(self): if self._is_prepared: return freq_dist_a = FreqDist() for a in self._pair.chunks_a: freq_dist_a.update(self._tokenize(a)) freq_dist_b = FreqDist() for b in self._pair.chunks_b: freq_dist_b.update(self._tokenize(b)) self._avg_freq_dist = FreqDist() n_a = freq_dist_a.N() n_b = freq_dist_b.N() for a in freq_dist_a: self._avg_freq_dist[a] = (freq_dist_a[a] / n_a + freq_dist_b[a] / n_b) / 2.0 for b in freq_dist_b: if self._avg_freq_dist[b] != 0.0: continue self._avg_freq_dist[b] = (freq_dist_a[b] / n_a + freq_dist_b[b] / n_b) / 2.0 self._chunks = self._sampler.generate_chunk_pairs(self._pair) self.__freq_a = None self.__freq_b = None self._is_prepared = True
def pos_ngrams(t1, t2, order=3): """ Generate POS n-gram distributions. :param t1: text1 :param t2: text2 :param order: n-gram order :return: tuple containing FreqDists """ t1_freq = FreqDist() t2_freq = FreqDist() t1 = nltk.sent_tokenize(t1) for s in t1: pos_tags = nltk.pos_tag(nltk.word_tokenize(s)) t1_freq.update( tuple(map(lambda x: x[1], pos_tags[i:i + order])) for i in range(len(pos_tags) - order + 1)) t2 = nltk.sent_tokenize(t2) for s in t2: pos_tags = nltk.pos_tag(nltk.word_tokenize(s)) t2_freq.update( tuple(map(lambda x: x[1], pos_tags[i:i + order])) for i in range(len(pos_tags) - order + 1)) return t1_freq, t2_freq
def word_distr(category_tweets): word_dist = FreqDist() for tweet in category_tweets: tokens = tokenizer.tokenize(tweet.text) tokens = [x for x in tokens if x not in stop_words and not is_punctuation(x)] dist = FreqDist(tokens) word_dist.update(dist) return word_dist
def ngram_freq(self, speaker, token_count=1): """Return a FreqDist of ngrams of length token_count for speaker.""" freq = FreqDist() for line in self.all_lines(speaker): for sent in line.sentences: freq.update(" ".join(ngram) for ngram in ngrams(sent.tokenize(), token_count)) return freq
def compute_freq(text, N_gram): bigramfdist = FreqDist() threeramfdist = FreqDist() tokens = text.strip().split(' ') bigrams = ngrams(tokens, N_gram) bigramfdist.update(bigrams) return bigramfdist
def __init__(self, params, picklefile, modelfile=None): texts = [] with open(params['PATH_TRAIN']) as inf: for line in inf: temp = line.replace("\n", "") texts.append(temp[temp.index('\t') + 1:].lower()) word_dist = FreqDist() for s in texts: word_dist.update(s.split()) word_freq = dict(word_dist) word_index = {} c = 1 for t in word_freq: word_index[t] = c c = c + 1 pickle.dump(word_index, open("word_pickle.p", "wb")) word_prob = {} for t in word_freq: word_prob[t] = 1 / word_freq[t] pickle.dump(word_index, open("word_freq.p", "wb")) self.word_freq = word_freq self.word_index = word_index embeddings_index = {} f = open(os.path.join(params['GLOVE_DIR'], 'glove.6B.50d.txt')) for line in f: values = line.split() word = values[0] coefs = np.asarray(values[1:], dtype='float32') embeddings_index[word] = coefs f.close() # Obtained all the word embeddings. Fetch for word 'w', as embeddings_index[w] embedding_matrix = np.zeros( (params['VOCAB_SIZE'], params['EMBEDDING_DIM'])) with open(picklefile, 'rb') as f: self.encoding = pickle.load(f, encoding='latin1') # self.encoding = pickle.load(open(picklefile, 'rb')) self.word_index = word_index self.embedding_matrix = embedding_matrix self.params = params if (modelfile == None): self.model = Sequential() embedding_layer = Embedding( params['VOCAB_SIZE'], params['EMBEDDING_DIM'], weights=[embedding_matrix], input_length=params['MAX_SEQUENCE_LENGTH'], trainable=False) self.model.add(embedding_layer) else: self.model = load_model(modelfile)
def get_stats(self, output_fname): fd = FreqDist() for text in self.texts: fd.update(set(text)) fh = open(output_fname, 'w') text = Text(self.paragraph_tokens) fdist = FreqDist(text) for (w,f) in fdist.iteritems(): print >> fh, "%s\t%i" % (w, f) fh.close()
def frecuencias_terminos(tokens): term_freq = FreqDist() for i in xrange(len(tokens)): for j in tokens[i]: term_freq.update(FreqDist(j)) y = [count for tag, count in term_freq.most_common(30)] x = range(1, len(y) + 1) print term_freq.most_common(30) plt.bar(x, y) plt.title("Frecuencias de los terminos") plt.ylabel("Frecuencia") plt.show()
def compute_freq(text_body, ngram_n=6): stop_words = set(stopwords.words('english')) n_gramfdist = FreqDist() for line in text_body: if len(line) > 1: tokens = line.strip().split(' ') # tokens_without_stops = [x.lower() for x in tokens if x.lower() not in stop_words] # n_grams = ngrams(tokens_without_stops, 3) n_grams = ngrams(tokens, ngram_n) n_gramfdist.update(n_grams) return n_gramfdist
class BiWordExtractor: def __init__(self, pickle_file): self._statuses = pickle.load(open(pickle_file, 'rb')) self._averages = dict() self._gender_stats = dict() self.fdistneuro = FreqDist() self.fdistnonneuro = FreqDist() self.highneuro = defaultdict() self.highnonneuro = defaultdict() """ Processes statuses. (For information on how the different data structures are set up, look at the comments for the getters.) """ def wordprocess(self): lengths = dict() row = 0 for status in self._statuses[1:]: row += 1 print row user = status[0] filtered_status = status[1].translate(string.maketrans("", ""), string.punctuation) tokens = pattern_split.split(filtered_status.lower()) filtered_tokens = [ w for w in tokens if w not in stopwordslist and w not in filterlist ] bitokens = nltk.bigrams(filtered_tokens) if status[5] == '+': self.fdistneuro.update(bitokens) elif status[5] == '-': self.fdistnonneuro.update(bitokens) def neuro_word_frequency(self): vocneuro = self.fdistneuro.keys() highvocneuro = vocneuro[:300] return highvocneuro def highneuro_word_frequency(self): for w in self.neuro_word_frequency(): if self.fdistneuro[w] >= 5: self.highneuro[w] = self.fdistneuro[w] print self.highneuro.items() print self.highneuro.keys() return self.highneuro.keys()
def get_uni(first, second, uni): bigramfdist = FreqDist() for line in first: token = nltk.word_tokenize(line) token = [ x for x in token if not re.fullmatch('[' + string.punctuation + ']+', x) ] bigrams = ngrams(token, 1) bigramfdist.update(bigrams) print(bigramfdist.most_common(50)) print(bigramfdist.get("but"))
def get_ngrams(fileLines, n, pos_tag_dict): # Get n gram counts for corpus tokens = []; ngram_counts = FreqDist(); for excerpt in fileLines: ngram_counts_exp = get_ngram_counts_per_excerpt(excerpt,n,pos_tag_dict); for ngram in ngram_counts_exp: if ( ngram in ngram_counts ): val = ngram_counts[ngram]; else: val = 0; ngram_counts[ngram] = val + ngram_counts_exp[ngram]; ngram_counts.update(ngram_counts_exp); return ngram_counts;
def train_finder(self, all_listings): """ Train the product identification algorithm with example data. """ logging.info("Start training of recognizer for product: {0}" .format(self.product_id)) self.classifier = None #select example listings for the finder's product listings, n_pos, n_neg = self.filter_trainig_samples(all_listings) logging.info("Number listings: {l}, positive: {p}, negative: {n}; " "features: {f}" .format(l=len(listings), p=n_pos, n=n_neg, f=self.n_features)) if len(listings) < 30: logging.warn("Product {0}. Can't compute classifier. " "Too few listings." .format(self.product_id)) return elif n_pos < 10: logging.warn("Product {0}. Can't compute classifier. " "Too few positive listings." .format(self.product_id)) return elif n_neg < 10: logging.warn("Product {0}. Can't compute classifier. " "Too few negative listings." .format(self.product_id)) return #Create list of most common words, and put it into feature extractor #TODO: remove stop-words self.feature_extractor = FeatureExtractor() word_freqs = FreqDist() for _, listing in listings.iterrows(): words = self.feature_extractor.extract_words(listing) word_freqs.update(words) common_words = word_freqs.keys()[:self.n_features] self.feature_extractor = FeatureExtractor(common_words) logging.debug("Number individual words: {0}; hapaxes: {1}" .format(len(word_freqs), len(word_freqs.hapaxes()))) logging.debug("Most common words: {}".format(word_freqs.keys()[:100])) #Train the classifier train_set = self.create_labeled_features(listings) self.classifier = nltk.NaiveBayesClassifier.train(train_set) self.classifier.show_most_informative_features(20)
def create_vocabulary(_text, rmv_stop_wrds): # create an empty network of model's vocabulary def init_vocab_network(n_inputs): network = list() for i in range(0, n_inputs): layer = {'value': 0, 'token': ''} network.append(layer) return network # given a list of words, return a dictionary of word-frequency pairs. def wordlist_to_freq_dict(wrdlist): wordfreq = [wrdlist.count(p) for p in wrdlist] return dict(zip(wrdlist, wordfreq)) # sort the dictionary of word-frequency pairs in descending order def sort_freq_dict(freqdict): aux = [(freqdict[key], key) for key in freqdict] aux.sort() aux.reverse() return aux if rmv_stop_wrds: print('removing stop words...') tokenized_text = nltk.word_tokenize(_text) stopwords = nltk.corpus.stopwords.words('english') word_freq = nltk.FreqDist(tokenized_text) dict_filter = lambda word_freq, stopwords: dict( (word, word_freq[word]) for word in word_freq if word not in stopwords) wordlist = dict_filter(word_freq, stopwords) else: wordlist = FreqDist() wordlist.update(_text.split()) sort_freq_list = sort_freq_dict(wordlist) # initiate model's vocabulary _voc = init_vocab_network(len(sort_freq_list)) # update vocabulary values j = 0 for index in sort_freq_list: # plus one to avoid the zero padding _voc[j]['value'] = j + 1 _voc[j]['token'] = index[1] j += 1 return _voc, len(_voc)
def ngram_probs(filename='D:/raw_sentences.txt'): textfile = open(filename) bigram_fdist = FreqDist() threegram_fdist = FreqDist() for line in textfile: if len(line) > 1: tokens = line.lower().strip().split(' ') bigrams = ngrams(tokens, 2) bigram_fdist.update(bigrams) threegrams = ngrams(tokens, 3) threegram_fdist.update(threegrams) return bigram_fdist,threegram_fdist
def check_svc_bef_aft(list_line, command): # check the freq of words before and after bus service # check the freq of words before and after of word (number) which is non bus svc text = '' for i in range(0, len(list_line), 3): split_first = 0 split_second = 0 if i % 3 == 0: split_first = list_line[i].strip().split('\t') j = i + 1 if j % 3 == 1: split_second = list_line[j].strip().split('\t') for k in range(0, len(split_second)): if command == 'before_svc': if int(split_second[k]) == 1: # mean bus svc if command == 'before_svc': if k > 0: # bus svc doesn't appear at the first position of sentences text = text + split_first[k - 1].lower() + ' ' # take the word before print i, k, split_first[k] if command == 'after_svc': if int(split_second[k]) == 1: # mean bus svc if command == 'after_svc': if k < len(split_second) - 1: text = text + split_first[k + 1].lower() + ' ' # take the word after if command == 'before_notsvc': if RepresentsInt(split_first[k]) is True and int(split_second[k]) != 1: # text is a number and not a bus svc if k > 0: # bus svc doesn't appear at the last position of sentences text = text + split_first[k - 1].lower() + ' ' if command == 'after_notsvc': if RepresentsInt(split_first[k]) is True and int(split_second[k]) != 1: # text is a number and not a bus svc if k < len(split_second) - 1: # bus svc doesn't appear at the last position of sentences text = text + split_first[k + 1].lower() + ' ' fdist = FreqDist() tokens = word_tokenize(str(text)) fdist.update(tokens) for value in fdist.most_common(len(fdist)): print value[0], '\t', value[1] print text
def get_vocab(series, addtional_tokens=[], top=None): """ extract the vocabulary out of an array, allow to add additional tokens to the vocabulary and choose only the top n frequent words. :param series: array of sentences :param addtional_tokens: additional tokens we want to include in the vocabulary :param top: top n frequent words we want to include in the vocabulary :return: map from a word to its numeric representation and the opposite map """ rev_vocab = addtional_tokens freq_vocab = FreqDist() for s in tqdm(series): freq_vocab.update(word_tokenize(decontracted(s))) print("Original vocab size %s" % len(freq_vocab)) all_words_sorted = sorted(freq_vocab, key=freq_vocab.get, reverse=True) top_words = all_words_sorted[:top] rev_vocab += top_words vocab = {word: index for index, word in enumerate(rev_vocab)} return vocab, rev_vocab
def get_vocab(series, addtional_tokens=[], top=None): """ extract the vocabulary out of an array, allow to add additional tokens to the vocabulary and choose only the top n frequent words. :param series: array of sentences :param addtional_tokens: additional tokens we want to include in the vocabulary :param top: top n frequent words we want to include in the vocabulary :return: map from a word to its numeric representation and the opposite map """ rev_vocab = addtional_tokens freq_vocab = FreqDist() for s in tqdm(series): freq_vocab.update(word_tokenize(decontracted(s))) print("Original vocab size %s" % len(freq_vocab)) all_words_sorted = sorted(freq_vocab, key=freq_vocab.get, reverse=True) top_words = all_words_sorted[:top] rev_vocab += top_words vocab = {word: index for index, word in enumerate(rev_vocab)} return vocab, rev_vocab
def get_frequency_distribution(docs, n=1): """ Get the n-gram terms frequency distribution from a list of strings Paramaters: docs: list of strings n: an integer Returns: nltk.FreqDist """ ngram_freq_dist = FreqDist() for doc in docs: if isinstance(doc, str): tokens = word_tokenize(doc) ngram_tokens = ngrams(tokens, n) ngram_freq_dist.update(ngram_tokens) return ngram_freq_dist
def __prepare_vocabulary(self, train_captions): vocab = FreqDist() for caption in train_captions: vocab.update([ token.text.lower() for token in english_tokenizer.tokenizer(caption) ]) # Histogram # hist, bin_edges = np.histogram(list(vocab.values()), bins=20000, density=False) most_common_words = list( map(lambda token: token[0], vocab.most_common(NUMBER_OF_WORDS_FOR_VOCABULARY))) idx = 4 for word in most_common_words: if word not in self.stoi.keys(): self.stoi[word] = idx self.itos[idx] = word idx += 1
def fetch_if(self, cond, term, pos_is_target=True, include_pair=False): tmp_freq_dist = FreqDist() conditions = { ng_prefix: ["pos[:-1] == term", "token[:-1] == term"], ng_suffix: ["pos[-len(term):] == term", "token[-len(term):] == term"], ng_contain: [ "self._is_subcontent(term, pos)", "self._is_subcontent(term , token)" ], ng_equal: ["pos == term", "token == term"] } if cond not in conditions: cond = prefix # Fetching Choice Configuration p_key, t_key = "", "" if include_pair: p_key = "(pos, token)" t_key = "(token, pos)" else: p_key = "pos" t_key = "token" cmp_p = compile(p_key, '<string>', 'eval') cmp_t = compile(t_key, '<string>', 'eval') if pos_is_target: cmp_cond = compile(conditions[cond][0], '<string>', 'eval') for (token, pos), freq in self.train_data.items(): if eval(cmp_cond): tmp_freq_dist.update({eval(cmp_p): freq}) else: cmp_cond = compile(conditions[cond][1], '<string>', 'eval') for (token, pos), freq in self.train_data.items(): if eval(cmp_cond): tmp_freq_dist.update({eval(cmp_t): freq}) return tmp_freq_dist
def token_aft(list_line, command): # check the token after label, note that belongs to the command ('svc', 'road', 'busstop') text = '' list_length = [] for i in range(0, len(list_line), 3): split_first = 0 split_second = 0 if i % 3 == 0: split_first = list_line[i].strip().split('\t') # list of sentences j = i + 1 if j % 3 == 1: split_second = list_line[j].strip().split('\t') # list of label for each word list_length.append(len(split_first)) if command == 'svc': for k in range(0, len(split_second)): # check the frequency of token before bus service if int(split_second[k]) == 1: # mean bus svc if k < len(split_second) - 1: # bus svc doesn't appear at the first position of sentences # try: # don't use stemming here # stem_word = port.stem(connect_token(split_first[k - 1].lower())) # take the token before # except UnicodeDecodeError: # stem_word = connect_token(split_first[k - 1].lower()) stem_word = connect_token(split_first[k + 1].lower()) # take the token after label if is_int(stem_word) is False: text = text + stem_word + ' ' # if stem_word == 'sd' or stem_word == 'dd': # print list_line[i] fdist = FreqDist() tokens = word_tokenize(str(text)) fdist.update(tokens) for value in fdist.most_common(len(fdist)): # print value[0], '\t', value[1] print value[0] print text
def term_freq_all(path, name): file = path + '/' + name fdist = FreqDist() list_line = [] with open(file) as f: for line in f: split_line = line.split('\t') words = nltk.word_tokenize(split_line[1].decode('utf-8').lower().strip()) fdist.update(words) print split_line[0] # list_stem = [] # for token in words: # # st = LancasterStemmer() # # try: # # list_stem.append(st.stem(token).decode('utf-8')) # # except: # # print (split_line[0]) # # st = PorterStemmer() # try: # list_stem.append(st.stem(token).decode('utf-8')) # except: # print (split_line[0]) # fdist.update(list_stem) #print (line) print ('==========================================') print ('==========================================') print (len(fdist)) stop = stopwords.words('english') for value in fdist.most_common(15000): # if (value[0] not in stop and (len(value[0]) >= 4)): if (value[0] not in stop): print (str(value[0].encode('utf-8')) + '\t' + str(value[1]))
def get_desc_graph(part, creator, resource, comments): unigramdist = FreqDist() bigramfdist = FreqDist() fc = comments if part != 'all': fc = fc[fc['primary_category'] == part] if creator != 'all': fc = fc[fc['creator_department'] == creator] if resource != 'all': fc = fc[fc['resource_type'] == resource] for index, sentence in fc.iterrows(): unigrams = ngrams(sentence['tokens'], 1) bigrams = ngrams(sentence['tokens'], 2) unigramdist.update(unigrams) bigramfdist.update(bigrams) return unigram_freq_graph(unigramdist), bigram_freq_graph(bigramfdist), \ unigram_word_cloud(unigramdist), bigram_word_cloud(bigramfdist)
def term_freq_time(first, last): ## get the time convert in sgforum db = MySQLdb.connect(host="localhost", # your host, usually localhost user="******", # your username passwd="ducthong", # your password db="sgforums_singaporebuses") # name of the data base # you must create a Cursor object. It will let # you execute all the queries you need cur = db.cursor() # Use all the SQL you like sql = "select p.post_id, s.createdAtSecond, p.summary from posts_filter p, posts_createatsecond s where p.post_id = s.post_id and s.createdAtSecond >= " \ + str(first) + " and s.createdAtSecond <= " + str(last) + " order by s.createdAtSecond;" cur.execute(sql) #call the database which name 'posts' fdist = FreqDist() for row in cur.fetchall(): post_id = str(row[0]) createdAtSecond = str(row[1]) summary = unicode(str(row[2]), errors='ignore') #print (post_id + '\t' + createdAtSecond + '\t' + summary) words = nltk.word_tokenize(summary.lower().strip().decode('utf-8')) # try: # words = nltk.word_tokenize(summary.lower().strip().decode('utf-8')) # except: # print (post_id + '\t' + summary) fdist.update(words) cur.close() print ('==========================================') print ('==========================================') print (len(fdist)) stop = stopwords.words('english') for value in fdist.most_common(200): if (value[0] not in stop and len(value[0]) >= 3): print (str(value[0]).encode('utf-8') + '\t' + str(value[1]))
def count_corpus_frequency2(sentences): sentences_tokens = [line.strip().split(" ") for line in sentences] freq1 = FreqDist() freq12 = FreqDist() freq2 = FreqDist() for sentence in sentences_tokens: sentence = [x for x in sentence if x != ''] bigrams = ["{0} {1}".format(t[0], t[1]) for t in ngrams(sentence, 2)] freq1.update(sentence) freq12.update(sentence) freq12.update(bigrams) freq2.update(bigrams) return freq1, freq12, freq2
def check_bef_aft_roadBusStop(list_line, command): text = '' for i in range(0, len(list_line), 3): split_first = 0 split_second = 0 if i % 3 == 0: split_first = list_line[i].strip().split('\t') j = i + 1 if j % 3 == 1: split_second = list_line[j].strip().split('\t') k = 0 while True: if k >= len(split_second): break if command == 'bef_road': try: if int(split_second[k]) == 2: # take road if k > 0: text = text + connect_token(split_first[k - 1].lower()) + ' ' # take the word before while True: k += 1 if k == len(split_second): break else: if int(split_second[k]) != 2: break else: k += 1 except ValueError: k += 1 if command == 'aft_road': try: if int(split_second[k]) == 2: # take road while True: k += 1 if k == len(split_second): break else: if int(split_second[k]) != 2: break if k < len(split_second) - 1: if is_int(split_first[k]) is False: text = text + connect_token(split_first[k].lower()) + ' ' # take the token after the label else: k += 1 except ValueError: k += 1 if command == 'bef_busstop': try: if int(split_second[k]) == 3: # take busstop if k > 0: text = text + connect_token(split_first[k - 1].lower()) + ' ' # take the word before while True: k += 1 if k == len(split_second): break else: if int(split_second[k]) != 3: break else: k += 1 except ValueError: k += 1 if command == 'aft_busstop': try: if int(split_second[k]) == 3: # take road while True: k += 1 if k == len(split_second): break else: if int(split_second[k]) != 3: break if k < len(split_second) - 1: if is_int(split_first[k]) is False: text = text + connect_token(split_first[k].lower()) + ' ' # take the token after the label else: k += 1 except ValueError: k += 1 fdist = FreqDist() tokens = word_tokenize(str(text)) fdist.update(tokens) for value in fdist.most_common(len(fdist)): print value[0], '\t', value[1] print text
YOURSELVES YOU'VE""" set_function_words = list(set(function_words.lower().split(" "))) set_function_words += " " files = [] i = 1 path = 'output/' for filename in glob('Mini-CORE/*.txt'): with open(filename, 'r', encoding='utf8') as f: w_file = open(path + str(i) + ".md", 'w', encoding='utf8') clear = re.compile('<.*?>') n_clear = re.compile('\n') preClearText = re.sub(clear, '', f.read()) clearText = re.sub(n_clear, '', preClearText).lower() # w_file.write(clearText + '\n\n') tokens = sorted(list(clearText.split(" "))) fd = FreqDist() for word in tokens: if word not in set_function_words: fd.update([word]) fd_sorted = sorted(fd, key=fd.get, reverse=True) w_file.write('Sorted by values :\n') for word in fd_sorted: w_file.write(str(word) + ' ') w_file.write('\n\nList :\n') for word in fd: w_file.write(str(word) + ' : ' + str(fd[word]) + '\n') w_file.close() i += 1
def main(download_settings_filename, parse_settings_filename): with open(download_settings_filename, 'r') as f: download_config = json.load(f) with open(parse_settings_filename, 'r') as f: parse_config = json.load(f) topic = download_config.get('topic', 'Medicine') data_dir = os.path.join( download_config.get('save_dir', os.path.join('data', 'wiki')), topic) save_dir = os.path.join( parse_config.get('save_dir', os.path.join('artifacts', 'wiki')), topic, 'vocab') exclude_vocab = parse_config.get('exclude_vocab', []) min_page_vocab = parse_config.get('min_page_vocab', 5) plot_top_k = parse_config.get('plot_top_k', 40) plot_cumulative = parse_config.get('plot_cumulative', True) plot_title = 'top {} frequency'.format( plot_top_k) if not plot_cumulative else 'top {} cumulative'.format( plot_top_k) make_plots = plot_top_k > 0 wiki_url = 'https://en.wikipedia.org/wiki/Category:{}'.format(topic) sent_tokenizer = nltk.data.load('tokenizers/punkt/english.pickle').tokenize word_tokenizer = NISTTokenizer().tokenize lem = nltk.WordNetLemmatizer() S = requests.Session() pages = glob(os.path.join(data_dir, '*.html')) total_vocab = FreqDist() document_vocabs = {} print('reading {} files and generating vocabulary'.format(len(pages))) os.makedirs(save_dir, exist_ok=True) for page in tqdm(pages): l = process_page(S, page, exclude_vocab, word_tokenizer, lem, sent_tokenizer) # ignore pages with very small vocabulary if len(l) < min_page_vocab: continue document_vocabs[page] = FreqDist(l) total_vocab.update(l) save_filename = os.path.join( save_dir, os.path.basename(page[:page.rfind('.')]) + '.json') with open(save_filename, 'w') as f: json.dump(dict(document_vocabs[page]), f, indent=4) if make_plots: save_filename = save_filename[:save_filename.rfind('.')] + '.pdf' save_freq_plot(save_filename, document_vocabs[page], max_num=plot_top_k, cumulative=plot_cumulative, title=plot_title) with open(os.path.join(save_dir, 'total_count.json'), 'w') as f: json.dump(dict(total_vocab), f, indent=4) if make_plots: save_filename = os.path.join(save_dir, 'total_count.pdf') save_freq_plot(save_filename, total_vocab, max_num=plot_top_k, cumulative=plot_cumulative, title=plot_title)
class Corpus: def __init__(self, documents=None): """ Corpus constructor documents is a list of documents """ self.docs = {} if documents: for doc in documents: self.docs[doc.doc_id] = doc self.nltk_text_collection = TextCollection([x.to_nltk_text() for x in self.docs.values()]) self.term_index = {} self._vocabulary = None self.clear_indexes() self.pp = pprint.PrettyPrinter(indent=4) #def neighbors(self, document, window_size=9) def __len__(self): return len(self.docs.keys()) def __contains__(self, a): return a in self.docs def __getitem__(self, x): """Return the document with Document ID x""" return self.docs[x] def categories(self): """ Returns list of categories in this corpus For combined corpora, categories are equivalent to document ids """ return self.docs.keys() def _old_neighbors(self, document, window_size=9): """ neighbors based on moving window window_size is the diameter from the index element that should be included in the results """ if not self.sorted_by_len: self.generate_neighbor_list() if document.doc_id not in self.inverse_len_index: return [] index = self.inverse_len_index[document.doc_id] l = len(self.sorted_by_len) r = window_size / 2 start = max(0, index - r) end = min(l, (index + 1) + r) n = self.sorted_by_len[start:index] + self.sorted_by_len[(index+1):end] return [self.__getitem__(x) for x in n] def neighbors(self, document, max_distance): """ neighbors based on moving window distance is the maximum distance from the index element that should be included in the results """ sorted_dist_vector = self.generate_neighbor_list(document) filtered = filter(lambda((x, y)): y <= max_distance, sorted_dist_vector) return [self.__getitem__(x[0]) for x in filtered] def _sort_dict_by_value(self, d): return sorted(d.iteritems(), key=operator.itemgetter(1)) def _sorted_dict_index(self, pairs): return [i for i, j in pairs] def add(self, document): """ Add a document to this collection If there is any current iterator using this collection, it is not modified. You need to re-initialize the iterator if you want to include the new items. """ #print "adding " + str(document.doc_id) self.docs[document.doc_id] = document self.clear_indexes() def clear_indexes(self): self.doc_lens = None self.dist_matrix = None self.sorted_by_len = None self.inverse_len_index = None self.inverse_dist_index = None def generate_doc_lens(self): self.doc_lens = {} for document in self.docs.values(): doc_id = document.doc_id shn = len(document) self.doc_lens[document.doc_id] = shn def char_dist(self, doc1, doc2): "distance function by difference between document lengths" return abs(self.doc_lens[doc1] - self.doc_lens[doc2]) def generate_dist_vector(self, document, dist_func=char_dist): if (isinstance(document, Document)): doc_id = document.doc_id elif type(document) == str: doc_id = document if self.doc_lens == None: self.generate_doc_lens() v = {} for target in self.docs.keys(): v[target] = dist_func(self, doc_id, target) return v def generate_dist_matrix(self): if self.doc_lens == None: self.generate_doc_lens() if self.dist_matrix == None: self.dist_matrix = {} for doc1 in self.docs.keys(): self.dist_matrix[doc1] = generate_dist_vector(doc1) def _generate_neighbor_list(self): if self.doc_lens == None: self.generate_doc_lens() self.sorted_by_len = self._sorted_dict_index(self._sort_dict_by_value(self.doc_lens)) self.inverse_len_index = {} for idx, val in enumerate(self.sorted_by_len): self.inverse_len_index[val] = idx def generate_neighbor_list(self, document): dist_vector = self.generate_dist_vector(document) return self._sort_dict_by_value(dist_vector) def next(self): if self.cursor_position >= len(self.found_docs): raise StopIteration else: self.cursor_position += 1 doc = self.docs[self.found_docs[self.cursor_position - 1]] return doc def __iter__(self): self.cursor_position = 0 self.found_docs = self.docs.keys() return self def to_nltk_text_collection(self): if self.nltk_text_collection: return self.nltk_text_collection else: self.nltk_text_collection = TextCollection([x.to_nltk_text() for x in self.docs.values()]) return self.nltk_text_collection return None # wtf nltk def index(self): for k in self.docs.keys(): for word in self.docs[k].words(): if word in self.term_index: self.term_index[word].add(k) else: self.term_index[word] = set() self.term_index[word].add(k) def df(self, term): if not self.term_index: self.index() #self.pp.pprint(self.term_index) if term in self.term_index: return len(self.term_index[term]) else: return 0 def idf(self, term): df = self.df(term) if df == 0.0: return 0.0 else: return math.log(float(len(self)) / float(self.df(term))) # Use non-augmented tf for now, can experiment later def tf(self, doc_id, term): return self.docs[doc_id].tf(term) def tf_idf(self, doc_id, term): return self.tf(doc_id, term) * float(self.idf(term)) def vocabulary(self): if self._vocabulary == None: self._vocabulary = FreqDist() for doc in self.docs.values(): self._vocabulary.update(dict(doc.freq_dist())) return self._vocabulary def tf_idf_vector(self, doc_id): """return the TF-IDF term vector for a document the length of the vector is equal to the vocabulary size, not the number of terms in the document""" v = [0.0] * len(self.vocabulary()) d = self.docs[doc_id] if d: fd = d.freq_dist() for idx, word in self.vocabulary(): if word in fd: v[idx] = self.tf_idf(doc_id, word) return v def ranked_terms(self, doc_id, n=None): """ returns a list of the top terms by TF-IDF in a document if n is none, return all terms. Otherwise return the top n terms. """ d = self.docs[doc_id] if d: v = {} fd = d.freq_dist() for word in fd.keys(): v[word] = self.tf_idf(doc_id, word) sorted_v = sorted(v.iteritems(), key=operator.itemgetter(1)) sorted_v.reverse() if n != None: return sorted_v[0:n] else: return sorted_v def top_terms(self, n=5): r = [] for document in self.docs.values(): r.append(self.ranked_terms(document.doc_id, n)) return r def to_scikit_learn_dataset(self): dataset = {} dataset["data"] = [] dataset["ids"] = [] #dataset["filenames"] for doc_id in self.docs.keys(): dataset["ids"].append(doc_id) dataset["data"].append(unicode(self.docs[doc_id])) b = Bunch(DESCR=None, ids=dataset["ids"], data=dataset["data"]) return b def keys_sorted_by_attribute(self, attribute="created_time"): """ Return the list of document ids sorted by a document attribute """ d = [] for doc_id in self.docs.keys(): if attribute in self.docs[doc_id].document: d.append((doc_id, self.docs[doc_id].document[attribute])) # sort the list of doc_id, attribute tuples by the attribute return [x[0] for x in sorted(d, key=itemgetter(1))] def process_pipeline(self, pipeline): for doc in self.docs.values(): res = pipeline.process(doc)
def __init__(self, treebank, rootsymbol='S', wrap=False, cnf=True, cleanup=True, normalize=False, extratags=(), parser=InsideChartParser, **parseroptions): """ initialize a DOP model given a treebank. uses the Goodman reduction of a STSG to a PCFG. after initialization, self.parser will contain an InsideChartParser. >>> tree = Tree("(S (NP mary) (VP walks))") >>> d = GoodmanDOP([tree]) >>> print d.grammar Grammar with 8 productions (start state = S) NP -> 'mary' [1.0] NP@1 -> 'mary' [1.0] S -> NP VP [0.25] S -> NP VP@2 [0.25] S -> NP@1 VP [0.25] S -> NP@1 VP@2 [0.25] VP -> 'walks' [1.0] VP@2 -> 'walks' [1.0] >>> print d.parser.parse("mary walks".split()) (S (NP mary) (VP@2 walks)) (p=0.25) @param treebank: a list of Tree objects. Caveat lector: terminals may not have (non-terminals as) siblings. @param wrap: boolean specifying whether to add the start symbol to each tree @param normalize: whether to normalize frequencies @param parser: a class which will be instantiated with the DOP model as its grammar. Supports BitParChartParser. instance variables: - self.grammar a WeightedGrammar containing the PCFG reduction - self.fcfg a list of strings containing the PCFG reduction with frequencies instead of probabilities - self.parser an InsideChartParser object - self.exemplars dictionary of known parse trees (memoization)""" from bitpar import BitParChartParser nonterminalfd, subtreefd, cfg = FreqDist(), FreqDist(), FreqDist() ids = count(1) self.exemplars = {} if wrap: # wrap trees in a common root symbol (eg. for morphology) treebank = [Tree(rootsymbol, [a]) for a in treebank] if cnf: #CNF conversion is destructive treebank = list(treebank) for a in treebank: a.chomsky_normal_form() #todo: sibling annotation necessary? # add unique IDs to nodes utreebank = [(tree, decorate_with_ids(tree, ids)) for tree in treebank] # count node frequencies for tree, utree in utreebank: nodefreq(tree, utree, subtreefd, nonterminalfd) if isinstance(parser, BitParChartParser): lexicon = set(x for a, b in utreebank for x in a.pos() + b.pos()) # this takes the most time, produce CFG rules: cfg = FreqDist(chain(*(self.goodman(tree, utree) for tree, utree in utreebank))) cfg.update("%s\t%s" % (t, w) for w, t in extratags if w not in lexicon) lexicon.update(a for a in extratags if a not in lexicon) # annotate rules with frequencies self.fcfg = frequencies(cfg, subtreefd, nonterminalfd, normalize) self.parser = BitParChartParser(self.fcfg, lexicon, rootsymbol, cleanup=cleanup, **parseroptions) else: cfg = FreqDist(chain(*(self.goodman(tree, utree, False) for tree, utree in utreebank))) probs = probabilities(cfg, subtreefd, nonterminalfd) #for a in probs: print a self.grammar = WeightedGrammar(Nonterminal(rootsymbol), probs) self.parser = InsideChartParser(self.grammar) #stuff for self.mccparse #the highest id #self.addresses = ids.next() #a list of interior + exterior nodes, #ie., non-terminals with and without ids #self.nonterminals = nonterminalfd.keys() #a mapping of ids to nonterminals without their IDs #self.nonterminal = dict(a.split("@")[::-1] for a in # nonterminalfd.keys() if "@" in a) #clean up del cfg, nonterminalfd
all_words = FreqDist(w.lower() for w in train_set_words).keys() def tweet_features(tweet): tweet_words = word_tokenize(tweet) features = {} for word in all_words: features['contains({})'.format(word)] = (word in tweet_words) return features word_fd = FreqDist() label_word_fd = ConditionalFreqDist() for item in train_set: tweet = item[0].lower() words = word_tokenize(item[0]) word_fd.update(words) label_word_fd[item[1]].update(words) pos_word_count = label_word_fd['pos'].N() neg_word_count = label_word_fd['neg'].N() total_word_count = pos_word_count + neg_word_count word_scores = {} for word, freq in word_fd.iteritems(): pos_score = BigramAssocMeasures.chi_sq(label_word_fd['pos'][word], (freq, pos_word_count), total_word_count) neg_score = BigramAssocMeasures.chi_sq(label_word_fd['neg'][word], (freq, neg_word_count), total_word_count) word_scores[word] = pos_score + neg_score
if word1.lower() in tokens_tags: tokens_tags[word1].append(tag) else: tokens_tags[word1]=[tag] tags.append(tag) if (tag,word1) in tag_word: tag_word[(tag,word1)]=tag_word[(tag,word1)]+1 else: tag_word[(tag,word1)]=1 bigrams = ngrams(tags, 2) tag_tag.update(bigrams) for key,value in tag_word.items(): prob_tag_word[key]=tag_word[key]/tag_count[key[0]] for (t1,t2) in tag_tag.keys(): prob_tag_tag[(t1,t2)]=tag_tag[(t1,t2)]/tag_count[key[0]]
from nltk.util import ngrams from nltk.tokenize import word_tokenize from nltk import FreqDist sentence = 'this is a foo bar sentences and i want to ngramize it this this' # n = 3 # list_grams = ngrams(sentence.split(), n) # # for grams in list_grams: # string = '' # for value in grams: # string = string + ' ' + value # print (string.strip()) fdist = FreqDist() tokens = word_tokenize(str(sentence)) fdist.update(tokens) for value in fdist.most_common(): print value i = 11 for i in range(0, 10): i = i + 2 print 'testing' text = 'Mount Batten Rd Haig Rd Sims Ave' split_text = text.split('Rd') for value in split_text: print value
def token_bef(list_line, command): # check the token before label, note that belongs to the command ('svc', 'road', 'busstop') port = PorterStemmer() text = '' list_length = [] for i in range(0, len(list_line), 3): split_first = 0 split_second = 0 if i % 3 == 0: split_first = list_line[i].strip().split('\t') # list of sentences j = i + 1 if j % 3 == 1: split_second = list_line[j].strip().split('\t') # list of label for each word list_length.append(len(split_first)) if command == 'svc': for k in range(0, len(split_second)): # check the frequency of token before bus service if int(split_second[k]) == 1: # mean bus svc if k > 0: # bus svc doesn't appear at the first position of sentences # try: # don't use stemming here # stem_word = port.stem(connect_token(split_first[k - 1].lower())) # take the token before # except UnicodeDecodeError: # stem_word = connect_token(split_first[k - 1].lower()) stem_word = connect_token(split_first[k - 1].lower()) if is_int(stem_word) is False: text = text + stem_word + ' ' elif command == 'road': k = 0 while True: if k >= len(split_second): break else: try: if int(split_second[k]) == 2: # mean road if k > 0: stem_word = connect_token(split_first[k - 1].lower()) if is_int(stem_word) is False: text = text + stem_word + ' ' # take the word before while True: k += 1 if k == len(split_second): break else: if int(split_second[k]) != 2: break else: k += 1 except ValueError: k += 1 elif command == 'busstop': k = 0 while True: if k >= len(split_second): break else: try: if int(split_second[k]) == 3: # mean bus stop if k > 0: stem_word = connect_token(split_first[k - 1].lower()) if is_int(stem_word) is False: text = text + stem_word + ' ' # take the word before while True: k += 1 if k == len(split_second): break else: if int(split_second[k]) != 3: break else: k += 1 except ValueError: k += 1 fdist = FreqDist() tokens = word_tokenize(str(text)) fdist.update(tokens) for value in fdist.most_common(len(fdist)): print value[0], '\t', value[1] # print value[0] print text
class Vocab(object): def __init__(self, tokenizer=None, max_size=None, min_freq=1): """Basic Vocabulary object""" self.vocab_size = 0 self.freqdist = FreqDist() self.tokenizer = tokenizer def update(self, glove_dir, max_size=None, min_freq=1): """ Initialize id2word & word2id based on self.freqdist max_size include 4 special tokens """ # {0: '<pad>', 1: '<unk>', 2: '<sos>', 3: '<eos>'} self.id2word = { PAD_ID: PAD_TOKEN, UNK_ID: UNK_TOKEN, SOS_ID: SOS_TOKEN, EOS_ID: EOS_TOKEN } # {'<pad>': 0, '<unk>': 1, '<sos>': 2, '<eos>': 3} self.word2id = defaultdict(lambda: UNK_ID) # Not in vocab => return UNK self.word2id.update({ PAD_TOKEN: PAD_ID, UNK_TOKEN: UNK_ID, SOS_TOKEN: SOS_ID, EOS_TOKEN: EOS_ID }) # self.word2id = { # PAD_TOKEN: PAD_ID, UNK_TOKEN: UNK_ID, # SOS_TOKEN: SOS_ID, EOS_TOKEN: EOS_ID # } vocab_size = 4 min_freq = max(min_freq, 1) # Reset frequencies of special tokens # [...('<eos>', 0), ('<pad>', 0), ('<sos>', 0), ('<unk>', 0)] freqdist = self.freqdist.copy() special_freqdist = {token: freqdist[token] for token in [PAD_TOKEN, UNK_TOKEN, SOS_TOKEN, EOS_TOKEN]} freqdist.subtract(special_freqdist) # Sort: by frequency, then alphabetically # Ex) freqdist = { 'a': 4, 'b': 5, 'c': 3 } # => sorted = [('b', 5), ('a', 4), ('c', 3)] sorted_frequency_counter = sorted(freqdist.items(), key=lambda k_v: k_v[0]) sorted_frequency_counter.sort(key=lambda k_v: k_v[1], reverse=True) # Load glove vector word_emb_dict = self.get_glove_emb(glove_dir) for word, freq in sorted_frequency_counter: if freq < min_freq or vocab_size == max_size: break self.id2word[vocab_size] = word self.word2id[word] = vocab_size vocab_size += 1 self.vocab_size = vocab_size # Create embedding matrix self.embedding_matrix = embedding_matrix = np.zeros((self.vocab_size, 300)) for word, ind in self.word2id.items(): if word.lower() in word_emb_dict: embedding_matrix[self.word2id[word]] = word_emb_dict[word.lower()] else: embedding_matrix[self.word2id[word]] = np.random.uniform(-0.25, 0.25, 300) def get_glove_emb(self, GLOVE_DIR): embeddings_index = {} f = open(os.path.join(GLOVE_DIR, 'glove.840B.300d.txt'), 'rb') for line in f: values = line.split() word = values[0] coefs = np.asarray(values[1:], dtype='float32') embeddings_index[word.decode().lower()] = coefs f.close() return embeddings_index def __len__(self): return len(self.id2word) def load(self, word2id_path=None, id2word_path=None, word_emb_path=None): if word2id_path: with open(word2id_path, 'rb') as f: word2id = pickle.load(f) # Can't pickle lambda function self.word2id = defaultdict(lambda: UNK_ID) self.word2id.update(word2id) self.vocab_size = len(self.word2id) if id2word_path: with open(id2word_path, 'rb') as f: id2word = pickle.load(f) self.id2word = id2word if word_emb_path: with open(word_emb_path, 'rb') as f: embedding_matrix = pickle.load(f) self.embedding_matrix = embedding_matrix def add_word(self, word): assert isinstance(word, str), 'Input should be str' self.freqdist.update([word]) def add_sentence(self, sentence, tokenized=False): if not tokenized: sentence = self.tokenizer(sentence) for word in sentence: self.add_word(word) def add_dataframe(self, conversation_df, tokenized=True): for conversation in conversation_df: for sentence in conversation: self.add_sentence(sentence, tokenized=tokenized) def pickle(self, word2id_path, id2word_path, word_emb_path): with open(word2id_path, 'wb') as f: pickle.dump(dict(self.word2id), f) with open(id2word_path, 'wb') as f: pickle.dump(self.id2word, f) with open(word_emb_path, 'wb') as f: pickle.dump(self.embedding_matrix, f) def to_list(self, list_like): """Convert list-like containers to list""" if isinstance(list_like, list): return list_like if isinstance(list_like, Variable): return list(to_tensor(list_like).numpy()) elif isinstance(list_like, Tensor): return list(list_like.numpy()) def id2sent(self, id_list): """list of id => list of tokens (Single sentence)""" id_list = self.to_list(id_list) sentence = [] for id in id_list: word = self.id2word[id] if word not in [EOS_TOKEN, SOS_TOKEN, PAD_TOKEN]: sentence.append(word) if word == EOS_TOKEN: break return sentence def sent2id(self, sentence, var=False): """list of tokens => list of id (Single sentence)""" id_list = [self.word2id[word] for word in sentence] if var: id_list = to_var(torch.LongTensor(id_list), eval=True) return id_list def decode(self, id_list): sentence = self.id2sent(id_list) return ' '.join(sentence)
class Vocab(object): def __init__(self, tokenizer=None, max_size=None, min_freq=1): self.vocab_size = 0 self.freqdist = FreqDist() self.tokenizer = tokenizer self.pad_id = PAD_ID def update(self, max_size=None, min_freq=1): self.id2word = { PAD_ID: PAD_TOKEN, UNK_ID: UNK_TOKEN, SOS_ID: SOS_TOKEN, EOS_ID: EOS_TOKEN, SEP_ID: SEP_TOKEN, } self.word2id = defaultdict( lambda: UNK_ID) # Not in vocab => return UNK self.word2id.update({ PAD_TOKEN: PAD_ID, UNK_TOKEN: UNK_ID, SOS_TOKEN: SOS_ID, EOS_TOKEN: EOS_ID, SEP_TOKEN: SEP_ID, }) vocab_size = 5 min_freq = max(min_freq, 1) freqdist = self.freqdist.copy() special_freqdist = { token: freqdist[token] for token in [PAD_TOKEN, UNK_TOKEN, SOS_TOKEN, EOS_TOKEN, SEP_TOKEN] } freqdist.subtract(special_freqdist) sorted_frequency_counter = sorted(freqdist.items(), key=lambda k_v: k_v[0]) sorted_frequency_counter.sort(key=lambda k_v: k_v[1], reverse=True) for word, freq in sorted_frequency_counter: if freq < min_freq or vocab_size == max_size: break self.id2word[vocab_size] = word self.word2id[word] = vocab_size vocab_size += 1 self.vocab_size = vocab_size def __len__(self): return len(self.id2word) def load(self, word2id_path=None, id2word_path=None, ptb=False): if word2id_path: with open(word2id_path, 'rb') as f: word2id = pickle.load(f) self.word2id = defaultdict(lambda: UNK_ID) self.word2id.update(word2id) self.vocab_size = len(self.word2id) if id2word_path: with open(id2word_path, 'rb') as f: id2word = pickle.load(f) self.id2word = id2word if ptb: self.word2id['<sep>'] = self.vocab_size self.id2word[self.vocab_size] = '<sep>' self.vocab_size += 1 def add_word(self, word): assert isinstance(word, str), 'Input should be str' self.freqdist.update([word]) def add_sentence(self, sentence, tokenized=False): if not tokenized: sentence = self.tokenizer(sentence) for word in sentence: self.add_word(word) def add_dataframe(self, conversation_df, tokenized=True): for conversation in conversation_df: for sentence in conversation: self.add_sentence(sentence, tokenized=tokenized) def pickle(self, word2id_path, id2word_path): with open(word2id_path, 'wb') as f: pickle.dump(dict(self.word2id), f) with open(id2word_path, 'wb') as f: pickle.dump(self.id2word, f) def to_list(self, list_like): if isinstance(list_like, list): return list_like if isinstance(list_like, Variable): return list(to_tensor(list_like).numpy()) elif isinstance(list_like, Tensor): return list(list_like.numpy()) def id2sent(self, id_list): id_list = self.to_list(id_list) sentence = [] for id in id_list: word = self.id2word[id] if word not in [EOS_TOKEN, SOS_TOKEN, PAD_TOKEN]: sentence.append(word) if word == EOS_TOKEN: break return sentence def sent2id(self, sentence, var=False): id_list = [self.word2id[word] for word in sentence] if var: id_list = to_var(torch.LongTensor(id_list), eval=True) return id_list def decode(self, id_list): sentence = self.id2sent(id_list) return ' '.join(sentence)
in_str = sys.stdin.read(BUF_SIZE) rest = '' read_count = 0 while (rest + in_str).strip() != '': read_count += 1 if read_count % 100 == 0: sys.stderr.write('.') sys.stderr.flush() tokens = (rest + in_str).split() rest = tokens.pop() if not tokens: vocab.update(rest) break else: vocab.update(tokens) in_str = sys.stdin.read(BUF_SIZE) print for i in [1000, 2000, 5000, 10000, 20000, 50000, 100000, 200000, 500000, 1000000]: if i > len(vocab.values()): break print "vocab size %7d - cutoff = %d" % (i, vocab.values()[i])
class BiWordExtractor: def __init__(self, pickle_file): self._statuses = pickle.load(open(pickle_file, 'rb')) self._averages = dict() self._gender_stats = dict() self.fdistneuro = FreqDist() self.fdistnonneuro = FreqDist() self.highneuro = defaultdict() self.highnonneuro =defaultdict() """ Processes statuses. (For information on how the different data structures are set up, look at the comments for the getters.) """ def wordprocess(self): lengths = dict() row = 0 for status in self._statuses[1:]: row +=1 print row user = status[0] filtered_status = status[1].translate(string.maketrans("",""), string.punctuation) tokens = pattern_split.split(filtered_status.lower()) filtered_tokens = [w for w in tokens if w not in stopwordslist and w not in filterlist] bitokens = nltk.bigrams(filtered_tokens) if status[5] == '+': self.fdistneuro.update(bitokens) elif status[5] == '-': self.fdistnonneuro.update(bitokens) def neuro_word_frequency(self): vocneuro= self.fdistneuro.keys() highvocneuro = vocneuro [:300] return highvocneuro def highneuro_word_frequency(self): for w in self.neuro_word_frequency(): if self.fdistneuro[w]>= 5: self.highneuro[w] =self.fdistneuro[w] print self.highneuro.items() print self.highneuro.keys() return self.highneuro.keys()
class PosNgram: def __init__(self, deg=1): self.order = deg self.__sentence = "" # storing tokens and frequency self.train_data = FreqDist() self.test_sents = None # to prevent from illegral argument if deg < 1: self.order = 1 def poses2tokens(self, pos_terms, include_freq=False, default_dict=None): """ # The token_terms must be the element of ngram_model # whose order is 1 smaller than that of the current one. """ if default_dict is None: default_dict = self.train_data for (tokens, poses), freq in default_dict.items(): if pos_terms == poses: yield tokens if\ not include_freq\ else (tokens, freq) def tokens2poses(self, token_terms, include_freq=False, default_dict=None): """ # The token_terms must be the element of ngram_model # whose order is 1 smaller than that of the current one. """ if default_dict is None: default_dict = self.train_data for (tokens, poses), freq in default_dict.items(): if token_terms == tokens: yield poses if\ not include_freq\ else (poses, freq) def pre_process(self, file_id, training_size=90): start_processing = time.time() self.train_data = FreqDist() sents = gutenberg.sents(file_id) t_size = floor((training_size / 100) * len(sents)) train_sents = sents[:t_size] self.test_sents = sents[t_size:] p_title = "file_id = <{}>, ngram's order = {}, split_ratio = {}-{}" print( p_title.format(file_id, self.order, training_size, 100 - training_size)) with ICB('Processing...', max=len(train_sents), suffix='%(percent)d%%') as bar: for sent in train_sents: bar.next() self.__sentence = " ".join(sent) self.train_data.update(self._token_pos_pairs) print('dict_size = {}'.format(self.train_data.B())) print("loading time = {}".format(time.time() - start_processing)) def _is_subcontent(self, w1, w2): assert len(w1) <= len(w2) w1 = list(w1) w2 = list(w2) for w in w1: if w not in w2: return False w2.remove(w) return True def fetch_if(self, cond, term, pos_is_target=True, include_pair=False): tmp_freq_dist = FreqDist() conditions = { ng_prefix: ["pos[:-1] == term", "token[:-1] == term"], ng_suffix: ["pos[-len(term):] == term", "token[-len(term):] == term"], ng_contain: [ "self._is_subcontent(term, pos)", "self._is_subcontent(term , token)" ], ng_equal: ["pos == term", "token == term"] } if cond not in conditions: cond = prefix # Fetching Choice Configuration p_key, t_key = "", "" if include_pair: p_key = "(pos, token)" t_key = "(token, pos)" else: p_key = "pos" t_key = "token" cmp_p = compile(p_key, '<string>', 'eval') cmp_t = compile(t_key, '<string>', 'eval') if pos_is_target: cmp_cond = compile(conditions[cond][0], '<string>', 'eval') for (token, pos), freq in self.train_data.items(): if eval(cmp_cond): tmp_freq_dist.update({eval(cmp_p): freq}) else: cmp_cond = compile(conditions[cond][1], '<string>', 'eval') for (token, pos), freq in self.train_data.items(): if eval(cmp_cond): tmp_freq_dist.update({eval(cmp_t): freq}) return tmp_freq_dist @property def _token_pos_pairs(self): """ This function maps terms to POS (The previous version's name was phi1) """ for elems in self._ngram_tokens_pos: poses = [elem[1] for elem in elems] tokens = [elem[0] for elem in elems] yield (tuple(tokens), tuple(poses)) @property def _sent2pos_tag(self): sent = self.__sentence tokens = word_tokenize(sent) return pos_tag(tokens) @property def _ngram_tokens_pos(self): # this returns the tuples of token pos pair return ngrams(self._sent2pos_tag, self.order)
def load_all_dic_token_bef_road_busstop(list_line, command): # load all the word of token before and after labeling, note that we do not consider if this token is a # number. In fact, we only consider if token contain all characters # Using only for "road" and "busstop" text = '' for i in range(0, len(list_line), 3): split_first = 0 split_second = 0 if i % 3 == 0: split_first = list_line[i].strip().split('\t') j = i + 1 if j % 3 == 1: split_second = list_line[j].strip().split('\t') k = 0 while True: if k >= len(split_second): break if command == 'road': # get the token before labeling for road try: if int(split_second[k]) == 2: # detect this is a road => get the token before it if k > 0: token_bef = split_first[k - 1].lower() if token_isAllCharacter(token_bef) is True: text = text + connect_token(token_bef) + ' ' # take the word before while True: k += 1 if k == len(split_second): break else: if int(split_second[k]) != 2: break else: k += 1 except ValueError: k += 1 if command == 'busstop': # get the token before labeling for road try: if int(split_second[k]) == 3: # detect this is a road => get the token before it if k > 0: token_bef = split_first[k - 1].lower() if token_isAllCharacter(token_bef) is True: text = text + connect_token(token_bef) + ' ' # take the word before while True: k += 1 if k == len(split_second): break else: if int(split_second[k]) != 3: break else: k += 1 except ValueError: k += 1 fdist = FreqDist() tokens = word_tokenize(str(text)) fdist.update(tokens) for value in fdist.most_common(len(fdist)): print value[0], '\t', value[1] list_return = list() for value in fdist.most_common(len(fdist)): list_return.append(value[0]) print value[0] print len(fdist) return list_return
class WordExtractor: def __init__(self, pickle_file): self._statuses = pickle.load(open(pickle_file, 'rb')) self._averages = dict() self._gender_stats = dict() self.fdistneuro = FreqDist() self.fdistnonneuro = FreqDist() self.highneuro = defaultdict() self.highnonneuro =defaultdict() self.f = defaultdict(float) self.g = defaultdict(float) self.wordlist = [] """ Processes statuses. (For information on how the different data structures are set up, look at the comments for the getters.) """ def wordprocess(self): lengths = dict() line = 0 for status in self._statuses[1:]: line +=1 print line user = status[0] filtered_status = status[1].translate(string.maketrans("",""), string.punctuation) tokens = pattern_split.split(filtered_status.lower()) # filter out stopwords and emoticons filtered_tokens = [w for w in tokens if w not in stopwordslist and w not in filterlist] if status[5] == '+': self.fdistneuro.update(filtered_tokens) elif status[5] == '-': self.fdistnonneuro.update(filtered_tokens) #returns most frequently used words by neurotic person def neuro_word_frequency(self): vocneuro= self.fdistneuro.keys() highvocneuro = vocneuro [:500] return highvocneuro #returns most frequently used words by non-neurotic person def nonneuro_word_frequency(self): vocnonneuro = self.fdistnonneuro.keys() highvocnonneuro = vocnonneuro [:500] return highvocnonneuro def highneuro_word_frequency(self): for w in self.neuro_word_frequency(): self.highneuro[w] =self.fdistneuro[w] return self.highneuro.items() def highnonneuro_word_frequency(self): for w in self.nonneuro_word_frequency(): self.highnonneuro[w] =self.fdistnonneuro[w] return self.highnonneuro.items() def select_word(self): nntn = float(184563/1780098) ntnn = float(1780098/184563) for w in self.highneuro.keys(): if w in self.highnonneuro.keys(): self.f[w]= int(self.highneuro[w]-self.highnonneuro[w]*nntn) print self.f.items() print "Start calculating non-neurotic words" for w in self.highnonneuro.keys(): if w in self.highneuro.keys(): self.g[w] = int(self.highnonneuro[w]-self.highneuro[w]*ntnn) else: print "False for %s" %(w) print self.g.items() for w in self.f.keys(): if w in self.g.keys(): if self.f[w]>=2000 and self.g[w]<=500: self.wordlist.append(w) print "Here is the wordlist" print self.wordlist # return a list of words used relatively heavily by neurotic persons return self.wordlist
tokens=[re.sub(r'[^A-Za-z]+','',token) for token in tokens] tokens=[wn.lemmatize(token) for token in tokens] return tokens text_tokens=[] for item in sentences[0:1000]: tokens = preprocess_text(item) temp = " ".join(tokens) text_tokens.append(temp) from nltk import FreqDist word_dist = FreqDist() for s in text_tokens: word_dist.update(s.split()) ######################################################################################## from nltk.util import ngrams from collections import Counter text='' for sent in text_tokens: text=text+sent tokens=word_tokenize(text) bigrams = ngrams(tokens,2) bigram_dict=dict(Counter(bigrams))
def load_all_dic_token_bef_aft_svc(list_line, command): # loading all token before and after for bus service # Using only for bus service, because for bus service we not only focus on the token before, but also the token # after labeling text = '' for i in range(0, len(list_line), 3): split_first = 0 split_second = 0 if i % 3 == 0: split_first = list_line[i].strip().split('\t') j = i + 1 if j % 3 == 1: split_second = list_line[j].strip().split('\t') k = 0 while True: if k >= len(split_second): break if command == 'bef_svc': # get the token before labeling for bus svc try: if int(split_second[k]) == 1: # detect this is a svc => get the token before it if k > 0: token_bef = split_first[k - 1].lower() if token_isAllCharacter(token_bef) is True: text = text + connect_token(token_bef) + ' ' # take the word before while True: k += 1 if k == len(split_second): break else: if int(split_second[k]) != 1: break else: k += 1 except ValueError: k += 1 if command == 'aft_svc': try: if int(split_second[k]) == 1: # take bus svc while True: k += 1 if k == len(split_second): break else: if int(split_second[k]) != 1: break if k < len(split_second) - 1: # take the token after the label token_aft = split_first[k].lower() if token_isAllCharacter(token_aft) is True: text = text + connect_token(token_aft) + ' ' else: k += 1 except ValueError: k += 1 fdist = FreqDist() tokens = word_tokenize(str(text)) fdist.update(tokens) for value in fdist.most_common(len(fdist)): print value[0], '\t', value[1] list_return = list() for value in fdist.most_common(len(fdist)): list_return.append(value[0]) print value[0] print len(fdist) return list_return
class Vocab(object): def __init__(self, tokenizer=None, max_size=None, min_freq=1): """Basic Vocabulary object""" self.vocab_size = 0 self.freqdist = FreqDist() self.tokenizer = tokenizer def update(self, max_size=None, min_freq=1): """ Initialize id2word & word2id based on self.freqdist max_size include 4 special tokens """ # {0: '<pad>', 1: '<unk>', 2: '<sos>', 3: '<eos>'} self.id2word = { PAD_ID: PAD_TOKEN, UNK_ID: UNK_TOKEN, SOS_ID: SOS_TOKEN, EOS_ID: EOS_TOKEN } # {'<pad>': 0, '<unk>': 1, '<sos>': 2, '<eos>': 3} self.word2id = defaultdict( lambda: UNK_ID) # Not in vocab => return UNK self.word2id.update({ PAD_TOKEN: PAD_ID, UNK_TOKEN: UNK_ID, SOS_TOKEN: SOS_ID, EOS_TOKEN: EOS_ID }) # self.word2id = { # PAD_TOKEN: PAD_ID, UNK_TOKEN: UNK_ID, # SOS_TOKEN: SOS_ID, EOS_TOKEN: EOS_ID # } vocab_size = 4 min_freq = max(min_freq, 1) # Reset frequencies of special tokens # [...('<eos>', 0), ('<pad>', 0), ('<sos>', 0), ('<unk>', 0)] freqdist = self.freqdist.copy() special_freqdist = { token: freqdist[token] for token in [PAD_TOKEN, UNK_TOKEN, SOS_TOKEN, EOS_TOKEN] } freqdist.subtract(special_freqdist) # Sort: by frequency, then alphabetically # Ex) freqdist = { 'a': 4, 'b': 5, 'c': 3 } # => sorted = [('b', 5), ('a', 4), ('c', 3)] sorted_frequency_counter = sorted(freqdist.items(), key=lambda k_v: k_v[0]) sorted_frequency_counter.sort(key=lambda k_v: k_v[1], reverse=True) for word, freq in sorted_frequency_counter: if freq < min_freq or vocab_size == max_size: break self.id2word[vocab_size] = word self.word2id[word] = vocab_size vocab_size += 1 self.vocab_size = vocab_size def __len__(self): return len(self.id2word) def load(self, word2id_path=None, id2word_path=None): if word2id_path: with open(str(word2id_path), 'rb') as f: word2id = pickle.load(f) # Can't pickle lambda function self.word2id = defaultdict(lambda: UNK_ID) self.word2id.update(word2id) self.vocab_size = len(self.word2id) if id2word_path: with open(str(id2word_path), 'rb') as f: id2word = pickle.load(f) self.id2word = id2word def add_word(self, word): assert isinstance(word, str), 'Input should be str' self.freqdist.update([word]) def add_sentence(self, sentence, tokenized=False): if not tokenized: sentence = self.tokenizer(sentence) for word in sentence: self.add_word(word) def add_dataframe(self, conversation_df, tokenized=True): for conversation in conversation_df: for sentence in conversation: self.add_sentence(sentence, tokenized=tokenized) def pickle(self, word2id_path, id2word_path): with open(str(word2id_path), 'wb') as f: pickle.dump(dict(self.word2id), f) with open(str(id2word_path), 'wb') as f: pickle.dump(self.id2word, f) def to_list(self, list_like): """Convert list-like containers to list""" if isinstance(list_like, list): return list_like if isinstance(list_like, Variable): return list(to_tensor(list_like).numpy()) elif isinstance(list_like, Tensor): return list(list_like.numpy()) def id2sent(self, id_list): """list of id => list of tokens (Single sentence)""" id_list = self.to_list(id_list) sentence = [] for id in id_list: word = self.id2word[id] if word not in [EOS_TOKEN, SOS_TOKEN, PAD_TOKEN]: sentence.append(word) if word == EOS_TOKEN: break return sentence def sent2id(self, sentence, var=False): """list of tokens => list of id (Single sentence)""" id_list = [self.word2id[word] for word in sentence] if var: id_list = to_var(torch.LongTensor(id_list), eval=True) return id_list def decode(self, id_list): sentence = self.id2sent(id_list) return ' '.join(sentence)
if word_limit: logging.info('Word limit %d' % word_limit) order = parse_ngram_order(opts.ngram_order) logging.info('Char n-gram order (%d, %d)' % order) cutoff = opts.min_count corpus = SublexicalizedCorpus(WikiCorpus(dump_fn, dictionary=Dictionary()), order=order, word_limit=word_limit) tf = FreqDist() df = FreqDist() n_docs = 0 for text in corpus: n_docs += 1 tf.update(text) df.update(set(text)) print "###TOTAL###\t%d\t%d" % (tf.N(), n_docs) for token, freq in tf.items(): if freq < cutoff: break print "%s\t%d\t%d\t%.6f" % (token, freq, df[token], math.log(float(n_docs)/df[token]))