def grammar_dependencies_count(headlines, bodies): parser = spacy.load('en') grammar_counts = {} print("starting parser") # tagsDict = {k: v for v, k in enumerate(parser.pipe_labels['parser'])} tagsDict = parser.pipe_labels['parser'] for i, doc in enumerate(parser.pipe(bodies, batch_size=1000, n_threads=4)): counts = collections.Counter() for w in doc: counts[w.dep_] += 1 ssum = sum(counts.values()) for k, v in counts.items(): counts[k] = (counts[k] / ssum) grammar_counts[i] = counts rv = list(range(len(bodies))) print("starting lists") for i, b in tqdm(enumerate(bodies)): try: rv[i] = [] for k in tagsDict: if grammar_counts[i].keys().__contains__(k): rv[i].append(grammar_counts[i][k]) else: rv[i].append(0) except Exception as e: # Ocassionally the way Spacey processes unusual characters (bullet points, em dashes) will cause the lookup based on the original characters to fail. # In that case, just set to None. print("Error in GrammarTransformer, setting to None") # print(text) rv[i] = {} continue return rv
def wiki_test(request, word1, word2): page = requests.get("https://en.wikipedia.org/wiki/" + word1) soup = BeautifulSoup(page.content, features="html.parser") word_tokenized1 = [] size = len(soup.find_all('p')) content = soup.find_all('p') for i in range(size): word_tokenized1.append(word_tokenize(content[i].get_text())) filtered_words1 = [] for list in word_tokenized1: for word in list: if word not in stopwords.words('english'): if word.isalpha(): filtered_words1.append(word) most_common_words1 = (collections.Counter(filtered_words1).most_common(100)) #--------------------------------------------------------- page = requests.get("https://en.wikipedia.org/wiki/" + word2) soup = BeautifulSoup(page.content, features="html.parser") word_tokenized2 = [] size = len(soup.find_all('p')) content = soup.find_all('p') for i in range(size): word_tokenized2.append(word_tokenize(content[i].get_text())) filtered_words2 = [] for list in word_tokenized2: for word in list: if word not in stopwords.words('english'): if word.isalpha(): filtered_words2.append(word) most_common_words2 = (collections.Counter(filtered_words2).most_common(100)) len2 = 0 i = 0 x = 0 for w, c in most_common_words2: len2 += c if w == word1: x = i i += 1 return JsonResponse({'s': (most_common_words2[x][1] / len2) * 100})
def contexts(input, mode, keyword): sys.stdout = open( './01_out/' + input.split('_')[0] + '_cont_' + keyword + '_' + mode + '.txt', 'w') f = open('./00_data/' + input, encoding=mode) token = sent_tokenize(f.read()) f.close() data = [i.split(' ') for i in token] list = [] for i in data: for j in range(1, len(i) - 1): if i[j].lower() == keyword.lower(): list.append(i[j - 1] + '_' + i[j + 1]) print(collections.Counter(list))
def get_word_counts(config, from_timestamp, to_timestamp, count): posts = get_posts_from_range(config, from_timestamp, to_timestamp) wordcount = {} for post in posts: words = get_cleaned_text(post['text']).split() for word in words: if word not in wordcount: wordcount[word] = 1 else: wordcount[word] += 1 word_counter = collections.Counter(wordcount) words = [] for word in word_counter.most_common(count): words.append({'word': word[0], 'count': word[1]}) return words
def build_dictionary(sentences, vocabulary_size): # 把句子转换成单词列表 split_sentences = [s.split() for s in sentences] words = [x for sublist in split_sentences for x in sublist] # count里面存放的内容 [word, word_count],存放的第一个[单词,单词数]对是[未知单词,-1] count = [('RARE', -1)] # 使用nltk.collections提供的工具统计单词出现的频率 count.extend(collections.Counter(words).most_common(vocabulary_size - 1)) # 将单词与索引对应 # ToKnown:原作者使用单词序列长度累加的方法获得索引,速度慢而且不容易理解,我修改为直接枚举 word_dict = {x: i for i, (x, _) in enumerate(count)} # word_dict = {} # for word, word_count in count: # word_dict[word] = len(word_dict) return word_dict
def calculate_and_store_types(self): for tweet in self.tweets: text = nltk.word_tokenize(tweet) liste = nltk.pos_tag(text) for word_and_type in liste: self.types.append(word_and_type[1]) self.types_and_numbers_list = list(collections.Counter(self.types).items()) feature_list = ['VBZ', ',', 'CD', 'JJS', 'WDT', 'VBP', '#', 'PRP$', 'JJR'] #feature_list = [',', 'CD', 'VBP', '#', 'PRP$'] for j in feature_list: self.func(j) self.valid_types_and_numbers.sort(key=lambda x: x[0]) for i in self.valid_types_and_numbers: self.features_array.append(i[1]) ############################################# KENDİ ÇIKARDIĞIM KELİMELERİN FEATURE OLARAK EKLENMESİ # avg = int(sum(self.features_array) / len(self.features_array)) for word in commonly_used_words_by_males: if word in self.words: self.features_array.append(Counter(self.words).get(word)) else: self.features_array.append(0) for word in words_of_technology: if word in self.words: self.features_array.append(Counter(self.words).get(word)) # else: self.features_array.append(0) # for word in commonly_used_words_by_females: if word in self.words: self.features_array.append(Counter(self.words).get(word)) # okay else: self.features_array.append(0)
def sentence_vector(self, sentence: str, sample=None) -> ndarray: tokens = [ token for token in tokenizer.tokenize(sentence) if token.isprintable() and token not in _stop_words ] if sample: tokens = list(everygrams(tokens, 1, 1)) # .sort(key=lambda x: x[1])[:10] tokens = [ word[0] for word, count in collections.Counter( tokens).most_common(sample) ] key = hashlib.md5(sentence.encode('utf-8')).hexdigest() if self.redis is not None and self.redis.exists(key): vector = Embeddings.load_vector_from_cache(key, self.redis) else: vector = self.arithmetic_mean_bow_embedding(tokens) if self.redis is not None: Embeddings.cache_vector(key, vector, self.redis) return vector
total = vocabsize['aggregations']['counts']['value'] print("%s\t%s\t%s\t%s" % (genre['key'], unq / total, total, unq)) for genre in res['aggregations']['genres']['buckets']: curr_gnere = genre["key"] genre_songs_list = [] songs_by_genre = { "query": { "match": { "album.genre": curr_gnere } }, "size": NUM_OF_SONGS } res = es.search(index=ES_INDEX, doc_type=ES_TYPE, body=songs_by_genre) for song in res["hits"]["hits"]: lyrics = song["_source"]["lyrics"] tokens = nltk.word_tokenize(lyrics) filtered_tokens = [ token for token in tokens if token not in stopwords.words('english') ] tokens_counter = collections.Counter(filtered_tokens) tuple_list = list(tokens_counter.items()) genre_songs_list.append(tuple_list) with open( '/home/omri/Dev/Python/IntroToDS/Data/pickle_list/' + curr_gnere + '.pickle', 'wb') as handle: pickle.dump(genre_songs_list, handle, protocol=pickle.HIGHEST_PROTOCOL)
def generator(self, spell, sentence, call): if (call == 5): return sentence if (sentence == "" and self.n > 1): index = random.randrange(len(self.smootedTable) - 1) heceler = list(list(self.smootedTable.items())[index])[0] sentence = ''.join([str(elem) for elem in heceler]) newSpell = [] if self.n > 4: newSpell.append(heceler[len(heceler) - 4]) if self.n > 3: newSpell.append(heceler[len(heceler) - 3]) if self.n > 2: newSpell.append(heceler[len(heceler) - 2]) newSpell.append(heceler[len(heceler) - 1]) call += 1 return self.generator(newSpell, sentence, call) elif self.n == 1: founded = collections.Counter(self.smootedTable).most_common(5) for i in founded: sentence += i[0] return sentence else: newSpell = [] founded = dict() if self.n == 2: for key in self.smootedTable: if key[0] == spell[0]: founded[key] = self.smootedTable[key] elif self.n == 3: for key in self.smootedTable: if key[0] == spell[0] and key[1] == spell[1]: founded[key] = self.smootedTable[key] elif self.n == 4: for key in self.smootedTable: if key[0] == spell[0] and key[1] == spell[1] and key[ 2] == spell[2]: founded[key] = self.smootedTable[key] else: for key in self.smootedTable: if key[0] == spell[0] and key[1] == spell[1] and key[ 2] == spell[2] and key[3] == spell[3]: founded[key] = self.smootedTable[key] founded = collections.Counter(founded).most_common(1) if len(founded) == 0: return sentence found = founded[0] start = self.n - 1 sentence += str(found[0][len(found[0]) - 1]) if self.n > 4: newSpell.append(found[0][len(found[0]) - 4]) if self.n > 3: newSpell.append(found[0][len(found[0]) - 3]) if self.n > 2: newSpell.append(found[0][len(found[0]) - 2]) newSpell.append(found[0][len(found[0]) - 1]) call += 1 return self.generator(newSpell, sentence, call)
def setNGramTable(self): self.nGramTable = collections.Counter(self.nGram)
def topKFrequent2(self, nums: List[int], k: int) -> List[int]: from nltk import collections counts = collections.Counter(nums) print(counts.keys()) return heapq.nlargest(k, counts.keys(), key=counts.get)
df['token'] = df['token'].apply( lambda x: [item for item in x if item not in sw]) df['tokenstring'] = [' '.join(map(str, l)) for l in df['token']] lists = df['token'] row_list = [] no_of_lists_per_name = Counter(chain.from_iterable(map(set, lists))) for name, no_of_lists in no_of_lists_per_name.most_common(): if no_of_lists == 1: break # since it is ordered by count, once we get this low we are done row_list.append([name, no_of_lists]) df_cat_1 = pd.DataFrame(row_list, columns=['cat', 'cat_count']) #print(Counter(list(ngrams(df['token'], 2)))) counts = collections.Counter() # or nltk.FreqDist() #for sent in df['token']: # counts.update(nltk.ngrams(sent, 2)) #print(counts) for sent in df['token']: counts.update(" ".join(n) for n in nltk.ngrams(sent, 2)) df_cat_2 = pd.DataFrame.from_records(counts.most_common(), columns=['cat', 'cat_count']) df_cat = df_cat_1.append(df_cat_2) df = pd.merge(df, df_cat, left_on=["tokenstring"], right_on="cat") df.to_csv('temp.csv', encoding='utf-8-sig', index=False) df = df.groupby(['category']).agg({
def find_tf_string(token_list): weight_dict = collections.Counter(token_list) tf_dict = {} for word, weight in zip(weight_dict.keys(), weight_dict.values()): tf_dict[word] = (weight / len(token_list)) return tf_dict
print("Topic Stats") print(get_data_frame_stats(df)) train, test = train_test_split(df, random_state=42, test_size=0.33, shuffle=True) X_train = train.text.apply(lambda text: get_cleaned_text(text)) X_test = test.text.apply(lambda text: get_cleaned_text(text)) wordcount = {} for text in X_train.append(X_test): for word in text.split(): if word not in wordcount: wordcount[word] = 1 else: wordcount[word] += 1 word_counter = collections.Counter(wordcount) n_print = int(input("How many most common words to print: ")) for word, count in word_counter.most_common(n_print): print(count, word) # MultiLabelBinarizer().fit_transform(train) start = time.time() train, test = train_test_split(df, random_state=42, test_size=0.33, shuffle=True) X_train = train.text.apply(lambda text: get_cleaned_text(text)) X_test = test.text.apply(lambda text: get_cleaned_text(text)) NB_pipeline = Pipeline([ ('tfidf', TfidfVectorizer()), # ('clf', OneVsRestClassifier(LinearSVC(), n_jobs=1)), # ('clf', LinearSVC()),