def create_logistic_model(df, _story_ids, data): results = [] lr_models = {} for label in set(labels): positive_story_ids = set(df >> sift(X["labels"] == label) >> X.story_id.values) y_ = np.array([s in positive_story_ids for s in _story_ids]) X_ = data lr = linear_model.LogisticRegression(C=C_VALUE) logging.info(label, Counter(y_)) cv_score = cross_validation.cross_val_score( lr, X_, y_, cv=10, scoring="roc_auc").mean() lr = lr.fit(X_, y_) lr_models[label] = lr probs = lr.predict_proba(X_)[:, 1] results.append({"alg": "log reg", "label": label, "auc": cv_score}) logging.info(C_VALUE, label, cv_score, len(probs[probs > 0.19]), Counter(labels == label)) logging.info() results_df = pd.DataFrame(results) lr_fname = make_time_filename(LOGISTIC_MODEL_NAME, ".pkl") logging.info("writing file", lr_fname) with open(lr_fname, "wb") as f: pickle.dump(lr_models, f, protocol=2)
def get_features(self, question, context): stop = set() if self.stop_words is None else self.stop_words.words context_features = np.zeros((len(context), 3)) if not self.require_unique_match: question_words = set(x for x in question if x.lower() not in stop) quesiton_words_lower = set(x.lower() for x in question) quesiton_words_stem = set( self.lemmatize_word(x) for x in quesiton_words_lower) else: question_words = set(k for k, v in Counter(question).items() if v == 1) quesiton_words_lower = set(k for k, v in Counter( x.lower() for x in question_words).items() if v == 1) quesiton_words_stem = set(k for k, v in Counter( self.lemmatize_word(x) for x in quesiton_words_lower).items() if v == 1) for i, word in enumerate(context): if word in question_words: context_features[i][:3] = 1 elif word.lower() in quesiton_words_lower: context_features[i][:2] = 1 elif self._lemmatizer.lemmatize(word) in quesiton_words_stem: context_features[i][2] = 1 if self.empty_question_features: return np.zeros((len(question), 3)), context_features else: return np.zeros((len(question), 0)), context_features
def llt(query): print( colored( "*****************************************************************************************" "********************************************************************************************", color='magenta')) public_tweets = get_tweets(query) location = {} language = {} time_zone = {} for tweet in public_tweets['statuses']: loc = tweet['user']['location'] lang = tweet['user']['lang'] tz = tweet['user']['time_zone'] if loc in location: location[loc] += 1 else: location[loc] = 1 if lang in language: language[lang] += 1 else: language[lang] = 1 if tz in time_zone: time_zone[tz] += 1 else: time_zone[tz] = 1 if None in time_zone: del time_zone[None] if '' in time_zone: del time_zone[''] if '' in language: del language[''] if '' in location: del location[''] if None in location: del location[None] if None in language: del language[None] language_count = dict(Counter(language).most_common(4)) print(colored("language: ", color='green', attrs=['bold'])) print language_count location_count = dict(Counter(location).most_common(4)) print(colored("locations: ", color='green', attrs=['bold'])) print location_count time_zone_count = dict(Counter(time_zone).most_common(4)) print(colored("Time Zone: ", color='green', attrs=['bold'])) print time_zone_count print( colored( "*****************************************************************************************" "********************************************************************************************", color='magenta'))
def llt(query): public_tweets = get_tweets(query) global time_zone1, loca, lang location = {} language = {} time_zone = {} for tweet in public_tweets['statuses']: loca = tweet['user']['location'] lang = tweet['user']['lang'] time_zone1 = tweet['user']['created_at'] if loca in location: location[loca] += 1 else: location[loca] = 1 if lang in language: language[lang] += 1 else: language[lang] = 1 if time_zone1 in time_zone: time_zone[time_zone1] += 1 else: time_zone[time_zone1] = 1 # limiting the display of the values if None in time_zone: del time_zone[None] if '' in time_zone: del time_zone[''] if '' in language: del language[''] if '' in location: del location[''] if None in location: del location[None] if None in language: del language[None] language_count = dict(Counter(language).most_common(5)) print(colored("Language: ", color='green', attrs=['bold'])) print(language_count) location_count = dict(Counter(location).most_common(5)) print(colored("Location: ", color='green', attrs=['bold'])) print(location_count) time_zone_count = dict(Counter(time_zone).most_common(5)) print(colored("Time Zone: ", color='green', attrs=['bold'])) print(time_zone_count)
def process_document(self, documents): tokenizer = data.load('tokenizers/punkt/english.pickle') lemmatizer = WordNetLemmatizer() stopwords = corpus.stopwords.words('english') tf = [] # term frequency idf = [] tokens_list_doc_wise = [] all_tokens = set() for document in documents: tokens = tokenizer.tokenize(document) tokens = [lemmatizer.lemmatize(token) for token in tokens] tokens = [token for token in tokens if token not in stopwords] tokens_list_doc_wise.append(tokens) tf.append(Counter(tokens)) all_tokens.union(tokens) # calculating idf for token in all_tokens: present_in_documents = 0 for x in range(0, len(documents)): present_in_documents += 1 if tf[x][ token] > 0 else present_in_documents idf[token] = math.log(len(documents) / len(present_in_documents)) # calculating tf_idf for tokens document wise tf_idf = [] for x in range(0, len(tokens_list_doc_wise)): for token in tokens_list_doc_wise[x]: tf_idf[token] = tf[x][token] * idf[token]
def train(self, trainset): X = [] y = [] self.tfidf = TfidfVectorizer() for sent in trainset: self.tfidfWordList.append(sent['target_word']) for item in sent['target_word'].split(" "): self.wordList.append(item) #tf model self.tfList = Counter(self.wordList) max = np.array([item for item in self.tfList.values()]) self.maxNumber = np.max(max) #tfidf model weightTfidf = self.tfidf.fit_transform(self.tfidfWordList).toarray() zeroVector = np.zeros(len(weightTfidf[0])) for item in weightTfidf: itemVector = np.array(item) zeroVector += itemVector self.tfidfResult = dict(zip(self.tfidf.get_feature_names(), zeroVector)) # self.tfidfResult = {key:value for key,value in self.tfidf.vocabulary_.items()} # self.normal = np.max(np.array([item for item in self.tfidfResult.values()])) # print(self.tfList) for sent in trainset: X.append(self.extract_features(sent['target_word'])) y.append(sent['gold_label']) self.model.fit(X, y) title = "TF+TFIDF " + self.language.capitalize() self.plot_learning_curve(self.model, title, X, y)
def modified_precision(candidate, references, n): candidate_counter = Counter(get_ngrams(candidate, n)) if not candidate_counter: return 0 max_reference_counter = {} for reference in references: reference_counter = Counter(get_ngrams(reference, n)) for ngram in candidate_counter: max_reference_counter[ngram] = max( max_reference_counter.get(ngram, 0), reference_counter[ngram]) clipped_counter = dict( (ngram, min(reference_counter_count, max_reference_counter[ngram])) for ngram, reference_counter_count in reference_counter.items()) return sum(clipped_counter.values()) / sum(candidate_counter.values())
def td_idf(text_array, neg_word_occ, pos_word_occ): for i in text_array: word_map = Counter(i) for wd in word_map: word_map[wd] = log(len(text_array) / (neg_word_occ[wd] + pos_word_occ[wd]), 10) * word_map[wd] / len(i) print(word_map) break
def countWords(self): filteredWords = cleanWords(self.contents) counts = Counter(filteredWords) wordFreq = dict() # Convert to relative frequency for c in counts: wordFreq[c] = counts[c] / len(filteredWords) return wordFreq
def td_idf_to_vec(dataset, dim, neg_word_occ, pos_word_occ): result = zeros((len(dataset), dim)) for i, val in enumerate(dataset): word_map = Counter(val) for wd in val: print(word_map) result[i][wd] = log(len(dataset) / (neg_word_occ[wd] + pos_word_occ[wd]), 10) * word_map[wd] / len(val) return result
def ngrams_over_file(sourcefile, n, targetfile): f = open(targetfile, 'w') data_set = read_file(sourcefile) split_it = data_set.split() split_it = list(filter(lambda x: x not in stopwords, split_it)) split_it = list(filter(lambda x: len(x) > 2, split_it)) counter = Counter(split_it) if n > 1: grams = ngrams(split_it, n) counter = Counter(grams) most_occur = counter.most_common(20) for occur in most_occur: if n == 1: f.write(occur[0] + "|" + str(occur[1]) + "\n") else: f.write(' '.join(map(str, occur[0])) + "|" + str(occur[1]) + "\n") print(most_occur) f.close()
def getsimiliar(self, word, text): T = Text(text) word_context_index = ContextIndex(T.tokens, filter=lambda x: x.isalpha(), key=lambda s: s.lower()) word = word.lower() wci = word_context_index._word_to_contexts words = [] if word in wci.conditions(): contexts = set(wci[word]) fd = Counter(w for w in wci.conditions() for c in wci[w] if c in contexts and not w == word) words = [w for w, _ in fd.most_common(20)] return words
def cleaning(sentence: str, method: str, frequency: int = 0, percentage: int = 0): """ :param sentence: Definition to clean :param method: string which define which method to call :param frequency: if not None define minimum number of words repetition :param percentage: percentage of the highest frequent words to take :return Counter(key=word,value=frequency): sentence cleaned """ tokenized: Counter = rm_stopwords_punctuation(sentence) tokenized = utility.remove_number_key(tokenized, minimum=1950, maximum=2030) if len(tokenized) <= 0: return Counter() elif frequency > 0: # Filtering only words with at least frequency occurrences filtered = dict(filter(lambda x: x[1] >= frequency, tokenized.items())) i = 1 while len(filtered) <= 0: filtered = dict( filter(lambda x: x[1] >= frequency - i, tokenized.items())) i += 1 return globals()[method](Counter(filtered)) # If a percentage is defined take the first elements (based on percentage), otherwise take everything elif percentage > 0: percentage = int((percentage / 100) * len(tokenized)) most_common = tokenized.most_common(percentage) tokenized = Counter( dict( filter(lambda elem: elem[0] in dict(most_common).keys(), tokenized.items()))) return globals()[method](tokenized)
def rm_stopwords_punctuation(sentence: str, language="english", stamp=False) -> Counter: tokens = word_tokenize(sentence) if len(tokens) > 0: tokens[0] = tokens[0].lower() sentence = Counter(tokens) stopwords_list = set(stopwords.words(language)) stop_punctuation = stopwords_list.union(resources.punctuation).union( resources.ambiguous) filtered = utility.filter_by_set(sentence, stop_punctuation) if stamp: print("---Removing Stopwords---") print("Stopwords in", language, ":", stopwords_list) print("Sentence with stopwords and punctuation removed:\n", filtered) return filtered
def Topusage(): new_tweets = api.user_timeline(screen_name='@narendramodi', count=200, tweet_mode='extended') for tweet in new_tweets: #print(tweet.full_text) temp = [] temp.append(tweet.full_text) temp1 = temp import re words = re.sub(r"http\S+", " ", str(temp1)) word = words.split() word1 = [w for w in word if w in stop_words] for w in word1: if w not in stop_words: word1.append(w) num = Counter(word1).most_common(10) print(num)
namedEnt.draw() #6 Bag of Words Model-Most common words datacamp from nltk import Counter from nltk.corpus import stopwords # so far we already have text, lets do a quick bow model: alpha_tokens_lower= [w.lower() for w in nltk.word_tokenize(text) if w.isalpha()] no_stops=[w for w in alpha_tokens_lower if w not in stopwords.words('english') ] len(alpha_tokens_lower)-len(no_stops) words_count=Counter(no_stops) words_count.most_common(17) #7 Bag of words model Udemy: #we already have the text: import re dataset=nltk.sent_tokenize(text) #lowercase, non-alpha to singe space for i in range(len(dataset)): dataset[i]=dataset[i].lower() dataset[i]=re.sub(r'\W', ' ', dataset[i]) dataset[i]=re.sub(r'\s+',' ',dataset[i])
def count_words(cls, items: list): counter = Counter(items) return counter.most_common(5)
def _token_counter(self): """Return the counts of all tokens""" return Counter([word for doc in self.tokens for word in doc])
def lemmer(tokens) -> Counter: lemmed = Counter() for k in tokens.keys(): lemmed.update({lemmatizer.lemmatize(k): tokens[k]}) return lemmed
semantic_values = stringate_value(hypernom_subj, hypernom_dobj) semantic_type.append(semantic_values) return semantic_type, sentences_analyzed if __name__ == '__main__': verbs_bf = ['build', 'love', 'eat'] for verb_base_form in verbs_bf: sentences = get_sentences_with_verb(verb_base_form) print('*' * 50) print('\nCurrent verb base form : {}\n'.format(verb_base_form)) semantic_cluster, sentences_analyzed = get_semantic_cluster(sentences, verb_base_form) print('------ End extraction-----------') # Print stats sts_semantic_cluster = Counter(semantic_cluster) common_semantic_cluster = sts_semantic_cluster.most_common(5) plot_result(common_semantic_cluster, verb_base_form) print('\nAnalized {} sentences \nFor the verb in base form : {} pair of semantic type are:\n' .format(sentences_analyzed, verb_base_form)) for s in sts_semantic_cluster: print('\t< {} > Count {} '.format(s, sts_semantic_cluster[s])) print('*' * 50) print('\n\n\n')
def statistics(self, trainset): for sent in trainset: self.wordNumber += len( re.sub("[^\w']", " ", sent['sentence']).split()) self.wordbackup += re.sub("[^\w']", " ", sent['sentence']).split() self.wordCounter = Counter(self.wordbackup)
def _modified_precision(candidate, references, n): """Calculate modified ngram precision. The normal precision method may lead to some wrong translations with high-precision, e.g., the translation, in which a word of reference repeats several times, has very high precision. So in the modified n-gram precision, a reference word will be considered exhausted after a matching candidate word is identified. Paper examples: >>> _modified_precision( ... 'the the the the the the the'.split(), ... ['the cat is on the mat'.split(), 'there is a cat on the mat'.split()], ... n=1, ... ) 0.28... >>> _modified_precision( ... 'the the the the the the the'.split(), ... ['the cat is on the mat'.split(), 'there is a cat on the mat'.split()], ... n=2, ... ) 0.0 >>> _modified_precision( ... 'of the'.split(), ... [ ... 'It is a guide to action that ensures that the military will forever heed Party commands.'.split(), ... 'It is the guiding principle which guarantees the military forces always being under the command of the Party.'.split(), ... 'It is the practical guide for the army always to heed the directions of the party'.split(), ... ], ... n=1, ... ) 1.0 >>> _modified_precision( ... 'of the'.split(), ... [ ... 'It is a guide to action that ensures that the military will forever heed Party commands.'.split(), ... 'It is the guiding principle which guarantees the military forces always being under the command of the Party.'.split(), ... 'It is the practical guide for the army always to heed the directions of the party'.split(), ... ], ... n=2, ... ) 1.0 More examples: >>> weights = [0.25, 0.25, 0.25, 0.25] >>> candidate1 = ['It', 'is', 'a', 'guide', 'to', 'action', 'which', ... 'ensures', 'that', 'the', 'military', 'always', ... 'obeys', 'the', 'commands', 'of', 'the', 'party'] >>> candidate2 = ['It', 'is', 'to', 'insure', 'the', 'troops', ... 'forever', 'hearing', 'the', 'activity', 'guidebook', ... 'that', 'party', 'direct'] >>> reference1 = ['It', 'is', 'a', 'guide', 'to', 'action', 'that', ... 'ensures', 'that', 'the', 'military', 'will', 'forever', ... 'heed', 'Party', 'commands'] >>> reference2 = ['It', 'is', 'the', 'guiding', 'principle', 'which', ... 'guarantees', 'the', 'military', 'forces', 'always', ... 'being', 'under', 'the', 'command', 'of', 'the', ... 'Party'] >>> reference3 = ['It', 'is', 'the', 'practical', 'guide', 'for', 'the', ... 'army', 'always', 'to', 'heed', 'the', 'directions', ... 'of', 'the', 'party'] Unigrams: >>> _modified_precision( ... candidate1, ... [reference1, reference2, reference3], ... n=1, ... ) 0.94... >>> _modified_precision( ... candidate2, ... [reference1, reference2, reference3], ... n=1, ... ) 0.57... Bigrams: >>> _modified_precision( ... candidate1, ... [reference1, reference2, reference3], ... n=2, ... ) 0.58... >>> _modified_precision( ... candidate2, ... [reference1, reference2, reference3], ... n=2, ... ) 0.07... """ counts = Counter(ngrams(candidate, n)) if not counts: return 0 max_counts = {} for reference in references: reference_counts = Counter(ngrams(reference, n)) for ngram in counts: max_counts[ngram] = max(max_counts.get(ngram, 0), reference_counts[ngram]) clipped_counts = dict((ngram, min(count, max_counts[ngram])) for ngram, count in counts.items()) return sum(clipped_counts.values()) / sum(counts.values())
import pandas import vincent from nltk import Counter from DB import db from Preprocessing import preprocess, stop count = 0 db = db() allTweets = db.getAll() count_all_hashtags = Counter() count_all_terms = Counter() dates_hashtag = [] for tweet in allTweets: tweetText = tweet['text'].lower() # Bigrams list termsWithoutStopwords = [ term for term in preprocess(tweetText) if term not in stop ] # termsBigrams = bigrams(termsWithoutStopwords) # Hashtags list terms_hash = [ term for term in preprocess(tweetText) if term.startswith('#') ] if '#marchfortruth' in terms_hash: dates_hashtag.append(tweet['created_at']) # Update the counter(s)
def stemmer(tokens) -> Counter: stemmed = Counter() for k in tokens.keys(): stemmed.update({stemmatizer.stem(k): tokens[k]}) return stemmed
def get_features(text, setting): if setting == 'bow': return {word: count for word, count in Counter(preprocess(text)).items() if not word in stoplist} else: return {word: True for word in preprocess(text) if not word in stoplist}
#count words in list(data_split), if word already in list, pass, but if not, #add it to the current wordcount dictionary. Counting frequency for item in data_split: if item in wordcount.keys(): wordcount[item] += 1 else: wordcount[item] = 1 qstring = "I think I will get the best score in the class" qstring_split = qstring.split() qstring_dict = {} for word in qstring_split: if word in qstring_dict.keys(): qstring_dict[word] += 1 else: qstring_dict[word] = 1 #count bigrams in the text file from nltk import Counter data_bi = Counter(nltk.bigrams(data_split)) q_bi = Counter(nltk.bigrams(qstring_split)) #count the probability of each word in wordcount dictionary biprob_list = [] for item in q_bi: if item in data_bi: biprob_list.append(q_bi[item] / data_bi[item]) else: bi_prob = 0 total_prob = 1 for prob in biprob_list: total_prob = total_prob * prob