def get_wikifrequencies(candidate_keywords): """ Return normalized word frequency for each keyword in Wikipedia """ max_frequency = wikiwords.freq('the') return [ wikiwords.freq(w) / float(max_frequency) for w in candidate_keywords ]
def compute_features(d_dict, q_dict, c_dict): # in_q, in_c, lemma_in_q, lemma_in_c, tf q_words_set = set([w.lower() for w in q_dict['words']]) in_q = [int(w.lower() in q_words_set and not is_stopword(w) and not is_punc(w)) for w in d_dict['words']] c_words_set = set([w.lower() for w in c_dict['words']]) in_c = [int(w.lower() in c_words_set and not is_stopword(w) and not is_punc(w)) for w in d_dict['words']] q_words_set = set([w.lower() for w in q_dict['lemma']]) lemma_in_q = [int(w.lower() in q_words_set and not is_stopword(w) and not is_punc(w)) for w in d_dict['lemma']] c_words_set = set([w.lower() for w in c_dict['lemma']]) lemma_in_c = [int(w.lower() in c_words_set and not is_stopword(w) and not is_punc(w)) for w in d_dict['lemma']] tf = [0.1 * math.log(wikiwords.N * wikiwords.freq(w.lower()) + 10) for w in d_dict['words']] tf = [float('%.2f' % v) for v in tf] d_words = Counter(filter(lambda w: not is_stopword(w) and not is_punc(w), d_dict['words'])) from conceptnet import concept_net p_q_relation = concept_net.p_q_relation(d_dict['words'], q_dict['words']) p_c_relation = concept_net.p_q_relation(d_dict['words'], c_dict['words']) assert len(in_q) == len(in_c) and len(lemma_in_q) == len(in_q) and len(lemma_in_c) == len(in_q) and len(tf) == len(in_q) assert len(tf) == len(p_q_relation) and len(tf) == len(p_c_relation) return { 'in_q': in_q, 'in_c': in_c, 'lemma_in_q': lemma_in_q, 'lemma_in_c': lemma_in_c, 'tf': tf, 'p_q_relation': p_q_relation, 'p_c_relation': p_c_relation }
def wikifrequncy(file): list = file.split() dict = {} for i in range(len(list)): word = list[i] if not word in dict: dict[word] = wikiwords.freq(word) return dict
def get_term_frequency(word: Union[str, Token]) -> float: """ Returns the Term Frequency of word in the Wikipedia corpus. Calculated as: tf_w = log(1 + f_w) Where `f_w` is the number of occurrences of the word in the corpus. """ if isinstance(word, Token): word = word.text # I'd like to use wikiwords.occ instead of this, but it's broken. # So I compute the occurence and N * freq (since freq = occ/N). occurrences = wikiwords.N * wikiwords.freq(word) return math.log(1 + occurrences)
def get_tfidf(sentence): """ calculate the weight of each word in the sentence by pretrained tfidf sentence - a list of string type return a list of scaler """ tfidf_ = [] for idx in range(len(sentence)): w = sentence[idx] try: tfidf = 0.1 * math.log(wikiwords.N * wikiwords.freq(w.lower()) + 10) tfidf = float('%.2f' % tfidf) except: logger.warning('{} - Failed to get to tfidf'.format(w.lower)) tfidf = 0.0 tfidf_.append(tfidf) return tfidf_
def compute_features(q_dict, c_dict): # in_c, lemma_in_c, tf c_words_set = set([w.lower() for w in c_dict['words']]) in_c = [ int(w.lower() in c_words_set and not is_stopword(w) and not is_punc(w)) for w in q_dict['words'] ] c_words_set = set([w.lower() for w in c_dict['lemma']]) lemma_in_c = [ int(w.lower() in c_words_set and not is_stopword(w) and not is_punc(w)) for w in q_dict['lemma'] ] # tf = [0.1 * math.log(wikiwords.N * wikiwords.freq(w.lower()) + 10) for w in q_dict['words']] tf = [wikiwords.freq(w.lower()) for w in q_dict['words']] # tf = [float('%.2f' % v) for v in tf] q_words = Counter( filter(lambda w: not is_stopword(w) and not is_punc(w), q_dict['words'])) from conceptnet import concept_net q_c_relation = concept_net.p_q_relation(q_dict['words'], c_dict['words']) assert len(lemma_in_c) == len(in_c) and len(tf) == len(in_c) assert len(tf) == len(q_c_relation) q_is_science_term = [is_science_term(w) for w in q_dict['words']] q_is_cand = [ 1 if not is_punc(w) and not is_stopword(w) else 0 for w in q_dict['words'] ] return { 'in_c': in_c, 'lemma_in_c': lemma_in_c, 'tf': tf, 'q_c_relation': q_c_relation, 'q_is_science_term': q_is_science_term, 'q_is_cand': q_is_cand }
def get_likelihood_of_string(string, avg_frequency=False): """ Finds individual words in string of multiple words that is missing spaces between words. Does so by reducing string from the back and checking if resulting string is a word in English dictionary. In: string (str): string of words separated by spaces avg_frequency (bool): boolean whether or not to calculate avg frequency of words Out: sum_frequency (float): likelihood of string """ sum_frequency = 0 list_of_words = string.split(" ") for word in list_of_words: sum_frequency += wikiwords.freq(word.lower()) if avg_frequency: sum_frequency = sum_frequency / len(list_of_words) return sum_frequency
def compute_features(p_dict, q_dict, c_dict): # p_in_q, p_in_c, lemma_p_in_q, lemma_p_in_c, tf p_words_set = set([w.lower() for w in p_dict['words']]) q_words_set = set([w.lower() for w in q_dict['words']]) c_words_set = set([w.lower() for w in c_dict['words']]) p_in_q = [int(w.lower() in q_words_set and not is_stopword(w) and not is_punc(w)) for w in p_dict['words']] p_in_c = [int(w.lower() in c_words_set and not is_stopword(w) and not is_punc(w)) for w in p_dict['words']] q_in_p = [int(w.lower() in p_words_set and not is_stopword(w) and not is_punc(w)) for w in q_dict['words']] q_in_c = [int(w.lower() in c_words_set and not is_stopword(w) and not is_punc(w)) for w in q_dict['words']] c_in_p = [int(w.lower() in p_words_set and not is_stopword(w) and not is_punc(w)) for w in c_dict['words']] c_in_q = [int(w.lower() in q_words_set and not is_stopword(w) and not is_punc(w)) for w in c_dict['words']] p_words_set = set([w.lower() for w in p_dict['lemma']]) q_words_set = set([w.lower() for w in q_dict['lemma']]) c_words_set = set([w.lower() for w in c_dict['lemma']]) p_lemma_in_q = [int(w.lower() in q_words_set and not is_stopword(w) and not is_punc(w)) for w in p_dict['lemma']] p_lemma_in_c = [int(w.lower() in c_words_set and not is_stopword(w) and not is_punc(w)) for w in p_dict['lemma']] q_lemma_in_p = [int(w.lower() in p_words_set and not is_stopword(w) and not is_punc(w)) for w in q_dict['lemma']] q_lemma_in_c = [int(w.lower() in c_words_set and not is_stopword(w) and not is_punc(w)) for w in q_dict['lemma']] c_lemma_in_p = [int(w.lower() in p_words_set and not is_stopword(w) and not is_punc(w)) for w in c_dict['lemma']] c_lemma_in_q = [int(w.lower() in q_words_set and not is_stopword(w) and not is_punc(w)) for w in c_dict['lemma']] p_tf = [0.1 * math.log(wikiwords.N * wikiwords.freq(w.lower()) + 10) for w in p_dict['words']] p_tf = [float('%.2f' % v) for v in p_tf] q_tf = [0.1 * math.log(wikiwords.N * wikiwords.freq(w.lower()) + 10) for w in q_dict['words']] q_tf = [float('%.2f' % v) for v in q_tf] c_tf = [0.1 * math.log(wikiwords.N * wikiwords.freq(w.lower()) + 10) for w in c_dict['words']] c_tf = [float('%.2f' % v) for v in c_tf] d_words = Counter(filter(lambda w: not is_stopword(w) and not is_punc(w), p_dict['words'])) from conceptnet import concept_net p_q_relation = concept_net.p_q_relation(p_dict['words'], q_dict['words']) p_c_relation = concept_net.p_q_relation(p_dict['words'], c_dict['words']) q_p_relation = concept_net.p_q_relation(q_dict['words'], p_dict['words']) q_c_relation = concept_net.p_q_relation(q_dict['words'], c_dict['words']) c_p_relation = concept_net.p_q_relation(c_dict['words'], p_dict['words']) c_q_relation = concept_net.p_q_relation(c_dict['words'], q_dict['words']) assert len(p_tf) == len(p_q_relation) and len(p_tf) == len(p_c_relation) assert len(q_tf) == len(q_p_relation) and len(q_tf) == len(q_c_relation) assert len(c_tf) == len(c_p_relation) and len(c_tf) == len(c_q_relation) return { 'p_in_q': p_in_q, 'p_in_c': p_in_c, 'p_lemma_in_q': p_lemma_in_q, 'p_lemma_in_c': p_lemma_in_c, 'p_tf': p_tf, 'p_q_relation': p_q_relation, 'p_c_relation': p_c_relation, 'q_in_p': q_in_p, 'q_in_c': q_in_c, 'q_lemma_in_p': q_lemma_in_p, 'q_lemma_in_c': q_lemma_in_c, 'q_tf': q_tf, 'q_p_relation': q_p_relation, 'q_c_relation': q_c_relation, 'c_in_p': c_in_p, 'c_in_q': c_in_q, 'c_lemma_in_p': c_lemma_in_p, 'c_lemma_in_q': c_lemma_in_q, 'c_tf': c_tf, 'c_p_relation': c_p_relation, 'c_q_relation': c_q_relation, }
def screen_show(self, num, answer): if self.mode == "1": os.system("cls") print("%s\n\n" % self.vacab["Word"][num]) print("1.%s\n" % self.vacab["Chinese"][answer["1"]]) print("2.%s\n" % self.vacab["Chinese"][answer["2"]]) print("3.%s\n" % self.vacab["Chinese"][answer["3"]]) print("4.%s\n" % self.vacab["Chinese"][answer["4"]]) if self.mode == "2": os.system("cls") print("Question No.%s\n\n" % (self._counter - self._start)) print("%s\n\n" % self.vacab["Word"][num]) print("%s\n\n" % self.vacab["Chinese"][answer]) print("1.Easy 2.Hard 3. Hell q.Quit\n") if self._counter >= self._end: print("You can finish the study now!\n") if self.mode == "3": os.system("cls") print("Question No.%s\n\n" % (self._counter - self._start)) print("%s\n\n" % self.vacab["Word"][num]) try: statistic = (wikiwords.freq(self.vacab["Word"][num]), wikiwords.occ(self.vacab["Word"][num])) except Exception as e: statistic = (0, 0) print("Freq:%-10.2eOcc:%-10.2e\n" % statistic) #print("%s\n\n" % self.vacab["Chinese"][answer]) print("1.Easy 2.Hard 3. Hell q.Quit\n") if self._counter >= self._end: print("You can finish the study now!\n") if self.mode == "4": os.system("cls") print("Question No.%s\n\n" % (self._counter - self._start)) print("%s\n\n" % self.vacab["Word"][num]) try: statistic = (wikiwords.freq(self.vacab["Word"][num]), wikiwords.occ(self.vacab["Word"][num])) except Exception as e: statistic = (0, 0) print("Freq:%-10.2eOcc:%-10.2e\n" % statistic) #print("%s\n\n" % self.vacab["Chinese"][answer]) print("1.Easy 2.Hard 3. Hell q.Quit\n")
def compute_features(d_dicts, q_dict, c_dicts, q_terms): # compute features for each d_dict and c_dict in_qs, in_cs, lemma_in_qs, lemma_in_cs = [], [], [], [] p_q_relations, p_c_relations = [], [] tfs = [] for d_dict, c_dict in zip(d_dicts, c_dicts): # in_q, in_c, lemma_in_q, lemma_in_c, tf q_words_set = set([w.lower() for w in q_dict['words']]) in_q = [ int(w.lower() in q_words_set and not is_stopword(w) and not is_punc(w)) for w in d_dict['words'] ] in_qs.append(in_q) q_words_set = set([w.lower() for w in q_dict['lemma']]) lemma_in_q = [ int(w.lower() in q_words_set and not is_stopword(w) and not is_punc(w)) for w in d_dict['lemma'] ] lemma_in_qs.append(lemma_in_q) c_words_set = set([w.lower() for w in c_dict['words']]) in_c = [ int(w.lower() in c_words_set and not is_stopword(w) and not is_punc(w)) for w in d_dict['words'] ] in_cs.append(in_c) c_words_set = set([w.lower() for w in c_dict['lemma']]) lemma_in_c = [ int(w.lower() in c_words_set and not is_stopword(w) and not is_punc(w)) for w in d_dict['lemma'] ] lemma_in_cs.append(lemma_in_c) tf = [ 0.1 * math.log(wikiwords.N * wikiwords.freq(w.lower()) + 10) for w in d_dict['words'] ] tf = [float('%.2f' % v) for v in tf] tfs.append(tf) #d_words = Counter(filter(lambda w: not is_stopword(w) and not is_punc(w), d_dict['words'])) from conceptnet import concept_net p_q_relation = concept_net.p_q_relation(d_dict['words'], q_dict['words']) p_q_relations.append(p_q_relation) p_c_relation = concept_net.p_q_relation(d_dict['words'], c_dict['words']) p_c_relations.append(p_c_relation) assert len(in_q) == len(in_c) and len(lemma_in_q) == len(in_q) and len( lemma_in_c) == len(in_q) and len(tf) == len(in_q) assert len(tf) == len(p_q_relation) and len(tf) == len(p_c_relation) if q_terms is not None: q_es = [True if w in q_terms else False for w in q_dict['words']] else: q_es = None # update in_c, lemma_in_c and p_c_relation return { 'in_qs': in_qs, 'in_cs': in_cs, 'lemma_in_qs': lemma_in_qs, 'lemma_in_cs': lemma_in_cs, 'tfs': tfs, 'p_q_relations': p_q_relations, 'p_c_relations': p_c_relations, 'q_es': q_es }
def idf_wiki(token): """Computed IDF score based on lookup table of frequency based on Wikipedia corpus""" if wikiwords.freq(token) == 0: return math.log(wikiwords.N) else: return math.log(wikiwords.freq(token))
def add_post(self, text, metainfo): """ Extract the frequency of words of text and create a Post TODO extract related words Args: text (str): main text or the post metainfo (dictionary): title, number of likes and shares, etc """ text = text.encode('ascii','ignore') #sentimental analysis t = TextBlob(text) metainfo["polarity"] = t.sentiment.polarity metainfo["subjectivity"] = t.sentiment.subjectivity vader = vaderSentiment(text) metainfo["vader"] = vader text = text.translate(string.maketrans("",""), string.punctuation) #removing stop words stop = stopwords.words('english') #frequency in english language english_freq = {} reverse_stem = {} list_words = [] for i in text.split(): i = i.lower() if i not in stop: freq = wikiwords.freq(i, lambda x: 0.000001) st = self.stemmer.stem(i) if not reverse_stem.has_key(st): reverse_stem[st] = i if english_freq.has_key(st):english_freq[st] += freq else: english_freq[st] = freq list_words.append(st) #get frequencies of words frequencies = FreqDist(list_words) main_words = [] for word, count in frequencies.items(): english_freq[word] /= count #print reverse_stem[word] + ": " + str(english_freq[word]) tf_idf = (-1.0)*log(count+0.1)/(log(english_freq[word])) main_words.append([tf_idf, count, reverse_stem[word], word]) #select just most important words main_words.sort(reverse=True) NUM_MAX = 100 if len(main_words) > NUM_MAX: main_words = main_words[0:NUM_MAX] #create dict final_main_words = {} for w in main_words: final_main_words[w[3]] = w[0:3] self.last_id_added += 1 id = self.last_id_added post = Post(id, self.name, metainfo, frequencies, final_main_words, reverse_stem) post.save() self.update_index(list_words, frequencies) self.save() return post
def Pw(word): return wikiwords.freq(word, avoid_long_words)
import wikiwords import unicodedata import numpy as np from collections import Counter from nltk.corpus import stopwords words = frozenset(stopwords.words('english')) punc = frozenset(string.punctuation) def is_stopword(w): return w.lower() in words def is_punc(c): return c in punc baseline = wikiwords.freq('the') def get_idf(w): return np.log(baseline / (wikiwords.freq(w.lower()) + 1e-10)) def load_data(path, scriptKnowledge, use_script_knowledge, use_char_emb): from doc import Example data = [] for line in open(path, 'r', encoding='utf-8'): if path.find('race') < 0 or np.random.random() < 0.6: data.append(Example(json.loads(line), scriptKnowledge, use_script_knowledge, use_char_emb)) print('Load %d examples from %s...' % (len(data), path)) return data class Dictionary(object): NULL = '<NULL>' UNK = '<UNK>'
def get_wikifrequencies(candidate_keywords): """ Return normalized word frequency for each keyword in Wikipedia """ max_frequency = wikiwords.freq('the') return [wikiwords.freq(w)/float(max_frequency) for w in candidate_keywords]
def get_idf(w): return np.log(baseline / (wikiwords.freq(w.lower()) + 1e-10))
def compute_features(d_dict, q_dict, c_dict, d_id, q_id, c_id, graphs, sentence_graphs): # in_q, in_c, lemma_in_q, lemma_in_c, tf q_words_set = set([w.lower() for w in q_dict['words']]) in_q = [ int(w.lower() in q_words_set and not is_stopword(w) and not is_punc(w)) for w in d_dict['words'] ] c_words_set = set([w.lower() for w in c_dict['words']]) in_c = [ int(w.lower() in c_words_set and not is_stopword(w) and not is_punc(w)) for w in d_dict['words'] ] q_words_set = set([w.lower() for w in q_dict['lemma']]) lemma_in_q = [ int(w.lower() in q_words_set and not is_stopword(w) and not is_punc(w)) for w in d_dict['lemma'] ] c_words_set = set([w.lower() for w in c_dict['lemma']]) lemma_in_c = [ int(w.lower() in c_words_set and not is_stopword(w) and not is_punc(w)) for w in d_dict['lemma'] ] tf = [ 0.1 * math.log(wikiwords.N * wikiwords.freq(w.lower()) + 10) for w in d_dict['words'] ] tf = [float('%.2f' % v) for v in tf] d_words = Counter( filter(lambda w: not is_stopword(w) and not is_punc(w), d_dict['words'])) four_lang_utils = Utils() p_q_four_lang_relation = compute_4lang_relation(graphs, four_lang_utils, d_dict, q_dict) p_c_four_lang_relation = compute_4lang_relation(graphs, four_lang_utils, d_dict, c_dict) q_c_four_lang_relation = compute_4lang_relation(graphs, four_lang_utils, q_dict, c_dict) p_q_four_lang_sentence_relation =\ compute_4lang_sentence_relation(sentence_graphs[d_id], sentence_graphs[d_id]["questions"][q_id], four_lang_utils) p_c_four_lang_sentence_relation =\ compute_4lang_sentence_relation(sentence_graphs[d_id], sentence_graphs[d_id]["questions"][q_id]["choice"][c_id], four_lang_utils) q_c_four_lang_sentence_relation =\ compute_4lang_sentence_relation(sentence_graphs[d_id]["questions"][q_id], sentence_graphs[d_id]["questions"][q_id]["choice"][c_id], four_lang_utils) from conceptnet import concept_net p_q_relation = concept_net.p_q_relation(d_dict['words'], q_dict['words']) p_c_relation = concept_net.p_q_relation(d_dict['words'], c_dict['words']) assert len(in_q) == len(in_c) and len(lemma_in_q) == len(in_q) and len( lemma_in_c) == len(in_q) and len(tf) == len(in_q) assert len(tf) == len(p_q_relation) and len(tf) == len(p_c_relation) return { 'in_q': in_q, 'in_c': in_c, 'lemma_in_q': lemma_in_q, 'lemma_in_c': lemma_in_c, 'tf': tf, 'p_q_relation': p_q_relation, 'p_c_relation': p_c_relation, 'p_q_four_lang_relation': p_q_four_lang_relation, 'p_c_four_lang_relation': p_c_four_lang_relation, 'q_c_four_lang_relation': q_c_four_lang_relation, 'p_q_four_lang_sentence_relation': p_q_four_lang_sentence_relation, 'p_c_four_lang_sentence_relation': p_c_four_lang_sentence_relation, 'q_c_four_lang_sentence_relation': q_c_four_lang_sentence_relation }