def __init__(self, freq_fpath, min_freq=1, preprocess=True, sep='\t', strip_pos=True, use_pickle=True): """ Reads a word frequency list in CSV format "word<TAB>freq" """ if not exists(freq_fpath): self._freq = {} return pkl_fpath = freq_fpath + ".pkl" if use_pickle and exists(pkl_fpath): voc = pickle.load(open(pkl_fpath, "rb")) else: # load words to datafame if preprocess: freq_cln_fpath = freq_fpath + "-cln" preprocess_pandas_csv(freq_fpath, freq_cln_fpath) word_df = read_csv(freq_cln_fpath, sep, encoding='utf8', error_bad_lines=False) try_remove(freq_cln_fpath) else: word_df = read_csv(freq_fpath, sep, encoding='utf8', error_bad_lines=False) # load from dataframe to dictionary word_df = word_df.drop(word_df[word_df["freq"] < min_freq].index) if strip_pos: voc = {} for i, row in word_df.iterrows(): try: word = unicode(row["word"]).split("#")[0] freq = int(row["freq"]) if word not in voc or voc[word] < freq: voc[word] = freq except: print "Bad row:", row print format_exc() else: voc = { row["word"]: row["freq"] for i, row in word_df.iterrows() } print "dictionary is loaded:", len(voc) if use_pickle: pickle.dump(voc, open(pkl_fpath, "wb")) print "Pickled voc:", pkl_fpath print "Loaded %d words from: %s" % (len(voc), pkl_fpath if pkl_fpath else freq_fpath) self._freq = voc
def fill_direct_isas(self, subphrases=False): # get direct hypernyms of different isas: model_name -> (hypo, hyper) -> weight hypo2hyper_freq = defaultdict(dict) # raw frequency hypo2hyper_inorm = defaultdict( dict) # in-voc norm: divide by max invoc frequency hypo2hyper_anorm = defaultdict( dict) # absolute norm: divide by max frequency per word for isa_name in self._isas: print isa_name, len(self._isas[isa_name].data) for hypo in self.voc: # find hypernyms hypers_list = self._isas[isa_name].all_hyper(hypo) hypers_dict = {hyper: freq for hyper, freq in hypers_list} invoc_hypers_dict = { w: hypers_dict[w] for w in set(hypers_dict.keys()).intersection(self.voc) } invoc_hypers_dict.pop(hypo, None) invoc_hypers_list = sorted(invoc_hypers_dict.items(), key=operator.itemgetter(1), reverse=True) if VERBOSE: if len(invoc_hypers_list) > 0: print hypo, len(hypers_dict), len( invoc_hypers_list), ", ".join( w + ":" + unicode(freq) for w, freq in invoc_hypers_list) # print len(invoc_hypers_list), # find hypernyms of subphrases if len(invoc_hypers_list) == 0 and subphrases: for hypo_subphrase in self._subphrases(hypo): hypers_list = self._isas[isa_name].all_hyper( hypo_subphrase) hypers_dict = { hyper: freq for hyper, freq in hypers_list } invoc_hypers_dict = { w: hypers_dict[w] for w in set(hypers_dict.keys()).intersection( self.voc) } invoc_hypers_dict.pop(hypo_subphrase, None) invoc_hypers_list = sorted(invoc_hypers_dict.items(), key=operator.itemgetter(1), reverse=True) if (invoc_hypers_list) > 0: break if len(invoc_hypers_list) == 0: continue elif VERBOSE: print hypo, "-->", hypo_subphrase, ":", invoc_hypers_list elif len(invoc_hypers_list) == 0: continue # normalize max_freq = float(hypers_list[0][1]) invoc_max_freq = float(invoc_hypers_list[0][1]) for hyper, freq in invoc_hypers_list: hypo2hyper_freq[isa_name][(hypo, hyper)] = freq hypo2hyper_anorm[isa_name][(hypo, hyper)] = freq / max_freq hypo2hyper_inorm[isa_name][(hypo, hyper)] = freq / invoc_max_freq # average: (hypo, hyper) -> weight hypo2hyper_iavg = self._average(hypo2hyper_inorm) hypo2hyper_aavg = self._average(hypo2hyper_anorm) # initialize arrays hyper2hypo_iavg_arr = np.zeros(len(self._relations)) hypo2hyper_iavg_arr = np.zeros(len(self._relations)) hypo2hyper_iavg2_arr = np.zeros(len(self._relations)) hyper2hypo_iavg2_arr = np.zeros(len(self._relations)) hypo2hyper_aavg_arr = np.zeros(len(self._relations)) hyper2hypo_aavg_arr = np.zeros(len(self._relations)) hypo2hyper_arr = {} hyper2hypo_arr = {} for isa_name in hypo2hyper_inorm: hypo2hyper_arr[isa_name] = np.zeros(len(self._relations)) hyper2hypo_arr[isa_name] = np.zeros(len(self._relations)) # fill the arrays for i, row in self._relations.iterrows(): if i != 0 and i % 100000 == 0: print i hypo2hyper_iavg_arr[i] = hypo2hyper_iavg.pop( (row.hyponym, row.hypernym), 0) hyper2hypo_iavg_arr[i] = hypo2hyper_iavg.pop( (row.hypernym, row.hyponym), 0) hypo2hyper_aavg_arr[i] = hypo2hyper_aavg.pop( (row.hyponym, row.hypernym), 0) hyper2hypo_aavg_arr[i] = hypo2hyper_aavg.pop( (row.hypernym, row.hyponym), 0) for isa_name in hypo2hyper_inorm: hypo2hyper_arr[isa_name][i] = hypo2hyper_freq[isa_name].pop( (row.hyponym, row.hypernym), 0) hyper2hypo_arr[isa_name][i] = hypo2hyper_freq[isa_name].pop( (row.hypernym, row.hyponym), 0) # insert arrays as columns s = "_s" if subphrases else "" for isa_name in hypo2hyper_inorm: col = "hypo2hyper_" + isa_name + s self._relations[col] = Series(hypo2hyper_arr[isa_name], index=self._relations.index) hypo2hyper_iavg2_arr += self._relations[col] / self._relations[ col].max() col = "hyper2hypo_" + isa_name + s self._relations[col] = Series(hyper2hypo_arr[isa_name], index=self._relations.index) hyper2hypo_iavg2_arr += self._relations[col] / self._relations[ col].max() self._relations["hypo2hyper" + s] = Series(hypo2hyper_iavg_arr, index=self._relations.index) self._relations["hyper2hypo" + s] = Series(hyper2hypo_iavg_arr, index=self._relations.index) self._relations["hypo2hyper2" + s] = Series( hypo2hyper_iavg2_arr, index=self._relations.index) self._relations["hyper2hypo2" + s] = Series( hyper2hypo_iavg2_arr, index=self._relations.index) self._relations["hypo2hyper3" + s] = Series( hypo2hyper_aavg_arr, index=self._relations.index) self._relations["hyper2hypo3" + s] = Series( hyper2hypo_aavg_arr, index=self._relations.index) self._save_relations() # debug info debug_fpath = self._relations_fpath + "-direct-hypo2hyper" + s + ".csv" tmp_fpath = debug_fpath + ".tmp" with codecs.open(tmp_fpath, "w", "utf-8") as out: print >> out, "hyponym\thypernym\tfreq" for hypo, hyper in hypo2hyper_iavg: print >> out, "%s\t%s\t%.3f" % (hypo, hyper, hypo2hyper_iavg[(hypo, hyper)]) df = read_csv(tmp_fpath, encoding='utf-8', delimiter="\t", error_bad_lines=False) df = df.sort(["hyponym", "freq"], ascending=[1, 0]) df.to_csv(debug_fpath, sep="\t", encoding="utf-8", float_format='%.3f', index=False) try_remove(tmp_fpath) print "Direct hypernyms:", debug_fpath
def fill_direct_isas(self, subphrases=False): # get direct hypernyms of different isas: model_name -> (hypo, hyper) -> weight hypo2hyper_freq = defaultdict(dict) # raw frequency hypo2hyper_inorm = defaultdict(dict) # in-voc norm: divide by max invoc frequency hypo2hyper_anorm = defaultdict(dict) # absolute norm: divide by max frequency per word for isa_name in self._isas: print isa_name, len(self._isas[isa_name].data) for hypo in self.voc: # find hypernyms hypers_list = self._isas[isa_name].all_hyper(hypo) hypers_dict = {hyper: freq for hyper, freq in hypers_list} invoc_hypers_dict = {w: hypers_dict[w] for w in set(hypers_dict.keys()).intersection(self.voc)} invoc_hypers_dict.pop(hypo, None) invoc_hypers_list = sorted(invoc_hypers_dict.items(), key=operator.itemgetter(1), reverse=True) if VERBOSE: if len(invoc_hypers_list) > 0: print hypo, len(hypers_dict), len(invoc_hypers_list), ", ".join(w + ":" + unicode(freq) for w, freq in invoc_hypers_list) # print len(invoc_hypers_list), # find hypernyms of subphrases if len(invoc_hypers_list) == 0 and subphrases: for hypo_subphrase in self._subphrases(hypo): hypers_list = self._isas[isa_name].all_hyper(hypo_subphrase) hypers_dict = {hyper: freq for hyper, freq in hypers_list} invoc_hypers_dict = {w: hypers_dict[w] for w in set(hypers_dict.keys()).intersection(self.voc)} invoc_hypers_dict.pop(hypo_subphrase, None) invoc_hypers_list = sorted(invoc_hypers_dict.items(), key=operator.itemgetter(1), reverse=True) if (invoc_hypers_list) > 0: break if len(invoc_hypers_list) == 0: continue elif VERBOSE: print hypo, "-->", hypo_subphrase, ":", invoc_hypers_list elif len(invoc_hypers_list) == 0: continue # normalize max_freq = float(hypers_list[0][1]) invoc_max_freq = float(invoc_hypers_list[0][1]) for hyper, freq in invoc_hypers_list: hypo2hyper_freq[isa_name][(hypo, hyper)] = freq hypo2hyper_anorm[isa_name][(hypo, hyper)] = freq/max_freq hypo2hyper_inorm[isa_name][(hypo, hyper)] = freq/invoc_max_freq # average: (hypo, hyper) -> weight hypo2hyper_iavg = self._average(hypo2hyper_inorm) hypo2hyper_aavg = self._average(hypo2hyper_anorm) # initialize arrays hyper2hypo_iavg_arr = np.zeros(len(self._relations)) hypo2hyper_iavg_arr = np.zeros(len(self._relations)) hypo2hyper_iavg2_arr = np.zeros(len(self._relations)) hyper2hypo_iavg2_arr = np.zeros(len(self._relations)) hypo2hyper_aavg_arr = np.zeros(len(self._relations)) hyper2hypo_aavg_arr = np.zeros(len(self._relations)) hypo2hyper_arr = {} hyper2hypo_arr = {} for isa_name in hypo2hyper_inorm: hypo2hyper_arr[isa_name] = np.zeros(len(self._relations)) hyper2hypo_arr[isa_name] = np.zeros(len(self._relations)) # fill the arrays for i, row in self._relations.iterrows(): if i != 0 and i % 100000 == 0: print i hypo2hyper_iavg_arr[i] = hypo2hyper_iavg.pop((row.hyponym, row.hypernym), 0) hyper2hypo_iavg_arr[i] = hypo2hyper_iavg.pop((row.hypernym, row.hyponym), 0) hypo2hyper_aavg_arr[i] = hypo2hyper_aavg.pop((row.hyponym, row.hypernym), 0) hyper2hypo_aavg_arr[i] = hypo2hyper_aavg.pop((row.hypernym, row.hyponym), 0) for isa_name in hypo2hyper_inorm: hypo2hyper_arr[isa_name][i] = hypo2hyper_freq[isa_name].pop((row.hyponym, row.hypernym), 0) hyper2hypo_arr[isa_name][i] = hypo2hyper_freq[isa_name].pop((row.hypernym, row.hyponym), 0) # insert arrays as columns s = "_s" if subphrases else "" for isa_name in hypo2hyper_inorm: col = "hypo2hyper_" + isa_name + s self._relations[col] = Series(hypo2hyper_arr[isa_name], index=self._relations.index) hypo2hyper_iavg2_arr += self._relations[col] / self._relations[col].max() col = "hyper2hypo_" + isa_name + s self._relations[col] = Series(hyper2hypo_arr[isa_name], index=self._relations.index) hyper2hypo_iavg2_arr += self._relations[col] / self._relations[col].max() self._relations["hypo2hyper" + s] = Series(hypo2hyper_iavg_arr, index=self._relations.index) self._relations["hyper2hypo" + s] = Series(hyper2hypo_iavg_arr, index=self._relations.index) self._relations["hypo2hyper2" + s] = Series(hypo2hyper_iavg2_arr, index=self._relations.index) self._relations["hyper2hypo2" + s] = Series(hyper2hypo_iavg2_arr, index=self._relations.index) self._relations["hypo2hyper3" + s] = Series(hypo2hyper_aavg_arr, index=self._relations.index) self._relations["hyper2hypo3" + s] = Series(hyper2hypo_aavg_arr, index=self._relations.index) self._save_relations() # debug info debug_fpath = self._relations_fpath + "-direct-hypo2hyper" + s + ".csv" tmp_fpath = debug_fpath + ".tmp" with codecs.open(tmp_fpath, "w", "utf-8") as out: print >> out, "hyponym\thypernym\tfreq" for hypo, hyper in hypo2hyper_iavg: print >> out, "%s\t%s\t%.3f" % (hypo, hyper, hypo2hyper_iavg[(hypo, hyper)]) df = read_csv(tmp_fpath, encoding='utf-8', delimiter="\t", error_bad_lines=False) df = df.sort(["hyponym","freq"], ascending=[1,0]) df.to_csv(debug_fpath, sep="\t", encoding="utf-8", float_format='%.3f', index=False) try_remove(tmp_fpath) print "Direct hypernyms:", debug_fpath
def __init__(self, isas_fpath, min_freq=1, preprocess=True, sep='\t', strip_pos=True, use_pickle=True, lowercase=True): """ Provides access to a ISAs relations from a CSV file "hyponym<TAB>hypernym<TAB>freq" """ if not exists(isas_fpath): self._hypo2hyper = {} return isas_pkl_fpath = isas_fpath + ".pkl" if use_pickle and exists(isas_pkl_fpath): pkl = pickle.load(open(isas_pkl_fpath, "rb")) if "hypo2hyper" in pkl: hypo2hyper = pkl["hypo2hyper"] else: print "Error: cannot find hypo2hyper in ", isas_pkl_fpath hypo2hyper = {} if "hyper2hypo" in pkl: hyper2hypo = pkl["hyper2hypo"] else: print "Error: cannot find hyper2hypo in ", isas_pkl_fpath hyper2hypo = {} else: if preprocess: isas_cln_fpath = isas_fpath + ".cleaned" preprocess_pandas_csv(isas_fpath, isas_cln_fpath) isas_df = read_csv(isas_cln_fpath, sep, encoding='utf8', error_bad_lines=False) try_remove(isas_cln_fpath) else: isas_df = read_csv(isas_fpath, sep, encoding='utf8', error_bad_lines=False) isas_df = isas_df.drop(isas_df[isas_df["freq"] < min_freq].index) hypo2hyper = defaultdict(dict) hyper2hypo = defaultdict(dict) for i, row in isas_df.iterrows(): try: hypo = unicode(row["hyponym"]).split("#")[0].lower( ) if lowercase else unicode(row["hyponym"]).split("#")[0] hyper = unicode(row["hypernym"]).split("#")[0].lower( ) if lowercase else unicode(row["hypernym"]).split("#")[0] freq = int(row["freq"]) hypo_lemma = lemmatize(hypo).lower() hyper_lemma = lemmatize(hyper).lower() if hypo not in hypo2hyper or hyper not in hypo2hyper[hypo]: hypo2hyper[hypo][hyper] = freq else: hypo2hyper[hypo][hyper] += freq if hypo_lemma not in hypo2hyper or hyper_lemma not in hypo2hyper[ hypo_lemma]: hypo2hyper[hypo_lemma][hyper_lemma] = freq else: hypo2hyper[hypo_lemma][hyper_lemma] += freq if hyper not in hyper2hypo or hypo not in hyper2hypo[hyper]: hyper2hypo[hyper][hypo] = freq else: hyper2hypo[hyper][hypo] += freq if hyper_lemma not in hyper2hypo or hypo_lemma not in hyper2hypo[ hyper_lemma]: hyper2hypo[hyper_lemma][hypo_lemma] = freq else: hyper2hypo[hyper_lemma][hypo_lemma] += freq except: print "Bad row:", row print format_exc() print "dictionary is loaded:", len(hypo2hyper) if use_pickle: pkl = {"hypo2hyper": hypo2hyper, "hyper2hypo": hyper2hypo} pickle.dump(pkl, open(isas_pkl_fpath, "wb")) print "Pickled voc:", isas_pkl_fpath print "Loaded %d words from: %s" % (len(hypo2hyper), isas_pkl_fpath if isas_pkl_fpath else isas_fpath) self._hypo2hyper = hypo2hyper self._hyper2hypo = hyper2hypo
def __init__( self, isas_fpath, min_freq=1, preprocess=True, sep="\t", strip_pos=True, use_pickle=True, lowercase=True ): """ Provides access to a ISAs relations from a CSV file "hyponym<TAB>hypernym<TAB>freq" """ if not exists(isas_fpath): self._hypo2hyper = {} return isas_pkl_fpath = isas_fpath + ".pkl" if use_pickle and exists(isas_pkl_fpath): pkl = pickle.load(open(isas_pkl_fpath, "rb")) if "hypo2hyper" in pkl: hypo2hyper = pkl["hypo2hyper"] else: print "Error: cannot find hypo2hyper in ", isas_pkl_fpath hypo2hyper = {} if "hyper2hypo" in pkl: hyper2hypo = pkl["hyper2hypo"] else: print "Error: cannot find hyper2hypo in ", isas_pkl_fpath hyper2hypo = {} else: if preprocess: isas_cln_fpath = isas_fpath + ".cleaned" preprocess_pandas_csv(isas_fpath, isas_cln_fpath) isas_df = read_csv(isas_cln_fpath, sep, encoding="utf8", error_bad_lines=False) try_remove(isas_cln_fpath) else: isas_df = read_csv(isas_fpath, sep, encoding="utf8", error_bad_lines=False) isas_df = isas_df.drop(isas_df[isas_df["freq"] < min_freq].index) hypo2hyper = defaultdict(dict) hyper2hypo = defaultdict(dict) for i, row in isas_df.iterrows(): try: hypo = ( unicode(row["hyponym"]).split("#")[0].lower() if lowercase else unicode(row["hyponym"]).split("#")[0] ) hyper = ( unicode(row["hypernym"]).split("#")[0].lower() if lowercase else unicode(row["hypernym"]).split("#")[0] ) freq = int(row["freq"]) hypo_lemma = lemmatize(hypo).lower() hyper_lemma = lemmatize(hyper).lower() if hypo not in hypo2hyper or hyper not in hypo2hyper[hypo]: hypo2hyper[hypo][hyper] = freq else: hypo2hyper[hypo][hyper] += freq if hypo_lemma not in hypo2hyper or hyper_lemma not in hypo2hyper[hypo_lemma]: hypo2hyper[hypo_lemma][hyper_lemma] = freq else: hypo2hyper[hypo_lemma][hyper_lemma] += freq if hyper not in hyper2hypo or hypo not in hyper2hypo[hyper]: hyper2hypo[hyper][hypo] = freq else: hyper2hypo[hyper][hypo] += freq if hyper_lemma not in hyper2hypo or hypo_lemma not in hyper2hypo[hyper_lemma]: hyper2hypo[hyper_lemma][hypo_lemma] = freq else: hyper2hypo[hyper_lemma][hypo_lemma] += freq except: print "Bad row:", row print format_exc() print "dictionary is loaded:", len(hypo2hyper) if use_pickle: pkl = {"hypo2hyper": hypo2hyper, "hyper2hypo": hyper2hypo} pickle.dump(pkl, open(isas_pkl_fpath, "wb")) print "Pickled voc:", isas_pkl_fpath print "Loaded %d words from: %s" % (len(hypo2hyper), isas_pkl_fpath if isas_pkl_fpath else isas_fpath) self._hypo2hyper = hypo2hyper self._hyper2hypo = hyper2hypo