def __init__(self, taxonomy_resources, voc_fpath="", relations_fpath="", lang="en"): self._isas = taxonomy_resources.isas self._freqs = taxonomy_resources.freqs self.voc_name = fpath2filename(voc_fpath) self._voc_fpath = voc_fpath self._stopwords = load_stoplist(lang=lang) self._lang = lang if exists(voc_fpath) and not exists(relations_fpath): self.voc = self._load_voc(voc_fpath) relations_fpath = voc_fpath + "-relations.csv" print "Generating new relations file:", relations_fpath self._relations_fpath = voc_fpath + "-relations.csv" self._relations = self._generate_relations(self.voc, self._relations_fpath) elif exists(relations_fpath): print "Loading relations file:", relations_fpath self._relations_fpath = relations_fpath self._relations = read_csv(relations_fpath, encoding='utf-8', delimiter="\t", error_bad_lines=False) print "Loaded %d relations from: %s" % (len(self._relations), relations_fpath) hypos_voc = set(self._relations.hyponym.to_dict().values()) hyper_voc = set(self._relations.hypernym.to_dict().values()) self.voc = hypos_voc.union(hyper_voc) print "Loaded %d voc from relations" % len(self.voc) else: raise Exception("Error: cannot load relations or generate them. Specify either voc_fpath or relations_fpath.")
def load_relations(relations_fpath, taxo_en_plants_fpath="", taxo_en_vehicles_fpath="", taxo_en_ai_fpath="", taxo_eval_en_ai_fpath=""): if exists(relations_fpath): relations = read_csv(relations_fpath, encoding='utf-8', delimiter="\t", error_bad_lines=False) print "Relations loaded from:", relations_fpath elif exists(taxo_en_plants_fpath) and exists(taxo_en_vehicles_fpath) and exists(taxo_en_ai_fpath) and exists(taxo_eval_en_ai_fpath): tic = time() plants = taxo2csv_all_correct(taxo_en_plants_fpath) plants = insert_source(taxo_en_plants_fpath, plants) print "plants:", len(plants) vehicles = taxo2csv_all_correct(taxo_en_vehicles_fpath) vehicles = insert_source(taxo_en_vehicles_fpath, vehicles) print "vehicles:", len(vehicles) ai = taxo2csv_mixed(taxo_en_ai_fpath, taxo_eval_en_ai_fpath) ai = insert_source(taxo_en_ai_fpath, ai) print "ai:", len(ai) relations = concat([plants, vehicles, ai], ignore_index=True) print "all:", len(relations) relations = remove_underscores(relations) relations = add_inverse_relations(relations) relations = relations.sort(["hyponym", "correct"], ascending=[1,0]) relations.to_csv(relations_fpath, sep="\t", encoding="utf-8", float_format='%.0f', index=False) relations = read_csv(relations_fpath, encoding='utf-8', delimiter="\t", error_bad_lines=False) print "Dataset:", relations_fpath print "Relations generated and loaded in %.1f sec." % (time()-tic) else: print "Error: cannot load relations. No input files found." relations = None return relations
def __init__(self, model_dir, method="LogisticRegressionL2", features=FEATURES, k=100, overwrite=False): self.CLASSIFIER_FILE = "classifier" self.KBEST_VOC_FILE = "kbest-voc.csv" self.KBEST_FILE = "kbest.pkl" self.META_FILE = "meta.json" clf_fpath = join(model_dir, self.CLASSIFIER_FILE) kbest_fpath = join(model_dir, self.KBEST_FILE) self._model_dir = model_dir self._meta_fpath = join(model_dir, self.META_FILE) self._meta = {} self._meta["method"] = method self._meta["k"] = k self._meta["features"] = features if exists(model_dir) and exists(clf_fpath) and not overwrite: # load the model self._clf = joblib.load(clf_fpath) self._meta = json.load(open(self._meta_fpath, "r")) print "Metadata were loaded from:", self._meta_fpath else: # model doesn't exist, or must be overwritten create a new one ensure_dir(model_dir) self.save_meta()
def __init__(self, model_dir, method="LogisticRegressionL2", features=FEATURES, k=100, overwrite=False): self.CLASSIFIER_FILE = "classifier" self.KBEST_VOC_FILE = "kbest-voc.csv" self.KBEST_FILE = "kbest.pkl" self.META_FILE = "meta.json" clf_fpath = join(model_dir, self.CLASSIFIER_FILE) kbest_fpath = join(model_dir, self.KBEST_FILE) self._model_dir = model_dir self._meta_fpath = join(model_dir, self.META_FILE) self._meta = {} self._meta["method"] = method self._meta["k"] = k self._meta["features"] = features if exists(model_dir) and exists(clf_fpath) and not overwrite: # load the model self._clf = joblib.load(clf_fpath) self._meta = json.load(open(self._meta_fpath, "r")) print("Metadata were loaded from:", self._meta_fpath) else: # model doesn't exist, or must be overwritten create a new one ensure_dir(model_dir) self.save_meta()
def load_relations(relations_fpath, taxo_en_plants_fpath="", taxo_en_vehicles_fpath="", taxo_en_ai_fpath="", taxo_eval_en_ai_fpath=""): if exists(relations_fpath): relations = read_csv(relations_fpath, encoding='utf-8', delimiter="\t", error_bad_lines=False) print("Relations loaded from:", relations_fpath) elif exists(taxo_en_plants_fpath) and exists( taxo_en_vehicles_fpath) and exists(taxo_en_ai_fpath) and exists( taxo_eval_en_ai_fpath): tic = time() plants = taxo2csv_all_correct(taxo_en_plants_fpath) plants = insert_source(taxo_en_plants_fpath, plants) print("plants:", len(plants)) vehicles = taxo2csv_all_correct(taxo_en_vehicles_fpath) vehicles = insert_source(taxo_en_vehicles_fpath, vehicles) print("vehicles:", len(vehicles)) ai = taxo2csv_mixed(taxo_en_ai_fpath, taxo_eval_en_ai_fpath) ai = insert_source(taxo_en_ai_fpath, ai) print("ai:", len(ai)) relations = concat([plants, vehicles, ai], ignore_index=True) print("all:", len(relations)) relations = remove_underscores(relations) relations = add_inverse_relations(relations) relations = relations.sort_values(["hyponym", "correct"], ascending=[1, 0]) relations.to_csv(relations_fpath, sep="\t", encoding="utf-8", float_format='%.0f', index=False) relations = read_csv(relations_fpath, encoding='utf-8', delimiter="\t", error_bad_lines=False) print("Dataset:", relations_fpath) print("Relations generated and loaded in %.1f sec." % (time() - tic)) else: print("Error: cannot load relations. No input files found.") relations = None return relations
def _load(self, babelnet_fpath, divide_by_freq=False, sanity_check=True): if not exists(babelnet_fpath): return defaultdict(dict) with open(babelnet_fpath, 'rb') as babelnet_file: bn = pickle.load(babelnet_file) if sanity_check: err_num = 0 for word in bn: if len(bn[word]) <= 0: err_num += 1 print "Warning: local word with no senses", word if err_num > 0: print "Warning:", err_num, "local words with no senses" print "Loaded BabelNet with %d words from: %s" % (len(bn), babelnet_fpath) self._block_save = False if self._normalized: for word in bn: for sense_id in bn[word]: if divide_by_freq: bow = Counter({w: bn[word][sense_id]["bow"][w] / self._freq.freq(w) for w in bn[word][sense_id]["bow"] if good_token(w)}) self._block_save = True else: bow = bn[word][sense_id]["bow"] max_freq_norm = float(max(bow.values())) if len(bow) > 0 else 1.0 if max_freq_norm == 0.0: max_freq_norm = 1.0 bow_range_norm = Counter({w: bow[w] / max_freq_norm for w in bow if good_token(w)}) bn[word][sense_id]["bow"] = bow_range_norm return bn
def _save_synset(self, word, sid, synset): try: if not exists(self._babelnet_dir): return output_fpath = join(self._babelnet_dir, word + "#" + sid + ".json") with codecs.open(output_fpath, 'w', "utf-8") as outfile: print >> outfile, json.dumps(synset, ensure_ascii=False).decode("utf-8") except: print "Error saving file" print format_exc()
def __init__(self, freq_fpath, min_freq=1, preprocess=True, sep='\t', strip_pos=True, use_pickle=True): """ Reads a word frequency list in CSV format "word<TAB>freq" """ if not exists(freq_fpath): self._freq = {} return pkl_fpath = freq_fpath + ".pkl" if use_pickle and exists(pkl_fpath): voc = pickle.load(open(pkl_fpath, "rb")) else: # load words to datafame if preprocess: freq_cln_fpath = freq_fpath + "-cln" preprocess_pandas_csv(freq_fpath, freq_cln_fpath) word_df = read_csv(freq_cln_fpath, sep, encoding='utf8', error_bad_lines=False) try_remove(freq_cln_fpath) else: word_df = read_csv(freq_fpath, sep, encoding='utf8', error_bad_lines=False) # load from dataframe to dictionary word_df = word_df.drop(word_df[word_df["freq"] < min_freq].index) if strip_pos: voc = {} for i, row in word_df.iterrows(): try: word = unicode(row["word"]).split("#")[0] freq = int(row["freq"]) if word not in voc or voc[word] < freq: voc[word] = freq except: print "Bad row:", row print format_exc() else: voc = { row["word"]: row["freq"] for i, row in word_df.iterrows() } print "dictionary is loaded:", len(voc) if use_pickle: pickle.dump(voc, open(pkl_fpath, "wb")) print "Pickled voc:", pkl_fpath print "Loaded %d words from: %s" % (len(voc), pkl_fpath if pkl_fpath else freq_fpath) self._freq = voc
def _save_synset(self, word, sid, synset): try: if not exists(self._babelnet_dir): return output_fpath = join(self._babelnet_dir, word + "#" + sid + ".json") with codecs.open(output_fpath, 'w', "utf-8") as outfile: print >> outfile, json.dumps( synset, ensure_ascii=False).decode("utf-8") except: print "Error saving file" print format_exc()
def _load_voc(self, voc_fpath): if exists(voc_fpath): voc_df = read_csv(voc_fpath, encoding='utf-8', delimiter="\t", error_bad_lines=False) voc_name = fpath2filename(voc_fpath) voc = set() for i, row in voc_df.iterrows(): if "term" in row: voc.add(row.term) elif "word" in row: voc.add(row.word) print "Loaded %d words vocabulary" % len(voc) return voc else: print "Warning: vocabulary is not loaded. This means hypo2hyper features cannot be extracted." return set()
def __init__(self, taxonomy_resources, voc_fpath="", relations_fpath="", lang="en"): self._isas = taxonomy_resources.isas self._freqs = taxonomy_resources.freqs self.voc_name = fpath2filename(voc_fpath) self._voc_fpath = voc_fpath self._stopwords = load_stoplist(lang=lang) self._lang = lang if exists(voc_fpath) and not exists(relations_fpath): self.voc = self._load_voc(voc_fpath) relations_fpath = voc_fpath + "-relations.csv" print "Generating new relations file:", relations_fpath self._relations_fpath = voc_fpath + "-relations.csv" self._relations = self._generate_relations(self.voc, self._relations_fpath) elif exists(relations_fpath): print "Loading relations file:", relations_fpath self._relations_fpath = relations_fpath self._relations = read_csv(relations_fpath, encoding='utf-8', delimiter="\t", error_bad_lines=False) print "Loaded %d relations from: %s" % (len( self._relations), relations_fpath) hypos_voc = set(self._relations.hyponym.to_dict().values()) hyper_voc = set(self._relations.hypernym.to_dict().values()) self.voc = hypos_voc.union(hyper_voc) print "Loaded %d voc from relations" % len(self.voc) else: raise Exception( "Error: cannot load relations or generate them. Specify either voc_fpath or relations_fpath." )
def _load(self, babelnet_fpath, divide_by_freq=False, sanity_check=True): if not exists(babelnet_fpath): return defaultdict(dict) with open(babelnet_fpath, 'rb') as babelnet_file: bn = pickle.load(babelnet_file) if sanity_check: err_num = 0 for word in bn: if len(bn[word]) <= 0: err_num += 1 print "Warning: local word with no senses", word if err_num > 0: print "Warning:", err_num, "local words with no senses" print "Loaded BabelNet with %d words from: %s" % (len(bn), babelnet_fpath) self._block_save = False if self._normalized: for word in bn: for sense_id in bn[word]: if divide_by_freq: bow = Counter({ w: bn[word][sense_id]["bow"][w] / self._freq.freq(w) for w in bn[word][sense_id]["bow"] if good_token(w) }) self._block_save = True else: bow = bn[word][sense_id]["bow"] max_freq_norm = float(max( bow.values())) if len(bow) > 0 else 1.0 if max_freq_norm == 0.0: max_freq_norm = 1.0 bow_range_norm = Counter({ w: bow[w] / max_freq_norm for w in bow if good_token(w) }) bn[word][sense_id]["bow"] = bow_range_norm return bn
def adagram_disambiguate(contexts_fpath, model_fpath, output_fpath, nearest_neighbors="false"): env = dict(os.environ) env["DYLD_LIBRARY_PATH"] = DYLD_LIBRARY p = Popen(["julia", join(ADAGRAM_SCRIPTS_DIR, "matching.jl"), contexts_fpath, model_fpath, output_fpath, nearest_neighbors], stdin=PIPE, stdout=PIPE, stderr=PIPE, env=env) stdout, err = p.communicate(b"") rc = p.returncode print stdout print err print "Output:", output_fpath print "Output exits:", exists(output_fpath)
def adagram_disambiguate(contexts_fpath, model_fpath, output_fpath, nearest_neighbors="false"): env = dict(os.environ) env["DYLD_LIBRARY_PATH"] = DYLD_LIBRARY p = Popen([ "julia", join(ADAGRAM_SCRIPTS_DIR, "matching.jl"), contexts_fpath, model_fpath, output_fpath, nearest_neighbors ], stdin=PIPE, stdout=PIPE, stderr=PIPE, env=env) stdout, err = p.communicate(b"") rc = p.returncode print stdout print err print "Output:", output_fpath print "Output exits:", exists(output_fpath)
def __init__(self, isas_fpath, min_freq=1, preprocess=True, sep='\t', strip_pos=True, use_pickle=True, lowercase=True): """ Provides access to a ISAs relations from a CSV file "hyponym<TAB>hypernym<TAB>freq" """ if not exists(isas_fpath): self._hypo2hyper = {} return isas_pkl_fpath = isas_fpath + ".pkl" if use_pickle and exists(isas_pkl_fpath): pkl = pickle.load(open(isas_pkl_fpath, "rb")) if "hypo2hyper" in pkl: hypo2hyper = pkl["hypo2hyper"] else: print "Error: cannot find hypo2hyper in ", isas_pkl_fpath hypo2hyper = {} if "hyper2hypo" in pkl: hyper2hypo = pkl["hyper2hypo"] else: print "Error: cannot find hyper2hypo in ", isas_pkl_fpath hyper2hypo = {} else: if preprocess: isas_cln_fpath = isas_fpath + ".cleaned" preprocess_pandas_csv(isas_fpath, isas_cln_fpath) isas_df = read_csv(isas_cln_fpath, sep, encoding='utf8', error_bad_lines=False) try_remove(isas_cln_fpath) else: isas_df = read_csv(isas_fpath, sep, encoding='utf8', error_bad_lines=False) isas_df = isas_df.drop(isas_df[isas_df["freq"] < min_freq].index) hypo2hyper = defaultdict(dict) hyper2hypo = defaultdict(dict) for i, row in isas_df.iterrows(): try: hypo = unicode(row["hyponym"]).split("#")[0].lower( ) if lowercase else unicode(row["hyponym"]).split("#")[0] hyper = unicode(row["hypernym"]).split("#")[0].lower( ) if lowercase else unicode(row["hypernym"]).split("#")[0] freq = int(row["freq"]) hypo_lemma = lemmatize(hypo).lower() hyper_lemma = lemmatize(hyper).lower() if hypo not in hypo2hyper or hyper not in hypo2hyper[hypo]: hypo2hyper[hypo][hyper] = freq else: hypo2hyper[hypo][hyper] += freq if hypo_lemma not in hypo2hyper or hyper_lemma not in hypo2hyper[ hypo_lemma]: hypo2hyper[hypo_lemma][hyper_lemma] = freq else: hypo2hyper[hypo_lemma][hyper_lemma] += freq if hyper not in hyper2hypo or hypo not in hyper2hypo[hyper]: hyper2hypo[hyper][hypo] = freq else: hyper2hypo[hyper][hypo] += freq if hyper_lemma not in hyper2hypo or hypo_lemma not in hyper2hypo[ hyper_lemma]: hyper2hypo[hyper_lemma][hypo_lemma] = freq else: hyper2hypo[hyper_lemma][hypo_lemma] += freq except: print "Bad row:", row print format_exc() print "dictionary is loaded:", len(hypo2hyper) if use_pickle: pkl = {"hypo2hyper": hypo2hyper, "hyper2hypo": hyper2hypo} pickle.dump(pkl, open(isas_pkl_fpath, "wb")) print "Pickled voc:", isas_pkl_fpath print "Loaded %d words from: %s" % (len(hypo2hyper), isas_pkl_fpath if isas_pkl_fpath else isas_fpath) self._hypo2hyper = hypo2hyper self._hyper2hypo = hyper2hypo
def __init__( self, isas_fpath, min_freq=1, preprocess=True, sep="\t", strip_pos=True, use_pickle=True, lowercase=True ): """ Provides access to a ISAs relations from a CSV file "hyponym<TAB>hypernym<TAB>freq" """ if not exists(isas_fpath): self._hypo2hyper = {} return isas_pkl_fpath = isas_fpath + ".pkl" if use_pickle and exists(isas_pkl_fpath): pkl = pickle.load(open(isas_pkl_fpath, "rb")) if "hypo2hyper" in pkl: hypo2hyper = pkl["hypo2hyper"] else: print "Error: cannot find hypo2hyper in ", isas_pkl_fpath hypo2hyper = {} if "hyper2hypo" in pkl: hyper2hypo = pkl["hyper2hypo"] else: print "Error: cannot find hyper2hypo in ", isas_pkl_fpath hyper2hypo = {} else: if preprocess: isas_cln_fpath = isas_fpath + ".cleaned" preprocess_pandas_csv(isas_fpath, isas_cln_fpath) isas_df = read_csv(isas_cln_fpath, sep, encoding="utf8", error_bad_lines=False) try_remove(isas_cln_fpath) else: isas_df = read_csv(isas_fpath, sep, encoding="utf8", error_bad_lines=False) isas_df = isas_df.drop(isas_df[isas_df["freq"] < min_freq].index) hypo2hyper = defaultdict(dict) hyper2hypo = defaultdict(dict) for i, row in isas_df.iterrows(): try: hypo = ( unicode(row["hyponym"]).split("#")[0].lower() if lowercase else unicode(row["hyponym"]).split("#")[0] ) hyper = ( unicode(row["hypernym"]).split("#")[0].lower() if lowercase else unicode(row["hypernym"]).split("#")[0] ) freq = int(row["freq"]) hypo_lemma = lemmatize(hypo).lower() hyper_lemma = lemmatize(hyper).lower() if hypo not in hypo2hyper or hyper not in hypo2hyper[hypo]: hypo2hyper[hypo][hyper] = freq else: hypo2hyper[hypo][hyper] += freq if hypo_lemma not in hypo2hyper or hyper_lemma not in hypo2hyper[hypo_lemma]: hypo2hyper[hypo_lemma][hyper_lemma] = freq else: hypo2hyper[hypo_lemma][hyper_lemma] += freq if hyper not in hyper2hypo or hypo not in hyper2hypo[hyper]: hyper2hypo[hyper][hypo] = freq else: hyper2hypo[hyper][hypo] += freq if hyper_lemma not in hyper2hypo or hypo_lemma not in hyper2hypo[hyper_lemma]: hyper2hypo[hyper_lemma][hypo_lemma] = freq else: hyper2hypo[hyper_lemma][hypo_lemma] += freq except: print "Bad row:", row print format_exc() print "dictionary is loaded:", len(hypo2hyper) if use_pickle: pkl = {"hypo2hyper": hypo2hyper, "hyper2hypo": hyper2hypo} pickle.dump(pkl, open(isas_pkl_fpath, "wb")) print "Pickled voc:", isas_pkl_fpath print "Loaded %d words from: %s" % (len(hypo2hyper), isas_pkl_fpath if isas_pkl_fpath else isas_fpath) self._hypo2hyper = hypo2hyper self._hyper2hypo = hyper2hypo