def process(ddt_fpath): preprocess_pandas_csv(ddt_fpath) df = read_csv(ddt_fpath, encoding='utf-8', delimiter="\t", error_bad_lines=False) df = df.fillna("") print len(df), "senses loaded" closure_fpath = ddt_fpath + ".closure" with codecs.open(closure_fpath, "w", "utf-8") as closure: print >> closure, "word\tcid\tcluster\tisas" for i, row in df.iterrows(): cluster = remove_unknown(row.cluster) isas = remove_unknown(row.isas) print >> closure, "%s\t%s\t%s\t%s" % (row.word, row.cid, cluster, isas) print "Output:", closure_fpath
def __init__(self, freq_fpath, min_freq=1, preprocess=True, sep='\t', strip_pos=True, use_pickle=True): """ Reads a word frequency list in CSV format "word<TAB>freq" """ if not exists(freq_fpath): self._freq = {} return pkl_fpath = freq_fpath + ".pkl" if use_pickle and exists(pkl_fpath): voc = pickle.load(open(pkl_fpath, "rb")) else: # load words to datafame if preprocess: freq_cln_fpath = freq_fpath + "-cln" preprocess_pandas_csv(freq_fpath, freq_cln_fpath) word_df = read_csv(freq_cln_fpath, sep, encoding='utf8', error_bad_lines=False) try_remove(freq_cln_fpath) else: word_df = read_csv(freq_fpath, sep, encoding='utf8', error_bad_lines=False) # load from dataframe to dictionary word_df = word_df.drop(word_df[word_df["freq"] < min_freq].index) if strip_pos: voc = {} for i, row in word_df.iterrows(): try: word = unicode(row["word"]).split("#")[0] freq = int(row["freq"]) if word not in voc or voc[word] < freq: voc[word] = freq except: print "Bad row:", row print format_exc() else: voc = { row["word"]: row["freq"] for i, row in word_df.iterrows() } print "dictionary is loaded:", len(voc) if use_pickle: pickle.dump(voc, open(pkl_fpath, "wb")) print "Pickled voc:", pkl_fpath print "Loaded %d words from: %s" % (len(voc), pkl_fpath if pkl_fpath else freq_fpath) self._freq = voc
def __init__(self, isas_fpath, min_freq=1, preprocess=True, sep='\t', strip_pos=True, use_pickle=True, lowercase=True): """ Provides access to a ISAs relations from a CSV file "hyponym<TAB>hypernym<TAB>freq" """ if not exists(isas_fpath): self._hypo2hyper = {} return isas_pkl_fpath = isas_fpath + ".pkl" if use_pickle and exists(isas_pkl_fpath): pkl = pickle.load(open(isas_pkl_fpath, "rb")) if "hypo2hyper" in pkl: hypo2hyper = pkl["hypo2hyper"] else: print "Error: cannot find hypo2hyper in ", isas_pkl_fpath hypo2hyper = {} if "hyper2hypo" in pkl: hyper2hypo = pkl["hyper2hypo"] else: print "Error: cannot find hyper2hypo in ", isas_pkl_fpath hyper2hypo = {} else: if preprocess: isas_cln_fpath = isas_fpath + ".cleaned" preprocess_pandas_csv(isas_fpath, isas_cln_fpath) isas_df = read_csv(isas_cln_fpath, sep, encoding='utf8', error_bad_lines=False) try_remove(isas_cln_fpath) else: isas_df = read_csv(isas_fpath, sep, encoding='utf8', error_bad_lines=False) isas_df = isas_df.drop(isas_df[isas_df["freq"] < min_freq].index) hypo2hyper = defaultdict(dict) hyper2hypo = defaultdict(dict) for i, row in isas_df.iterrows(): try: hypo = unicode(row["hyponym"]).split("#")[0].lower( ) if lowercase else unicode(row["hyponym"]).split("#")[0] hyper = unicode(row["hypernym"]).split("#")[0].lower( ) if lowercase else unicode(row["hypernym"]).split("#")[0] freq = int(row["freq"]) hypo_lemma = lemmatize(hypo).lower() hyper_lemma = lemmatize(hyper).lower() if hypo not in hypo2hyper or hyper not in hypo2hyper[hypo]: hypo2hyper[hypo][hyper] = freq else: hypo2hyper[hypo][hyper] += freq if hypo_lemma not in hypo2hyper or hyper_lemma not in hypo2hyper[ hypo_lemma]: hypo2hyper[hypo_lemma][hyper_lemma] = freq else: hypo2hyper[hypo_lemma][hyper_lemma] += freq if hyper not in hyper2hypo or hypo not in hyper2hypo[hyper]: hyper2hypo[hyper][hypo] = freq else: hyper2hypo[hyper][hypo] += freq if hyper_lemma not in hyper2hypo or hypo_lemma not in hyper2hypo[ hyper_lemma]: hyper2hypo[hyper_lemma][hypo_lemma] = freq else: hyper2hypo[hyper_lemma][hypo_lemma] += freq except: print "Bad row:", row print format_exc() print "dictionary is loaded:", len(hypo2hyper) if use_pickle: pkl = {"hypo2hyper": hypo2hyper, "hyper2hypo": hyper2hypo} pickle.dump(pkl, open(isas_pkl_fpath, "wb")) print "Pickled voc:", isas_pkl_fpath print "Loaded %d words from: %s" % (len(hypo2hyper), isas_pkl_fpath if isas_pkl_fpath else isas_fpath) self._hypo2hyper = hypo2hyper self._hyper2hypo = hyper2hypo
def get_topic_stoplist(preprocess=False): stoplist_fpath = join(get_data_dir(), "topic-stoplist-489.csv") if preprocess: preprocess_pandas_csv(stoplist_fpath) df = read_csv(stoplist_fpath, "\t", encoding='utf8', error_bad_lines=False) return [row.word for i,row in df.iterrows()]
def __init__( self, isas_fpath, min_freq=1, preprocess=True, sep="\t", strip_pos=True, use_pickle=True, lowercase=True ): """ Provides access to a ISAs relations from a CSV file "hyponym<TAB>hypernym<TAB>freq" """ if not exists(isas_fpath): self._hypo2hyper = {} return isas_pkl_fpath = isas_fpath + ".pkl" if use_pickle and exists(isas_pkl_fpath): pkl = pickle.load(open(isas_pkl_fpath, "rb")) if "hypo2hyper" in pkl: hypo2hyper = pkl["hypo2hyper"] else: print "Error: cannot find hypo2hyper in ", isas_pkl_fpath hypo2hyper = {} if "hyper2hypo" in pkl: hyper2hypo = pkl["hyper2hypo"] else: print "Error: cannot find hyper2hypo in ", isas_pkl_fpath hyper2hypo = {} else: if preprocess: isas_cln_fpath = isas_fpath + ".cleaned" preprocess_pandas_csv(isas_fpath, isas_cln_fpath) isas_df = read_csv(isas_cln_fpath, sep, encoding="utf8", error_bad_lines=False) try_remove(isas_cln_fpath) else: isas_df = read_csv(isas_fpath, sep, encoding="utf8", error_bad_lines=False) isas_df = isas_df.drop(isas_df[isas_df["freq"] < min_freq].index) hypo2hyper = defaultdict(dict) hyper2hypo = defaultdict(dict) for i, row in isas_df.iterrows(): try: hypo = ( unicode(row["hyponym"]).split("#")[0].lower() if lowercase else unicode(row["hyponym"]).split("#")[0] ) hyper = ( unicode(row["hypernym"]).split("#")[0].lower() if lowercase else unicode(row["hypernym"]).split("#")[0] ) freq = int(row["freq"]) hypo_lemma = lemmatize(hypo).lower() hyper_lemma = lemmatize(hyper).lower() if hypo not in hypo2hyper or hyper not in hypo2hyper[hypo]: hypo2hyper[hypo][hyper] = freq else: hypo2hyper[hypo][hyper] += freq if hypo_lemma not in hypo2hyper or hyper_lemma not in hypo2hyper[hypo_lemma]: hypo2hyper[hypo_lemma][hyper_lemma] = freq else: hypo2hyper[hypo_lemma][hyper_lemma] += freq if hyper not in hyper2hypo or hypo not in hyper2hypo[hyper]: hyper2hypo[hyper][hypo] = freq else: hyper2hypo[hyper][hypo] += freq if hyper_lemma not in hyper2hypo or hypo_lemma not in hyper2hypo[hyper_lemma]: hyper2hypo[hyper_lemma][hypo_lemma] = freq else: hyper2hypo[hyper_lemma][hypo_lemma] += freq except: print "Bad row:", row print format_exc() print "dictionary is loaded:", len(hypo2hyper) if use_pickle: pkl = {"hypo2hyper": hypo2hyper, "hyper2hypo": hyper2hypo} pickle.dump(pkl, open(isas_pkl_fpath, "wb")) print "Pickled voc:", isas_pkl_fpath print "Loaded %d words from: %s" % (len(hypo2hyper), isas_pkl_fpath if isas_pkl_fpath else isas_fpath) self._hypo2hyper = hypo2hyper self._hyper2hypo = hyper2hypo