def process(ddt_fpath):

    preprocess_pandas_csv(ddt_fpath)
    df = read_csv(ddt_fpath, encoding='utf-8', delimiter="\t", error_bad_lines=False)
    df = df.fillna("")
    print len(df), "senses loaded"

    closure_fpath = ddt_fpath + ".closure"
    with codecs.open(closure_fpath, "w", "utf-8") as closure:
        print >> closure, "word\tcid\tcluster\tisas"
        for i, row in df.iterrows():
            cluster = remove_unknown(row.cluster)
            isas = remove_unknown(row.isas)
            print >> closure, "%s\t%s\t%s\t%s" % (row.word, row.cid, cluster, isas)

    print "Output:", closure_fpath
Example #2
0
    def __init__(self, freq_fpath, min_freq=1, preprocess=True, sep='\t', strip_pos=True, use_pickle=True):
        """ Reads a word frequency list in CSV format "word<TAB>freq" """

        if not exists(freq_fpath):
            self._freq = {}
            return

        pkl_fpath = freq_fpath + ".pkl"
        if use_pickle and exists(pkl_fpath):
            voc = pickle.load(open(pkl_fpath, "rb"))
        else:
            # load words to datafame
            if preprocess:
                freq_cln_fpath = freq_fpath + "-cln"
                preprocess_pandas_csv(freq_fpath, freq_cln_fpath)
                word_df = read_csv(freq_cln_fpath, sep, encoding='utf8', error_bad_lines=False)
                try_remove(freq_cln_fpath)
            else:
                word_df = read_csv(freq_fpath, sep, encoding='utf8', error_bad_lines=False)

            # load from dataframe to dictionary
            word_df = word_df.drop(word_df[word_df["freq"] < min_freq].index)
            if strip_pos:
                voc = {}
                for i, row in word_df.iterrows():
                    try:
                        word = unicode(row["word"]).split("#")[0]
                        freq = int(row["freq"])
                        if word not in voc or voc[word] < freq: voc[word] = freq
                    except:
                        print "Bad row:", row
                        print format_exc()
            else:
                voc = { row["word"]: row["freq"] for i, row in word_df.iterrows() }

            print "dictionary is loaded:", len(voc)

            if use_pickle:
                pickle.dump(voc, open(pkl_fpath, "wb"))
                print "Pickled voc:", pkl_fpath

        print "Loaded %d words from: %s" % (len(voc), pkl_fpath if pkl_fpath else freq_fpath)

        self._freq = voc
def process(ddt_fpath):

    preprocess_pandas_csv(ddt_fpath)
    df = read_csv(ddt_fpath,
                  encoding='utf-8',
                  delimiter="\t",
                  error_bad_lines=False)
    df = df.fillna("")
    print len(df), "senses loaded"

    closure_fpath = ddt_fpath + ".closure"
    with codecs.open(closure_fpath, "w", "utf-8") as closure:
        print >> closure, "word\tcid\tcluster\tisas"
        for i, row in df.iterrows():
            cluster = remove_unknown(row.cluster)
            isas = remove_unknown(row.isas)
            print >> closure, "%s\t%s\t%s\t%s" % (row.word, row.cid, cluster,
                                                  isas)

    print "Output:", closure_fpath
Example #4
0
    def __init__(self,
                 freq_fpath,
                 min_freq=1,
                 preprocess=True,
                 sep='\t',
                 strip_pos=True,
                 use_pickle=True):
        """ Reads a word frequency list in CSV format "word<TAB>freq" """

        if not exists(freq_fpath):
            self._freq = {}
            return

        pkl_fpath = freq_fpath + ".pkl"
        if use_pickle and exists(pkl_fpath):
            voc = pickle.load(open(pkl_fpath, "rb"))
        else:
            # load words to datafame
            if preprocess:
                freq_cln_fpath = freq_fpath + "-cln"
                preprocess_pandas_csv(freq_fpath, freq_cln_fpath)
                word_df = read_csv(freq_cln_fpath,
                                   sep,
                                   encoding='utf8',
                                   error_bad_lines=False)
                try_remove(freq_cln_fpath)
            else:
                word_df = read_csv(freq_fpath,
                                   sep,
                                   encoding='utf8',
                                   error_bad_lines=False)

            # load from dataframe to dictionary
            word_df = word_df.drop(word_df[word_df["freq"] < min_freq].index)
            if strip_pos:
                voc = {}
                for i, row in word_df.iterrows():
                    try:
                        word = unicode(row["word"]).split("#")[0]
                        freq = int(row["freq"])
                        if word not in voc or voc[word] < freq:
                            voc[word] = freq
                    except:
                        print "Bad row:", row
                        print format_exc()
            else:
                voc = {
                    row["word"]: row["freq"]
                    for i, row in word_df.iterrows()
                }

            print "dictionary is loaded:", len(voc)

            if use_pickle:
                pickle.dump(voc, open(pkl_fpath, "wb"))
                print "Pickled voc:", pkl_fpath

        print "Loaded %d words from: %s" % (len(voc), pkl_fpath
                                            if pkl_fpath else freq_fpath)

        self._freq = voc
Example #5
0
    def __init__(self,
                 isas_fpath,
                 min_freq=1,
                 preprocess=True,
                 sep='\t',
                 strip_pos=True,
                 use_pickle=True,
                 lowercase=True):
        """ Provides access to a ISAs relations from a CSV file "hyponym<TAB>hypernym<TAB>freq" """

        if not exists(isas_fpath):
            self._hypo2hyper = {}
            return

        isas_pkl_fpath = isas_fpath + ".pkl"
        if use_pickle and exists(isas_pkl_fpath):
            pkl = pickle.load(open(isas_pkl_fpath, "rb"))
            if "hypo2hyper" in pkl:
                hypo2hyper = pkl["hypo2hyper"]
            else:
                print "Error: cannot find hypo2hyper in ", isas_pkl_fpath
                hypo2hyper = {}

            if "hyper2hypo" in pkl:
                hyper2hypo = pkl["hyper2hypo"]
            else:
                print "Error: cannot find hyper2hypo in ", isas_pkl_fpath
                hyper2hypo = {}

        else:
            if preprocess:
                isas_cln_fpath = isas_fpath + ".cleaned"
                preprocess_pandas_csv(isas_fpath, isas_cln_fpath)
                isas_df = read_csv(isas_cln_fpath,
                                   sep,
                                   encoding='utf8',
                                   error_bad_lines=False)
                try_remove(isas_cln_fpath)
            else:
                isas_df = read_csv(isas_fpath,
                                   sep,
                                   encoding='utf8',
                                   error_bad_lines=False)

            isas_df = isas_df.drop(isas_df[isas_df["freq"] < min_freq].index)
            hypo2hyper = defaultdict(dict)
            hyper2hypo = defaultdict(dict)
            for i, row in isas_df.iterrows():
                try:
                    hypo = unicode(row["hyponym"]).split("#")[0].lower(
                    ) if lowercase else unicode(row["hyponym"]).split("#")[0]
                    hyper = unicode(row["hypernym"]).split("#")[0].lower(
                    ) if lowercase else unicode(row["hypernym"]).split("#")[0]
                    freq = int(row["freq"])
                    hypo_lemma = lemmatize(hypo).lower()
                    hyper_lemma = lemmatize(hyper).lower()

                    if hypo not in hypo2hyper or hyper not in hypo2hyper[hypo]:
                        hypo2hyper[hypo][hyper] = freq
                    else:
                        hypo2hyper[hypo][hyper] += freq
                    if hypo_lemma not in hypo2hyper or hyper_lemma not in hypo2hyper[
                            hypo_lemma]:
                        hypo2hyper[hypo_lemma][hyper_lemma] = freq
                    else:
                        hypo2hyper[hypo_lemma][hyper_lemma] += freq

                    if hyper not in hyper2hypo or hypo not in hyper2hypo[hyper]:
                        hyper2hypo[hyper][hypo] = freq
                    else:
                        hyper2hypo[hyper][hypo] += freq
                    if hyper_lemma not in hyper2hypo or hypo_lemma not in hyper2hypo[
                            hyper_lemma]:
                        hyper2hypo[hyper_lemma][hypo_lemma] = freq
                    else:
                        hyper2hypo[hyper_lemma][hypo_lemma] += freq

                except:
                    print "Bad row:", row
                    print format_exc()

            print "dictionary is loaded:", len(hypo2hyper)

            if use_pickle:
                pkl = {"hypo2hyper": hypo2hyper, "hyper2hypo": hyper2hypo}
                pickle.dump(pkl, open(isas_pkl_fpath, "wb"))
                print "Pickled voc:", isas_pkl_fpath

        print "Loaded %d words from: %s" % (len(hypo2hyper), isas_pkl_fpath
                                            if isas_pkl_fpath else isas_fpath)

        self._hypo2hyper = hypo2hyper
        self._hyper2hypo = hyper2hypo
Example #6
0
def get_topic_stoplist(preprocess=False):
    stoplist_fpath = join(get_data_dir(), "topic-stoplist-489.csv")
    if preprocess: preprocess_pandas_csv(stoplist_fpath)
    df = read_csv(stoplist_fpath, "\t", encoding='utf8', error_bad_lines=False)
    return [row.word for i,row in df.iterrows()]
Example #7
0
def get_topic_stoplist(preprocess=False):
    stoplist_fpath = join(get_data_dir(), "topic-stoplist-489.csv")
    if preprocess: preprocess_pandas_csv(stoplist_fpath)
    df = read_csv(stoplist_fpath, "\t", encoding='utf8', error_bad_lines=False)
    return [row.word for i,row in df.iterrows()]
Example #8
0
    def __init__(
        self, isas_fpath, min_freq=1, preprocess=True, sep="\t", strip_pos=True, use_pickle=True, lowercase=True
    ):
        """ Provides access to a ISAs relations from a CSV file "hyponym<TAB>hypernym<TAB>freq" """

        if not exists(isas_fpath):
            self._hypo2hyper = {}
            return

        isas_pkl_fpath = isas_fpath + ".pkl"
        if use_pickle and exists(isas_pkl_fpath):
            pkl = pickle.load(open(isas_pkl_fpath, "rb"))
            if "hypo2hyper" in pkl:
                hypo2hyper = pkl["hypo2hyper"]
            else:
                print "Error: cannot find hypo2hyper in ", isas_pkl_fpath
                hypo2hyper = {}

            if "hyper2hypo" in pkl:
                hyper2hypo = pkl["hyper2hypo"]
            else:
                print "Error: cannot find hyper2hypo in ", isas_pkl_fpath
                hyper2hypo = {}

        else:
            if preprocess:
                isas_cln_fpath = isas_fpath + ".cleaned"
                preprocess_pandas_csv(isas_fpath, isas_cln_fpath)
                isas_df = read_csv(isas_cln_fpath, sep, encoding="utf8", error_bad_lines=False)
                try_remove(isas_cln_fpath)
            else:
                isas_df = read_csv(isas_fpath, sep, encoding="utf8", error_bad_lines=False)

            isas_df = isas_df.drop(isas_df[isas_df["freq"] < min_freq].index)
            hypo2hyper = defaultdict(dict)
            hyper2hypo = defaultdict(dict)
            for i, row in isas_df.iterrows():
                try:
                    hypo = (
                        unicode(row["hyponym"]).split("#")[0].lower()
                        if lowercase
                        else unicode(row["hyponym"]).split("#")[0]
                    )
                    hyper = (
                        unicode(row["hypernym"]).split("#")[0].lower()
                        if lowercase
                        else unicode(row["hypernym"]).split("#")[0]
                    )
                    freq = int(row["freq"])
                    hypo_lemma = lemmatize(hypo).lower()
                    hyper_lemma = lemmatize(hyper).lower()

                    if hypo not in hypo2hyper or hyper not in hypo2hyper[hypo]:
                        hypo2hyper[hypo][hyper] = freq
                    else:
                        hypo2hyper[hypo][hyper] += freq
                    if hypo_lemma not in hypo2hyper or hyper_lemma not in hypo2hyper[hypo_lemma]:
                        hypo2hyper[hypo_lemma][hyper_lemma] = freq
                    else:
                        hypo2hyper[hypo_lemma][hyper_lemma] += freq

                    if hyper not in hyper2hypo or hypo not in hyper2hypo[hyper]:
                        hyper2hypo[hyper][hypo] = freq
                    else:
                        hyper2hypo[hyper][hypo] += freq
                    if hyper_lemma not in hyper2hypo or hypo_lemma not in hyper2hypo[hyper_lemma]:
                        hyper2hypo[hyper_lemma][hypo_lemma] = freq
                    else:
                        hyper2hypo[hyper_lemma][hypo_lemma] += freq

                except:
                    print "Bad row:", row
                    print format_exc()

            print "dictionary is loaded:", len(hypo2hyper)

            if use_pickle:
                pkl = {"hypo2hyper": hypo2hyper, "hyper2hypo": hyper2hypo}
                pickle.dump(pkl, open(isas_pkl_fpath, "wb"))
                print "Pickled voc:", isas_pkl_fpath

        print "Loaded %d words from: %s" % (len(hypo2hyper), isas_pkl_fpath if isas_pkl_fpath else isas_fpath)

        self._hypo2hyper = hypo2hyper
        self._hyper2hypo = hyper2hypo