コード例 #1
0
ファイル: sense_clusters.py プロジェクト: zhangxt/sensegram
    def __init__(self,
                 sense_clusters_fpath,
                 strip_dst_senses=False,
                 load_sim=True,
                 verbose=False,
                 normalized_bow=False,
                 use_pickle=True,
                 voc_fpath="",
                 voc=[],
                 normalize_sim=False):
        """ Loads and operates sense clusters in the format 'word<TAB>cid<TAB>prob<TAB>cluster<TAB>isas' """

        self._verbose = verbose
        self._normalized_bow = normalized_bow
        self._stoplist = get_stoplist()
        self._normalize_sim = normalize_sim

        if len(voc) > 0:
            self._voc = voc
        elif exists(voc_fpath):
            self._voc = load_voc(voc_fpath)
        else:
            self._voc = {}

        sense_clusters_pkl_fpath = sense_clusters_fpath + ".pkl"
        if use_pickle and exists(sense_clusters_pkl_fpath):
            pkl = pickle.load(open(sense_clusters_pkl_fpath, "rb"))
            if "sense_clusters" in pkl:
                self._sc = pkl["sense_clusters"]
            else:
                print("Error: cannot find sense_clusters in ",
                      sense_clusters_pkl_fpath)
                self._sc = {}

            if "normword2word" in pkl:
                self._normword2word = pkl["normword2word"]
            else:
                print("Error: cannot find normword2word in ",
                      sense_clusters_pkl_fpath)
                self._normword2word = {}
            print("Loaded %d words from: %s" %
                  (len(self._sc), sense_clusters_pkl_fpath))

        else:
            self._sc, self._normword2word = self._load(sense_clusters_fpath,
                                                       strip_dst_senses,
                                                       load_sim)
            if use_pickle:
                pkl = {
                    "sense_clusters": self._sc,
                    "normword2word": self._normword2word
                }
                pickle.dump(pkl, open(sense_clusters_pkl_fpath, "wb"))
                print("Pickled sense clusters:", sense_clusters_pkl_fpath)
コード例 #2
0
 def __init__(self, sense_clusters_fpath, skip_voc_fpath=""):
     self._skip_voc = load_voc(
         skip_voc_fpath, preprocess=True, sep='\t',
         use_pickle=True) if exists(skip_voc_fpath) else set()
     print("Skip voc:", len(self._skip_voc))
     self._sense_clusters = SenseClusters(sense_clusters_fpath,
                                          strip_dst_senses=False)
     self._sc = self._sense_clusters.data
コード例 #3
0
ファイル: sense_clusters.py プロジェクト: zhangxt/sensegram
    def _load(self, pcz_fpath, strip_dst_senses, load_sim):
        """ Loads a dict[word][sense] --> {"cluster": Counter(), "cluster_norm": Counter(), "isas": Counter()} """

        senses = defaultdict(dict)
        normword2word = defaultdict(set)
        if not exists(pcz_fpath): return senses, normword2word

        df = read_csv(pcz_fpath,
                      encoding='utf-8',
                      delimiter=SEP,
                      error_bad_lines=True,
                      quotechar='\0')
        df = df.fillna("")
        err_clusters = 0
        num_senses = 0

        # foreach sense cluster
        for i, row in df.iterrows():
            try:
                if i % 25000 == 0:
                    print("%d (%d) senses loaded of %d" %
                          (i, num_senses, len(df)))
                if len(self._voc) > 0 and row.word not in self._voc:
                    continue

                r = {}
                r["prob"] = row.prob if "prob" in row else 1.0
                r["cluster"] = self._get_words(
                    row.cluster, strip_dst_senses,
                    load_sim) if "cluster" in row else Counter()
                r["cluster_norm"] = self._get_normalized_words(
                    r["cluster"]) if self._normalized_bow else r["cluster"]
                r["isas"] = self._get_words(
                    row.isas, strip_dst_senses,
                    load_sim) if "isas" in row else Counter()
                r["isas_norm"] = self._get_normalized_words(
                    r["isas"]) if self._normalized_bow else r["isas"]
                senses[row.word][row.cid] = r
                normword2word[self.norm(row.word)].add(row.word)
                num_senses += 1
            except:
                print(".", end=' ')
                if self._verbose:
                    print("Warning: bad cluster")
                    print(row)
                    print(format_exc())
                err_clusters += 1

        print(err_clusters, "cluster errors")
        print(num_senses, "senses loaded out of", i + 1)
        print(len(senses), "words loaded")

        return senses, normword2word
コード例 #4
0
ファイル: train.py プロジェクト: yellowwoods12/sensegram
def main():
    parser = argparse.ArgumentParser(description='Performs training of a word sense embeddings model from a raw text '
                                                 'corpus using the SkipGram approach based on word2vec and graph '
                                                 'clustering of ego networks of semantically related terms.')
    parser.add_argument('train_corpus', help="Path to a training corpus in text form (can be .gz).")
    parser.add_argument('-phrases', help="Path to a file with extra vocabulary words, e.g. multiword expressions,"
                                     "which should be included into the vocabulary of the model. Each "
                                     "line of this text file should contain one word or phrase with no header.",
                        default="")
    parser.add_argument('-cbow', help="Use the continuous bag of words model (default is 1, use 0 for the "
                                      "skip-gram model).", default=1, type=int)
    parser.add_argument('-size', help="Set size of word vectors (default is 300).", default=300, type=int)
    parser.add_argument('-window', help="Set max skip length between words (default is 5).", default=5, type=int)
    parser.add_argument('-threads', help="Use <int> threads (default {}).".format(cpu_count()),
                        default=cpu_count(), type=int)
    parser.add_argument('-iter', help="Run <int> training iterations (default 5).", default=5, type=int)
    parser.add_argument('-min_count', help="This will discard words that appear less than <int> times"
                                           " (default is 10).", default=10, type=int)
    parser.add_argument('-N', help="Number of nodes in each ego-network (default is 200).", default=200, type=int)
    parser.add_argument('-n', help="Maximum number of edges a node can have in the network"
                                   " (default is 200).", default=200, type=int)
    parser.add_argument('-bigrams', help="Detect bigrams in the input corpus.", action="store_true")
    parser.add_argument('-min_size', help="Minimum size of the cluster (default is 5).", default=5, type=int)
    parser.add_argument('-make-pcz', help="Perform two extra steps to label the original sense inventory with"
                                          " hypernymy labels and disambiguate the list of related words."
                                          "The obtained resource is called proto-concepualization or PCZ.",
                        action="store_true")
    args = parser.parse_args()

    corpus_name = basename(args.train_corpus)
    model_dir = "model/"
    ensure_dir(model_dir)
    vectors_fpath = join(model_dir, corpus_name + ".cbow{}-size{}-window{}-iter{}-mincount{}-bigrams{}.word_vectors".format(
        args.cbow, args.size, args.window, args.iter, args.min_count, args.bigrams))
    vectors_short_fpath = join(model_dir, corpus_name + ".word_vectors")
    neighbours_fpath = join(model_dir, corpus_name + ".N{}.graph".format(args.N))
    clusters_fpath = join(model_dir, corpus_name + ".n{}.clusters".format(args.n))
    clusters_minsize_fpath = clusters_fpath + ".minsize" + str(args.min_size)  # clusters that satisfy min_size
    clusters_removed_fpath = clusters_minsize_fpath + ".removed"  # cluster that are smaller than min_size

    
    if exists(vectors_fpath):
        print("Using existing vectors:", vectors_fpath)
    elif exists(vectors_short_fpath):
        print("Using existing vectors:", vectors_short_fpath)
        vectors_fpath = vectors_short_fpath
    else:
        learn_word_embeddings(args.train_corpus, vectors_fpath, args.cbow, args.window,
                              args.iter, args.size, args.threads, args.min_count,
                              detect_bigrams=args.bigrams, phrases_fpath=args.phrases)

    if not exists(neighbours_fpath):
        compute_graph_of_related_words(vectors_fpath, neighbours_fpath, neighbors=args.N)
    else:
        print("Using existing neighbors:", neighbours_fpath)
        
    if not exists(clusters_fpath):
        word_sense_induction(neighbours_fpath, clusters_fpath, args.n, args.threads)
    else:
       print("Using existing clusters:", clusters_fpath)
   
    if not exists(clusters_minsize_fpath): 
        filter_clusters.run(clusters_fpath, clusters_minsize_fpath, args.min_size)
    else:
        print("Using existing filtered clusters:", clusters_minsize_fpath)
    
    building_sense_embeddings(clusters_minsize_fpath, vectors_fpath)

    if (args.make_pcz):
        # add isas
        isas_fpath = ""
        # in: clusters_minsize_fpath
        clusters_with_isas_fpath = clusters_minsize_fpath + ".isas"

        # disambiguate the original sense clusters
        clusters_disambiguated_fpath = clusters_with_isas_fpath + ".disambiguated"
        pcz.disamgiguate_sense_clusters.run(clusters_with_isas_fpath, clusters_disambiguated_fpath)

        # make the closure
        clusters_closure_fpath = clusters_disambiguated_fpath + ".closure"
コード例 #5
0
ファイル: train.py プロジェクト: sounak98/sensegram
def main():
    parser = argparse.ArgumentParser(description='Performs training of a word sense embeddings model from a raw text '
                                                 'corpus using the SkipGram approach based on word2vec and graph '
                                                 'clustering of ego networks of semantically related terms.')
    parser.add_argument('train_corpus', help="Path to a training corpus in text form (can be .gz).")
    parser.add_argument('-cbow', help="Use the continuous bag of words model (default is 1, use 0 for the "
                                      "skip-gram model).", default=1, type=int)
    parser.add_argument('-size', help="Set size of word vectors (default is 300).", default=300, type=int)
    parser.add_argument('-window', help="Set max skip length between words (default is 5).", default=5, type=int)
    parser.add_argument('-threads', help="Use <int> threads (default {}).".format(cpu_count()), default=cpu_count(), type=int)
    parser.add_argument('-iter', help="Run <int> training iterations (default 5).", default=5, type=int)
    parser.add_argument('-min_count', help="This will discard words that appear less than <int> times"
                                           " (default is 10).", default=10, type=int)
    #parser.add_argument('-only_letters', help="Use only words built from letters/dash/point for DT.", action="store_true")
    #parser.add_argument('-vocab_limit', help="Use only <int> most frequent words from word vector model"
    #                                         " for DT. By default use all words (default is none).", default=None, type=int)
    parser.add_argument('-N', help="Number of nodes in each ego-network (default is 200).", default=200, type=int)
    parser.add_argument('-n', help="Maximum number of edges a node can have in the network"
                                   " (default is 200).", default=200, type=int)
    parser.add_argument('-min_size', help="Minimum size of the cluster (default is 5).", default=5, type=int)
    parser.add_argument('-make-pcz', help="Perform two extra steps to label the original sense inventory with"
                                          " hypernymy labels and disambiguate the list of related words."
                                          "The obtained resource is called proto-concepualization or PCZ.", action="store_true")
    args = parser.parse_args()

    vectors_fpath, neighbours_fpath, clusters_fpath, clusters_minsize_fpath, clusters_removed_fpath = get_paths(
        args.train_corpus, args.min_size)
    
    if not exists(vectors_fpath):
        print(vectors_fpath)
        learn_word_embeddings(args.train_corpus, vectors_fpath, args.cbow, args.window,
                              args.iter, args.size, args.threads, args.min_count, detect_phrases=True)
    else:
        print("Using existing vectors:", vectors_fpath)
 
    if not exists(neighbours_fpath):
        compute_graph_of_related_words(vectors_fpath, neighbours_fpath)
    else:
        print("Using existing neighbors:", neighbours_fpath)
        
    if not exists(clusters_fpath):
        word_sense_induction(neighbours_fpath, clusters_fpath, args.n, args.threads)
    else:
       print("Using existing clusters:", clusters_fpath)
   
    if not exists(clusters_minsize_fpath): 
        filter_clusters.run(clusters_fpath, clusters_minsize_fpath, args.min_size)
    else:
        print("Using existing filtered clusters:", clusters_minsize_fpath)
    
    building_sense_embeddings(clusters_minsize_fpath, vectors_fpath)

    if (args.make_pcz):
        # add isas
        isas_fpath = ""
        # in: clusters_minsize_fpath
        clusters_with_isas_fpath = clusters_minsize_fpath + ".isas"


        # disambiguate the original sense clusters
        clusters_disambiguated_fpath = clusters_with_isas_fpath + ".disambiguated"
        pcz.disamgiguate_sense_clusters.run(clusters_with_isas_fpath, clusters_disambiguated_fpath)

        # make the closure
        clusters_closure_fpath = clusters_disambiguated_fpath + ".closure"
コード例 #6
0
def main():
    parser = argparse.ArgumentParser(
        description=
        'Performs training of a word sense embeddings model from a raw text '
        'corpus using the SkipGram approach based on word2vec and graph '
        'clustering of ego networks of semantically related terms.')

    parser.add_argument('-vectors',
                        help="Existing embeddings to make sense vectors from")
    parser.add_argument('-threads',
                        help="Use <int> threads (default {}).".format(
                            cpu_count()),
                        default=cpu_count(),
                        type=int)
    parser.add_argument('-iter',
                        help="Run <int> training iterations (default 5).",
                        default=5,
                        type=int)
    parser.add_argument(
        '-min_count',
        help="This will discard words that appear less than <int> times"
        " (default is 10).",
        default=10,
        type=int)
    parser.add_argument(
        '-N',
        help="Number of nodes in each ego-network (default is 200).",
        default=200,
        type=int)
    parser.add_argument(
        '-n',
        help="Maximum number of edges a node can have in the network"
        " (default is 200).",
        default=200,
        type=int)
    parser.add_argument('-min_size',
                        help="Minimum size of the cluster (default is 5).",
                        default=5,
                        type=int)

    args = parser.parse_args()

    model_dir = "model/"
    ensure_dir(model_dir)
    vectors_fpath = args.vectors
    neighbours_fpath = join(model_dir,
                            args.vectors + ".N{}.graph".format(args.N))
    clusters_fpath = join(model_dir,
                          args.vectors + ".n{}.clusters".format(args.n))
    clusters_minsize_fpath = clusters_fpath + ".minsize" + str(
        args.min_size)  # clusters that satisfy min_size
    clusters_removed_fpath = clusters_minsize_fpath + ".removed"  # cluster that are smaller than min_size

    if exists(vectors_fpath):
        print("Using existing vectors:", vectors_fpath)
    else:
        return FileNotFoundError

    if not exists(neighbours_fpath):
        compute_graph_of_related_words(vectors_fpath,
                                       neighbours_fpath,
                                       neighbors=args.N)
    else:
        print("Using existing neighbors:", neighbours_fpath)

    word_sense_induction(neighbours_fpath, clusters_fpath, args.n,
                         args.threads)
    filter_clusters.run(clusters_fpath, clusters_minsize_fpath, args.min_size)
    building_sense_embeddings(clusters_minsize_fpath, vectors_fpath)
コード例 #7
0
ファイル: isas.py プロジェクト: zhangxt/sensegram
    def __init__(self,
                 isas_fpath,
                 min_freq=0.0,
                 preprocess=True,
                 sep='\t',
                 strip_pos=True,
                 use_pickle=True,
                 lowercase=True):
        """ Provides access to a ISAs relations from a CSV file "hyponym<TAB>hypernym<TAB>freq" """

        if not exists(isas_fpath):
            self._hypo2hyper = {}
            self._hyper2hypo = {}
            return

        isas_pkl_fpath = isas_fpath + ".pkl"
        if use_pickle and exists(isas_pkl_fpath):
            pkl = pickle.load(open(isas_pkl_fpath, "rb"))
            if "hypo2hyper" in pkl:
                hypo2hyper = pkl["hypo2hyper"]
            else:
                print(("Error: cannot find hypo2hyper in ", isas_pkl_fpath))
                hypo2hyper = {}

            if "hyper2hypo" in pkl:
                hyper2hypo = pkl["hyper2hypo"]
            else:
                print(("Error: cannot find hyper2hypo in ", isas_pkl_fpath))
                hyper2hypo = {}

        else:
            if preprocess:
                isas_cln_fpath = isas_fpath + ".cleaned"
                preprocess_pandas_csv(isas_fpath, isas_cln_fpath)
                isas_df = read_csv(isas_cln_fpath,
                                   sep,
                                   encoding='utf8',
                                   error_bad_lines=False)
                try_remove(isas_cln_fpath)
            else:
                isas_df = read_csv(isas_fpath,
                                   sep,
                                   encoding='utf8',
                                   error_bad_lines=False)

            isas_df = isas_df.drop(isas_df[isas_df["freq"] < min_freq].index)
            hypo2hyper = defaultdict(dict)
            hyper2hypo = defaultdict(dict)
            for i, row in isas_df.iterrows():
                try:
                    hypo = str(row["hyponym"]).split("#")[0].lower(
                    ) if lowercase else str(row["hyponym"]).split("#")[0]
                    hyper = str(row["hypernym"]).split("#")[0].lower(
                    ) if lowercase else str(row["hypernym"]).split("#")[0]
                    freq = float(row["freq"])
                    hypo_lemma = lemmatize(hypo).lower()
                    hyper_lemma = lemmatize(hyper).lower()

                    # hypo2hyper
                    if hypo not in hypo2hyper or hyper not in hypo2hyper[hypo]:
                        hypo2hyper[hypo][hyper] = freq
                    else:
                        hypo2hyper[hypo][hyper] += freq
                    if (hypo_lemma, hyper_lemma) != (hypo, hyper):
                        if hypo_lemma not in hypo2hyper or hyper_lemma not in hypo2hyper[
                                hypo_lemma]:
                            hypo2hyper[hypo_lemma][hyper_lemma] = freq
                        else:
                            hypo2hyper[hypo_lemma][hyper_lemma] += freq

                    # hyper2hypo
                    if hyper not in hyper2hypo or hypo not in hyper2hypo[hyper]:
                        hyper2hypo[hyper][hypo] = freq
                    else:
                        hyper2hypo[hyper][hypo] += freq
                    if (hypo_lemma, hyper_lemma) != (hypo, hyper):
                        if hyper_lemma not in hyper2hypo or hypo_lemma not in hyper2hypo[
                                hyper_lemma]:
                            hyper2hypo[hyper_lemma][hypo_lemma] = freq
                        else:
                            hyper2hypo[hyper_lemma][hypo_lemma] += freq

                except:
                    print(("Bad row:", row))
                    print((format_exc()))

            print(("dictionary is loaded:", len(hypo2hyper)))

            if use_pickle:
                pkl = {"hypo2hyper": hypo2hyper, "hyper2hypo": hyper2hypo}
                pickle.dump(pkl, open(isas_pkl_fpath, "wb"))
                print(("Pickled voc:", isas_pkl_fpath))

        print(("Loaded %d words from: %s" %
               (len(hypo2hyper),
                isas_pkl_fpath if isas_pkl_fpath else isas_fpath)))

        self._hypo2hyper = hypo2hyper
        self._hyper2hypo = hyper2hypo