コード例 #1
0
ファイル: taxo.py プロジェクト: anukat2015/taxi
    def __init__(self, taxonomy_resources, voc_fpath="", relations_fpath="", lang="en"):
        self._isas = taxonomy_resources.isas
        self._freqs = taxonomy_resources.freqs
        self.voc_name = fpath2filename(voc_fpath)
        self._voc_fpath = voc_fpath
        self._stopwords = load_stoplist(lang=lang)
        self._lang = lang

        if exists(voc_fpath) and not exists(relations_fpath):
            self.voc = self._load_voc(voc_fpath) 
            relations_fpath = voc_fpath + "-relations.csv"
            print "Generating new relations file:", relations_fpath
            self._relations_fpath = voc_fpath + "-relations.csv"
            self._relations = self._generate_relations(self.voc, self._relations_fpath)
        elif exists(relations_fpath):
            print "Loading relations file:", relations_fpath
            self._relations_fpath = relations_fpath
            self._relations = read_csv(relations_fpath, encoding='utf-8', delimiter="\t", error_bad_lines=False)
            print "Loaded %d relations from: %s" % (len(self._relations), relations_fpath)
            hypos_voc = set(self._relations.hyponym.to_dict().values())
            hyper_voc = set(self._relations.hypernym.to_dict().values())
            self.voc = hypos_voc.union(hyper_voc)
            print "Loaded %d voc from relations" % len(self.voc)
        else:
            raise Exception("Error: cannot load relations or generate them. Specify either voc_fpath or relations_fpath.")
コード例 #2
0
ファイル: direction.py プロジェクト: anukat2015/taxi
def load_relations(relations_fpath, taxo_en_plants_fpath="", taxo_en_vehicles_fpath="", taxo_en_ai_fpath="", taxo_eval_en_ai_fpath=""):
    if exists(relations_fpath):
        relations = read_csv(relations_fpath, encoding='utf-8', delimiter="\t", error_bad_lines=False)
        print "Relations loaded from:", relations_fpath

    elif exists(taxo_en_plants_fpath) and exists(taxo_en_vehicles_fpath) and exists(taxo_en_ai_fpath) and exists(taxo_eval_en_ai_fpath):
        tic = time()
        plants = taxo2csv_all_correct(taxo_en_plants_fpath)
        plants = insert_source(taxo_en_plants_fpath, plants)
        print "plants:", len(plants)

        vehicles = taxo2csv_all_correct(taxo_en_vehicles_fpath)
        vehicles = insert_source(taxo_en_vehicles_fpath, vehicles)
        print "vehicles:", len(vehicles)

        ai = taxo2csv_mixed(taxo_en_ai_fpath, taxo_eval_en_ai_fpath)
        ai = insert_source(taxo_en_ai_fpath, ai)
        print "ai:", len(ai)

        relations = concat([plants, vehicles, ai], ignore_index=True)
        print "all:", len(relations)

        relations = remove_underscores(relations)
        relations = add_inverse_relations(relations)
        relations = relations.sort(["hyponym", "correct"], ascending=[1,0])
        relations.to_csv(relations_fpath, sep="\t", encoding="utf-8", float_format='%.0f', index=False)
        relations = read_csv(relations_fpath, encoding='utf-8', delimiter="\t", error_bad_lines=False)
        print "Dataset:", relations_fpath
        print "Relations generated and loaded in %.1f sec." % (time()-tic)
        
    else:
        print "Error: cannot load relations. No input files found." 
        relations = None

    return relations
コード例 #3
0
ファイル: supervised.py プロジェクト: anukat2015/taxi
    def __init__(self, model_dir, method="LogisticRegressionL2", features=FEATURES, k=100, overwrite=False):

        self.CLASSIFIER_FILE = "classifier"
        self.KBEST_VOC_FILE = "kbest-voc.csv"
        self.KBEST_FILE = "kbest.pkl"
        self.META_FILE = "meta.json"
        clf_fpath = join(model_dir, self.CLASSIFIER_FILE)
        kbest_fpath = join(model_dir, self.KBEST_FILE)
        self._model_dir = model_dir
        self._meta_fpath = join(model_dir, self.META_FILE)
        
        self._meta = {}
        self._meta["method"] = method
        self._meta["k"] = k
        self._meta["features"] = features

        if exists(model_dir) and exists(clf_fpath) and not overwrite:
            # load the model
            self._clf = joblib.load(clf_fpath)
            self._meta = json.load(open(self._meta_fpath, "r"))
            print "Metadata were loaded from:", self._meta_fpath
        else:
            # model doesn't exist, or must be overwritten create a new one
            ensure_dir(model_dir)
            self.save_meta()
コード例 #4
0
ファイル: supervised.py プロジェクト: mjj203/taxi-1
    def __init__(self,
                 model_dir,
                 method="LogisticRegressionL2",
                 features=FEATURES,
                 k=100,
                 overwrite=False):

        self.CLASSIFIER_FILE = "classifier"
        self.KBEST_VOC_FILE = "kbest-voc.csv"
        self.KBEST_FILE = "kbest.pkl"
        self.META_FILE = "meta.json"
        clf_fpath = join(model_dir, self.CLASSIFIER_FILE)
        kbest_fpath = join(model_dir, self.KBEST_FILE)
        self._model_dir = model_dir
        self._meta_fpath = join(model_dir, self.META_FILE)

        self._meta = {}
        self._meta["method"] = method
        self._meta["k"] = k
        self._meta["features"] = features

        if exists(model_dir) and exists(clf_fpath) and not overwrite:
            # load the model
            self._clf = joblib.load(clf_fpath)
            self._meta = json.load(open(self._meta_fpath, "r"))
            print("Metadata were loaded from:", self._meta_fpath)
        else:
            # model doesn't exist, or must be overwritten create a new one
            ensure_dir(model_dir)
            self.save_meta()
コード例 #5
0
def load_relations(relations_fpath,
                   taxo_en_plants_fpath="",
                   taxo_en_vehicles_fpath="",
                   taxo_en_ai_fpath="",
                   taxo_eval_en_ai_fpath=""):
    if exists(relations_fpath):
        relations = read_csv(relations_fpath,
                             encoding='utf-8',
                             delimiter="\t",
                             error_bad_lines=False)
        print("Relations loaded from:", relations_fpath)

    elif exists(taxo_en_plants_fpath) and exists(
            taxo_en_vehicles_fpath) and exists(taxo_en_ai_fpath) and exists(
                taxo_eval_en_ai_fpath):
        tic = time()
        plants = taxo2csv_all_correct(taxo_en_plants_fpath)
        plants = insert_source(taxo_en_plants_fpath, plants)
        print("plants:", len(plants))

        vehicles = taxo2csv_all_correct(taxo_en_vehicles_fpath)
        vehicles = insert_source(taxo_en_vehicles_fpath, vehicles)
        print("vehicles:", len(vehicles))

        ai = taxo2csv_mixed(taxo_en_ai_fpath, taxo_eval_en_ai_fpath)
        ai = insert_source(taxo_en_ai_fpath, ai)
        print("ai:", len(ai))

        relations = concat([plants, vehicles, ai], ignore_index=True)
        print("all:", len(relations))

        relations = remove_underscores(relations)
        relations = add_inverse_relations(relations)
        relations = relations.sort_values(["hyponym", "correct"],
                                          ascending=[1, 0])
        relations.to_csv(relations_fpath,
                         sep="\t",
                         encoding="utf-8",
                         float_format='%.0f',
                         index=False)
        relations = read_csv(relations_fpath,
                             encoding='utf-8',
                             delimiter="\t",
                             error_bad_lines=False)
        print("Dataset:", relations_fpath)
        print("Relations generated and loaded in %.1f sec." % (time() - tic))

    else:
        print("Error: cannot load relations. No input files found.")
        relations = None

    return relations
コード例 #6
0
    def _load(self, babelnet_fpath, divide_by_freq=False, sanity_check=True):
        if not exists(babelnet_fpath): return defaultdict(dict)

        with open(babelnet_fpath, 'rb') as babelnet_file:
            bn = pickle.load(babelnet_file)

        if sanity_check:
            err_num = 0
            for word in bn:
                if len(bn[word]) <= 0:
                    err_num += 1
                    print "Warning: local word with no senses", word
            if err_num > 0:
                print "Warning:", err_num, "local words with no senses"

            print "Loaded BabelNet with %d words from: %s" % (len(bn), babelnet_fpath)

        self._block_save = False
        if self._normalized:
            for word in bn:
                for sense_id in bn[word]:
                    if divide_by_freq:
                        bow = Counter({w: bn[word][sense_id]["bow"][w] / self._freq.freq(w) for w in bn[word][sense_id]["bow"] if good_token(w)})
                        self._block_save = True
                    else:
                        bow = bn[word][sense_id]["bow"]

                    max_freq_norm = float(max(bow.values())) if len(bow) > 0 else 1.0
                    if max_freq_norm == 0.0: max_freq_norm = 1.0
                    bow_range_norm = Counter({w: bow[w] / max_freq_norm for w in bow if good_token(w)})

                    bn[word][sense_id]["bow"] = bow_range_norm

        return bn
コード例 #7
0
 def _save_synset(self, word, sid, synset):
     try:
         if not exists(self._babelnet_dir): return
         output_fpath = join(self._babelnet_dir, word + "#" + sid + ".json")
         with codecs.open(output_fpath, 'w', "utf-8") as outfile:
             print >> outfile, json.dumps(synset, ensure_ascii=False).decode("utf-8")
     except:
         print "Error saving file"
         print format_exc()
コード例 #8
0
ファイル: freq.py プロジェクト: luisfgutierrez/taxi
    def __init__(self, freq_fpath, min_freq=1, preprocess=True, sep='\t', strip_pos=True, use_pickle=True):
        """ Reads a word frequency list in CSV format "word<TAB>freq" """

        if not exists(freq_fpath):
            self._freq = {}
            return

        pkl_fpath = freq_fpath + ".pkl"
        if use_pickle and exists(pkl_fpath):
            voc = pickle.load(open(pkl_fpath, "rb"))
        else:
            # load words to datafame
            if preprocess:
                freq_cln_fpath = freq_fpath + "-cln"
                preprocess_pandas_csv(freq_fpath, freq_cln_fpath)
                word_df = read_csv(freq_cln_fpath, sep, encoding='utf8', error_bad_lines=False)
                try_remove(freq_cln_fpath)
            else:
                word_df = read_csv(freq_fpath, sep, encoding='utf8', error_bad_lines=False)

            # load from dataframe to dictionary
            word_df = word_df.drop(word_df[word_df["freq"] < min_freq].index)
            if strip_pos:
                voc = {}
                for i, row in word_df.iterrows():
                    try:
                        word = unicode(row["word"]).split("#")[0]
                        freq = int(row["freq"])
                        if word not in voc or voc[word] < freq: voc[word] = freq
                    except:
                        print "Bad row:", row
                        print format_exc()
            else:
                voc = { row["word"]: row["freq"] for i, row in word_df.iterrows() }

            print "dictionary is loaded:", len(voc)

            if use_pickle:
                pickle.dump(voc, open(pkl_fpath, "wb"))
                print "Pickled voc:", pkl_fpath

        print "Loaded %d words from: %s" % (len(voc), pkl_fpath if pkl_fpath else freq_fpath)

        self._freq = voc
コード例 #9
0
 def _save_synset(self, word, sid, synset):
     try:
         if not exists(self._babelnet_dir): return
         output_fpath = join(self._babelnet_dir, word + "#" + sid + ".json")
         with codecs.open(output_fpath, 'w', "utf-8") as outfile:
             print >> outfile, json.dumps(
                 synset, ensure_ascii=False).decode("utf-8")
     except:
         print "Error saving file"
         print format_exc()
コード例 #10
0
ファイル: taxo.py プロジェクト: anukat2015/taxi
    def _load_voc(self, voc_fpath):
        if exists(voc_fpath):
            voc_df = read_csv(voc_fpath, encoding='utf-8', delimiter="\t", error_bad_lines=False)
            voc_name = fpath2filename(voc_fpath)

            voc = set()
            for i, row in voc_df.iterrows():
                if "term" in row: voc.add(row.term)
                elif "word" in row: voc.add(row.word)
            print "Loaded %d words vocabulary"  % len(voc) 
            return voc
        else:
            print "Warning: vocabulary is not loaded. This means hypo2hyper features cannot be extracted."
            return set()
コード例 #11
0
ファイル: taxo.py プロジェクト: binarymax/taxi
    def __init__(self,
                 taxonomy_resources,
                 voc_fpath="",
                 relations_fpath="",
                 lang="en"):
        self._isas = taxonomy_resources.isas
        self._freqs = taxonomy_resources.freqs
        self.voc_name = fpath2filename(voc_fpath)
        self._voc_fpath = voc_fpath
        self._stopwords = load_stoplist(lang=lang)
        self._lang = lang

        if exists(voc_fpath) and not exists(relations_fpath):
            self.voc = self._load_voc(voc_fpath)
            relations_fpath = voc_fpath + "-relations.csv"
            print "Generating new relations file:", relations_fpath
            self._relations_fpath = voc_fpath + "-relations.csv"
            self._relations = self._generate_relations(self.voc,
                                                       self._relations_fpath)
        elif exists(relations_fpath):
            print "Loading relations file:", relations_fpath
            self._relations_fpath = relations_fpath
            self._relations = read_csv(relations_fpath,
                                       encoding='utf-8',
                                       delimiter="\t",
                                       error_bad_lines=False)
            print "Loaded %d relations from: %s" % (len(
                self._relations), relations_fpath)
            hypos_voc = set(self._relations.hyponym.to_dict().values())
            hyper_voc = set(self._relations.hypernym.to_dict().values())
            self.voc = hypos_voc.union(hyper_voc)
            print "Loaded %d voc from relations" % len(self.voc)
        else:
            raise Exception(
                "Error: cannot load relations or generate them. Specify either voc_fpath or relations_fpath."
            )
コード例 #12
0
ファイル: taxo.py プロジェクト: binarymax/taxi
    def _load_voc(self, voc_fpath):
        if exists(voc_fpath):
            voc_df = read_csv(voc_fpath,
                              encoding='utf-8',
                              delimiter="\t",
                              error_bad_lines=False)
            voc_name = fpath2filename(voc_fpath)

            voc = set()
            for i, row in voc_df.iterrows():
                if "term" in row: voc.add(row.term)
                elif "word" in row: voc.add(row.word)
            print "Loaded %d words vocabulary" % len(voc)
            return voc
        else:
            print "Warning: vocabulary is not loaded. This means hypo2hyper features cannot be extracted."
            return set()
コード例 #13
0
    def _load(self, babelnet_fpath, divide_by_freq=False, sanity_check=True):
        if not exists(babelnet_fpath): return defaultdict(dict)

        with open(babelnet_fpath, 'rb') as babelnet_file:
            bn = pickle.load(babelnet_file)

        if sanity_check:
            err_num = 0
            for word in bn:
                if len(bn[word]) <= 0:
                    err_num += 1
                    print "Warning: local word with no senses", word
            if err_num > 0:
                print "Warning:", err_num, "local words with no senses"

            print "Loaded BabelNet with %d words from: %s" % (len(bn),
                                                              babelnet_fpath)

        self._block_save = False
        if self._normalized:
            for word in bn:
                for sense_id in bn[word]:
                    if divide_by_freq:
                        bow = Counter({
                            w:
                            bn[word][sense_id]["bow"][w] / self._freq.freq(w)
                            for w in bn[word][sense_id]["bow"] if good_token(w)
                        })
                        self._block_save = True
                    else:
                        bow = bn[word][sense_id]["bow"]

                    max_freq_norm = float(max(
                        bow.values())) if len(bow) > 0 else 1.0
                    if max_freq_norm == 0.0: max_freq_norm = 1.0
                    bow_range_norm = Counter({
                        w: bow[w] / max_freq_norm
                        for w in bow if good_token(w)
                    })

                    bn[word][sense_id]["bow"] = bow_range_norm

        return bn
コード例 #14
0
def adagram_disambiguate(contexts_fpath, model_fpath, output_fpath, nearest_neighbors="false"):
    env = dict(os.environ)
    env["DYLD_LIBRARY_PATH"] = DYLD_LIBRARY
    p = Popen(["julia",
               join(ADAGRAM_SCRIPTS_DIR, "matching.jl"),
               contexts_fpath,
               model_fpath,
               output_fpath,
               nearest_neighbors],
               stdin=PIPE,
               stdout=PIPE,
               stderr=PIPE,
               env=env)
    stdout, err = p.communicate(b"")
    rc = p.returncode

    print stdout
    print err
    print "Output:", output_fpath
    print "Output exits:", exists(output_fpath)
コード例 #15
0
def adagram_disambiguate(contexts_fpath,
                         model_fpath,
                         output_fpath,
                         nearest_neighbors="false"):
    env = dict(os.environ)
    env["DYLD_LIBRARY_PATH"] = DYLD_LIBRARY
    p = Popen([
        "julia",
        join(ADAGRAM_SCRIPTS_DIR, "matching.jl"), contexts_fpath, model_fpath,
        output_fpath, nearest_neighbors
    ],
              stdin=PIPE,
              stdout=PIPE,
              stderr=PIPE,
              env=env)
    stdout, err = p.communicate(b"")
    rc = p.returncode

    print stdout
    print err
    print "Output:", output_fpath
    print "Output exits:", exists(output_fpath)
コード例 #16
0
ファイル: isas.py プロジェクト: shannonyu/taxi
    def __init__(self,
                 isas_fpath,
                 min_freq=1,
                 preprocess=True,
                 sep='\t',
                 strip_pos=True,
                 use_pickle=True,
                 lowercase=True):
        """ Provides access to a ISAs relations from a CSV file "hyponym<TAB>hypernym<TAB>freq" """

        if not exists(isas_fpath):
            self._hypo2hyper = {}
            return

        isas_pkl_fpath = isas_fpath + ".pkl"
        if use_pickle and exists(isas_pkl_fpath):
            pkl = pickle.load(open(isas_pkl_fpath, "rb"))
            if "hypo2hyper" in pkl:
                hypo2hyper = pkl["hypo2hyper"]
            else:
                print "Error: cannot find hypo2hyper in ", isas_pkl_fpath
                hypo2hyper = {}

            if "hyper2hypo" in pkl:
                hyper2hypo = pkl["hyper2hypo"]
            else:
                print "Error: cannot find hyper2hypo in ", isas_pkl_fpath
                hyper2hypo = {}

        else:
            if preprocess:
                isas_cln_fpath = isas_fpath + ".cleaned"
                preprocess_pandas_csv(isas_fpath, isas_cln_fpath)
                isas_df = read_csv(isas_cln_fpath,
                                   sep,
                                   encoding='utf8',
                                   error_bad_lines=False)
                try_remove(isas_cln_fpath)
            else:
                isas_df = read_csv(isas_fpath,
                                   sep,
                                   encoding='utf8',
                                   error_bad_lines=False)

            isas_df = isas_df.drop(isas_df[isas_df["freq"] < min_freq].index)
            hypo2hyper = defaultdict(dict)
            hyper2hypo = defaultdict(dict)
            for i, row in isas_df.iterrows():
                try:
                    hypo = unicode(row["hyponym"]).split("#")[0].lower(
                    ) if lowercase else unicode(row["hyponym"]).split("#")[0]
                    hyper = unicode(row["hypernym"]).split("#")[0].lower(
                    ) if lowercase else unicode(row["hypernym"]).split("#")[0]
                    freq = int(row["freq"])
                    hypo_lemma = lemmatize(hypo).lower()
                    hyper_lemma = lemmatize(hyper).lower()

                    if hypo not in hypo2hyper or hyper not in hypo2hyper[hypo]:
                        hypo2hyper[hypo][hyper] = freq
                    else:
                        hypo2hyper[hypo][hyper] += freq
                    if hypo_lemma not in hypo2hyper or hyper_lemma not in hypo2hyper[
                            hypo_lemma]:
                        hypo2hyper[hypo_lemma][hyper_lemma] = freq
                    else:
                        hypo2hyper[hypo_lemma][hyper_lemma] += freq

                    if hyper not in hyper2hypo or hypo not in hyper2hypo[hyper]:
                        hyper2hypo[hyper][hypo] = freq
                    else:
                        hyper2hypo[hyper][hypo] += freq
                    if hyper_lemma not in hyper2hypo or hypo_lemma not in hyper2hypo[
                            hyper_lemma]:
                        hyper2hypo[hyper_lemma][hypo_lemma] = freq
                    else:
                        hyper2hypo[hyper_lemma][hypo_lemma] += freq

                except:
                    print "Bad row:", row
                    print format_exc()

            print "dictionary is loaded:", len(hypo2hyper)

            if use_pickle:
                pkl = {"hypo2hyper": hypo2hyper, "hyper2hypo": hyper2hypo}
                pickle.dump(pkl, open(isas_pkl_fpath, "wb"))
                print "Pickled voc:", isas_pkl_fpath

        print "Loaded %d words from: %s" % (len(hypo2hyper), isas_pkl_fpath
                                            if isas_pkl_fpath else isas_fpath)

        self._hypo2hyper = hypo2hyper
        self._hyper2hypo = hyper2hypo
コード例 #17
0
ファイル: isas.py プロジェクト: tudarmstadt-lt/taxi
    def __init__(
        self, isas_fpath, min_freq=1, preprocess=True, sep="\t", strip_pos=True, use_pickle=True, lowercase=True
    ):
        """ Provides access to a ISAs relations from a CSV file "hyponym<TAB>hypernym<TAB>freq" """

        if not exists(isas_fpath):
            self._hypo2hyper = {}
            return

        isas_pkl_fpath = isas_fpath + ".pkl"
        if use_pickle and exists(isas_pkl_fpath):
            pkl = pickle.load(open(isas_pkl_fpath, "rb"))
            if "hypo2hyper" in pkl:
                hypo2hyper = pkl["hypo2hyper"]
            else:
                print "Error: cannot find hypo2hyper in ", isas_pkl_fpath
                hypo2hyper = {}

            if "hyper2hypo" in pkl:
                hyper2hypo = pkl["hyper2hypo"]
            else:
                print "Error: cannot find hyper2hypo in ", isas_pkl_fpath
                hyper2hypo = {}

        else:
            if preprocess:
                isas_cln_fpath = isas_fpath + ".cleaned"
                preprocess_pandas_csv(isas_fpath, isas_cln_fpath)
                isas_df = read_csv(isas_cln_fpath, sep, encoding="utf8", error_bad_lines=False)
                try_remove(isas_cln_fpath)
            else:
                isas_df = read_csv(isas_fpath, sep, encoding="utf8", error_bad_lines=False)

            isas_df = isas_df.drop(isas_df[isas_df["freq"] < min_freq].index)
            hypo2hyper = defaultdict(dict)
            hyper2hypo = defaultdict(dict)
            for i, row in isas_df.iterrows():
                try:
                    hypo = (
                        unicode(row["hyponym"]).split("#")[0].lower()
                        if lowercase
                        else unicode(row["hyponym"]).split("#")[0]
                    )
                    hyper = (
                        unicode(row["hypernym"]).split("#")[0].lower()
                        if lowercase
                        else unicode(row["hypernym"]).split("#")[0]
                    )
                    freq = int(row["freq"])
                    hypo_lemma = lemmatize(hypo).lower()
                    hyper_lemma = lemmatize(hyper).lower()

                    if hypo not in hypo2hyper or hyper not in hypo2hyper[hypo]:
                        hypo2hyper[hypo][hyper] = freq
                    else:
                        hypo2hyper[hypo][hyper] += freq
                    if hypo_lemma not in hypo2hyper or hyper_lemma not in hypo2hyper[hypo_lemma]:
                        hypo2hyper[hypo_lemma][hyper_lemma] = freq
                    else:
                        hypo2hyper[hypo_lemma][hyper_lemma] += freq

                    if hyper not in hyper2hypo or hypo not in hyper2hypo[hyper]:
                        hyper2hypo[hyper][hypo] = freq
                    else:
                        hyper2hypo[hyper][hypo] += freq
                    if hyper_lemma not in hyper2hypo or hypo_lemma not in hyper2hypo[hyper_lemma]:
                        hyper2hypo[hyper_lemma][hypo_lemma] = freq
                    else:
                        hyper2hypo[hyper_lemma][hypo_lemma] += freq

                except:
                    print "Bad row:", row
                    print format_exc()

            print "dictionary is loaded:", len(hypo2hyper)

            if use_pickle:
                pkl = {"hypo2hyper": hypo2hyper, "hyper2hypo": hyper2hypo}
                pickle.dump(pkl, open(isas_pkl_fpath, "wb"))
                print "Pickled voc:", isas_pkl_fpath

        print "Loaded %d words from: %s" % (len(hypo2hyper), isas_pkl_fpath if isas_pkl_fpath else isas_fpath)

        self._hypo2hyper = hypo2hyper
        self._hyper2hypo = hyper2hypo
コード例 #18
0
ファイル: freq.py プロジェクト: shannonyu/taxi
    def __init__(self,
                 freq_fpath,
                 min_freq=1,
                 preprocess=True,
                 sep='\t',
                 strip_pos=True,
                 use_pickle=True):
        """ Reads a word frequency list in CSV format "word<TAB>freq" """

        if not exists(freq_fpath):
            self._freq = {}
            return

        pkl_fpath = freq_fpath + ".pkl"
        if use_pickle and exists(pkl_fpath):
            voc = pickle.load(open(pkl_fpath, "rb"))
        else:
            # load words to datafame
            if preprocess:
                freq_cln_fpath = freq_fpath + "-cln"
                preprocess_pandas_csv(freq_fpath, freq_cln_fpath)
                word_df = read_csv(freq_cln_fpath,
                                   sep,
                                   encoding='utf8',
                                   error_bad_lines=False)
                try_remove(freq_cln_fpath)
            else:
                word_df = read_csv(freq_fpath,
                                   sep,
                                   encoding='utf8',
                                   error_bad_lines=False)

            # load from dataframe to dictionary
            word_df = word_df.drop(word_df[word_df["freq"] < min_freq].index)
            if strip_pos:
                voc = {}
                for i, row in word_df.iterrows():
                    try:
                        word = unicode(row["word"]).split("#")[0]
                        freq = int(row["freq"])
                        if word not in voc or voc[word] < freq:
                            voc[word] = freq
                    except:
                        print "Bad row:", row
                        print format_exc()
            else:
                voc = {
                    row["word"]: row["freq"]
                    for i, row in word_df.iterrows()
                }

            print "dictionary is loaded:", len(voc)

            if use_pickle:
                pickle.dump(voc, open(pkl_fpath, "wb"))
                print "Pickled voc:", pkl_fpath

        print "Loaded %d words from: %s" % (len(voc), pkl_fpath
                                            if pkl_fpath else freq_fpath)

        self._freq = voc