Ejemplo n.º 1
0
    def __init__(self,
                 input_wvectors,
                 input_word2id,
                 input_id2word,
                 input_vocabulary,
                 pair_file_path,
                 kn_file_name,
                 output_file_name,
                 topn = 20):
        word2id = dict()
        with codecs.open(input_word2id, 'r', encoding='utf-8') as f:
            for lines in f:
                word2id[lines.strip().split()[0]] = int(lines.strip().split()[1])
        id2word = dict()
        with codecs.open(input_id2word, 'r', encoding='utf-8') as f:
            for lines in f:
                id2word[int(lines.strip().split()[0])] = lines.strip().split()[1]
        vocabulary = []
        with codecs.open(input_vocabulary, 'r', encoding='utf-8') as f:
            for lines in f:
                vocabulary.append(int(lines.strip()))

        self.topn = topn
        kneighbor = KNeighbor(input_wvectors, vocabulary, word2id, id2word)
        dump_to_pkl(kneighbor, kn_file_name)

        logging_set('NSselect.log')
        files = os.listdir(pair_file_path)
        pairs = dict()
        for file in tqdm(files):
            if not os.path.isdir(file):
                path = pair_file_path + "/" + file
                pair = load_from_pkl(path)
                logging.info("pair size: %d" % (len(pair)))
                if len(pairs) == 0:
                    pairs = pair
                else:
                    for key in pair.keys():
                        if key in pairs:
                            pairs[key] += pair[key]
                        else:
                            pairs[key] = pair[key]
                logging.info("current total pair size: %d" % (len(pairs)))
        logging.info("start calculate score")
        score = self.select_new(pairs, kneighbor, self.topn)
        #score1 = self.select(pairs, kneighbor)
        logging.info("start saving")
        dump_to_pkl(score, output_file_name)
Ejemplo n.º 2
0
if __name__ == "__main__":
    parser = argparse.ArgumentParser(
        formatter_class=argparse.RawDescriptionHelpFormatter,
        description="Philly arguments parser")

    parser.add_argument('emb_file_name', type=str)
    parser.add_argument('--similarity_test_paths',
                        type=str,
                        default='data/240.txt|data/297.txt')
    parser.add_argument('--synset_paths',
                        type=str,
                        default='data/nsem3-adjusted.txt')
    parser.add_argument('--analogy_test_paths',
                        type=str,
                        default='data/analogy.txt')
    parser.add_argument('--log_path', type=str, default='evaluation.log')
    args, _ = parser.parse_known_args()
    logging_set(args.log_path)

    if args.similarity_test_paths == 'None':
        args.similarity_test_paths = None
    if args.synset_paths == 'None':
        args.synset_paths = None
    if args.analogy_test_paths == 'None':
        args.analogy_test_paths = None
    best_scores, save_flag = evaluation(args.emb_file_name,
                                        args.similarity_test_paths,
                                        args.synset_paths,
                                        args.analogy_test_paths)
Ejemplo n.º 3
0
    def select(self, pairs, kneighbor):
        score = dict()
        for keyn in tqdm(kneighbor.keys()):
            score[keyn] = []
            for value in kneighbor[keyn]:
                s = 0
                i = 0
                for keyp in pairs.keys():
                    if keyp[0] == keyn:
                        replace = tuple([value] + list(keyp[1:]))
                        if replace in pairs:
                            s += pairs[replace] / pairs[keyp]
                            i += 1
                        else:
                            s += 0
                            i += 1
                score[keyn].append(s / i)
        return score


if __name__ == '__main__':
    logging_set('NSselect.log')
    ns = NSselect(input_wvectors=sys.argv[1],
                  input_word2id=sys.argv[2],
                  input_id2word=sys.argv[3],
                  input_vocabulary=sys.argv[4],
                  pair_file_path=sys.argv[5],
                  kn_file_name=sys.argv[6],
                  output_file_name=sys.argv[7])
Ejemplo n.º 4
0
import os
from utils import load_from_pkl, dump_to_pkl, logging_set
from tqdm import tqdm
import logging
import gc

logging_set('merge_pair.log')

path = 'data/pair'
files = os.listdir(path)[1:]
pairs = dict()
for idx, file in enumerate(tqdm(files)):
    if idx % 20 == 0:
        gc.collect()  #手动触发 内存回收

    if not os.path.isdir(file):
        pair_file_path = path + "/" + file
        pair = load_from_pkl(pair_file_path)
        logging.info("pair size: %d" % (len(pair)))
        if len(pairs) == 0:
            pairs = pair
        else:
            for key in pair.keys():
                if key in pairs:
                    pairs[key] += pair[key]
                else:
                    pairs[key] = pair[key]
        logging.info("current total pair size: %d" % (len(pairs)))

output_file_name = 'data/pairs.pkl'
dump_to_pkl(pairs, output_file_name)