コード例 #1
0
model = Word2VecModel.load_model(args.word2vec,
                                 fvocab=args.word2vec_vocab,
                                 binary=True)

cur_dir_path = dirname(realpath(__file__))

if args.prep_vocab:
    prep_vocab_list = read_vocab_list(args.prep_vocab)
else:
    prep_vocab_list = read_vocab_list(
        join(cur_dir_path, consts.PREP_VOCAB_LIST_FILE))

pred_count_dict = None
if args.subsampling:
    with open(join(cur_dir_path, consts.PRED_VOCAB_COUNT_FILE)) as fin:
        pred_count_dict = read_counter(fin)

for input_f in input_files:
    with BZ2File(input_f, 'r') as fin:
        script_corpus = ScriptCorpus.from_text(fin.read())
        for script in script_corpus.scripts:
            rich_script = RichScript.build(script,
                                           prep_vocab_list=prep_vocab_list,
                                           use_lemma=args.use_lemma,
                                           filter_stop_events=False)
            rich_script.get_index(model,
                                  include_type=True,
                                  use_unk=True,
                                  pred_count_dict=pred_count_dict)
            pair_tuning_inputs = rich_script.get_pair_tuning_input_list(
                neg_sample_type=args.neg_sample_type)
コード例 #2
0
                    default=5,
                    help='minimum count to keep the word')

args = parser.parse_args()

input_dirs = sorted([
    join(args.input_path, f) for f in listdir(args.input_path)
    if isdir(join(args.input_path, f))
])

all_vocab = defaultdict(Counter)

for input_dir in input_dirs:
    print 'Reading vocabulary count from {}'.format(input_dir)
    with BZ2File(join(input_dir, 'argument.bz2'), 'r') as fin:
        all_vocab['argument'] += read_counter(fin)
        prune_counter(all_vocab['argument'], args.min_count)
    with BZ2File(join(input_dir, 'name_entity.bz2'), 'r') as fin:
        all_vocab['name_entity'] += read_counter(fin)
        prune_counter(all_vocab['name_entity'], args.min_count)
    with BZ2File(join(input_dir, 'name_entity_tag.bz2'), 'r') as fin:
        all_vocab['name_entity_tag'] += read_counter(fin)
        # prune_counter(all_vocab['name_entity_tag'], args.min_count)
    with BZ2File(join(input_dir, 'predicate.bz2'), 'r') as fin:
        all_vocab['predicate'] += read_counter(fin)
        prune_counter(all_vocab['predicate'], args.min_count)
    with BZ2File(join(input_dir, 'preposition.bz2'), 'r') as fin:
        all_vocab['preposition'] += read_counter(fin)
        prune_counter(all_vocab['preposition'], args.min_count)

for key in all_vocab: