model = Word2VecModel.load_model(args.word2vec, fvocab=args.word2vec_vocab, binary=True) cur_dir_path = dirname(realpath(__file__)) if args.prep_vocab: prep_vocab_list = read_vocab_list(args.prep_vocab) else: prep_vocab_list = read_vocab_list( join(cur_dir_path, consts.PREP_VOCAB_LIST_FILE)) pred_count_dict = None if args.subsampling: with open(join(cur_dir_path, consts.PRED_VOCAB_COUNT_FILE)) as fin: pred_count_dict = read_counter(fin) for input_f in input_files: with BZ2File(input_f, 'r') as fin: script_corpus = ScriptCorpus.from_text(fin.read()) for script in script_corpus.scripts: rich_script = RichScript.build(script, prep_vocab_list=prep_vocab_list, use_lemma=args.use_lemma, filter_stop_events=False) rich_script.get_index(model, include_type=True, use_unk=True, pred_count_dict=pred_count_dict) pair_tuning_inputs = rich_script.get_pair_tuning_input_list( neg_sample_type=args.neg_sample_type)
default=5, help='minimum count to keep the word') args = parser.parse_args() input_dirs = sorted([ join(args.input_path, f) for f in listdir(args.input_path) if isdir(join(args.input_path, f)) ]) all_vocab = defaultdict(Counter) for input_dir in input_dirs: print 'Reading vocabulary count from {}'.format(input_dir) with BZ2File(join(input_dir, 'argument.bz2'), 'r') as fin: all_vocab['argument'] += read_counter(fin) prune_counter(all_vocab['argument'], args.min_count) with BZ2File(join(input_dir, 'name_entity.bz2'), 'r') as fin: all_vocab['name_entity'] += read_counter(fin) prune_counter(all_vocab['name_entity'], args.min_count) with BZ2File(join(input_dir, 'name_entity_tag.bz2'), 'r') as fin: all_vocab['name_entity_tag'] += read_counter(fin) # prune_counter(all_vocab['name_entity_tag'], args.min_count) with BZ2File(join(input_dir, 'predicate.bz2'), 'r') as fin: all_vocab['predicate'] += read_counter(fin) prune_counter(all_vocab['predicate'], args.min_count) with BZ2File(join(input_dir, 'preposition.bz2'), 'r') as fin: all_vocab['preposition'] += read_counter(fin) prune_counter(all_vocab['preposition'], args.min_count) for key in all_vocab: