def _read_one(path, is_train=False, max_size=None): lang_id = guess_language_id(path) logger.info("Reading: guess that the language of file %s is %s." % (path, lang_id)) one_data = conllx_data.read_data_to_variable(path, word_alphabet, char_alphabet, pos_alphabet, type_alphabet, use_gpu=use_gpu, volatile=(not is_train), use_bert=args.use_bert, symbolic_root=True, lang_id=lang_id, max_size=max_size) return one_data
def _read_one(path, is_train): lang_id = guess_language_id(path) logger.info("Reading: guess that the language of file %s is %s." % (path, lang_id)) one_data = conllx_stacked_data.read_stacked_data_to_variable( path, word_alphabet, char_alphabet, pos_alphabet, type_alphabet, use_gpu=use_gpu, volatile=(not is_train), prior_order=prior_order, lang_id=lang_id) return one_data
def _read_one(path, is_train): lang_id = guess_language_id(path) logger.info("Reading: guess that the language of file %s is %s." % (path, lang_id)) one_data = conllx_data.read_data_to_variable( path, word_alphabet, char_alphabet, pos_alphabet, type_alphabet, use_gpu=use_gpu, volatile=(not is_train), symbolic_root=True, lang_id=lang_id, len_thresh=(args.train_len_thresh if is_train else 100000)) return one_data
def augment_with_extra_embedding(the_alphabet, extra_embed_file, extra_embed_src_file, test_file, logger): extra_embeds_arr = [] if extra_embed_file is not None: # reopen the vocab the_alphabet.open() # read the embed extra_word_dict, _ = load_embedding_dict('word2vec', extra_embed_file) if extra_embed_src_file is not None: src_extra_word_dict, _ = load_embedding_dict( 'word2vec', extra_embed_src_file) lang_id = guess_language_id(test_file) for one_sent in iter_file(test_file): for w in one_sent["word"]: already_spec = w.startswith("!en_") if already_spec: normed_word = w else: normed_word = DIGIT_RE.sub(b"0", w) normed_word = lang_specific_word(normed_word, lang_id=lang_id) # if normed_word in the_alphabet.instance2index: continue # TODO: assume english is the source for run-translate if already_spec: w = w[4:] check_dict = src_extra_word_dict else: check_dict = extra_word_dict # if w in check_dict: new_embed_arr = check_dict[w] elif w.lower() in check_dict: new_embed_arr = check_dict[w.lower()] else: new_embed_arr = None if new_embed_arr is not None: extra_embeds_arr.append(new_embed_arr) the_alphabet.add(normed_word) # close the vocab the_alphabet.close() logger.info("Augmenting the vocab with new words of %s, now vocab is %s." % (len(extra_embeds_arr), the_alphabet.size())) return extra_embeds_arr