def train_in_ids_lm(train_data, vocab_path, out_dir): if not os.path.exists(vocab_path): os.mkdir(vocab_path) vocab_file_in_words = os.path.join(vocab_path, "vocab_in_words") vocab_file_out = os.path.join(vocab_path, "vocab_out") data_ut = DataUtility(vocab_file_in_words=vocab_file_in_words, vocab_file_out=vocab_file_out) with codecs.open(train_data, "r") as f: with codecs.open(os.path.join(vocab_path, out_dir), "w") as f1: for line in f.readlines(): words = line.strip() words = words.replace('.', ' .') words = words.replace(',', ' ,') words = words.replace("'", "' ") words = words.replace('"', '" ') words = words.split() words_ids = data_ut.words2ids(words) words_ids = [str(id) for id in words_ids] words_ids = ' '.join(words_ids) f1.write(words_ids + '#' + words_ids + '\n')
def train_in_ids_lm(train_data, vocab_path): if not os.path.exists(vocab_path): os.mkdir(vocab_path) vocab_file_in_words = os.path.join(vocab_path, "vocab_in_words") vocab_file_in_letters = os.path.join(vocab_path, "vocab_in_letters") vocab_file_out = os.path.join(vocab_path, "vocab_out") data_ut = DataUtility(vocab_file_in_words=vocab_file_in_words, vocab_file_in_letters=vocab_file_in_letters, vocab_file_out=vocab_file_out) with codecs.open(train_data, "r") as f: with codecs.open(os.path.join(vocab_path, "train_in_ids_lm"), "w") as f1: for line in f.readlines(): _, words = line.strip().split('\t') words = words.split('#') words_ids = data_ut.words2ids(words) words_ids = [str(id) for id in words_ids] words_ids = ' '.join(words_ids) f1.write(words_ids + '#' + words_ids + '\n')