def __init__(self, args): self.args = args self.lang_to_code, self.code_to_lang = utils.get_lang_code_dicts() paths_to_read = [] langs = args.langs.split("/") for lang in langs: input_folder = args.treebank_path + "/" + "UD_" + self.code_to_lang[ lang] + "//" for [path, dir, files] in os.walk(input_folder): files.sort() for file in files: if file.endswith(".conllu"): path = input_folder + file print("Reading vocab from ", path) paths_to_read.append((path, lang)) break self.tag_to_ids, self.word_to_id, self.char_to_id = self.read_files( paths_to_read) print("Size of vocab before: %d" % len(self.word_to_id)) self.word_to_id['<unk>'] = len(self.word_to_id) self.char_to_id['<unk>'] = len(self.char_to_id) self.word_to_id['<\s>'] = len(self.word_to_id) self.char_to_id['<pad>'] = len(self.char_to_id) print("Size of vocab after: %d" % len(self.word_to_id)) self.word_padding_token = 0 self.char_padding_token = 0 self.id2tags = {} self.tag_vocab_sizes = {} self.word_freq = {} for key, tag2id in self.tag_to_ids.items(): self.id2tags[key] = {v: k for k, v in tag2id.items()} self.tag_vocab_sizes[key] = len(tag2id) print("Feat: {0} Size: {1}".format(key, len(tag2id))) print(self.tag_to_ids[key]) self.id_to_word = {v: k for k, v in self.word_to_id.items()} self.id_to_char = {v: k for k, v in self.char_to_id.items()} self.word_vocab_size = len(self.id_to_word) self.char_vocab_size = len(self.id_to_char) print("Size of vocab after: %d" % len(self.word_to_id)) print("Word vocab size=%d, Char Vocab size=%d" % (self.word_vocab_size, self.char_vocab_size))
def main(): lang_to_code, code_to_lang = utils.get_lang_code_dicts() annot_sents = read_conll(['ru', 'bg', 'da', 'sv', 'es', 'pt', 'uk'], code_to_lang, train_or_dev="train")
print(args) # Set seed torch.manual_seed(args.seed) # Create dictionaries for language codes, morph tags and pos tags langs = args.langs.split("/") args.model_name = args.model_type + "".join(["_" + l for l in langs]) if args.sum_word_char: args.model_name += "_wc-sum" if args.sent_attn: args.model_name += "_sent-attn" if args.tgt_size: args.model_name += "-" + str(args.tgt_size) lang_to_code, code_to_lang = utils.get_lang_code_dicts() print("Reading training data...") training_data_langwise, train_tgt_labels = utils.read_conll( args.treebank_path, langs, code_to_lang, tgt_size=args.tgt_size, train_or_dev="train") training_data = [] if args.tgt_size == 100 and args.model_type != "mono": training_data_langwise[langs[-1]] = training_data_langwise[langs[-1]] * 10 elif args.tgt_size == 1000 and args.model_type != "mono": training_data_langwise[langs[-1]] = training_data_langwise[langs[-1]]