def preprocess(self): config = self.config input_dir = config.input_dir output_dir = config.output_dir tree_type = config.preprocessor_config.tree_type output_type = self.__get_output_type() # set file names file_names = { 'train': ['train_{}.pkl'.format(x) for x in tree_type], 'validation': ['validation_{}.pkl'.format(x) for x in tree_type], 'test': ['test_{}.pkl'.format(x) for x in tree_type] } sentiment_map_file = 'sentiment_map.pkl' # load sentiment map eprint('Loading sentiment map.') sentiment_map = from_pkl_file( os.path.join(input_dir, sentiment_map_file)) # preprocessing trees for tag_name, f_list in file_names.items(): nx_tree_list = [] for f in f_list: nx_tree_list.append(from_pkl_file(os.path.join(input_dir, f))) nx_tree_list = list(zip(*nx_tree_list)) self.__init_stats__(tag_name) tree_list = [] for x in tqdm(nx_tree_list, desc='Preprocessing {}'.format(tag_name)): t = self.tree_transformer.transform(*x) if self.__assign_node_features__(t, sentiment_map, output_type): # assign only if there is a label on the root (missing labe means neutral) self.__update_stats__(tag_name, t) tree_list.append(self.__nx_to_dgl__(t)) self.__print_stats__(tag_name) to_pkl_file(tree_list, os.path.join(output_dir, '{}.pkl'.format(tag_name))) # save all stats self.__save_stats__() # compute and sabe word_embeddings self.__save_word_embeddings__()
def preprocess(self): config = self.config input_dir = config.input_dir output_dir = config.output_dir tree_type = config.preprocessor_config.tree_type # set file names file_names = { 'train': ['train_{}.pkl'.format(x) for x in tree_type], 'validation': ['validation_{}.pkl'.format(x) for x in tree_type], 'test': ['test_{}.pkl'.format(x) for x in tree_type] } # preprocessing trees for tag_name, f_list in file_names.items(): parsed_trees_list = [] for f in f_list: parsed_trees_list.append( from_pkl_file(os.path.join(input_dir, f))) n_trees = len(parsed_trees_list[0]) parsed_trees = [{ 'tree': tuple([v[i]['tree'] for v in parsed_trees_list]), 'coarse_label': parsed_trees_list[0][i]['coarse_label'], 'fine_label': parsed_trees_list[0][i]['fine_label'] } for i in range(n_trees)] self.__init_stats__(tag_name) data_list = [] for x in tqdm(parsed_trees, desc='Preprocessing {}'.format(tag_name)): t = self.tree_transformer.transform(*x['tree']) self.__assign_node_features__(t) self.__update_stats__(tag_name, t) dgl_t = self.__nx_to_dgl__(t) data_list.append((dgl_t, x['coarse_label'], x['fine_label'])) self.__print_stats__(tag_name) to_pkl_file(data_list, os.path.join(output_dir, '{}.pkl'.format(tag_name))) # save all stats self.__save_stats__() self.__save_word_embeddings__()
def __new__(cls, *args, **kwargs): embedding_type = kwargs['embedding_type'] if embedding_type == 'pretrained': np_array = from_pkl_file(kwargs['pretrained_embs']) return nn.Embedding.from_pretrained(th.tensor(np_array, dtype=th.float), freeze=kwargs['freeze']) elif embedding_type == 'one_hot': num_embs = kwargs['num_embs'] return nn.Embedding.from_pretrained(th.eye(num_embs, num_embs), freeze=True) elif embedding_type == 'random': num_embs = kwargs['num_embs'] emb_size = kwargs['emb_size'] return nn.Embedding(num_embs, emb_size) else: raise ValueError('Embedding type is unkown!')
def __init__(self, config): tree_transformer = create_object_from_config( config.preprocessor_config.tree_transformer) super(NlpParsedTreesPreprocessor, self).__init__(config, tree_transformer.CREATE_TYPES) self.tree_transformer = tree_transformer tree_type = config.preprocessor_config.tree_type if not isinstance(tree_type, list): config.preprocessor_config.tree_type = [tree_type] else: config.preprocessor_config.tree_type = list(sorted(tree_type)) # load vocabulary eprint('Loading word vocabulary.') words_vocab_file = 'words_vocab.pkl' self.words_vocab = from_pkl_file( os.path.join(config.input_dir, words_vocab_file))
}) to_pkl_file(parsed_trees, out_file) # save words vocab file eprint('Store word vocabulary.') words_vocab_file = os.path.join(output_dir, 'words_vocab.pkl') to_pkl_file(parser.words_vocab, words_vocab_file) # strore label vocabs eprint('Store label vocabulary.') to_json_file(coarse_label_vocab, os.path.join(output_dir, 'coarse_vocab.json')) to_json_file(fine_label_vocab, os.path.join(output_dir, 'fine_vocab.json')) else: parsed_trees = from_pkl_file(out_file) all_parsed_trees[k_t] = parsed_trees # compute validation split # rand_perm_idx = np.random.permutation(len(parsed_trees['dep'])) # idx_val = rand_perm_idx[:n_val] # idx_train = rand_perm_idx[n_val:] n_trees = len(all_parsed_trees['train']['dep']) n_val = 500 labels = [d['coarse_label'] for d in all_parsed_trees['train']['dep']] idx_train, idx_val = train_test_split(np.arange(n_trees), test_size=n_val / n_trees, stratify=labels) trees_to_write = {