コード例 #1
0
    def preprocess(self):
        config = self.config
        input_dir = config.input_dir
        output_dir = config.output_dir
        tree_type = config.preprocessor_config.tree_type
        output_type = self.__get_output_type()

        # set file names
        file_names = {
            'train': ['train_{}.pkl'.format(x) for x in tree_type],
            'validation': ['validation_{}.pkl'.format(x) for x in tree_type],
            'test': ['test_{}.pkl'.format(x) for x in tree_type]
        }

        sentiment_map_file = 'sentiment_map.pkl'

        # load sentiment map
        eprint('Loading sentiment map.')
        sentiment_map = from_pkl_file(
            os.path.join(input_dir, sentiment_map_file))

        # preprocessing trees
        for tag_name, f_list in file_names.items():
            nx_tree_list = []
            for f in f_list:
                nx_tree_list.append(from_pkl_file(os.path.join(input_dir, f)))
            nx_tree_list = list(zip(*nx_tree_list))

            self.__init_stats__(tag_name)

            tree_list = []

            for x in tqdm(nx_tree_list,
                          desc='Preprocessing {}'.format(tag_name)):
                t = self.tree_transformer.transform(*x)
                if self.__assign_node_features__(t, sentiment_map,
                                                 output_type):
                    # assign only if there is a label on the root (missing labe means neutral)
                    self.__update_stats__(tag_name, t)
                    tree_list.append(self.__nx_to_dgl__(t))

            self.__print_stats__(tag_name)
            to_pkl_file(tree_list,
                        os.path.join(output_dir, '{}.pkl'.format(tag_name)))

        # save all stats
        self.__save_stats__()

        # compute and sabe word_embeddings
        self.__save_word_embeddings__()
コード例 #2
0
    def preprocess(self):
        config = self.config
        input_dir = config.input_dir
        output_dir = config.output_dir

        tree_type = config.preprocessor_config.tree_type

        # set file names
        file_names = {
            'train': ['train_{}.pkl'.format(x) for x in tree_type],
            'validation': ['validation_{}.pkl'.format(x) for x in tree_type],
            'test': ['test_{}.pkl'.format(x) for x in tree_type]
        }

        # preprocessing trees
        for tag_name, f_list in file_names.items():
            parsed_trees_list = []
            for f in f_list:
                parsed_trees_list.append(
                    from_pkl_file(os.path.join(input_dir, f)))

            n_trees = len(parsed_trees_list[0])
            parsed_trees = [{
                'tree':
                tuple([v[i]['tree'] for v in parsed_trees_list]),
                'coarse_label':
                parsed_trees_list[0][i]['coarse_label'],
                'fine_label':
                parsed_trees_list[0][i]['fine_label']
            } for i in range(n_trees)]

            self.__init_stats__(tag_name)

            data_list = []

            for x in tqdm(parsed_trees,
                          desc='Preprocessing {}'.format(tag_name)):
                t = self.tree_transformer.transform(*x['tree'])

                self.__assign_node_features__(t)

                self.__update_stats__(tag_name, t)

                dgl_t = self.__nx_to_dgl__(t)
                data_list.append((dgl_t, x['coarse_label'], x['fine_label']))

            self.__print_stats__(tag_name)
            to_pkl_file(data_list,
                        os.path.join(output_dir, '{}.pkl'.format(tag_name)))

        # save all stats
        self.__save_stats__()
        self.__save_word_embeddings__()
コード例 #3
0
 def __new__(cls, *args, **kwargs):
     embedding_type = kwargs['embedding_type']
     if embedding_type == 'pretrained':
         np_array = from_pkl_file(kwargs['pretrained_embs'])
         return nn.Embedding.from_pretrained(th.tensor(np_array,
                                                       dtype=th.float),
                                             freeze=kwargs['freeze'])
     elif embedding_type == 'one_hot':
         num_embs = kwargs['num_embs']
         return nn.Embedding.from_pretrained(th.eye(num_embs, num_embs),
                                             freeze=True)
     elif embedding_type == 'random':
         num_embs = kwargs['num_embs']
         emb_size = kwargs['emb_size']
         return nn.Embedding(num_embs, emb_size)
     else:
         raise ValueError('Embedding type is unkown!')
コード例 #4
0
    def __init__(self, config):
        tree_transformer = create_object_from_config(
            config.preprocessor_config.tree_transformer)
        super(NlpParsedTreesPreprocessor,
              self).__init__(config, tree_transformer.CREATE_TYPES)

        self.tree_transformer = tree_transformer

        tree_type = config.preprocessor_config.tree_type
        if not isinstance(tree_type, list):
            config.preprocessor_config.tree_type = [tree_type]
        else:
            config.preprocessor_config.tree_type = list(sorted(tree_type))

        # load vocabulary
        eprint('Loading word vocabulary.')
        words_vocab_file = 'words_vocab.pkl'
        self.words_vocab = from_pkl_file(
            os.path.join(config.input_dir, words_vocab_file))
コード例 #5
0
                        })
            to_pkl_file(parsed_trees, out_file)

            # save words vocab file
            eprint('Store word vocabulary.')
            words_vocab_file = os.path.join(output_dir, 'words_vocab.pkl')
            to_pkl_file(parser.words_vocab, words_vocab_file)

            # strore label vocabs
            eprint('Store label vocabulary.')
            to_json_file(coarse_label_vocab,
                         os.path.join(output_dir, 'coarse_vocab.json'))
            to_json_file(fine_label_vocab,
                         os.path.join(output_dir, 'fine_vocab.json'))
        else:
            parsed_trees = from_pkl_file(out_file)

        all_parsed_trees[k_t] = parsed_trees

    # compute validation split
    # rand_perm_idx = np.random.permutation(len(parsed_trees['dep']))
    # idx_val = rand_perm_idx[:n_val]
    # idx_train = rand_perm_idx[n_val:]
    n_trees = len(all_parsed_trees['train']['dep'])
    n_val = 500
    labels = [d['coarse_label'] for d in all_parsed_trees['train']['dep']]
    idx_train, idx_val = train_test_split(np.arange(n_trees),
                                          test_size=n_val / n_trees,
                                          stratify=labels)

    trees_to_write = {