def build(self, training_data_path, file_columns, input_types, file_with_col_header, answer_column_name, word2vec_path=None, word_emb_dim=None, format=None, file_type=None, involve_all_words=None, file_format="tsv", show_progress=True, cpu_num_workers=-1, max_vocabulary=800000, word_frequency=3): """ Args: training_data_path: file_columns: { "word1": 0, "word2": 1, "label": 2, "postag_feature1": 3, "postag_feature2": 4 }, input_types: e.g. { "word": { "cols": ["word1", "word2"], "dim": 300 }, "postag": { "cols": ["postag_feature1", "postag_feature2"], "dim": 20 }, } or { "bpe": { "cols": ["word1", "word2"], "dim": 100 "bpe_path": "xxx.bpe" } } word2vec_path: word_emb_dim: involve_all_word: involve all words that show up in the pretrained embedding file_format: "tsv", or "json". Note "json" means each sample is represented by a json string. Returns: """ if 'bpe' in input_types: try: bpe_encoder = BPEEncoder(input_types['bpe']['bpe_path']) except KeyError: raise Exception( 'Please define a bpe path at the embedding layer.') else: bpe_encoder = None self.file_column_num = len(file_columns) with open(training_data_path, "r", encoding='utf-8') as f: progress = self.get_data_list_from_file(f, file_with_col_header) docs, target_docs, cnt_legal, cnt_illegal = self.build_training_multi_processor( progress, cpu_num_workers, file_columns, input_types, answer_column_name, bpe_encoder=bpe_encoder) logging.info("Corpus imported: %d legal lines, %d illegal lines." % (cnt_legal, cnt_illegal)) if word2vec_path and involve_all_words is True: logging.info("Getting pre-trained embeddings...") word_emb_dict = load_embedding(word2vec_path, word_emb_dim, format, file_type, with_head=False, word_set=None) self.input_dicts['word'].build( [list(word_emb_dict.keys())], max_vocabulary_num=len(word_emb_dict), threshold=0) for input_type in input_types: if input_type != 'word': self.input_dicts[input_type].build( docs[input_type], max_vocabulary_num=max_vocabulary, threshold=word_frequency) else: self.input_dicts[input_type].build( docs[input_type], max_vocabulary_num=max_vocabulary, threshold=word_frequency) logging.info("%d types in %s" % (self.input_dicts[input_type].cell_num(), input_type)) if ProblemTypes[self.problem_type] == ProblemTypes.classification: self.output_dict.build(list(target_docs.values())[0], threshold=0) elif ProblemTypes[self.problem_type] == ProblemTypes.sequence_tagging: self.output_dict.build(list(target_docs.values())[0], threshold=0) elif ProblemTypes[self.problem_type] == ProblemTypes.regression or \ ProblemTypes[self.problem_type] == ProblemTypes.mrc: pass if self.output_dict: logging.info("%d types in target" % (self.output_dict.cell_num())) logging.debug("Cell dict built") if word2vec_path: if not involve_all_words: logging.info("Getting pre-trained embeddings...") word_emb_dict = load_embedding( word2vec_path, word_emb_dim, format, file_type, with_head=False, word_set=self.input_dicts['word'].cell_id_map.keys()) for word in word_emb_dict: loaded_emb_dim = len(word_emb_dict[word]) break assert loaded_emb_dim == word_emb_dim, "The dimension of defined word embedding is inconsistent with the pretrained embedding provided!" if self.input_dicts['word'].with_unk: word_emb_dict['<unk>'] = np.random.random(size=word_emb_dim) if self.input_dicts['word'].with_pad: word_emb_dict['<pad>'] = np.random.random(size=word_emb_dim) word_emb_matrix = [] unknown_word_count = 0 for i in range(self.input_dicts['word'].cell_num()): if self.input_dicts['word'].id_cell_map[i] in word_emb_dict: word_emb_matrix.append( word_emb_dict[self.input_dicts['word'].id_cell_map[i]]) else: word_emb_matrix.append(word_emb_dict['<unk>']) unknown_word_count += 1 word_emb_matrix = np.array(word_emb_matrix) logging.info( "word embedding matrix shape:(%d, %d); unknown word count: %d;" % (len(word_emb_matrix), len( word_emb_matrix[0]), unknown_word_count)) logging.info("Word embedding loaded") else: word_emb_matrix = None return word_emb_matrix
def encode(self, data_path, file_columns, input_types, file_with_col_header, object_inputs, answer_column_name, min_sentence_len, extra_feature, max_lengths=None, fixed_lengths=None, file_format="tsv", show_progress=True, cpu_num_workers=-1): """ Args: data_path: file_columns: { "word1": 0, "word2": 1, "label": 2, "postag_feature1": 3, "postag_feature2": 4 }, input_types: { "word": { "cols": [ "word1", "word2" ], "dim": 300 }, "postag": { "cols": ["postag_feature1", "postag_feature2"], "dim": 20 } } or { "bpe": { "cols": ["word1", "word2"], "dim": 100 "bpe_path": "xxx.bpe" } } object_inputs: { "string1": [ "word1", "postag_feature1" ], "string2": [ "word2", "postag_feature2" ] }, answer_column_name: 'label' / None. None means there is no target and it is used for prediction only. max_lengths: if it is a dict, firstly cut the sequences if they exceed the max length. Then, pad all the sequences to the length of longest string. { "string1": 25, "string2": 100 } fixed_lengths: if it is a dict, cut or pad the sequences to the fixed lengths. { "string1": 25, "string2": 100 } file_format: Returns: data: indices, padded { 'string1': { 'word1': [...], 'postage_feature1': [..] } 'string2': { 'word1': [...], 'postage_feature1': [..] } lengths: real length of data { 'string1': [...], 'string2': [...] } target: [...] """ if 'bpe' in input_types: try: bpe_encoder = BPEEncoder(input_types['bpe']['bpe_path']) except KeyError: raise Exception( 'Please define a bpe path at the embedding layer.') else: bpe_encoder = None with open(data_path, 'r', encoding='utf-8') as fin: progress = self.get_data_list_from_file(fin, file_with_col_header) data, lengths, target, cnt_legal, cnt_illegal = self.encode_data_multi_processor( progress, cpu_num_workers, file_columns, input_types, object_inputs, answer_column_name, min_sentence_len, extra_feature, max_lengths, fixed_lengths, file_format, bpe_encoder=bpe_encoder) logging.info("%s: %d legal samples, %d illegal samples" % (data_path, cnt_legal, cnt_illegal)) return data, lengths, target
def build(self, data_path_list, file_columns, input_types, file_with_col_header, answer_column_name, word2vec_path=None, word_emb_dim=None, format=None, file_type=None, involve_all_words=None, file_format="tsv", show_progress=True, cpu_num_workers=-1, max_vocabulary=800000, word_frequency=3): """ Args: training_data_path: file_columns: { "word1": 0, "word2": 1, "label": 2, "postag_feature1": 3, "postag_feature2": 4 }, input_types: e.g. { "word": { "cols": ["word1", "word2"], "dim": 300 }, "postag": { "cols": ["postag_feature1", "postag_feature2"], "dim": 20 }, } or { "bpe": { "cols": ["word1", "word2"], "dim": 100 "bpe_path": "xxx.bpe" } } word2vec_path: word_emb_dim: involve_all_word: involve all words that show up in the pretrained embedding file_format: "tsv", or "json". Note "json" means each sample is represented by a json string. Returns: """ # parameter check if not word2vec_path: word_emb_dim, format, file_type, involve_all_words = None, None, None, None if 'bpe' in input_types: try: bpe_encoder = BPEEncoder(input_types['bpe']['bpe_path']) except KeyError: raise Exception( 'Please define a bpe path at the embedding layer.') else: bpe_encoder = None self.file_column_num = len(file_columns) progress = self.get_data_generator_from_file(data_path_list, file_with_col_header) preprocessed_data_generator = self.build_training_multi_processor( progress, cpu_num_workers, file_columns, input_types, answer_column_name, bpe_encoder=bpe_encoder) # update symbol universe total_cnt_legal, total_cnt_illegal = 0, 0 for docs, target_docs, cnt_legal, cnt_illegal in tqdm( preprocessed_data_generator): total_cnt_legal += cnt_legal total_cnt_illegal += cnt_illegal # input_type for input_type in input_types: self.input_dicts[input_type].update(docs[input_type]) # problem_type if ProblemTypes[self.problem_type] == ProblemTypes.classification or \ ProblemTypes[self.problem_type] == ProblemTypes.sequence_tagging: self.output_dict.update(list(target_docs.values())[0]) elif ProblemTypes[self.problem_type] == ProblemTypes.regression or \ ProblemTypes[self.problem_type] == ProblemTypes.mrc: pass logging.info("Corpus imported: %d legal lines, %d illegal lines." % (total_cnt_legal, total_cnt_illegal)) # build dictionary for input_type in input_types: self.input_dicts[input_type].build( threshold=word_frequency, max_vocabulary_num=max_vocabulary) logging.info("%d types in %s column" % (self.input_dicts[input_type].cell_num(), input_type)) if self.output_dict: self.output_dict.build(threshold=0) logging.info("%d types in target column" % (self.output_dict.cell_num())) logging.debug("training data dict built") # embedding word_emb_matrix = None if word2vec_path: logging.info("Getting pre-trained embeddings...") word_emb_dict = None if involve_all_words is True: word_emb_dict = load_embedding(word2vec_path, word_emb_dim, format, file_type, with_head=False, word_set=None) self.input_dicts['word'].update([list(word_emb_dict.keys())]) self.input_dicts['word'].build( threshold=0, max_vocabulary_num=len(word_emb_dict)) else: extend_vocabulary = set() for single_word in self.input_dicts['word'].cell_id_map.keys(): extend_vocabulary.add(single_word) if single_word.lower() != single_word: extend_vocabulary.add(single_word.lower()) word_emb_dict = load_embedding(word2vec_path, word_emb_dim, format, file_type, with_head=False, word_set=extend_vocabulary) for word in word_emb_dict: loaded_emb_dim = len(word_emb_dict[word]) break assert loaded_emb_dim == word_emb_dim, "The dimension of defined word embedding is inconsistent with the pretrained embedding provided!" logging.info("constructing embedding table") if self.input_dicts['word'].with_unk: word_emb_dict['<unk>'] = np.random.random(size=word_emb_dim) if self.input_dicts['word'].with_pad: word_emb_dict['<pad>'] = np.random.random(size=word_emb_dim) word_emb_matrix = [] unknown_word_count = 0 scale = np.sqrt(3.0 / word_emb_dim) for i in range(self.input_dicts['word'].cell_num()): single_word = self.input_dicts['word'].id_cell_map[i] if single_word in word_emb_dict: word_emb_matrix.append(word_emb_dict[single_word]) elif single_word.lower() in word_emb_dict: word_emb_matrix.append(word_emb_dict[single_word.lower()]) else: word_emb_matrix.append( np.random.uniform(-scale, scale, word_emb_dim)) unknown_word_count += 1 word_emb_matrix = np.array(word_emb_matrix) logging.info( "word embedding matrix shape:(%d, %d); unknown word count: %d;" % (len(word_emb_matrix), len( word_emb_matrix[0]), unknown_word_count)) logging.info("Word embedding loaded") return word_emb_matrix