def main(): '''Execute the job. Load and pre-process the lyrics, get the song arrays and labels and store them in npy files. If num_common_words option is specified, num_common_words number of most common words will be removed from the lyrics. ''' parser = argparse.ArgumentParser() parser.add_argument('-n', '--num_common_words', help='Number of most common words to be removed.', type=int) parser.add_argument('-w', '--words_to_count', help='The list of words to be counted in each song.', nargs='*') parser.add_argument('-v', '--vectorize', help='Perform the vectorization of each song.', action='store_true') args = parser.parse_args() processor = DataProcessor(words_to_count=args.words_to_count) processor.dump_to_npy(num_common_words=args.num_common_words, vectorize=args.vectorize)
def bilstm_crf_train_eval(config, import_model, train_examples, dev_examples=None, test_examples=None): processor = DataProcessor(config.data_dir, config.do_lower_case) word2id = processor.get_vocab() config.vocab_size = len(word2id) train_features = convert_examples_to_features_crf( examples=train_examples, word2id=word2id, label_list=config.label_list, ) train_dataset = BuildDataSet(train_features) train_loader = DataLoader(train_dataset, batch_size=config.batch_size, shuffle=True) if dev_examples: dev_features = convert_examples_to_features_crf( examples=dev_examples, word2id=word2id, label_list=config.label_list, ) dev_dataset = BuildDataSet(dev_features) dev_loader = DataLoader(dev_dataset, batch_size=config.batch_size, shuffle=True) else: dev_loader = None if test_examples: test_features = convert_examples_to_features_crf( examples=test_examples, word2id=word2id, label_list=config.label_list, ) test_dataset = BuildDataSet(test_features) test_loader = DataLoader(test_dataset, batch_size=config.batch_size, shuffle=False) else: test_loader = None logger.info("self config:\n {}".format(config_to_json_string(config))) model = import_model.Model(config).to(config.device) best_model = model_train(config, model, train_loader, dev_loader, to_crf=True) model_test(config, best_model, test_loader, to_crf=True)
def input_fn(filename, mode=tf.estimator.ModeKeys.EVAL, num_epochs=1, batch_size=32): labels, lines = DataProcessor().read_data(filename) shuffle = True if mode == tf.estimator.ModeKeys.TRAIN else False num_threads = multiprocessing.cpu_count() buffer_size = 2 * batch_size + 1 print("") print("* data input_fn:") print("================") print("Batch size: {}".format(batch_size)) print("Epoch Count: {}".format(num_epochs)) print("Mode: {}".format(mode)) print("Thread Count: {}".format(num_threads)) print("Shuffle: {}".format(shuffle)) print("================") print("") max_seq_length = config.max_seq_length labels_id = [] for label in labels: label_list = [] for ch in label: if ch in label_dict: label_list.append(label_dict[ch]) else: label_list.append(label_dict['O']) label_list = label_list[:max_seq_length] if len(label_list) >= max_seq_length else label_list + [label_dict['O']] * ( max_seq_length - len(label_list)) labels_id.append(np.array(label_list, dtype=np.int32)) words_id = [] segs_id = [] lengths = [] for line in lines: seg_id = get_seg_features(line) word_id = [] for word in line: if word in word_dict: word_id.append(word_dict[word]) else: word_id.append(word_dict[UNK]) lengths.append(len(word_id)) seg_id = seg_id[:max_seq_length] if len(seg_id) >= max_seq_length else seg_id + [2] * (max_seq_length - len(seg_id)) word_id = word_id[:max_seq_length] if len(word_id) >= max_seq_length else word_id + [word_dict[PAD_WORD]] * (max_seq_length - len(word_id)) segs_id.append(np.array(seg_id, dtype=np.int32)) words_id.append(np.array(word_id, dtype=np.int32)) assert len(seg_id) == len(word_id) assert len(words_id) == len(labels_id) #words_id:(None,max_seq_length) segs_id:(None, max_seq_length) lengths:(None) res = np.concatenate([np.array(words_id), np.array(segs_id), np.reshape(np.array(lengths), (-1, 1))], axis=-1) dataset = tf.data.Dataset.from_tensor_slices(({"instances":res}, np.array(labels_id, dtype=np.int32))) if shuffle: dataset = dataset.shuffle(buffer_size) dataset = dataset.batch(batch_size) if mode == tf.estimator.ModeKeys.TRAIN: dataset = dataset.repeat(None) else: dataset = dataset.repeat(1) dataset = dataset.prefetch(buffer_size) return dataset
type=str, required=True, help='choose a model: bilstm, bilstm_crf, hmm,cnn') args = parser.parse_args() model_name = args.model import_model = import_module('model.' + model_name) config = import_model.Config() random_seed(config.seed) set_logger(config.logging_dir) # load data if args.model == 'hmm': processor = HMMDataProcessor(config.data_dir, config.do_lower_case) else: processor = DataProcessor(config.data_dir, config.do_lower_case) train_examples = processor.get_train_examples() config.train_num_examples = len(train_examples) dev_examples = processor.get_dev_examples() config.dev_num_examples = len(dev_examples) test_examples = processor.get_test_examples() config.test_num_examples = len(test_examples) config.label_list = processor.get_tagging() config.num_label = len(config.label_list) if args.model == 'bilstm': bilstm_train_eval(config, import_model=import_model, train_examples=train_examples, dev_examples=dev_examples,