Beispiel #1
0
def main():
    '''Execute the job.
    Load and pre-process the lyrics,
    get the song arrays and labels and
    store them in npy files.

    If num_common_words option is specified,
    num_common_words number of most common words 
    will be removed from the lyrics.
    '''

    parser = argparse.ArgumentParser()
    parser.add_argument('-n',
                        '--num_common_words',
                        help='Number of most common words to be removed.',
                        type=int)
    parser.add_argument('-w',
                        '--words_to_count',
                        help='The list of words to be counted in each song.',
                        nargs='*')
    parser.add_argument('-v',
                        '--vectorize',
                        help='Perform the vectorization of each song.',
                        action='store_true')
    args = parser.parse_args()

    processor = DataProcessor(words_to_count=args.words_to_count)
    processor.dump_to_npy(num_common_words=args.num_common_words,
                          vectorize=args.vectorize)
Beispiel #2
0
def bilstm_crf_train_eval(config,
                          import_model,
                          train_examples,
                          dev_examples=None,
                          test_examples=None):
    processor = DataProcessor(config.data_dir, config.do_lower_case)
    word2id = processor.get_vocab()
    config.vocab_size = len(word2id)
    train_features = convert_examples_to_features_crf(
        examples=train_examples,
        word2id=word2id,
        label_list=config.label_list,
    )
    train_dataset = BuildDataSet(train_features)
    train_loader = DataLoader(train_dataset,
                              batch_size=config.batch_size,
                              shuffle=True)
    if dev_examples:
        dev_features = convert_examples_to_features_crf(
            examples=dev_examples,
            word2id=word2id,
            label_list=config.label_list,
        )
        dev_dataset = BuildDataSet(dev_features)
        dev_loader = DataLoader(dev_dataset,
                                batch_size=config.batch_size,
                                shuffle=True)
    else:
        dev_loader = None

    if test_examples:
        test_features = convert_examples_to_features_crf(
            examples=test_examples,
            word2id=word2id,
            label_list=config.label_list,
        )
        test_dataset = BuildDataSet(test_features)
        test_loader = DataLoader(test_dataset,
                                 batch_size=config.batch_size,
                                 shuffle=False)
    else:
        test_loader = None

    logger.info("self config:\n {}".format(config_to_json_string(config)))

    model = import_model.Model(config).to(config.device)
    best_model = model_train(config,
                             model,
                             train_loader,
                             dev_loader,
                             to_crf=True)
    model_test(config, best_model, test_loader, to_crf=True)
Beispiel #3
0
def input_fn(filename, mode=tf.estimator.ModeKeys.EVAL,
             num_epochs=1,
             batch_size=32):
    labels, lines = DataProcessor().read_data(filename)
    shuffle = True if mode == tf.estimator.ModeKeys.TRAIN else False
    num_threads = multiprocessing.cpu_count()
    buffer_size = 2 * batch_size + 1
    print("")
    print("* data input_fn:")
    print("================")
    print("Batch size: {}".format(batch_size))
    print("Epoch Count: {}".format(num_epochs))
    print("Mode: {}".format(mode))
    print("Thread Count: {}".format(num_threads))
    print("Shuffle: {}".format(shuffle))
    print("================")
    print("")
    max_seq_length = config.max_seq_length
    labels_id = []
    for label in labels:
        label_list = []
        for ch in label:
            if ch in label_dict:
                label_list.append(label_dict[ch])
            else:
                label_list.append(label_dict['O'])
        label_list = label_list[:max_seq_length] if len(label_list) >= max_seq_length else label_list + [label_dict['O']] * (
                    max_seq_length - len(label_list))
        labels_id.append(np.array(label_list, dtype=np.int32))
    words_id = []
    segs_id = []
    lengths = []
    for line in lines:
        seg_id = get_seg_features(line)
        word_id = []
        for word in line:
            if word in word_dict:
                word_id.append(word_dict[word])
            else:
                word_id.append(word_dict[UNK])
        lengths.append(len(word_id))
        seg_id = seg_id[:max_seq_length] if len(seg_id) >= max_seq_length else seg_id + [2] * (max_seq_length - len(seg_id))
        word_id = word_id[:max_seq_length] if len(word_id) >= max_seq_length else word_id + [word_dict[PAD_WORD]] * (max_seq_length - len(word_id))
        segs_id.append(np.array(seg_id, dtype=np.int32))
        words_id.append(np.array(word_id, dtype=np.int32))
        assert len(seg_id) == len(word_id)
    assert len(words_id) == len(labels_id)
    #words_id:(None,max_seq_length) segs_id:(None, max_seq_length) lengths:(None)
    res = np.concatenate([np.array(words_id), np.array(segs_id), np.reshape(np.array(lengths), (-1, 1))], axis=-1)
    dataset = tf.data.Dataset.from_tensor_slices(({"instances":res}, np.array(labels_id, dtype=np.int32)))
    if shuffle:
        dataset = dataset.shuffle(buffer_size)
    dataset = dataset.batch(batch_size)
    if mode == tf.estimator.ModeKeys.TRAIN:
        dataset = dataset.repeat(None)
    else:
        dataset = dataset.repeat(1)
    dataset = dataset.prefetch(buffer_size)
    return dataset
Beispiel #4
0
                        type=str,
                        required=True,
                        help='choose a model: bilstm, bilstm_crf, hmm,cnn')
    args = parser.parse_args()

    model_name = args.model
    import_model = import_module('model.' + model_name)
    config = import_model.Config()
    random_seed(config.seed)
    set_logger(config.logging_dir)

    # load data
    if args.model == 'hmm':
        processor = HMMDataProcessor(config.data_dir, config.do_lower_case)
    else:
        processor = DataProcessor(config.data_dir, config.do_lower_case)

    train_examples = processor.get_train_examples()
    config.train_num_examples = len(train_examples)
    dev_examples = processor.get_dev_examples()
    config.dev_num_examples = len(dev_examples)
    test_examples = processor.get_test_examples()
    config.test_num_examples = len(test_examples)
    config.label_list = processor.get_tagging()
    config.num_label = len(config.label_list)

    if args.model == 'bilstm':
        bilstm_train_eval(config,
                          import_model=import_model,
                          train_examples=train_examples,
                          dev_examples=dev_examples,