Exemple #1
0
import sys
import ioutils
import cPickle

print "Reading corpus from", sys.argv[1]

useful_data = ioutils.read_corpus(sys.argv[1], False)

print "Writing..."

with open(sys.argv[2], "wb") as f:
    cPickle.dump(useful_data, f)


    if not args.load:
        timestamp = (datetime.datetime.fromtimestamp(time.time()).strftime('%Y_%m_%d_%H_%M_%S'))
        args.save = args.save + "/" + timestamp
        os.mkdir(args.save)
        f = open(args.save + "/argument.txt", "w")
        for i in vars(args):
            f.write(str(i) + "\t" + str(vars(args)[i]) + "\n")
        f.close()
    else:
        args.save = args.load

    utils.config_logger(args.verbose)
    logger = utils.get_logger('train')
    logger.info('Reading training data')
    train_pairs, train_max = ioutils.read_corpus(args.train, args.lower, args.lang, args.ratio)
    logger.info('Reading validation data')
    valid_pairs, valid_max = ioutils.read_corpus(args.validation, args.lower, args.lang)
    logger.info('Reading test data')
    test_pairs, test_max = ioutils.read_corpus(args.test, args.lower, args.lang)
    logger.info('Reading word embeddings')
    word_dict, embeddings = ioutils.load_embeddings(args.embeddings, args.vocab)
    max_len = None
    #print(train_pairs)
    #embeddings = utils.normalize_embeddings(embeddings)
    logger.debug('Embeddings have shape {} (including unknown, padding and null)'
                 .format(embeddings.shape))

    logger.info('Converting words to indices')
    # find out which labels are there in the data (more flexible to different datasets)
    label_dict = utils.create_label_dict(train_pairs)
Exemple #3
0
    session_config = tf.ConfigProto(allow_soft_placement=True)
    session_config.gpu_options.allow_growth = True
    # session_config.gpu_options.per_process_gpu_memory_fraction = 0.5
    sess = tf.InteractiveSession(config=session_config)

    model_class = utils.get_model_class(params)
    model, _ = model_class.load(args.model, sess)
    word_dict, embeddings = ioutils.load_embeddings(args.embeddings,
                                                    args.vocabulary,
                                                    generate=False,
                                                    load_extra_from=args.model,
                                                    normalize=True)
    model.initialize_embeddings(sess, embeddings)
    label_dict = ioutils.load_label_dict(args.model)

    pairs = ioutils.read_corpus(args.dataset, params['lowercase'],
                                params['language'])
    dataset = utils.create_dataset(pairs, word_dict, label_dict)

    genres = None

    if args.genres != None:
        genres = utils.read_genres(args.genres)

    loss, acc, answers, logits = model.evaluate(sess, dataset, True)
    print('# problems: %s' % dataset.num_items)
    print('Loss: %f' % loss)
    print('Accuracy: %f' % acc)

    if args.genres:
        print_acc_per_genre(pairs, answers, logits, label_dict, genres)
Exemple #4
0
    # word_dict, embeddings = ioutils.load_embeddings(args.embeddings,
    #                                                args.vocabulary,
    #                                                generate=False,
    #                                                load_extra_from=args.model,
    #                                                normalize=True)
    word_dict, embeddings = ioutils.load_embeddings(args.embeddings,
                                                    args.vocabulary,
                                                    generate=False,
                                                    load_extra_from=None,
                                                    normalize=False)

    model.initialize_embeddings(sess, embeddings)
    label_dict = ioutils.load_label_dict(args.model)

    pairs, wordpairs = ioutils.read_corpus(args.dataset, True,
                                           params['language'])
    dataset, _, _ = utils.create_dataset(pairs,
                                         wordpairs,
                                         word_dict,
                                         label_dict,
                                         max_len1=model.maxlen1,
                                         max_len2=model.maxlen2)

    print("Test Dataset Size :%d", dataset.num_items)
    loss, acc, answers, logits = model.evaluate(sess,
                                                dataset,
                                                True,
                                                batch_size=64)
    #print(answers)
    print(np.array(logits).shape)
    label_dict_inverse = {}
Exemple #5
0
                        default=0.0)
    parser.add_argument('--report', help='Number of batches between '
                                         'performance reports',
                        default=100, type=int)
    parser.add_argument('-v', help='Verbose', action='store_true',
                        dest='verbose')
    parser.add_argument('--optim', help='Optimizer algorithm',
                        default='adagrad',
                        choices=['adagrad', 'adadelta', 'adam'])

    args = parser.parse_args()

    utils.config_logger(args.verbose)
    logger = utils.get_logger('train')
    logger.debug('Training with following options: %s' % ' '.join(sys.argv))
    train_pairs = ioutils.read_corpus(args.train, args.lower, args.lang)
    valid_pairs = ioutils.read_corpus(args.validation, args.lower, args.lang)

    # whether to generate embeddings for unknown, padding, null
    word_dict, embeddings = ioutils.load_embeddings(args.embeddings, args.vocab,
                                                    True, normalize=True)

    logger.info('Converting words to indices')
    # find out which labels are there in the data
    # (more flexible to different datasets)
    label_dict = utils.create_label_dict(train_pairs)
    train_data = utils.create_dataset(train_pairs, word_dict, label_dict)
    valid_data = utils.create_dataset(valid_pairs, word_dict, label_dict)

    ioutils.write_params(args.save, lowercase=args.lower, language=args.lang,
                         model=args.model)
Exemple #6
0
                        default=0,
                        type=int)
    parser.add_argument('--continue',
                        help='Continue training.',
                        action='store_true',
                        dest='cont')
    parser.add_argument('--warm-start',
                        help='Use pre-trained model.',
                        dest='warm')

    args = parser.parse_args()

    utils.config_logger(args.verbose)
    logger = utils.get_logger('train')
    logger.debug('Training with following options: %s' % ' '.join(sys.argv))
    train_pairs = ioutils.read_corpus(args.train, args.lower, args.lang)
    valid_pairs = ioutils.read_corpus(args.validation, args.lower, args.lang)

    if args.additional_training != None:
        train_pairs += ioutils.read_corpus(args.additional_training,
                                           args.lower, args.lang)

    assert (not args.cont)  # Not implemented yet.

    # whether to generate embeddings for unknown, padding, null
    is_really_cont = args.warm != None or (args.cont and os.path.exists(
        os.path.join(args.save, "model.meta")))
    warmup_model = args.warm

    if is_really_cont:
        logger.info('Found a model. Fine-tuning...')