Example #1
0
def do_train(args):
    """
    Train the model using the provided arguments.
    """

    # Assumption: it is cheap to store all the data in text form in
    # memory (it's only about 144mb)
    _, X, y = load_data_raw(args.input)
    X_train, y_train, X_val, y_val = split_data(X, y, args.dev_split)

    # Assumption: word vector model will also easily fit in memory.
    wvecs = WordVectorModel.from_file(args.wvecs, False, '*UNKNOWN*')

    # Typical values are 50, 50
    input_shape = (1,args.n_words, wvecs.dim)
    output_shape = len(LABELS)

    # Build model
    model = build_model(args, input_shape=input_shape, output_shape=output_shape, output_type=args.output_type)

    # Training data on the other hand will not. Each input instance is
    # 50x50 matrix with 8bytes per value: that's about 20kb.
    # Assuming we want to store only about 500mb in memory at a time,
    # that means we want at most 25k items in a batch.
    # Typically minibatches of 32-128 are probably ok. Let's keep it
    # that way?
    for epoch in range(args.n_epochs):
        log("== Training model, epoch {}", epoch)

        scorer = Scorer(model)
        for xy in tqdm(grouper(args.batch_size, zip(X_train, y_train))):
            X_batch, y_batch = zip(*xy)
            X_batch, y_batch = wvecs.embed_sentences(X_batch), array(make_one_hot(y_batch, len(LABELS)))
            score = model.train_on_batch(X_batch, y_batch)
            scorer.update(score, len(X_batch))
        log("=== train error: {}", scorer)

        scorer = Scorer(model)
        for xy in tqdm(grouper(args.batch_size, zip(X_val, y_val))):
            X_batch, y_batch = zip(*xy)
            X_batch, y_batch = wvecs.embed_sentences(X_batch), array(make_one_hot(y_batch, len(LABELS)))
            score = model.test_on_batch(X_batch, y_batch)
            scorer.update(score, len(X_batch))
        log("=== val error: {}", scorer)

    ## Save the model
    save_model(model, args.model, args.weights)
Example #2
0
def do_run(args):
    """
    Run the neural net to predict on new data.
    """
    # Load the model and weights
    model = load_model(args.model, args.weights)
    wvecs = WordVectorModel.from_file(args.wvecs, False, '*UNKNOWN*')

    data = ((tweet.id, tokenize(to_ascii(tweet.text))) for tweet in RowObjectFactory.from_stream(csv.reader(args.input, delimiter="\t")))
    writer = csv.writer(args.output, delimiter='\t')
    writer.writerow(['id',] + LABELS)

    for ix in tqdm(grouper(args.batch_size, data)):
        ids_batch, X_batch = zip(*ix)
        X_batch = wvecs.embed_sentences(X_batch)
        labels = model.predict_on_batch(X_batch)
        for id, label in zip(ids_batch, labels):
            writer.writerow([id,] + [float(l) for l in label])