def do_train(args): """ Train the model using the provided arguments. """ # Assumption: it is cheap to store all the data in text form in # memory (it's only about 144mb) _, X, y = load_data_raw(args.input) X_train, y_train, X_val, y_val = split_data(X, y, args.dev_split) # Assumption: word vector model will also easily fit in memory. wvecs = WordVectorModel.from_file(args.wvecs, False, '*UNKNOWN*') # Typical values are 50, 50 input_shape = (1,args.n_words, wvecs.dim) output_shape = len(LABELS) # Build model model = build_model(args, input_shape=input_shape, output_shape=output_shape, output_type=args.output_type) # Training data on the other hand will not. Each input instance is # 50x50 matrix with 8bytes per value: that's about 20kb. # Assuming we want to store only about 500mb in memory at a time, # that means we want at most 25k items in a batch. # Typically minibatches of 32-128 are probably ok. Let's keep it # that way? for epoch in range(args.n_epochs): log("== Training model, epoch {}", epoch) scorer = Scorer(model) for xy in tqdm(grouper(args.batch_size, zip(X_train, y_train))): X_batch, y_batch = zip(*xy) X_batch, y_batch = wvecs.embed_sentences(X_batch), array(make_one_hot(y_batch, len(LABELS))) score = model.train_on_batch(X_batch, y_batch) scorer.update(score, len(X_batch)) log("=== train error: {}", scorer) scorer = Scorer(model) for xy in tqdm(grouper(args.batch_size, zip(X_val, y_val))): X_batch, y_batch = zip(*xy) X_batch, y_batch = wvecs.embed_sentences(X_batch), array(make_one_hot(y_batch, len(LABELS))) score = model.test_on_batch(X_batch, y_batch) scorer.update(score, len(X_batch)) log("=== val error: {}", scorer) ## Save the model save_model(model, args.model, args.weights)
def do_run(args): """ Run the neural net to predict on new data. """ # Load the model and weights model = load_model(args.model, args.weights) wvecs = WordVectorModel.from_file(args.wvecs, False, '*UNKNOWN*') data = ((tweet.id, tokenize(to_ascii(tweet.text))) for tweet in RowObjectFactory.from_stream(csv.reader(args.input, delimiter="\t"))) writer = csv.writer(args.output, delimiter='\t') writer.writerow(['id',] + LABELS) for ix in tqdm(grouper(args.batch_size, data)): ids_batch, X_batch = zip(*ix) X_batch = wvecs.embed_sentences(X_batch) labels = model.predict_on_batch(X_batch) for id, label in zip(ids_batch, labels): writer.writerow([id,] + [float(l) for l in label])