def main(pred_path, gold_path): pred_corpus = Corpus(pred_path) train_corpus = Corpus(gold_path) Nb = NaiveBayes(train_corpus, pred_corpus) train_start_timestamp = time.time() Nb.train(train_corpus) train_elapsed = time.time() - train_start_timestamp i = 0 for tweet in pred_corpus: if i % 1000 == 0 and not i == 0: print('\r%i' % i, file=sys.stderr, end='') i += 1 tweet.prediction = Nb.predict(tweet) eval_results = pred_corpus.evaluate() print('\r' + pformat(eval_results) + '\n') print(' == Macro Avg F_Score: %.2f ==\n' % (eval_results['macro_avg_f_score'])) print(' == Eval Accuracy: %.2f ==\n' % (eval_results['accuracy'])) print(' %s\n' % ('-' * 78))
def main(pred_path: str, gold_path: str, epochs: int) -> None: train_corpus = Corpus(train_path) eval_corpus = Corpus(eval_path) feature_extractor = TextualFeatureExtractor([train_corpus, eval_corpus], caching=True) classes = feature_extractor.classes p = Perceptron(classes) epoch = 0 while True: epoch += 1 train_start_timestamp = time.time() train_corpus.shuffle() for i, tweet in enumerate(train_corpus): feature_vector = feature_extractor.get_feature_vector(tweet) prediction = p.train(feature_vector, tweet.emotion) tweet.prediction = max(prediction.items(), key=itemgetter(1))[0] if i % 1000 == 0 and not i == 0: print('\r%i' % i, file=sys.stderr, end='') train_elapsed = time.time() - train_start_timestamp train_results = train_corpus.evaluate() eval_start_timestamp = time.time() eval_corpus.shuffle() for i, tweet in enumerate(eval_corpus): feature_vector = feature_extractor.get_feature_vector(tweet) prediction = p.predict(feature_vector) tweet.prediction = max(prediction.items(), key=itemgetter(1))[0] eval_elapsed = time.time() - eval_start_timestamp eval_results = eval_corpus.evaluate() eval_width = 10 emotion_width = len(max(eval_results['emotions'], key=lambda x: len(x))) print('\rEpoch %i:' % epoch) print(pformat(eval_results) + '\n') print(' == Train Macro Avg F_Score: %.2f ==' % (train_results['macro_avg_f_score'])) print(' == Eval Macro Avg F_Score: %.2f ==' % (eval_results['macro_avg_f_score'])) print(' == Train Accuracy: %.2f ==' % (train_results['accuracy'])) print(' == Eval Accuracy: %.2f ==\n' % (eval_results['accuracy'])) print(' Time elapsed: Train: %.2f Eval %.2f' % (train_elapsed, eval_elapsed)) print(' %s\n' % ('-' * 78))
def main(pred_path, gold_path): """Evaluates prediction results against a gold standard and prints the resulting precision, recall and F-Score to stdout. Args: pred_path: Path to a file with prediction results. gold_path: Path to a file with the corresponding gold standard to compare against. """ corpus = Corpus(gold_path, pred_path) eval_results = corpus.evaluate() print(pformat(eval_results) + '\n') print(' == Accuracy: %.2f ==' % (eval_results['accuracy']))
def main(image_path: str, image_index_path: str, pred_path: str, gold_path: str, epochs: int) -> None: text_train_corpus = Corpus(train_path) text_eval_corpus = Corpus(eval_path) text_feature_extractor = TextualFeatureExtractor( [text_train_corpus, text_eval_corpus], caching=True) classes = text_feature_extractor.classes text_classifier = Perceptron(classes) predictions = {} epoch = 0 while epoch < 10: epoch += 1 train_start_timestamp = time.time() text_train_corpus.shuffle() for i, tweet in enumerate(text_train_corpus): feature_vector = text_feature_extractor.get_feature_vector(tweet) prediction = text_classifier.train(feature_vector, tweet.emotion) # tweet.full_prediction = prediction tweet.prediction = max(prediction.items(), key=itemgetter(1))[0] predictions[tweet.tweet_id] = prediction if i % 1000 == 0 and not i == 0: print('\r%i' % i, file=sys.stderr, end='') train_elapsed = time.time() - train_start_timestamp train_results = text_train_corpus.evaluate() eval_start_timestamp = time.time() text_eval_corpus.shuffle() for i, tweet in enumerate(text_eval_corpus): feature_vector = text_feature_extractor.get_feature_vector(tweet) prediction = text_classifier.predict(feature_vector) # tweet.full_prediction = prediction tweet.prediction = max(prediction.items(), key=itemgetter(1))[0] predictions[tweet.tweet_id] = prediction eval_elapsed = time.time() - eval_start_timestamp eval_results = text_eval_corpus.evaluate() eval_width = 10 emotion_width = len(max(eval_results['emotions'], key=lambda x: len(x))) print('\rEpoch %i:' % epoch) print(pformat(eval_results) + '\n') print(' == Macro Avg F_Score: %.2f ==' % (train_results['macro_avg_f_score'])) print(' == Train Accuracy: %.2f ==' % (train_results['accuracy'])) print(' == Eval Accuracy: %.2f ==\n' % (eval_results['accuracy'])) print(' Time elapsed: Train: %.2f Eval %.2f' % (train_elapsed, eval_elapsed)) print(' %s\n' % ('-' * 78)) image_train_corpus = Corpus(train_path, image_path, image_index_path, image_res=(12, 12)) image_eval_corpus = Corpus(eval_path, image_path, image_index_path, image_res=(12, 12)) image_feature_extractor = ImageFeatureExtractor( [image_train_corpus, image_eval_corpus], [predictions], caching=True) classes = image_feature_extractor.classes image_classifier = Perceptron(classes) epoch = 0 while True: epoch += 1 train_start_timestamp = time.time() image_train_corpus.shuffle() for i, tweet in enumerate(image_train_corpus): feature_vector = image_feature_extractor.get_feature_vector(tweet) prediction = image_classifier.train(feature_vector, tweet.emotion) # tweet.full_prediction = prediction tweet.prediction = max(prediction.items(), key=itemgetter(1))[0] if i % 1000 == 0 and not i == 0: print('\r%i' % i, file=sys.stderr, end='') train_elapsed = time.time() - train_start_timestamp train_results = image_train_corpus.evaluate() eval_start_timestamp = time.time() image_eval_corpus.shuffle() for i, tweet in enumerate(image_eval_corpus): feature_vector = image_feature_extractor.get_feature_vector(tweet) prediction = image_classifier.predict(feature_vector) # tweet.full_prediction = prediction tweet.prediction = max(prediction.items(), key=itemgetter(1))[0] eval_elapsed = time.time() - eval_start_timestamp eval_results = image_eval_corpus.evaluate() eval_width = 10 emotion_width = len(max(eval_results['emotions'], key=lambda x: len(x))) print('\rEpoch %i:' % epoch) print(pformat(eval_results) + '\n') print(' == Train Macro Avg F_Score: %.2f ==' % (train_results['macro_avg_f_score'])) print(' == Eval Macro Avg F_Score: %.2f ==' % (eval_results['macro_avg_f_score'])) print(' == Train Accuracy: %.2f ==' % (train_results['accuracy'])) print(' == Eval Accuracy: %.2f ==\n' % (eval_results['accuracy'])) print(' %s\n' % ('-' * 78))
def main(pred_path: str, gold_path: str, epochs: int) -> None: # train_longest_tweet = max((len(x) for tweet in train_corpus for x in tweet.body)) # eval_longest_tweet = max((len(x) for tweet in eval_corpus for x in tweet.body)) # longest_tweet = max(train_longest_tweet, eval_longest_tweet) train_corpus = Corpus(train_path) eval_corpus = Corpus(eval_path) word_lookup = Lookup() emotion_lookup = Lookup() i = 1 j = 0 for tweet in train_corpus: for word in tweet.content: if word not in word_lookup: word_lookup[word] = i i += 1 tag = tweet.emotion if tag not in emotion_lookup: emotion_lookup[tag] = j j += 1 word_lookup['__pad__'] = len(word_lookup) word_lookup['__unk__'] = len(word_lookup) emotion_lookup['__pad__'] = len(emotion_lookup) emotion_lookup['__unk__'] = len(emotion_lookup) reverse_emotion_lookup = {i: x for x, i in emotion_lookup.items()} x1 = T.imatrix('x1') # m = T.imatrix('y') # x2 = T.ftensor4('x2') y = T.ivector('y') network = build_network(text_input_var=x1, number_of_words=len(word_lookup), word_embedding_length=30, num_target=len(emotion_lookup)) prediction = lasagne.layers.get_output(network) loss = lasagne.objectives.categorical_crossentropy(prediction, y) # loss = lasagne.objectives.categorical_crossentropy(prediction, y) * T.reshape(T.cast(m, 'float32'), [-1]) # loss = T.sum(loss)/T.sum(m) loss = T.mean(loss) params = lasagne.layers.get_all_params(network) updates = lasagne.updates.adam(loss, params) train = theano.function(inputs=[x1, y], outputs=[prediction, loss], updates=updates, allow_input_downcast=True, on_unused_input='warn') predict = theano.function(inputs=[x1], outputs=[prediction], allow_input_downcast=True, on_unused_input='warn') prev_error = -1 epoch = 0 xts, tis, es = prepare_data(train_corpus, word_lookup, emotion_lookup) test_xts, test_tis, test_es = prepare_data(eval_corpus, word_lookup, emotion_lookup) try: prev_error = -1 while True: epoch += 1 errors = [] for texts, _, emotions in generate_batches(xts, tis, es, word_lookup): ys, err = train(texts, emotions) errors.append(err) error = sum(errors) / len(errors) for texts, tweet_ids, _ in generate_batches( xts, tis, es, word_lookup): ys = predict(texts)[0] for y, ti in zip(ys, tweet_ids): pred = reverse_emotion_lookup[int(np.argmax(y))] train_corpus.get_tweet(ti).prediction = pred for texts, tweet_ids, _ in generate_batches( test_xts, test_tis, test_es, word_lookup): ys = predict(texts)[0] for y, ti in zip(ys, tweet_ids): pred = reverse_emotion_lookup[int(np.argmax(y))] eval_corpus.get_tweet(ti).prediction = pred train_results = train_corpus.evaluate() eval_results = eval_corpus.evaluate() eval_width = 10 emotion_width = len( max(eval_results['emotions'], key=lambda x: len(x))) if prev_error == -1: print('\rEpoch %i error: %.6f' % (epoch, error)) else: err_delta = error - prev_error print('\rEpoch %i error: %.6f error_delta: %.6f' % (epoch, error, err_delta)) prev_error = error print(pformat(eval_results) + '\n') print(' == Train Macro Avg F_Score: %.2f ==' % (train_results['macro_avg_f_score'])) print(' == Eval Macro Avg F_Score: %.2f ==' % (eval_results['macro_avg_f_score'])) print(' == Train Accuracy: %.2f ==' % (train_results['accuracy'])) print(' == Eval Accuracy: %.2f ==\n' % (eval_results['accuracy'])) # print(' Time elapsed: Train: %.2f Eval %.2f' %(train_elapsed, eval_elapsed)) print(' %s\n' % ('-' * 78)) except KeyboardInterrupt: pass