from argparse import ArgumentParser from utils import maybe_cuda import gensim import utils from tensorboard_logger import configure, log_value import os import sys from pathlib2 import Path from wiki_loader import WikipediaDataSet import accuracy import numpy as np from termcolor import colored torch.multiprocessing.set_sharing_strategy('file_system') preds_stats = utils.predictions_analysis() def softmax(x): max_each_row = np.max(x, axis=1, keepdims=True) exps = np.exp(x - max_each_row) sums = np.sum(exps, axis=1, keepdims=True) return exps / sums def import_model(model_name): module = __import__('models.' + model_name, fromlist=['models']) return module.create() class Accuracies(object):
def main(args): start = timer() sys.path.append(str(Path(__file__).parent)) utils.read_config_file(args.config) utils.config.update(args.__dict__) logger.debug('Running with config %s', utils.config) print('Running with threshold: ' + str(args.seg_threshold)) preds_stats = utils.predictions_analysis() if not args.test: word2vec = gensim.models.KeyedVectors.load_word2vec_format( utils.config['word2vecfile'], binary=True) else: word2vec = None word2vec_done = timer() print 'Loading word2vec ellapsed: ' + str(word2vec_done - start) + ' seconds' dirname = 'test' if args.wiki: dataset_folders = [Path(utils.config['wikidataset']) / dirname] if (args.wiki_folder): dataset_folders = [] dataset_folders.append(args.wiki_folder) print 'running on wikipedia' else: if (args.bySegLength): dataset_folders = getSegmentsFolders(utils.config['choidataset']) print 'run on choi by segments length' else: dataset_folders = [utils.config['choidataset']] print 'running on Choi' with open(args.model, 'rb') as f: model = torch.load(f) model = maybe_cuda(model) model.eval() if (args.naive): model = naive.create() for dataset_path in dataset_folders: if (args.bySegLength): print 'Segment is ', os.path.basename(dataset_path), " :" if args.wiki: if (args.wiki_folder): dataset = WikipediaDataSet(dataset_path, word2vec, folder=True, high_granularity=False) else: dataset = WikipediaDataSet(dataset_path, word2vec, high_granularity=False) else: dataset = ChoiDataset(dataset_path, word2vec) dl = DataLoader(dataset, batch_size=args.bs, collate_fn=collate_fn, shuffle=False) with tqdm(desc='Testing', total=len(dl)) as pbar: total_accurate = 0 total_count = 0 total_loss = 0 acc = accuracy.Accuracy() for i, (data, targets, paths) in enumerate(dl): if i == args.stop_after: break pbar.update() output = model(data) targets_var = Variable(maybe_cuda(torch.cat(targets, 0), args.cuda), requires_grad=False) batch_loss = 0 output_prob = softmax(output.data.cpu().numpy()) output_seg = output_prob[:, 1] > args.seg_threshold target_seg = targets_var.data.cpu().numpy() batch_accurate = (output_seg == target_seg).sum() total_accurate += batch_accurate total_count += len(target_seg) total_loss += batch_loss preds_stats.add(output_seg, target_seg) current_target_idx = 0 for k, t in enumerate(targets): document_sentence_count = len(t) sentences_length = [s.size()[0] for s in data[k] ] if args.calc_word else None to_idx = int(current_target_idx + document_sentence_count) h = output_seg[current_target_idx:to_idx] # hypothesis and targets are missing classification of last sentence, and therefore we will add # 1 for both h = np.append(h, [1]) t = np.append(t.cpu().numpy(), [1]) acc.update(h, t, sentences_length=sentences_length) current_target_idx = to_idx logger.debug('Batch %s - error %7.4f, Accuracy: %7.4f', i, batch_loss, batch_accurate / len(target_seg)) pbar.set_description('Testing, Accuracy={:.4}'.format( batch_accurate / len(target_seg))) average_loss = total_loss / len(dl) average_accuracy = total_accurate / total_count calculated_pk, _ = acc.calc_accuracy() logger.info('Finished testing.') logger.info('Average loss: %s', average_loss) logger.info('Average accuracy: %s', average_accuracy) logger.info('Pk: {:.4}.'.format(calculated_pk)) logger.info('F1: {:.4}.'.format(preds_stats.get_f1())) end = timer() print('Seconds to execute to whole flow: ' + str(end - start))
def main(args): start = timer() sys.path.append(str(Path(__file__).parent)) utils.read_config_file(args.config) utils.config.update(args.__dict__) logger.debug('Running with config %s', utils.config) print('Running with threshold: ' + str(args.seg_threshold)) preds_stats = utils.predictions_analysis() probs_stats = [[], []] article_stats = [] export = [] #samples = [] # Let's use Amazon S3 s3 = boto3.resource( 's3') #s3 = boto3.client('s3', profile_name='signal-rnd') mybucket = s3.Bucket('data.data-science.signal') myfolder = 'summaries-segmentation' #pullBucketSamples(mybucket, myfolder+'/samples') print('Samples pulled successfully into container') workbook = excel.Workbook('output.xlsx') #workbook = excel.Workbook('/output/output.xlsx')#when running from container worksheet = workbook.add_worksheet() if not args.test: #key = myfolder + utils.config['word2vecfile'] #word2vec = gensim.models.KeyedVectors.load_word2vec_format(mybucket.Object(key).get()['Body'].read(), binary=True) #word2vec = gensim.models.KeyedVectors.load_word2vec_format(io.BytesIO(mybucket.Object(key).get()['Body'].read()), binary=True) word2vec = gensim.models.KeyedVectors.load_word2vec_format( utils.config['word2vecfile'], binary=True) #response = urllib2.urlopen('https://drive.google.com/file/d/0B7XkCwpI5KDYNlNUTTlSS21pQmM/edit?usp=sharing') #word2vec = gensim.models.KeyedVectors.load_word2vec_format(response.read(), binary=True) #mybucket.Object(key).download_file('GoogleNews_vectors') #word2vec = gensim.models.KeyedVectors.load_word2vec_format('GoogleNews_vectors', binary=True) else: word2vec = None word2vec_done = timer() print 'Loading word2vec ellapsed: ' + str(word2vec_done - start) + ' seconds' dirname = 'test' if args.wiki: dataset_folders = [Path(utils.config['wikidataset']) / dirname] if (args.wiki_folder): dataset_folders = [] dataset_folders.append(args.wiki_folder) print 'running on wikipedia' else: if (args.bySegLength): dataset_folders = getSegmentsFolders(utils.config['choidataset']) print 'run on choi by segments length' else: dataset_folders = [utils.config['choidataset']] print 'running on Choi' key = myfolder + args.model #model = torch.load(mybucket.Object(key).get()['Body'].read()) #fileobj = io.BytesIO() #mybucket.Object(key).download_fileobj(fileobj) mybucket.Object(key).download_file('trained_model') #with open(args.model, 'rb') as f: with open('trained_model', 'rb') as f: model = torch.load(f) model = maybe_cuda(model) model.eval() if (args.naive): model = naive.create() for dataset_path in dataset_folders: if (args.bySegLength): print 'Segment is ', os.path.basename(dataset_path), " :" if args.wiki: if (args.wiki_folder): dataset = WikipediaDataSet(dataset_path, word2vec, folder=True, high_granularity=False) else: dataset = WikipediaDataSet(dataset_path, word2vec, high_granularity=False) else: dataset = ChoiDataset(dataset_path, word2vec) dl = DataLoader(dataset, batch_size=args.bs, collate_fn=collate_fn, shuffle=False) with tqdm(desc='Testing', total=len(dl)) as pbar: total_accurate = 0 total_count = 0 total_loss = 0 acc = accuracy.Accuracy() for i, (data, targets, paths) in enumerate(dl): if i == args.stop_after: break pbar.update() output = model(data) targets_var = Variable(maybe_cuda(torch.cat(targets, 0), args.cuda), requires_grad=False) batch_loss = 0 output_prob = softmax(output.data.cpu().numpy()) #if i < 5: #print output_prob.shape probs_stats[0].append(output_prob.tolist()) #samples.append(data) output_seg = output_prob[:, 1] > args.seg_threshold target_seg = targets_var.data.cpu().numpy() probs_stats[1].append(target_seg.tolist()) batch_accurate = (output_seg == target_seg).sum() total_accurate += batch_accurate total_count += len(target_seg) total_loss += batch_loss preds_stats.add(output_seg, target_seg) current_target_idx = 0 article_stats.append([]) for k, t in enumerate(targets): document_sentence_count = len(t) article_stats[i].append(document_sentence_count) sentences_length = [s.size()[0] for s in data[k] ] if args.calc_word else None to_idx = int(current_target_idx + document_sentence_count) h = output_seg[current_target_idx:to_idx] # hypothesis and targets are missing classification of last sentence, and therefore we will add # 1 for both h = np.append(h, [1]) t = np.append(t.cpu().numpy(), [1]) acc.update(h, t, sentences_length=sentences_length) current_target_idx = to_idx logger.debug('Batch %s - error %7.4f, Accuracy: %7.4f', i, batch_loss, batch_accurate / len(target_seg)) pbar.set_description('Testing, Accuracy={:.4}'.format( batch_accurate / len(target_seg))) average_loss = total_loss / len(dl) average_accuracy = total_accurate / total_count calculated_pk, _ = acc.calc_accuracy() article = 0 for batch, probs in enumerate(probs_stats[0]): boundary = 0 for sentences in article_stats[batch]: export.append([]) for sentence in range(0, sentences): export[article].append(probs[boundary][1]) worksheet.write(sentence, 2 * article, probs[boundary][1]) worksheet.write(sentence, 2 * article + 1, probs_stats[1][batch][boundary]) #worksheet.write(sentence, 3*article + 2, " ".join(samples[batch][boundary][:5])) boundary += 1 article += 1 #Save dataset as pickle #data_out = np.asarray(export) with open('LSTM_probs.pkl', 'wb') as f: #with open('/output/LSTM_probs.pkl', 'wb') as f:#when rnuning from container pkl.dump({'probs': export}, f, pkl.HIGHEST_PROTOCOL ) #, 'labels': y_train }, f, pkl.HIGHEST_PROTOCOL) workbook.close() key = myfolder + '/testing/softmax_probs.jsonl' mybucket.Object(key).upload_file('LSTM_probs.pkl') key = myfolder + '/testing/output.xlsx' mybucket.Object(key).upload_file('output.xlsx') logger.info('Finished testing.') logger.info('Average loss: %s', average_loss) logger.info('Average accuracy: %s', average_accuracy) logger.info('Pk: {:.4}.'.format(calculated_pk)) logger.info('F1: {:.4}.'.format(preds_stats.get_f1())) end = timer() print('Seconds to execute to whole flow: ' + str(end - start))