Ejemplo n.º 1
0
from argparse import ArgumentParser
from utils import maybe_cuda
import gensim
import utils
from tensorboard_logger import configure, log_value
import os
import sys
from pathlib2 import Path
from wiki_loader import WikipediaDataSet
import accuracy
import numpy as np
from termcolor import colored

torch.multiprocessing.set_sharing_strategy('file_system')

preds_stats = utils.predictions_analysis()


def softmax(x):
    max_each_row = np.max(x, axis=1, keepdims=True)
    exps = np.exp(x - max_each_row)
    sums = np.sum(exps, axis=1, keepdims=True)
    return exps / sums


def import_model(model_name):
    module = __import__('models.' + model_name, fromlist=['models'])
    return module.create()


class Accuracies(object):
def main(args):
    start = timer()

    sys.path.append(str(Path(__file__).parent))

    utils.read_config_file(args.config)
    utils.config.update(args.__dict__)

    logger.debug('Running with config %s', utils.config)
    print('Running with threshold: ' + str(args.seg_threshold))
    preds_stats = utils.predictions_analysis()

    if not args.test:
        word2vec = gensim.models.KeyedVectors.load_word2vec_format(
            utils.config['word2vecfile'], binary=True)
    else:
        word2vec = None

    word2vec_done = timer()
    print 'Loading word2vec ellapsed: ' + str(word2vec_done -
                                              start) + ' seconds'
    dirname = 'test'

    if args.wiki:
        dataset_folders = [Path(utils.config['wikidataset']) / dirname]
        if (args.wiki_folder):
            dataset_folders = []
            dataset_folders.append(args.wiki_folder)
        print 'running on wikipedia'
    else:
        if (args.bySegLength):
            dataset_folders = getSegmentsFolders(utils.config['choidataset'])
            print 'run on choi by segments length'
        else:
            dataset_folders = [utils.config['choidataset']]
            print 'running on Choi'

    with open(args.model, 'rb') as f:
        model = torch.load(f)

    model = maybe_cuda(model)
    model.eval()

    if (args.naive):
        model = naive.create()

    for dataset_path in dataset_folders:

        if (args.bySegLength):
            print 'Segment is ', os.path.basename(dataset_path), " :"

        if args.wiki:
            if (args.wiki_folder):
                dataset = WikipediaDataSet(dataset_path,
                                           word2vec,
                                           folder=True,
                                           high_granularity=False)
            else:
                dataset = WikipediaDataSet(dataset_path,
                                           word2vec,
                                           high_granularity=False)
        else:
            dataset = ChoiDataset(dataset_path, word2vec)

        dl = DataLoader(dataset,
                        batch_size=args.bs,
                        collate_fn=collate_fn,
                        shuffle=False)

        with tqdm(desc='Testing', total=len(dl)) as pbar:
            total_accurate = 0
            total_count = 0
            total_loss = 0
            acc = accuracy.Accuracy()

            for i, (data, targets, paths) in enumerate(dl):
                if i == args.stop_after:
                    break

                pbar.update()
                output = model(data)
                targets_var = Variable(maybe_cuda(torch.cat(targets, 0),
                                                  args.cuda),
                                       requires_grad=False)
                batch_loss = 0
                output_prob = softmax(output.data.cpu().numpy())
                output_seg = output_prob[:, 1] > args.seg_threshold
                target_seg = targets_var.data.cpu().numpy()
                batch_accurate = (output_seg == target_seg).sum()
                total_accurate += batch_accurate
                total_count += len(target_seg)
                total_loss += batch_loss
                preds_stats.add(output_seg, target_seg)

                current_target_idx = 0
                for k, t in enumerate(targets):
                    document_sentence_count = len(t)
                    sentences_length = [s.size()[0] for s in data[k]
                                        ] if args.calc_word else None
                    to_idx = int(current_target_idx + document_sentence_count)
                    h = output_seg[current_target_idx:to_idx]

                    # hypothesis and targets are missing classification of last sentence, and therefore we will add
                    # 1 for both
                    h = np.append(h, [1])
                    t = np.append(t.cpu().numpy(), [1])

                    acc.update(h, t, sentences_length=sentences_length)

                    current_target_idx = to_idx

                logger.debug('Batch %s - error %7.4f, Accuracy: %7.4f', i,
                             batch_loss, batch_accurate / len(target_seg))
                pbar.set_description('Testing, Accuracy={:.4}'.format(
                    batch_accurate / len(target_seg)))

        average_loss = total_loss / len(dl)
        average_accuracy = total_accurate / total_count
        calculated_pk, _ = acc.calc_accuracy()

        logger.info('Finished testing.')
        logger.info('Average loss: %s', average_loss)
        logger.info('Average accuracy: %s', average_accuracy)
        logger.info('Pk: {:.4}.'.format(calculated_pk))
        logger.info('F1: {:.4}.'.format(preds_stats.get_f1()))

        end = timer()
        print('Seconds to execute to whole flow: ' + str(end - start))
Ejemplo n.º 3
0
def main(args):
    start = timer()

    sys.path.append(str(Path(__file__).parent))

    utils.read_config_file(args.config)
    utils.config.update(args.__dict__)

    logger.debug('Running with config %s', utils.config)
    print('Running with threshold: ' + str(args.seg_threshold))
    preds_stats = utils.predictions_analysis()
    probs_stats = [[], []]
    article_stats = []
    export = []
    #samples = []

    # Let's use Amazon S3
    s3 = boto3.resource(
        's3')  #s3 = boto3.client('s3', profile_name='signal-rnd')
    mybucket = s3.Bucket('data.data-science.signal')
    myfolder = 'summaries-segmentation'
    #pullBucketSamples(mybucket, myfolder+'/samples')
    print('Samples pulled successfully into container')

    workbook = excel.Workbook('output.xlsx')
    #workbook = excel.Workbook('/output/output.xlsx')#when running from container
    worksheet = workbook.add_worksheet()

    if not args.test:
        #key = myfolder + utils.config['word2vecfile']
        #word2vec = gensim.models.KeyedVectors.load_word2vec_format(mybucket.Object(key).get()['Body'].read(), binary=True)
        #word2vec = gensim.models.KeyedVectors.load_word2vec_format(io.BytesIO(mybucket.Object(key).get()['Body'].read()), binary=True)
        word2vec = gensim.models.KeyedVectors.load_word2vec_format(
            utils.config['word2vecfile'], binary=True)
        #response = urllib2.urlopen('https://drive.google.com/file/d/0B7XkCwpI5KDYNlNUTTlSS21pQmM/edit?usp=sharing')
        #word2vec = gensim.models.KeyedVectors.load_word2vec_format(response.read(), binary=True)

        #mybucket.Object(key).download_file('GoogleNews_vectors')
        #word2vec = gensim.models.KeyedVectors.load_word2vec_format('GoogleNews_vectors', binary=True)
    else:
        word2vec = None

    word2vec_done = timer()
    print 'Loading word2vec ellapsed: ' + str(word2vec_done -
                                              start) + ' seconds'
    dirname = 'test'

    if args.wiki:
        dataset_folders = [Path(utils.config['wikidataset']) / dirname]
        if (args.wiki_folder):
            dataset_folders = []
            dataset_folders.append(args.wiki_folder)
        print 'running on wikipedia'
    else:
        if (args.bySegLength):
            dataset_folders = getSegmentsFolders(utils.config['choidataset'])
            print 'run on choi by segments length'
        else:
            dataset_folders = [utils.config['choidataset']]
            print 'running on Choi'

    key = myfolder + args.model
    #model = torch.load(mybucket.Object(key).get()['Body'].read())
    #fileobj = io.BytesIO()
    #mybucket.Object(key).download_fileobj(fileobj)
    mybucket.Object(key).download_file('trained_model')

    #with open(args.model, 'rb') as f:
    with open('trained_model', 'rb') as f:
        model = torch.load(f)

    model = maybe_cuda(model)
    model.eval()

    if (args.naive):
        model = naive.create()

    for dataset_path in dataset_folders:

        if (args.bySegLength):
            print 'Segment is ', os.path.basename(dataset_path), " :"

        if args.wiki:
            if (args.wiki_folder):
                dataset = WikipediaDataSet(dataset_path,
                                           word2vec,
                                           folder=True,
                                           high_granularity=False)
            else:
                dataset = WikipediaDataSet(dataset_path,
                                           word2vec,
                                           high_granularity=False)
        else:
            dataset = ChoiDataset(dataset_path, word2vec)

        dl = DataLoader(dataset,
                        batch_size=args.bs,
                        collate_fn=collate_fn,
                        shuffle=False)

        with tqdm(desc='Testing', total=len(dl)) as pbar:
            total_accurate = 0
            total_count = 0
            total_loss = 0
            acc = accuracy.Accuracy()

            for i, (data, targets, paths) in enumerate(dl):
                if i == args.stop_after:
                    break

                pbar.update()
                output = model(data)
                targets_var = Variable(maybe_cuda(torch.cat(targets, 0),
                                                  args.cuda),
                                       requires_grad=False)
                batch_loss = 0
                output_prob = softmax(output.data.cpu().numpy())
                #if i < 5:
                #print output_prob.shape
                probs_stats[0].append(output_prob.tolist())
                #samples.append(data)
                output_seg = output_prob[:, 1] > args.seg_threshold
                target_seg = targets_var.data.cpu().numpy()
                probs_stats[1].append(target_seg.tolist())
                batch_accurate = (output_seg == target_seg).sum()
                total_accurate += batch_accurate
                total_count += len(target_seg)
                total_loss += batch_loss
                preds_stats.add(output_seg, target_seg)

                current_target_idx = 0
                article_stats.append([])
                for k, t in enumerate(targets):
                    document_sentence_count = len(t)
                    article_stats[i].append(document_sentence_count)
                    sentences_length = [s.size()[0] for s in data[k]
                                        ] if args.calc_word else None
                    to_idx = int(current_target_idx + document_sentence_count)
                    h = output_seg[current_target_idx:to_idx]

                    # hypothesis and targets are missing classification of last sentence, and therefore we will add
                    # 1 for both
                    h = np.append(h, [1])
                    t = np.append(t.cpu().numpy(), [1])

                    acc.update(h, t, sentences_length=sentences_length)

                    current_target_idx = to_idx

                logger.debug('Batch %s - error %7.4f, Accuracy: %7.4f', i,
                             batch_loss, batch_accurate / len(target_seg))
                pbar.set_description('Testing, Accuracy={:.4}'.format(
                    batch_accurate / len(target_seg)))

        average_loss = total_loss / len(dl)
        average_accuracy = total_accurate / total_count
        calculated_pk, _ = acc.calc_accuracy()

        article = 0
        for batch, probs in enumerate(probs_stats[0]):
            boundary = 0
            for sentences in article_stats[batch]:
                export.append([])
                for sentence in range(0, sentences):
                    export[article].append(probs[boundary][1])
                    worksheet.write(sentence, 2 * article, probs[boundary][1])
                    worksheet.write(sentence, 2 * article + 1,
                                    probs_stats[1][batch][boundary])
                    #worksheet.write(sentence, 3*article + 2, " ".join(samples[batch][boundary][:5]))
                    boundary += 1
                article += 1

        #Save dataset as pickle
        #data_out = np.asarray(export)
        with open('LSTM_probs.pkl', 'wb') as f:
            #with open('/output/LSTM_probs.pkl', 'wb') as f:#when rnuning from container
            pkl.dump({'probs': export}, f, pkl.HIGHEST_PROTOCOL
                     )  #, 'labels': y_train }, f, pkl.HIGHEST_PROTOCOL)
        workbook.close()

        key = myfolder + '/testing/softmax_probs.jsonl'
        mybucket.Object(key).upload_file('LSTM_probs.pkl')
        key = myfolder + '/testing/output.xlsx'
        mybucket.Object(key).upload_file('output.xlsx')

        logger.info('Finished testing.')
        logger.info('Average loss: %s', average_loss)
        logger.info('Average accuracy: %s', average_accuracy)
        logger.info('Pk: {:.4}.'.format(calculated_pk))
        logger.info('F1: {:.4}.'.format(preds_stats.get_f1()))

        end = timer()
        print('Seconds to execute to whole flow: ' + str(end - start))