Ejemplo n.º 1
0
def get_top_sectors():
    
    synonyms = load_synonyms('./datasets/sinonimos.csv')
    synonyms1 = load_synonyms('./datasets/sinonimos2.csv')
    dictionary = load_words()
    stop_words = load_stop_words('./datasets/stop-words.txt')
    routes = load_routes('./datasets/routes.txt')

    counter = Counter()

    with open('counter.txt') as fp:
        counter = pickle.load(fp)

    topRoutes = set(counter.elements())

    sectorGraph = get_graph()

    listRoutes = list(topRoutes)

    topSectors = []

    for avenue in listRoutes:
        for (x, y) in sectorGraph.edges():
            routesEdge = sectorGraph.edge[x][y]['routes']
            for route in routesEdge:
                processedRoute = process_tweet(route, synonyms, synonyms1, dictionary, stop_words)

                if (processedRoute.find(avenue) > -1):
                    topSectors.append({'from': x, 'to': y})

    return json.dumps(topSectors)
Ejemplo n.º 2
0
def get_top_sectors():

    synonyms = load_synonyms('./datasets/sinonimos.csv')
    synonyms1 = load_synonyms('./datasets/sinonimos2.csv')
    dictionary = load_words()
    stop_words = load_stop_words('./datasets/stop-words.txt')
    routes = load_routes('./datasets/routes.txt')

    counter = Counter()

    with open('counter.txt') as fp:
        counter = pickle.load(fp)

    topRoutes = set(counter.elements())

    sectorGraph = get_graph()

    listRoutes = list(topRoutes)

    topSectors = []

    for avenue in listRoutes:
        for (x, y) in sectorGraph.edges():
            routesEdge = sectorGraph.edge[x][y]['routes']
            for route in routesEdge:
                processedRoute = process_tweet(route, synonyms, synonyms1,
                                               dictionary, stop_words)

                if (processedRoute.find(avenue) > -1):
                    topSectors.append({'from': x, 'to': y})

    return json.dumps(topSectors)
Ejemplo n.º 3
0
def count_routes():
    synonyms = load_synonyms('./datasets/sinonimos.csv')
    synonyms1 = load_synonyms('./datasets/sinonimos2.csv')
    dictionary = load_words()
    stop_words = load_stop_words('./datasets/stop-words.txt')
    routes = load_routes('./datasets/routes.txt')
    Tweets = retrieve_tweets()
    counter = Counter()

    if not file_is_empty('./datasets/counter.txt'):
        with open('./datasets/counter.txt') as fp:
            counter = pickle.load(fp)

    for tweet in Tweets:
        Tweet_words = process_tweet(tweet.text, synonyms, synonyms1,
                                    dictionary, stop_words)
        for route in routes:
            #busca el nombre de las rutas en un tweet para contarlas
            if re.search(route, Tweet_words):
                counter[route] += 1

    print counter
    with open('counter.txt', 'wb') as fp:
        pickle.dump(counter, fp)
        fp.close()
Ejemplo n.º 4
0
def get_traffic(**kwargs):
    global TRAFFIC_WRAPPER
    # t0 = time.time()
    if TRAFFIC_WRAPPER is None:
        wrapperFile = 'wrappers/traffic_wrapper.json'
        synonyms = load_synonyms('./datasets/sinonimos.csv')
        words = load_words()

        if os.path.isfile(wrapperFile):
            with open(wrapperFile,'r+') as rwjson:
                TRAFFIC_WRAPPER = ClassifierWrapper()
                TRAFFIC_WRAPPER.jsonLoads(rwjson.read())
                TRAFFIC_WRAPPER.dataset.dataset = list(load_file('./datasets/traffic2.csv'))
                TRAFFIC_WRAPPER.synonyms = copy.deepcopy(synonyms)
                TRAFFIC_WRAPPER.words = copy.deepcopy(words)
                TRAFFIC_WRAPPER.dataset.synonyms = copy.deepcopy(synonyms)
                TRAFFIC_WRAPPER.dataset.words = copy.deepcopy(words)
                return TRAFFIC_WRAPPER

        clf = kwargs.pop('clf', LogisticRegression(C=8.5))
        dataWrapperDataset = list(load_file('./datasets/traffic2.csv'))
        dataWrapper = DataWrapper(dataset=dataWrapperDataset,synonyms=copy.deepcopy(synonyms),words=copy.deepcopy(words))
        dataWrapper.resolveMatrix()

        wrapper = ClassifierWrapper(clf=clf,dataset=dataWrapper,synonyms=copy.deepcopy(synonyms),words=copy.deepcopy(words))
        cross_validate = kwargs.pop('cross_validate', True)
        if cross_validate:
            wrapper.cross_validate()
        wrapper.train()
        # print time.time() - t0, "seconds from the multiclass classifier"
        TRAFFIC_WRAPPER = wrapper
        with open(wrapperFile, 'w') as rw_json:
            json.dump(TRAFFIC_WRAPPER.toDict(), rw_json)
    return TRAFFIC_WRAPPER
Ejemplo n.º 5
0
def run_word_vectors():
    print('reading nyt_vec.bin')
    all_w2vec = utils.read_vec_bin()
    words2id = utils.load_words()
    print('prepare w2vec')
    w2vec = utils.word_vectors(words2id, all_w2vec)
    print('dumping')
    json.dump(w2vec, open(Const.words_id2vector_filename, 'w'))
Ejemplo n.º 6
0
def find_keyword(args):
    print "loading data"
    if args.word_vec_file == '':
        w2v_file = 'GoogleNews-vectors-negative300.bin'
        keywords = load_words()
        vocab = keywords
        w2v = load_bin_vec(w2v_file, vocab)
    else:
        w2v = cPickle.load(open(args.word_vec_file, "r"))
    print "finish loading data"
    W = dict2Mat(w2v)
    kmeans = KMeans(n_clusters=args.keyword_num, random_state=0).fit(W)
    # save index to file
    cPickle.dump(kmeans.labels_, open(args.idx_save_file, "wb"))
    # get center vectors
    ctr_vecs = np.zeros(shape=(args.keyword_num, W.shape[1]))
    for i in range(args.keyword_num):
        ctr_vecs[i] = np.mean(W[kmeans.labels_ == i], axis=0)
    cPickle.dump(ctr_vecs, open('test.p', "wb"))
    print "center vecters saved"
    # save center words
    # get index of the closest vector to center vectors
    nbrs = NearestNeighbors(n_neighbors=1, algorithm=args.tree_algo).fit(W)
    distances, indices = nbrs.kneighbors(ctr_vecs)
    indices = np.reshape(indices, (len(indices)))
    # print words to file
    f_landmark = open(args.word_save_file, 'w')
    for i in range(args.keyword_num):
        print >> f_landmark, w2v.items()[indices[i]][0]
    f_landmark.close()
    print 'landmark words saved'
    # save words for vectors in W
    f_words = open(args.dict_file, 'w')
    for i in range(W.shape[0]):
        print >> f_words, w2v.items()[i][0]
    f_words.close()
    print 'words saved'
    print 'all done'
Ejemplo n.º 7
0
def count_routes():
    synonyms = load_synonyms('./datasets/sinonimos.csv')
    synonyms1 = load_synonyms('./datasets/sinonimos2.csv')
    dictionary = load_words()
    stop_words = load_stop_words('./datasets/stop-words.txt')
    routes = load_routes('./datasets/routes.txt')
    Tweets = retrieve_tweets();
    counter = Counter()
    
    if not file_is_empty('./datasets/counter.txt'):
        with open('./datasets/counter.txt') as fp:
            counter = pickle.load(fp)
    
    for tweet in Tweets:
        Tweet_words = process_tweet(tweet.text, synonyms, synonyms1, dictionary, stop_words)
        for route in routes:
            #busca el nombre de las rutas en un tweet para contarlas
            if re.search(route,Tweet_words):
                counter[route]+=1
                
    print counter
    with open('counter.txt', 'wb') as fp:
        pickle.dump(counter, fp)
        fp.close()
Ejemplo n.º 8
0
def get_relevant(**kwargs):
    global RELEVANT_WRAPPER
    # t0 = time.time()
    if RELEVANT_WRAPPER is None:
        wrapperFile = 'wrappers/relevant_wrapper.json'
        synonyms = load_synonyms('./datasets/sinonimos.csv')
        words = load_words()
        if os.path.isfile(wrapperFile):
            with open(wrapperFile,'r+') as rwjson:
                RELEVANT_WRAPPER = ClassifierWrapper()
                RELEVANT_WRAPPER.jsonLoads(rwjson.read())
                RELEVANT_WRAPPER.synonyms = copy.deepcopy(synonyms)
                RELEVANT_WRAPPER.words = copy.deepcopy(words)
                RELEVANT_WRAPPER.dataset.dataset = list(load_file('./datasets/relevant.csv'))
                RELEVANT_WRAPPER.dataset.synonyms = copy.deepcopy(synonyms)

                RELEVANT_WRAPPER.dataset.words = copy.deepcopy(words)

                return RELEVANT_WRAPPER

        clf = kwargs.pop('clf', LogisticRegression(C=10))
        dataWrapperDataset = list(load_file('./datasets/relevant.csv'))
        dataWrapper = DataWrapper(dataset=dataWrapperDataset,synonyms=copy.deepcopy(synonyms),words=copy.deepcopy(words))
        dataWrapper.resolveMatrix()

        wrapper = ClassifierWrapper(clf=clf,dataset=dataWrapper,synonyms=copy.deepcopy(synonyms),words=copy.deepcopy(words))

        cross_validate = kwargs.pop('cross_validate', False)
        if cross_validate:
            wrapper.cross_validate()
        wrapper.train()
        # print time.time() - t0, "seconds from relevant classifier"
        RELEVANT_WRAPPER = wrapper
        with open(wrapperFile, 'w') as rw_json:
            json.dump(RELEVANT_WRAPPER.toDict(), rw_json)
    return RELEVANT_WRAPPER
Ejemplo n.º 9
0
def main(args):
    # --------------------------------------------------------------------------
    # DATA
    logger.info('-' * 100)
    logger.info('Load data files')
    train_exs = utils.load_data(args, args.train_file, skip_no_answer=True)
    logger.info('Num train examples = %d' % len(train_exs))
    dev_exs = utils.load_data(args, args.dev_file)
    logger.info('Num dev examples = %d' % len(dev_exs))

    # If we are doing offician evals then we need to:
    # 1) Load the original text to retrieve spans from offsets.
    # 2) Load the (multiple) text answers for each question.
    if args.official_eval:
        dev_texts = utils.load_text(args.dev_json)
        dev_offsets = {ex['id']: ex['offsets'] for ex in dev_exs}
        dev_answers = utils.load_answers(args.dev_json)

    # --------------------------------------------------------------------------
    # MODEL
    logger.info('-' * 100)
    start_epoch = 0
    if args.checkpoint and os.path.isfile(args.model_file + '.checkpoint'):
        # Just resume training, no modifications.
        logger.info('Found a checkpoint...')
        checkpoint_file = args.model_file + '.checkpoint'
        model, start_epoch = ParagraphRanker.load_checkpoint(checkpoint_file, 
                                                             args)
    else:
        # Training starts fresh. But the model state is either pretrained or
        # newly (randomly) initialized.
        if args.pretrained:
            logger.info('Using pretrained model...')
            model = ParagraphRanker.load(args.pretrained, args)
            if args.expand_dictionary:
                logger.info('Expanding dictionary for new data...')
                # Add words in training + dev examples
                words = utils.load_words(args, train_exs + dev_exs)
                added = model.expand_dictionary(words)
                # Load pretrained embeddings for added words
                if args.embedding_file:
                    model.load_embeddings(added, args.embedding_file, args.fasttext)

        else:
            logger.info('Training model from scratch...')
            model = init_from_scratch(args, train_exs, dev_exs)

        # Set up partial tuning of embeddings
        if args.tune_partial > 0:
            logger.info('-' * 100)
            logger.info('Counting %d most frequent question words' %
                        args.tune_partial)
            top_words = utils.top_question_words(
                args, train_exs, model.word_dict
            )
            for word in top_words[:5]:
                logger.info(word)
            logger.info('...')
            for word in top_words[-6:-1]:
                logger.info(word)
            model.tune_embeddings([w[0] for w in top_words])

        # Set up optimizer
        model.init_optimizer()

    # Use the GPU?
    if args.cuda:
        model.cuda()

    # Use multiple GPUs?
    if args.parallel:
        model.parallelize()

    # --------------------------------------------------------------------------
    # DATA ITERATORS
    # Two datasets: train and dev. If we sort by length it's faster.
    logger.info('-' * 100)
    logger.info('Make data loaders')
    train_dataset = data.RankerDataset(train_exs, model, 
                                       args.neg_size, args.allowed_size)
    if args.sort_by_len:
        train_sampler = data.RankerBatchSampler(train_dataset.lengths(),
                                                args.batch_size,
                                                shuffle=True)
    else:
        train_sampler = torch.utils.data.sampler.RandomSampler(train_dataset)
    train_loader = torch.utils.data.DataLoader(
        train_dataset,
        batch_size=args.batch_size,
        sampler=train_sampler,
        num_workers=args.data_workers,
        collate_fn=vector.ranker_train_batchify,
        pin_memory=args.cuda,
    )
    dev_dataset = data.RankerDataset(dev_exs, model,
                                     neg_size=1, allowed_size=1000)
    if args.sort_by_len:
        dev_sampler = data.RankerBatchSampler(dev_dataset.lengths(),
                                              args.test_batch_size,
                                              shuffle=False)
    else:
        dev_sampler = torch.utils.data.sampler.SequentialSampler(dev_dataset)
    dev_loader = torch.utils.data.DataLoader(
        dev_dataset,
        batch_size=args.test_batch_size,
        sampler=dev_sampler,
        num_workers=args.data_workers,
        collate_fn=vector.ranker_dev_batchify,
        pin_memory=args.cuda,
    )

    # -------------------------------------------------------------------------
    # PRINT CONFIG
    logger.info('-' * 100)
    logger.info('CONFIG:\n%s' %
                json.dumps(vars(args), indent=4, sort_keys=True))

    # --------------------------------------------------------------------------
    # TRAIN/VALID LOOP
    logger.info('-' * 100)
    logger.info('Starting training...')
    stats = {'timer': utils.Timer(), 'epoch': 0, 'best_valid': 0}
    for epoch in range(start_epoch, args.num_epochs):
        stats['epoch'] = epoch

        # Train
        train(args, train_loader, model, stats)

        # Filtering by questions
        # pre_selected_docs = filter_docs(args, dev_loader)

        # Encode documents for dev
        docs, qs = encode_docs_qs(args, dev_loader, model, stats, mode='dev')

        # Rank encoded documents
        result = rank_docs(args, docs, qs, stats, mode='dev')

        # Save best valid
        if result[args.valid_metric] > stats['best_valid']:
            logger.info('Best valid: %s = %.3f (epoch %d, %d updates)' %
                        (args.valid_metric, result[args.valid_metric],
                         stats['epoch'], model.updates))
            model.save(args.model_file)
            stats['best_valid'] = result[args.valid_metric]

    # Ranker final evaluation
    docs, qs = encode_docs_qs(args, dev_loader, model, stats, mode='dev')
    result = rank_docs(args, docs, qs, stats, mode='dev')
Ejemplo n.º 10
0
def main(args):
    # --------------------------------------------------------------------------
    # DATA
    logger.info('-' * 100)
    logger.info('Load data files')
    train_exs = utils.load_data(args, args.train_file, skip_no_answer=True)
    logger.info('Num train examples = %d' % len(train_exs))
    dev_exs = utils.load_data(args, args.dev_file)
    logger.info('Num dev examples = %d' % len(dev_exs))

    # If we are doing offician evals then we need to:
    # 1) Load the original text to retrieve spans from offsets.
    # 2) Load the (multiple) text answers for each question.
    if args.official_eval:
        dev_texts = utils.load_text(args.dev_json)
        dev_offsets = {ex['id']: ex['offsets'] for ex in dev_exs}
        dev_answers = utils.load_answers(args.dev_json)

    # --------------------------------------------------------------------------
    # MODEL
    logger.info('-' * 100)
    start_epoch = 0
    if args.checkpoint and os.path.isfile(args.model_file + '.checkpoint'):
        # Just resume training, no modifications.
        logger.info('Found a checkpoint...')
        checkpoint_file = args.model_file + '.checkpoint'
        model, start_epoch = DocReader.load_checkpoint(checkpoint_file, args)
    else:
        # Training starts fresh. But the model state is either pretrained or
        # newly (randomly) initialized.
        if args.pretrained:
            logger.info('Using pretrained model...')
            model = DocReader.load(args.pretrained, args)
            if args.expand_dictionary:
                logger.info('Expanding dictionary for new data...')
                # Add words in training + dev examples
                words = utils.load_words(args, train_exs + dev_exs)
                added_words = model.expand_dictionary(words)
                # Load pretrained embeddings for added words
                if args.embedding_file:
                    model.load_embeddings(added_words, args.embedding_file)

                logger.info('Expanding char dictionary for new data...')
                # Add words in training + dev examples
                chars = utils.load_chars(args, train_exs + dev_exs)
                added_chars = model.expand_char_dictionary(chars)
                # Load pretrained embeddings for added words
                if args.char_embedding_file:
                    model.load_char_embeddings(added_chars, args.char_embedding_file)

        else:
            logger.info('Training model from scratch...')
            model = init_from_scratch(args, train_exs, dev_exs)

        # Set up partial tuning of embeddings
        if args.tune_partial > 0:
            logger.info('-' * 100)
            logger.info('Counting %d most frequent question words' %
                        args.tune_partial)
            top_words = utils.top_question_words(
                args, train_exs, model.word_dict
            )
            for word in top_words[:5]:
                logger.info(word)
            logger.info('...')
            for word in top_words[-6:-1]:
                logger.info(word)
            model.tune_embeddings([w[0] for w in top_words])

        # Set up optimizer
        model.init_optimizer()

    # Use the GPU?
    if args.cuda:
        model.cuda()

    # Use multiple GPUs?
    if args.parallel:
        model.parallelize()

    # --------------------------------------------------------------------------
    # DATA ITERATORS
    # Two datasets: train and dev. If we sort by length it's faster.
    logger.info('-' * 100)
    logger.info('Make data loaders')

    train_dataset = data.ReaderDataset(train_exs, model, single_answer=True)
    if args.sort_by_len:
        train_sampler = data.SortedBatchSampler(train_dataset.lengths(),
                                                args.batch_size,
                                                shuffle=True)
    else:
        train_sampler = torch.utils.data.sampler.RandomSampler(train_dataset)
    # if args.use_sentence_selector:
    #     train_batcher = vector.sentence_batchifier(model, single_answer=True)
    #     batching_function = train_batcher.batchify
    # else:
    batching_function = vector.batchify
    train_loader = torch.utils.data.DataLoader(
        train_dataset,
        batch_size=args.batch_size,
        sampler=train_sampler,
        num_workers=args.data_workers,
        collate_fn=batching_function,
        pin_memory=args.cuda,
    )
    dev_dataset = data.ReaderDataset(dev_exs, model, single_answer=False)
    if args.sort_by_len:
        dev_sampler = data.SortedBatchSampler(dev_dataset.lengths(),
                                              args.test_batch_size,
                                              shuffle=False)
    else:
        dev_sampler = torch.utils.data.sampler.SequentialSampler(dev_dataset)
    # if args.use_sentence_selector:
    #     dev_batcher = vector.sentence_batchifier(model, single_answer=False)
    #     batching_function = dev_batcher.batchify
    # else:
    batching_function = vector.batchify
    dev_loader = torch.utils.data.DataLoader(
        dev_dataset,
        batch_size=args.test_batch_size,
        sampler=dev_sampler,
        num_workers=args.data_workers,
        collate_fn=batching_function,
        pin_memory=args.cuda,
    )

    # -------------------------------------------------------------------------
    # PRINT CONFIG
    logger.info('-' * 100)
    logger.info('CONFIG:\n%s' %
                json.dumps(vars(args), indent=4, sort_keys=True))

    # --------------------------------------------------------------------------
    # TRAIN/VALID LOOP
    logger.info('-' * 100)
    logger.info('Starting training...')
    stats = {'timer': utils.Timer(), 'epoch': 0, 'best_valid': 0}

    # --------------------------------------------------------------------------
    # QUICKLY VALIDATE ON PRETRAINED MODEL

    if args.global_mode == "test":
        result1 = validate_unofficial(args, dev_loader, model, stats, mode='dev')
        result2 = validate_official(args, dev_loader, model, stats,
                                    dev_offsets, dev_texts, dev_answers)
        print(result2[args.valid_metric])
        print(result1["exact_match"])

        validate_adversarial(args, model, stats, mode="dev")
        exit(0)


    for epoch in range(start_epoch, args.num_epochs):
        stats['epoch'] = epoch

        # Train
        train(args, train_loader, model, stats)

        # Validate unofficial (train)
        validate_unofficial(args, train_loader, model, stats, mode='train')

        # Validate unofficial (dev)
        result = validate_unofficial(args, dev_loader, model, stats, mode='dev')

        # Validate official
        if args.official_eval:
            result = validate_official(args, dev_loader, model, stats,
                                       dev_offsets, dev_texts, dev_answers)

        # Save best valid
        if args.valid_metric is None or args.valid_metric == 'None':
            model.save(args.model_file)
        elif result[args.valid_metric] > stats['best_valid']:
            logger.info('Best valid: %s = %.2f (epoch %d, %d updates)' %
                        (args.valid_metric, result[args.valid_metric],
                         stats['epoch'], model.updates))
            model.save(args.model_file)
            stats['best_valid'] = result[args.valid_metric]
Ejemplo n.º 11
0
def main(args):
    # --------------------------------------------------------------------------
    # DATA
    logger.info('-' * 100)
    logger.info('Load data files')
    train_exs = utils.load_data(args, args.train_file, skip_no_answer=True)
    logger.info('Num train examples = %d' % len(train_exs))
    dev_exs = utils.load_data(args, args.dev_file)
    logger.info('Num dev examples = %d' % len(dev_exs))

    # If we are doing offician evals then we need to:
    # 1) Load the original text to retrieve spans from offsets.
    # 2) Load the (multiple) text answers for each question.
    if args.official_eval:
        dev_texts = utils.load_text(args.dev_json)
        dev_offsets = {ex['id']: ex['offsets'] for ex in dev_exs}
        dev_answers = utils.load_answers(args.dev_json)
    else:
        dev_texts = None
        dev_offsets = None
        dev_answers = None

    # --------------------------------------------------------------------------
    # MODEL
    logger.info('-' * 100)
    start_epoch = 0
    if args.checkpoint and os.path.isfile(args.model_file + '.checkpoint'):
        # Just resume training, no modifications.
        logger.info('Found a checkpoint...')
        checkpoint_file = args.model_file + '.checkpoint'
        model, start_epoch = DocReader.load_checkpoint(checkpoint_file, args)
    else:
        # Training starts fresh. But the model state is either pretrained or
        # newly (randomly) initialized.
        if args.pretrained:
            logger.info('Using pretrained model...')
            model = DocReader.load(args.pretrained, args)
            if args.expand_dictionary:
                logger.info('Expanding dictionary for new data...')
                # Add words in training + dev examples
                words = utils.load_words(args, train_exs + dev_exs)
                added_words = model.expand_dictionary(words)
                # Load pretrained embeddings for added words
                if args.embedding_file:
                    model.load_embeddings(added_words, args.embedding_file)

                logger.info('Expanding char dictionary for new data...')
                # Add words in training + dev examples
                chars = utils.load_chars(args, train_exs + dev_exs)
                added_chars = model.expand_char_dictionary(chars)
                # Load pretrained embeddings for added words
                if args.char_embedding_file:
                    model.load_char_embeddings(added_chars,
                                               args.char_embedding_file)

        else:
            logger.info('Training model from scratch...')
            model = init_from_scratch(args, train_exs, dev_exs)

        # Set up partial tuning of embeddings
        if args.tune_partial > 0:
            logger.info('-' * 100)
            logger.info('Counting %d most frequent question words' %
                        args.tune_partial)
            top_words = utils.top_question_words(args, train_exs,
                                                 model.word_dict)
            for word in top_words[:5]:
                logger.info(word)
            logger.info('...')
            for word in top_words[-6:-1]:
                logger.info(word)
            model.tune_embeddings([w[0] for w in top_words])

        # Set up optimizer
        model.init_optimizer()

    # Use the GPU?
    if args.cuda:
        model.cuda()

    # Use multiple GPUs?
    if args.parallel:
        model.parallelize()

    # --------------------------------------------------------------------------
    # DATA ITERATORS
    # Two datasets: train and dev. If we sort by length it's faster.
    logger.info('-' * 100)
    logger.info('Make data loaders')

    train_dataset = data.ReaderDataset(train_exs, model, single_answer=True)
    if args.sort_by_len:
        train_sampler = data.SortedBatchSampler(train_dataset.lengths(),
                                                args.batch_size,
                                                shuffle=True)
    else:
        train_sampler = torch.utils.data.sampler.RandomSampler(train_dataset)
    train_loader = torch.utils.data.DataLoader(
        train_dataset,
        batch_size=args.batch_size,
        sampler=train_sampler,
        num_workers=args.data_workers,
        collate_fn=vector.batchify,
        pin_memory=args.cuda,
    )
    dev_dataset = data.ReaderDataset(dev_exs, model, single_answer=False)
    if args.sort_by_len:
        dev_sampler = data.SortedBatchSampler(dev_dataset.lengths(),
                                              args.test_batch_size,
                                              shuffle=False)
    else:
        dev_sampler = torch.utils.data.sampler.SequentialSampler(dev_dataset)
    dev_loader = torch.utils.data.DataLoader(
        dev_dataset,
        batch_size=args.test_batch_size,
        sampler=dev_sampler,
        num_workers=args.data_workers,
        collate_fn=vector.batchify,
        pin_memory=args.cuda,
    )

    # -------------------------------------------------------------------------
    # PRINT CONFIG
    logger.info('-' * 100)
    logger.info('CONFIG:\n%s' %
                json.dumps(vars(args), indent=4, sort_keys=True))

    # --------------------------------------------------------------------------
    # TRAIN/VALID LOOP
    logger.info('-' * 100)
    logger.info('Starting training...')
    stats = {'timer': utils.Timer(), 'epoch': 0, 'best_valid': 0}
    model_prefix = os.path.join(args.model_dir, args.model_name)

    kept_models = []
    best_model_path = ''
    for epoch in range(start_epoch, args.num_epochs):
        stats['epoch'] = epoch

        # Train
        train(args, train_loader, model, stats)

        # Validate unofficial (train)
        logger.info('eval: train split unofficially...')
        validate_unofficial(args, train_loader, model, stats, mode='train')

        if args.official_eval:
            # Validate official (dev)
            logger.info('eval: dev split unofficially..')
            result = validate_official(args, dev_loader, model, stats,
                                       dev_offsets, dev_texts, dev_answers)
        else:
            # Validate unofficial (dev)
            logger.info(
                'train: evaluating dev split evaluating dev official...')
            result = validate_unofficial(args,
                                         dev_loader,
                                         model,
                                         stats,
                                         mode='dev')

        em = result['exact_match']
        f1 = result['f1']
        suffix = 'em_{:4.2f}-f1_{:4.2f}.mdl'.format(em, f1)
        # Save best valid
        model_file = '{}-epoch_{}-{}'.format(model_prefix, epoch, suffix)
        if args.valid_metric:
            if result[args.valid_metric] > stats['best_valid']:
                for f in glob.glob('{}-best*'.format(model_prefix)):
                    os.remove(f)
                logger.info('eval: dev best %s = %.2f (epoch %d, %d updates)' %
                            (args.valid_metric, result[args.valid_metric],
                             stats['epoch'], model.updates))
                model_file = '{}-best-epoch_{}-{}'.format(
                    model_prefix, epoch, suffix)
                best_model_path = model_file
                model.save(model_file)
                stats['best_valid'] = result[args.valid_metric]
                for f in kept_models:
                    os.remove(f)
                kept_models.clear()
            else:
                model.save(model_file)
                kept_models.append(model_file)
                if len(kept_models) >= args.early_stop:
                    logger.info(
                        'Finished training due to %s not improved for %d epochs, best model is at: %s'
                        %
                        (args.valid_metric, args.early_stop, best_model_path))
                    return
        else:
            # just save model every epoch since no validation metric is given
            model.save(model_file)
Ejemplo n.º 12
0
def p22():
    names = load_words('resources/p022_names.txt')
    names.sort()
    return sum([score_name(name, idx+1) for idx, name in enumerate(names)])
Ejemplo n.º 13
0
def p22():
    names = load_words('resources/p022_names.txt')
    names.sort()
    return sum([score_name(name, idx + 1) for idx, name in enumerate(names)])
Ejemplo n.º 14
0
def p42():
    words = load_words('resources/p042_words.txt')
    return len(filter(is_triangle, words))
Ejemplo n.º 15
0
    def play(self):
        """
        This function contains the main game loop and setup. You don't need to edit
        this.
        """

        if self.no_cheating:
            print("Playing without cheating!")
        if self.unlimited:
            print("Playing with unlimited guesses!")

        # Setting up new game
        words = None
        word_length = random.randint(4, 12)
        if self.no_cheating:
            correct_word = random.choice(
                tuple(load_words(self.dictionary_file_name, word_length)))
            words = set([correct_word])
        else:
            words = load_words(self.dictionary_file_name, word_length)

        discovered_letters = ["*" for i in range(word_length)]
        guessed_letters = set([])
        num_guesses = 0
        num_incorrect_guesses = 0

        Game.get_robot_ascii(num_incorrect_guesses)

        # Main game loop
        while True:
            print("\nGuessed letters: " + str(sorted(list(guessed_letters))))
            print(f'\nHint: {"".join(discovered_letters)}\n')

            guess = ""
            while not Game.is_valid_guess(guess):
                guess = input("What letter would you like to guess? ").upper()
            if guess in guessed_letters:
                continue
            guessed_letters.add(guess)
            num_guesses += 1

            guess_indices, words = Game.get_new_words_set(words, guess)

            if len(guess_indices) == 0:
                print("\nYou guessed wrongly!")
                num_incorrect_guesses += 1
            else:
                print("\nGood guess!")
                discovered_letters = Game.get_new_discovered_letters(
                    discovered_letters, guess_indices, guess)

            Game.get_robot_ascii(num_incorrect_guesses)

            if num_incorrect_guesses == 6:
                print(game_over)
                if self.unlimited:
                    print(
                        f"Too bad! Don't worry, you can still keep guessing!\n"
                    )
                else:
                    print(f"\nThe correct word was {list(words)[0]}!\n")
                    break
            if "*" not in "".join(discovered_letters):
                if num_incorrect_guesses < 6:
                    print(you_win)
                print(
                    f'\nThe correct word was {"".join(discovered_letters)}!\n')
                print(f"\nYou only took {num_guesses} guesses!\n")
                break