Example #1
0
def test_rocchio(test_set=ts.TEST_2, samples=DEF_SAMPLES):
    instance = test_set['instance']
    relevant = test_set['relevant']
    irrelevant = test_set['irrelevant']

    percentages = util.step_range(.1,.4,.1)
    results = dict()

    for percent in percentages:
        results[percent] = {}
        
        alphas = util.step_range(0, 1, .1)
        betas = util.step_range(0, 1, .1)
        for alpha in alphas:
            results[percent][alpha] = {}

            for beta in betas:
                print("Percentage: {} Alpha: {} Beta: {}".format(percent, alpha, beta))
                results[percent][alpha][beta] = {'recalls': []}
                for sample_num in range(samples):
                    relevant_sample = util.percent_sample(relevant, percent)
                    irrelevant_sample = irrelevant

                    response = sendVectorQuery(vector_function='rocchio',
                                               relevant=relevant_sample,
                                               irrelevant=irrelevant_sample,
                                               instance=instance,
                                               alpha=alpha,
                                               beta=beta)

                    relevant_returned = copy.copy(relevant_sample)
                    relevent_set = set(relevant)
                    relevant_sample_set = set(relevant_sample)
                    for sentence in response['sentences']:
                        sentence_id = int(sentence['id'])
                        if sentence_id in relevant and sentence_id not in relevant_returned:
                            relevant_returned.append(sentence_id)
                    to_find = relevent_set.difference(relevant_sample_set)
                    found = set(relevant_returned).difference(relevant_sample_set)

                    results[percent][alpha][beta]['recalls'].append(len(found) / max(.0000001, len(to_find)))
    fname = 'results_rocchio_{}'.format(datetime.datetime.now().strftime("%Y%m%d_%H%m%S"))
    with open(fname, 'w') as f:
        pickle.dump(results, f)
    return fname
Example #2
0
def test(test_set=ts.TEST_2, test_fns=VEC_ADJ_FNS, samples=DEF_SAMPLES,
         start=DEF_START, end=DEF_END, step=DEF_STEP, seed=random.random()):
    """
    Given a test set and a list of vector functions, checks recall rates
    over varried starting percentages of the test set.

    Keyword arguments:
    test_set -- a dictionary containing the 'name' of the set, the database
                'instance', the 'search_term' to use (only 'pseudo'), a list
                of 'relevant' sentence ids, and a list of 'irrelevant'
                sentence ids
    test_fns -- a list of names of vector adjustment functions to use
    samples -- the number of samples to run (default 3)
    start -- a float corresponding to the starting sample percentage (default 0.05)
    end -- a float corresponding to the ending sample percentage (default 0.6)
    step -- a float corresponding to the sample percentage to step by (default 0.05)
    seed -- the random seed to use
    """
    random.seed(seed)

    # load information from the test set
    test_set_name = test_set['name']
    instance = test_set['instance']
    relevant = test_set['relevant']
    irrelevant = test_set['irrelevant']
    search_term = test_set['search_term']

    results = {}
    for percent in util.step_range(start, end, step):
        # instantiate dictionaries
        results[percent] = {}
        for fn in test_fns:
            results[percent][fn] = {'recalls': []}

        for sample_num in range(samples):
            print('Percent: {} Sample: {}'.format(percent, sample_num + 1))

            # take a random percentage of the relevant sentence ids
            relevant_sample = util.percent_sample(relevant, percent)
            # currently we take all irrelevant sentence ids, so this isn't a sampling
            irrelevant_sample = irrelevant

            for fn in test_fns:
                response = sendVectorQuery(vector_function=fn,
                                           relevant=relevant_sample,
                                           irrelevant=irrelevant_sample,
                                           instance=instance)

                # find which sentences were recalled
                relevant_returned = copy.copy(relevant_sample)
                relevent_set = set(relevant)
                relevant_sample_set = set(relevant_sample)
                for sentence in response['sentences']:
                    sentence_id = int(sentence['id'])
                    if sentence_id in relevant and sentence_id not in relevant_returned:
                        relevant_returned.append(sentence_id)
                to_find = relevent_set.difference(relevant_sample_set)
                found = set(relevant_returned).difference(relevant_sample_set)

                # kludgy fix for when user gives percentage range to 100
                results[percent][fn]['recalls'].append(len(found) / max(.0000001, len(to_find)))

    results = {'results': results,
               'test_set': test_set_name,
               'vector_functions': test_fns,
               'start_percentage': start,
               'end_percentage': end,
               'step_percentage': step,
               'seed': seed}

    # save file and graph
    fname = "results_name_{}".format(datetime.datetime.now().strftime("%Y%m%d_%H%m%S"))
    with open(fname, 'w') as f:
        pickle.dump(results, f)
    graph(fname)