Ejemplo n.º 1
0
def difference_evaluation(name):
    queries = []
    with open(join(data_directory, name + '.tsv'), 'r') as f:
        for line in util.verboserate(f):
            items = line.split('\t')
            s, r, t = items[0], tuple(items[1].split(',')), items[2]
            q = PathQuery(s, r, t)
            q.aqs = [float(s) for s in items[3].split(',')]
            queries.append(q)

    aq_deltas = defaultdict(list)
    for q in queries:
        aqs = [1.0] + q.aqs
        for i in range(1, len(aqs)):
            r = q.r[i - 1]
            aq, prev_aq = aqs[i], aqs[i - 1]

            if prev_aq == 1.0:
                delta = 1.0  # no ground to gain
            elif prev_aq == 0.0:
                delta = np.nan  # no ground to lose
            else:
                diff = aq - prev_aq
                if diff >= 0:
                    delta = diff / (1.0 - prev_aq)  # portion recovered
                else:
                    delta = diff / prev_aq  # portion lost

            if not np.isnan(delta):
                aq_deltas[r].append(delta)

    return pd.DataFrame({
        'mean(aq_diff)':
        dict((r, np.nanmean(deltas)) for r, deltas in aq_deltas.iteritems())
    })
def difference_evaluation(name):
    queries = []
    with open(join(data_directory, name + '.tsv'), 'r') as f:
        for line in util.verboserate(f):
            items = line.split('\t')
            s, r, t = items[0], tuple(items[1].split(',')), items[2]
            q = PathQuery(s, r, t)
            q.aqs = [float(s) for s in items[3].split(',')]
            queries.append(q)

    aq_deltas = defaultdict(list)
    for q in queries:
        aqs = [1.0] + q.aqs
        for i in range(1, len(aqs)):
            r = q.r[i-1]
            aq, prev_aq = aqs[i], aqs[i-1]

            if prev_aq == 1.0:
                delta = 1.0  # no ground to gain
            elif prev_aq == 0.0:
                delta = np.nan  # no ground to lose
            else:
                diff = aq - prev_aq
                if diff >= 0:
                    delta = diff / (1.0 - prev_aq)  # portion recovered
                else:
                    delta = diff / prev_aq  # portion lost

            if not np.isnan(delta):
                aq_deltas[r].append(delta)

    return pd.DataFrame({'mean(aq_diff)': dict((r, np.nanmean(deltas)) for r, deltas in aq_deltas.iteritems())})
Ejemplo n.º 3
0
 def average_quantile(s, p):
     negatives, positives = neg_gen(PathQuery(s, p, ''),
                                    't',
                                    return_positives=True)
     pos_query = PathQuery(s, p, positives)
     neg_query = PathQuery(s, p, negatives)
     return util.average_quantile(scores(pos_query), scores(neg_query))
Ejemplo n.º 4
0
def augment_dataset(train_triples,
                    dev_triples,
                    add_paths=False,
                    max_path_length=8):

    train_graph = Graph(train_triples)
    full_graph = Graph(train_triples + dev_triples)

    # start with original edges in the training and dev set
    train = PathQuery.from_triples(train_triples)
    dev = PathQuery.from_triples(dev_triples)
    test = []  # empty for now

    if add_paths:
        print 'adding paths'
        # number of paths to augment existing triples with
        num_augment = lambda triples: len(triples) * (max_path_length - 1)

        # augment with paths
        train.extend(
            sample_paths(train_graph, 3 * num_augment(train_triples),
                         max_path_length))  # previously 980000
        dev.extend(
            sample_paths(full_graph, num_augment(dev_triples),
                         max_path_length))  # previously 35000

    # make unique, and eliminate train dev overlap
    print 'before: train {}, dev {}'.format(len(train), len(dev))
    train, dev = set(train), set(dev)
    dev -= train

    # remove trivial queries (queries that type match all entities)
    trivial_train_paths = set(get_trivial_path_queries(train_graph, train))
    trivial_dev_paths = set(get_trivial_path_queries(train_graph, dev))

    train -= trivial_train_paths
    dev -= trivial_dev_paths

    train, dev = list(train), list(dev)
    random.shuffle(train)
    random.shuffle(dev)

    print 'after: train {}, dev {}'.format(len(train), len(dev))

    if platform.system() != 'Darwin':
        # save generated datasets
        print 'saving datasets'
        dsets = {'train': train, 'dev': dev, 'test': test}
        for name, dset in dsets.iteritems():
            with open('{}.cpkl'.format(name), 'w') as f:
                pickle.dump(dset, f)
        print 'done'

    return train, dev, test, train_graph, full_graph
Ejemplo n.º 5
0
def segmented_evaluation(file_path, categorize=None):
    queries = []
    with open(file_path, 'r') as f:
        for line in util.verboserate(f):
            items = line.split('\t')
            s, r, t = items[0], tuple(items[1].split(',')), items[2]
            q = PathQuery(s, r, t)
            quantile_str = items[3]
            q.quantile = float(quantile_str)
            q.num_candidates = int(items[4])
            queries.append(q)

    def single_relation(query):
        if len(query.r) != 1:
            return False
        r = query.r[-1]
        if inverted(r):
            return False
        return r

    # group queries
    if categorize is None:
        categorize = single_relation

    groups = util.group(queries, categorize)

    print 'computing grouped stats'
    stats = defaultdict(dict)
    for key, queries in util.verboserate(groups.iteritems()):
        scores = [q.quantile for q in queries]
        score = np.nanmean(scores)

        def inv_sigmoid(y):
            return -np.log(1.0 / y - 1)

        score2 = inv_sigmoid(score)

        total = len(scores)
        nontrivial = np.count_nonzero(~np.isnan(scores))

        stats[key] = {
            'score': score,
            'score2': score2,
            'total_eval': total,
            'nontrivial_eval': nontrivial
        }

    stats.pop(False, None)
    return pd.DataFrame(stats).transpose()
def load_socher_test(test_set_path):
    examples = []
    with open(join(data_directory, test_set_path), 'r') as f:
        for line in util.verboserate(f):
            items = line.split()
            s, r, t, label = items[0], tuple(items[1].split(',')), items[2], items[3]
            ex = PathQuery(s, r, t)

            if label == '1':
                ex.label = True
            elif label == '-1':
                ex.label = False
            else:
                raise ValueError(label)
            examples.append(ex)
    return examples
Ejemplo n.º 7
0
def augment_dataset(train_triples, dev_triples, add_paths=False, max_path_length=8):

    train_graph = Graph(train_triples)
    full_graph = Graph(train_triples + dev_triples)

    # start with original edges in the training and dev set
    train = PathQuery.from_triples(train_triples)
    dev = PathQuery.from_triples(dev_triples)
    test = []  # empty for now

    if add_paths:
        print 'adding paths'
        # number of paths to augment existing triples with
        num_augment = lambda triples: len(triples) * (max_path_length - 1)

        # augment with paths
        train.extend(sample_paths(train_graph, 3*num_augment(train_triples), max_path_length))  # previously 980000
        dev.extend(sample_paths(full_graph, num_augment(dev_triples), max_path_length))  # previously 35000

    # make unique, and eliminate train dev overlap
    print 'before: train {}, dev {}'.format(len(train), len(dev))
    train, dev = set(train), set(dev)
    dev -= train

    # remove trivial queries (queries that type match all entities)
    trivial_train_paths = set(get_trivial_path_queries(train_graph, train))
    trivial_dev_paths = set(get_trivial_path_queries(train_graph, dev))

    train -= trivial_train_paths
    dev -= trivial_dev_paths

    train, dev = list(train), list(dev)
    random.shuffle(train)
    random.shuffle(dev)

    print 'after: train {}, dev {}'.format(len(train), len(dev))

    if platform.system() != 'Darwin':
        # save generated datasets
        print 'saving datasets'
        dsets = {'train': train, 'dev': dev, 'test': test}
        for name, dset in dsets.iteritems():
            with open('{}.cpkl'.format(name), 'w') as f:
                pickle.dump(dset, f)
        print 'done'

    return train, dev, test, train_graph, full_graph
Ejemplo n.º 8
0
def load_socher_test(test_set_path):
    examples = []
    with open(join(data_directory, test_set_path), 'r') as f:
        for line in util.verboserate(f):
            items = line.split()
            s, r, t, label = items[0], tuple(
                items[1].split(',')), items[2], items[3]
            ex = PathQuery(s, r, t)

            if label == '1':
                ex.label = True
            elif label == '-1':
                ex.label = False
            else:
                raise ValueError(label)
            examples.append(ex)
    return examples
Ejemplo n.º 9
0
    def performance(query):
        s, r, t = query.s, query.r, query.t
        negatives = neg_gen(query, 't')
        pos_query = PathQuery(s, r, t)
        neg_query = PathQuery(s, r, negatives)

        # don't score queries with no negatives
        if len(negatives) == 0:
            query.quantile = np.nan
        else:
            query.quantile = util.average_quantile(scores(pos_query),
                                                   scores(neg_query))

        query.num_candidates = len(negatives) + 1

        attributes = query.s, ','.join(query.r), query.t, str(
            query.quantile), str(query.num_candidates)
        return '\t'.join(attributes)
def segmented_evaluation(file_path, categorize=None):
    queries = []
    with open(file_path, 'r') as f:
        for line in util.verboserate(f):
            items = line.split('\t')
            s, r, t = items[0], tuple(items[1].split(',')), items[2]
            q = PathQuery(s, r, t)
            quantile_str = items[3]
            q.quantile = float(quantile_str)
            q.num_candidates = int(items[4])
            queries.append(q)

    def single_relation(query):
        if len(query.r) != 1:
            return False
        r = query.r[-1]
        if inverted(r):
            return False
        return r

    # group queries
    if categorize is None:
        categorize = single_relation

    groups = util.group(queries, categorize)

    print 'computing grouped stats'
    stats = defaultdict(dict)
    for key, queries in util.verboserate(groups.iteritems()):
        scores = [q.quantile for q in queries]
        score = np.nanmean(scores)

        def inv_sigmoid(y):
            return -np.log(1.0 / y - 1)

        score2 = inv_sigmoid(score)

        total = len(scores)
        nontrivial = np.count_nonzero(~np.isnan(scores))

        stats[key] = {'score': score, 'score2': score2, 'total_eval': total, 'nontrivial_eval': nontrivial}

    stats.pop(False, None)
    return pd.DataFrame(stats).transpose()
Ejemplo n.º 11
0
    def predict(self, maximizer, ex):
        samples = self.neg_generator(ex, 't')
        samples.insert(0, ex.t)  # insert positive at front

        scores = maximizer.objective.predict(maximizer.params,
                                             PathQuery(ex.s, ex.r,
                                                       samples)).ravel()
        assert len(scores.shape) == 1

        ranks = util.ranks(scores, ascending=False)
        return samples, scores, ranks