Beispiel #1
0
def build_demo_data(kvl):

    label_store = LabelStore(kvl)

    topic = 'where_are_aid_workers_housed_near_Monrovia'
    subtopics = ['Tanji_Fish_Curing_Site', 'Camp_Ramrod', 'Town_of_Wamba']
    subtopic_to_documents = {
        0:
        [(random_sid(), '2100-%d|%s' % (len(subtopics[0]), subtopics[0]), 3),
         (random_sid(), '15-93|we_drove_out_to_the_other_side_' +
          'of_the_river_delta_to_a_small_fish_smoking_camp', 2)],
        1:
        [(random_sid(), '3120-%d|%s' % (len(subtopics[1]), subtopics[1]), 2),
         (random_sid(), '200-217|Ramrod_(Facility)', 3)],
        2:
        [(random_sid(), '3120-%d|%s' % (len(subtopics[2]), subtopics[2]), 3),
         (random_sid(), '53-63|Wamba_Town', 2),
         (random_sid(), '44-50|Woomba', 1)]
    }

    for idx, subtopic in enumerate(subtopics):
        for stream_id, subtopic_id2, rating in subtopic_to_documents[idx]:

            print stream_id

            label = Label(topic,
                          stream_id,
                          'John',
                          CorefValue.Positive,
                          subtopic_id1=subtopic,
                          subtopic_id2=subtopic_id2,
                          rating=rating)
            label_store.put(label)
Beispiel #2
0
 def _(cid1a=id_, cid1b=id_, ann1=id_, v1=coref_value, t1=time_value,
       cid2a=id_, cid2b=id_, ann2=id_, v2=coref_value, t2=time_value):
     label_store.delete_all()
     l1 = Label(cid1a, cid1b, ann1, v1, epoch_ticks=t1)
     l2 = Label(cid2a, cid2b, ann2, v2, epoch_ticks=t2)
     label_store.put(l1)
     label_store.put(l2)
     if l1.same_subject_as(l2):
         if l1.epoch_ticks == l2.epoch_ticks:
             expected = [l2]
         else:
             expected = list(sorted([l1, l2]))[0:1]
     elif cid1a == cid2a or cid1a == cid2b:
         expected = list(sorted([l1, l2]))
     else:
         expected = [l1]
     assert (list(label_store.everything(content_id=cid1a)) == expected)
Beispiel #3
0
def test_list_deleted(app, label_store):
    label_store.put(
        Label('c1', 'c2', 'a1', CorefValue.Positive, epoch_ticks=1234567890))
    label_store.put(
        Label('c1', 'c2', 'a1', CorefValue.Positive, epoch_ticks=1234567891))
    label_store.put(
        Label('c1', 'c2', 'a1', CorefValue.Positive, epoch_ticks=1234567892))
    label_store.put(
        Label('c1', 'c2', 'a2', CorefValue.Negative, epoch_ticks=1234567890))

    app.runcmd('list', ['--include-deleted'])

    assert (
        app.stdout.getvalue() == 'c1 ==(1) c2 by a1 at 2009-02-13 23:31:32\n'
        'c1 ==(1) c2 by a1 at 2009-02-13 23:31:31\n'
        'c1 ==(1) c2 by a1 at 2009-02-13 23:31:30\n'
        'c1 !=(0) c2 by a2 at 2009-02-13 23:31:30\n')
Beispiel #4
0
def test_negative_inference(label_store):
    ac = Label('a', 'c', '', 1)
    bc = Label('b', 'c', '', 1)

    de = Label('d', 'e', '', 1)
    df = Label('d', 'f', '', 1)

    cg = Label('c', 'g', '', -1)
    dg = Label('d', 'g', '', -1)

    hg = Label('h', 'g', '', 1)

    label_store.put(ac)
    label_store.put(bc)
    label_store.put(de)
    label_store.put(df)
    label_store.put(cg)
    label_store.put(dg)
    label_store.put(hg)

    def get_pair(label):
        return (label.content_id1, label.content_id2)

    correct_pairs = [('a', 'g'), ('b', 'g'), ('c', 'g'), ('c', 'h'),
                     ('d', 'g'), ('d', 'h'), ('e', 'g'), ('f', 'g')]

    inference = label_store.negative_inference('g')

    assert frozenset(map(get_pair, inference)) == \
        frozenset(correct_pairs)
Beispiel #5
0
def test_negative_label_inference(label_store):
    ac = Label('a', 'c', '', 1)
    bc = Label('b', 'c', '', 1)

    de = Label('d', 'e', '', 1)
    df = Label('d', 'f', '', 1)
    dg = Label('d', 'g', '', -1)

    cd = Label('c', 'd', '', -1)
    fh = Label('f', 'h', '', 1)

    label_store.put(ac)
    label_store.put(bc)
    label_store.put(de)
    label_store.put(df)
    label_store.put(cd)
    label_store.put(dg)
    label_store.put(fh)

    def get_pair(label):
        return (label.content_id1, label.content_id2)

    correct_pairs = [('a', 'd'), ('b', 'd'), ('c', 'd'), ('c', 'e'),
                     ('c', 'f'), ('c', 'h')]
    # [but not (a,b) <-/-> (e,f)]

    inference = label_store.negative_label_inference(cd)

    assert frozenset(map(get_pair, inference)) == \
        frozenset(correct_pairs)
Beispiel #6
0
def test_connected_component_many_most_recent(label_store):
    ab = Label('a', 'b', '', 1)
    bc = Label('b', 'c', '', -1)
    cd = Label('c', 'd', '', 1)
    label_store.put(ab)
    label_store.put(bc)
    label_store.put(cd)

    connected = list(label_store.connected_component('a'))
    assert frozenset(connected) == frozenset([ab])

    # This label should overwrite the existing `bc` label and expand
    # the connected component to `cd` through transitivity.
    bc = Label('b', 'c', '', 1, epoch_ticks=bc.epoch_ticks + 1)
    label_store.put(bc)

    connected = list(label_store.connected_component('a'))
    assert frozenset(connected) == frozenset([ab, bc, cd])
Beispiel #7
0
def test_connected_component_many_most_recent_diff_value(label_store):
    ab = Label('a', 'b', '', 1)
    bc = Label('b', 'c', '', 1)
    cd = Label('c', 'd', '', 1)
    label_store.put(ab)
    label_store.put(bc)
    label_store.put(cd)

    connected = list(label_store.connected_component('a'))
    assert frozenset(connected) == frozenset([ab, bc, cd])

    # This label should overwrite the existing `bc` label and contract
    # the connected component to just `ab`.
    bc = Label('b', 'c', '', -1, epoch_ticks=bc.epoch_ticks + 1)
    label_store.put(bc)

    connected = list(label_store.connected_component('a'))
    assert frozenset(connected) == frozenset([ab])
Beispiel #8
0
def test_subtopic_order(cid=id_, s1=id_, s2=id_, ann=id_, v=coref_value):
    l = Label(cid, cid, ann, v, subtopic_id1=s1, subtopic_id2=s2)
    assert cid in l
    assert (cid, s1) in l
    assert (cid, s2) in l
    assert l.content_id1 == cid
    assert l.content_id2 == cid
    assert l.subtopic_id1 <= l.subtopic_id2
    assert l.subtopic_id1 == min(s1, s2)
    assert l.subtopic_id2 == max(s1, s2)
Beispiel #9
0
 def _(cid1a=id_,
       cid1b=id_,
       ann1=id_,
       v1=coref_value,
       t1=time_value,
       cid2a=id_,
       cid2b=id_,
       ann2=id_,
       v2=coref_value,
       t2=time_value):
     label_store.delete_all()
     l1 = Label(cid1a, cid1b, ann1, v1, epoch_ticks=t1)
     l2 = Label(cid2a, cid2b, ann2, v2, epoch_ticks=t2)
     label_store.put(l1)
     label_store.put(l2)
     if l1.same_subject_as(l2) and l1.epoch_ticks == l2.epoch_ticks:
         expected = [l2]
     else:
         expected = list(sorted([l1, l2]))
     assert (list(label_store.everything(include_deleted=True)) == expected)
Beispiel #10
0
def test_store_legacy_compatibility(label_store):
    def legacy_put_label(label):
        k1, k2 = label_store._keys_from_label(label)
        to_pack = (label.value.value + 1) | (label.rating << 4)
        v = struct.pack('B', to_pack)
        label_store.kvl.put(label_store.TABLE, *[(k1, v), (k2, v)])

    label = Label('a', 'b', '', 1, '1', '2')
    legacy_put_label(label)
    label_from_store = label_store.get('a', 'b', '', subid1='1', subid2='2')
    assert label == label_from_store
Beispiel #11
0
def test_meta_storage(label_store):
    label = Label('a', 'b', '', 1, '1', '2')
    label.meta['hello'] = 'world'
    label.meta['subtopic1_name'] = 'foo'
    label.meta['some_num'] = 5
    label.meta['some_datastructure'] = [1, 2, 3]

    label_store.put(label)
    label_from_store = label_store.get('a', 'b', '', subid1='1', subid2='2')
    assert label == label_from_store
    assert label.meta == label_from_store.meta
Beispiel #12
0
    def _(cid1=id_, cid2=id_, ann=id_, v=coref_value):
        label_store.delete_all()

        lab = Label(cid1, cid2, ann, v)
        label_store.put(lab)
        got = label_store.get(cid1, cid2, ann)
        assert lab == got and lab.value == got.value

        label_store.delete(lab)
        with pytest.raises(KeyError):
            label_store.get(cid1, cid2, ann)
Beispiel #13
0
def dict_to_label(d):
    return Label(
        content_id1=d['content_id1'],
        content_id2=d['content_id2'],
        annotator_id=d['annotator_id'],
        value=CorefValue(d['value']),
        subtopic_id1=d.get('subtopic_id1', None),
        subtopic_id2=d.get('subtopic_id2', None),
        epoch_ticks=d.get('epoch_ticks', None),  # will become time.time()
        rating=d.get('rating', None),
    )
Beispiel #14
0
def test_list_short(app, label_store):
    label_store.put(
        Label('c1',
              'c2',
              'annotator',
              CorefValue.Positive,
              epoch_ticks=1234567890))

    app.runcmd('list', [])

    assert (app.stdout.getvalue() ==
            'c1 ==(1) c2 by annotator at 2009-02-13 23:31:30\n')
Beispiel #15
0
def dict_to_label(d):
    def to_bytes(v):
        if isinstance(v, unicode):
            return v.encode('utf-8')
        return v

    def to_long(v):
        if isinstance(v, int):
            return long(v)
        return v

    return Label(**{k: to_long(to_bytes(v)) for k, v in d.items()})
Beispiel #16
0
def test_list_subtopics(app, label_store):
    label_store.put(
        Label('c1',
              'c2',
              'a1',
              CorefValue.Positive,
              epoch_ticks=1234567890,
              subtopic_id1='s1',
              subtopic_id2='s2'))

    app.runcmd('list', [])

    assert (app.stdout.getvalue() ==
            'c1(s1) ==(1) c2(s2) by a1 at 2009-02-13 23:31:30\n')
Beispiel #17
0
def label_from_truth_data_file_line(line_data):
    '''Create a label from a *parsed* truth_data_file line.

    :param line_data: dict
    '''
    # document data
    doc_id = line_data['docno']
    if not doc_id.strip():
        logger.warn('dropping invalid truth data line: '
                    'bad docno: %r: %r'
                    % (doc_id, line_data))
        return None

    if len(line_data['passage_name'].strip()) < 1:
        logger.warn('dropping empty passage: %r', line_data)
        return None

    # annotation data
    topic_id = line_data['topic_id']
    subtopic_id = line_data['subtopic_id']
    passage_id = line_data['passage_id']
    annotator = line_data['userid']

    # value data
    value = CorefValue.Positive
    try:
        rating = int(line_data['grade'])
    except ValueError:
        logger.warn('replacing bogus grade with zero = %r',
                    line_data['grade'])
        rating = 0

    if rating < 0:
        value = CorefValue.Negative
        rating = 0

    # meta data
    meta = {'domain_name': line_data['domain_name'],
            'domain_id': line_data['domain_id'],
            'username': line_data['username'],
            'topic_name': line_data['topic_name'],
            'topic_id': line_data['topic_id'],
            'subtopic_name': line_data['subtopic_name'],
            'passage_text': line_data['passage_name']}

    label = Label(topic_id, doc_id, annotator, value,
                  subtopic_id1=subtopic_id, subtopic_id2=passage_id,
                  rating=rating, meta=meta)
    return label
Beispiel #18
0
def build_test_data(kvl):
    topics = ['topic1', 'topic2', 'topic3']
    subtopics = ['subtopic1', 'subtopic2', 'subtopic3']
    relevances = [[1, 2, 3]] * 3
    offset = '13-235'

    label_store = LabelStore(kvl)

    for t_idx, topic in enumerate(topics):
        for s_idx, subtopic in enumerate(subtopics):
            label = Label(topic,
                          'doc' + str(t_idx) + str(s_idx),
                          'me',
                          CorefValue.Positive,
                          subtopic_id1=subtopic,
                          subtopic_id2=offset + '|' + 'some text',
                          relevance=relevances[t_idx][s_idx])
            label_store.put(label)
Beispiel #19
0
    def _(pfx1=str_letters(length=int_(1, 8)),
          pfx2=str_letters(length=int_(1, 8)),
          sfx1=str_letters(length=int_(1, 12)),
          sfx2=str_letters(length=int_(1, 12)),
          ann=id_,
          v=coref_value):
        label_store.delete_all()
        cid1 = pfx1 + sfx1
        cid2 = pfx1 + sfx2
        l = Label(cid1, cid2, ann, v)
        label_store.put(l)

        assert (list(label_store.everything(prefix=pfx1))) == [l]

        if pfx1.startswith(pfx2):
            expected = [l]
        else:
            expected = []
        assert (list(label_store.everything(prefix=pfx2))) == expected
Beispiel #20
0
def test_subtopic_id(cid1=id_,
                     cid2=id_,
                     s1=id_,
                     s2=id_,
                     ann=id_,
                     v=coref_value):
    l = Label(cid1, cid2, ann, v, subtopic_id1=s1, subtopic_id2=s2)
    assert cid1 in l
    assert (cid1, None) in l
    assert (cid1, s1) in l
    assert cid2 in l
    assert (cid2, None) in l
    assert (cid2, s2) in l
    assert l.other(cid1) == cid2
    assert l.other(cid2) == cid1
    if cid1 != cid2:
        assert l.subtopic_for(cid1) == s1
        assert l.subtopic_for(cid2) == s2
    else:
        assert l.subtopic_for(cid1) == min(s1, s2)
Beispiel #21
0
def test_split_by_connected_component(label_store):
    a1 = Label('a1', 'a2', '', 1)
    a2 = Label('a2', 'a3', '', 1)
    a3 = Label('a3', 'a4', '', 1)
    a4 = Label('a4', 'a1', '', 1)

    b1 = Label('b', 'b1', '', 1)
    b2 = Label('b', 'b2', '', 1)
    b3 = Label('b', 'b3', '', 1)

    c1 = Label('c1', 'c2', '', 1)

    label_store.put(a1, a2, a3, a4, b1, b2, b3, c1)

    ids = ['a2', 'a3', 'b1', 'b3', 'c1', 'd', 'e']

    splits = label_store.split_by_connected_component(ids)

    assert ['a2', 'a3'] in splits
    assert ['b1', 'b3'] in splits
    assert ['c1'] in splits
    assert ['d'] in splits
    assert ['e'] in splits
Beispiel #22
0
    def positive_subtopic_labels(self):
        cid, subid = self.query_content_id, self.query_subtopic_id
        subfolders = list(self.folders.parent_subfolders((cid, subid)))

        for fid, subfolder_id in subfolders:
            for cid2, subid2 in self.folders.items(fid, subfolder_id):
                # Since this item is in the same folder as our query, we
                # consider it a positive example. But there's no explicit
                # label for it, so manufacture one.
                #
                # TODO: Fix annotator id here. (We need to push annotator
                # information down into the search engine; the rest is
                # trivial.) ---AG
                yield Label(cid, cid2, Folders.DEFAULT_ANNOTATOR_ID,
                            CorefValue.Positive, subid, subid2)

                # Sometimes the user will directly attach a positive label
                # to an item in the folder. This will grab those.
                for lab in self.label_store.directly_connected(cid2):
                    if lab.value == CorefValue.Positive \
                            and lab.subtopic_for(cid2) == subid2:
                        yield lab
Beispiel #23
0
    def add_item(self,
                 folder_id,
                 subfolder_id,
                 content_id,
                 subtopic_id=None,
                 ann_id=None):
        '''Add an item to a subfolder.

        The format of ``content_id`` and ``subtopic_id`` is
        unspecified. It is application specific.

        If ``ann_id`` is set, then the item is owned by the given user.
        Otherwise, the item is owned and viewable by all anonymous
        users.

        :param str folder_id: Folder id
        :param str subfolder_id: Folder id
        :param str content_id: content identifier
        :param str subtopic_id: subtopic identifier
        :param str ann_id: Username
        '''
        self.assert_valid_folder_id(folder_id)
        self.assert_valid_folder_id(subfolder_id)
        ann_id = self._annotator(ann_id)
        folder_cid = self.wrap_folder_content_id(ann_id, folder_id)
        subfolder_sid = self.wrap_subfolder_subtopic_id(subfolder_id)

        if self.store.get(folder_cid) is None:
            raise KeyError(folder_id)

        lab = Label(folder_cid,
                    content_id,
                    ann_id,
                    CorefValue.Positive,
                    subtopic_id1=subfolder_sid,
                    subtopic_id2=subtopic_id)
        self.label_store.put(lab)
        logger.info('Added subfolder item: %r', lab)
Beispiel #24
0
 def _(cid1=id_, cid2=id_, ann=id_, v=coref_value):
     label_store.delete_all()
     l = Label(cid1, cid2, ann, v)
     label_store.put(l)
     assert list(label_store.everything()) == [l]
Beispiel #25
0
def test_expand(label_store):
    ab = Label('a', 'b', '', 1)
    bc = Label('b', 'c', '', 1)
    cd = Label('c', 'd', '', 1)
    ae = Label('a', 'e', '', -1)
    fg = Label('f', 'g', '', 1)

    label_store.put(ab)
    label_store.put(bc)
    label_store.put(cd)
    label_store.put(ae)
    label_store.put(fg)

    correct_pairs = [Label('a', 'b', '', 1),
                     Label('a', 'c', '', 1),
                     Label('a', 'd', '', 1),
                     Label('b', 'c', '', 1),
                     Label('b', 'd', '', 1),
                     Label('c', 'd', '', 1)]

    assert frozenset(label_store.expand('a')) == frozenset(correct_pairs)
    assert len(label_store.expand('e')) == 0
    assert label_store.expand('f') == [Label('f', 'g', '', 1)]
Beispiel #26
0
def label(id1, id2, v=CorefValue.Positive, sid1=None, sid2=None):
    return Label(id1, id2, 'foo', v, subtopic_id1=sid1, subtopic_id2=sid2)
Beispiel #27
0
def neg_label(id1, id2):
    return Label(id1, id2, '', CorefValue.Negative)
Beispiel #28
0
def pos_label(id1, id2):
    # Don't care about annotators or subtopics.
    return Label(id1, id2, '', CorefValue.Positive)
Beispiel #29
0
 def lab(cid1, sid1, cid2, sid2, neg=False):
     coref_val = CorefValue.Negative if neg else CorefValue.Positive
     return Label(cid1, cid2, 'unknown', coref_val, sid1, sid2)