def _(cid1a=id_, cid1b=id_, ann1=id_, v1=coref_value, t1=time_value, cid2a=id_, cid2b=id_, ann2=id_, v2=coref_value, t2=time_value): label_store.delete_all() l1 = Label(cid1a, cid1b, ann1, v1, epoch_ticks=t1) l2 = Label(cid2a, cid2b, ann2, v2, epoch_ticks=t2) label_store.put(l1) label_store.put(l2) if l1.same_subject_as(l2): if l1.epoch_ticks == l2.epoch_ticks: expected = [l2] else: expected = list(sorted([l1, l2]))[0:1] elif cid1a == cid2a or cid1a == cid2b: expected = list(sorted([l1, l2])) else: expected = [l1] assert (list(label_store.everything(content_id=cid1a)) == expected)
def _(cid1=id_, cid2=id_, ann=id_, v=coref_value, t=time_value): label_store.delete_all() l1 = Label(cid1, cid2, ann, v, epoch_ticks=t) l2 = Label(cid1, cid2, ann, v, epoch_ticks=t+1) label_store.put(l1) label_store.put(l2) assert list(label_store.everything()) == [l2] assert list(label_store.everything(include_deleted=True)) == [l2, l1]
def test_label_reverse_equality(cid1=id_, cid2=id_, ann=id_, v=coref_value, t=time_value): l1 = Label(cid1, cid2, ann, v, epoch_ticks=t) l2 = Label(cid2, cid1, ann, v, epoch_ticks=t) assert l1 == l2 assert hash(l1) == hash(l2)
def test_label_diff_empty(): old = [Label('a', 'b', 'foo', 1, epoch_ticks=0)] new = [Label('a', 'b', 'foo', 1, epoch_ticks=1)] assert old != new assert diff_labels_sets(old, new) == { 'add': set(), 'delete': set(), 'change': set(), }
def test_no_prefix_subtopic(label_store): foo_bar = Label('Foo', 'Bar', '', 1, 'Foo', 'Bar') foobaz_bar = Label('Foo Baz', 'Bar', '', 1, 'Foo Baz', 'Bar') label_store.put(foo_bar) label_store.put(foobaz_bar) direct = list(label_store.directly_connected(('Foo', 'Foo'))) assert direct == [foo_bar]
def test_same_subject(cid1=id_, cid2=id_, s1=id_, s2=id_, ann=id_, v1=coref_value, v2=coref_value, t1=time_value, t2=time_value): l1 = Label(cid1, cid2, ann, v1, epoch_ticks=t1, subtopic_id1=s1, subtopic_id2=s2) l2 = Label(cid1, cid2, ann, v2, epoch_ticks=t2, subtopic_id1=s1, subtopic_id2=s2) assert l1.same_subject_as(l2) assert l2.same_subject_as(l1)
def test_label_order_on_value(cid1=id_, cid2=id_, ann=id_, t=time_value, v1=coref_value, v2=coref_value): lab1 = Label(cid1, cid2, ann, v1, epoch_ticks=t) lab2 = Label(cid1, cid2, ann, v2, epoch_ticks=t) assert ((v1 < v2 and lab1 < lab2) or (v1 == v2 and lab1 == lab2) or (v1 > v2 and lab1 > lab2))
def test_direct_connect_unordered(label_store): ab = Label('a', 'b', '', 1) ac = Label('c', 'a', '', 1) bc = Label('b', 'c', '', 1) label_store.put(ab) label_store.put(ac) label_store.put(bc) direct = list(label_store.directly_connected('a')) assert direct == [ab, ac]
def test_connected_component_many_diff_value(label_store): ab = Label('a', 'b', '', 1) bc = Label('b', 'c', '', -1) cd = Label('c', 'd', '', 1) label_store.put(ab) label_store.put(bc) label_store.put(cd) connected = list(label_store.connected_component('a')) assert frozenset(connected) == frozenset([ab])
def _(cid1=id_, cid2=id_, ann=id_, v1=coref_value, v2=coref_value): label_store.delete_all() lab1 = Label(cid1, cid2, ann, v1) lab2 = Label(cid2, cid1, ann, v2, epoch_ticks=lab1.epoch_ticks + 1) label_store.put(lab1) label_store.put(lab2) assert list(label_store.directly_connected(cid1)) == [lab2] assert list(label_store.directly_connected(cid2)) == [lab2]
def test_connected_component_unordered(label_store): ab = Label('a', 'b', '', 1) ac = Label('c', 'a', '', 1) bc = Label('b', 'c', '', 1) label_store.put(ab) label_store.put(ac) label_store.put(bc) connected = list(label_store.connected_component('a')) assert frozenset(connected) == frozenset([ab, ac, bc])
def test_sub_connected(label_store): a1b2 = Label('a', 'b', '', 1, '1', '2') b2c3 = Label('b', 'c', '', 1, '2', '3') b4c5 = Label('b', 'c', '', 1, '4', '5') label_store.put(a1b2) label_store.put(b2c3) label_store.put(b4c5) connected = list(label_store.connected_component(('a', '1'))) assert frozenset(connected) == frozenset([a1b2, b2c3])
def test_meta_storage(label_store): label = Label('a', 'b', '', 1, '1', '2') label.meta['hello'] = 'world' label.meta['subtopic1_name'] = 'foo' label.meta['some_num'] = 5 label.meta['some_datastructure'] = [1, 2, 3] label_store.put(label) label_from_store = label_store.get('a', 'b', '', subid1='1', subid2='2') assert label == label_from_store assert label.meta == label_from_store.meta
def test_list_two(app, label_store): label_store.put( Label('c1', 'c2', 'a1', CorefValue.Positive, epoch_ticks=1234567890)) label_store.put( Label('c1', 'c2', 'a2', CorefValue.Negative, epoch_ticks=1234567890)) app.runcmd('list', []) assert ( app.stdout.getvalue() == 'c1 ==(1) c2 by a1 at 2009-02-13 23:31:30\n' 'c1 !=(0) c2 by a2 at 2009-02-13 23:31:30\n')
def _(cid1=id_, cid2=id_, ann=id_, v1=coref_value, v2=coref_value): label_store.delete_all() lab1 = Label(cid1, cid2, ann, v1) lab2 = Label(cid2, cid1, ann, v2, epoch_ticks=lab1.epoch_ticks + 1) label_store.put(lab1) label_store.put(lab2) got = label_store.get(cid1, cid2, ann) assert got == lab2 assert got != lab1 assert got.value == lab2.value
def test_label_most_recent_first_unordered(cid1=id_, cid2=id_, ann=id_, v1=coref_value, v2=coref_value, t=time_value): lab1 = Label(cid1, cid2, ann, v1, epoch_ticks=t) lab2 = Label(cid2, cid1, ann, v2, epoch_ticks=t + 1) assert lab2 < lab1 assert not (lab1 == lab2) assert sorted([lab1, lab2]) == [lab2, lab1] assert list(Label.most_recent([lab2, lab1])) == [lab2]
def test_sub_expand(label_store): a1b2 = Label('a', 'b', '', 1, '1', '2') b2c3 = Label('b', 'c', '', 1, '2', '3') b4c5 = Label('b', 'c', '', 1, '4', '5') # not in subtopic expansion! label_store.put(a1b2) label_store.put(b2c3) label_store.put(b4c5) # Not phyiscally present in the label table, but part of expansion! a1c3 = Label('a', 'c', '', 1, '1', '3') connected = list(label_store.expand(('a', '1'))) assert frozenset(connected) == frozenset([a1b2, b2c3, a1c3])
def _(cid1a=id_, cid1b=id_, ann1=id_, v1=coref_value, t1=time_value, cid2a=id_, cid2b=id_, ann2=id_, v2=coref_value, t2=time_value): label_store.delete_all() l1 = Label(cid1a, cid1b, ann1, v1, epoch_ticks=t1) l2 = Label(cid2a, cid2b, ann2, v2, epoch_ticks=t2) label_store.put(l1) label_store.put(l2) if l1.same_subject_as(l2) and l1.epoch_ticks == l2.epoch_ticks: expected = [l2] else: expected = list(sorted([l1, l2])) assert (list(label_store.everything(include_deleted=True)) == expected)
def test_sub_direct_connect(label_store): a1b2 = Label('a', 'b', '', 1, '1', '2') a1c3 = Label('a', 'c', '', 1, '1', '3') b2c3 = Label('b', 'c', '', 1, '2', '3') a4b2 = Label('a', 'b', '', 1, '4', '2') label_store.put(a1b2) label_store.put(a1c3) label_store.put(b2c3) label_store.put(a4b2) # a4b2 should not be included because we're demanding a specific # subtopic_id of 'a'. direct = list(label_store.directly_connected(('a', '1'))) assert direct == [a1b2, a1c3]
def test_connected_component_collision(label_store): # You can't store the hashes of objects and expect there to never # be collisions. As a corollary, hash(str) isn't that great # vs. small changes, and the recommended technique of xoring # together field hashes can get collisions quickly. # In particular, hash('test0') ^ hash('test1') is 1, # as is hash('test2') ^ hash('test3'). ab = Label('test0', 'test1', '', 1) bc = Label('test1', 'test2', '', 1) cd = Label('test2', 'test3', '', 1) label_store.put(ab) label_store.put(bc) label_store.put(cd) assert list(label_store.connected_component('test0')) == [ab, bc, cd]
def negative_subtopic_labels(label_store, folders, cid, subid): subfolders = list(folders.parent_subfolders((cid, subid))) # Find any directly connected negative labels to any item in the # containing subfolder. for fid, subfolder_id in subfolders: for cid2, subid2 in folders.items(fid, subfolder_id): for lab in label_store.directly_connected(cid2): if lab.value == CorefValue.Negative \ and lab.subtopic_for(cid2) == subid2: yield lab # Find all items in subfolders other than the subfolder that contains # (cid, subid) and add negative labels. Stay inside the folder (topic) # for now though. # # It's possible that `(cid, subid)` are in more than one subfolder, # but in SortingDesk, `subid` is usually some kind of offset or hash, # so it's probably very unlikely. In any case, if it is in more than # one subfolder, then it's a user error and we just have to hope that # the model figures it out. in_fids = set() for fid, subfolder_id in subfolders: in_fids.add(fid) for cousin_subid in folders.subfolders(fid): if cousin_subid == subfolder_id: # You can't be a cousin to yourself! continue for cid2, subid2 in folders.items(fid, cousin_subid): # TODO: Fix annotator id here. (We need to push annotator # information down into the search engine; the rest is # trivial.) ---AG yield Label(cid, cid2, Folders.DEFAULT_ANNOTATOR_ID, CorefValue.Negative, subid, subid2) # If we exhaust the above, then let's start adding negative labels with # other topics. for other_fid in folders.folders(): if other_fid in in_fids: # The item was found in one of these folders above, so ignore # it here. continue # We're home free. Find every item in this folder and make a # negative label for each. for other_subid in folders.subfolders(other_fid): for cid2, subid2 in folders.items(other_fid, other_subid): yield Label(cid, cid2, Folders.DEFAULT_ANNOTATOR_ID, CorefValue.Negative, subid, subid2)
def v1_label_put(request, response, visid_to_dbid, config, label_hooks, label_store, cid1, cid2, annotator_id): '''Store a single label. The route for this endpoint is: ``PUT /dossier/v1/labels/<content_id1>/<content_id2>/<annotator_id>``. ``content_id`` are the ids of the feature collections to associate. ``annotator_id`` is a string that identifies the human that created the label. The value of the label should be in the request body as one of the following three values: ``-1`` for not coreferent, ``0`` for "I don't know if they are coreferent" and ``1`` for coreferent. Optionally, the query parameters ``subtopic_id1`` and ``subtopic_id2`` may be specified. Neither, both or either may be given. ``subtopic_id1`` corresponds to a subtopic in ``content_id1`` and ``subtopic_id2`` corresponds to a subtopic in ``content_id2``. This endpoint returns status ``201`` upon successful storage. Any existing labels with the given ids are overwritten. ''' coref_value = CorefValue(int(request.body.read())) lab = Label(visid_to_dbid(cid1), visid_to_dbid(cid2), annotator_id, coref_value, subtopic_id1=request.query.get('subtopic_id1'), subtopic_id2=request.query.get('subtopic_id2')) label_store.put(lab) response.status = 201
def diff_labels_sets(old, new): diff = Label.diff(old, new) return { 'add': set(diff['add']), 'delete': set(diff['delete']), 'change': set(diff['change']), }
def test_content_id_order(cid1=id_, cid2=id_, ann=id_, v=coref_value): l = Label(cid1, cid2, ann, v) assert cid1 in l assert cid2 in l assert l.content_id1 <= l.content_id2 assert l.content_id1 == min(cid1, cid2) assert l.content_id2 == max(cid1, cid2)
def test_subtopic_id(cid1=id_, cid2=id_, s1=id_, s2=id_, ann=id_, v=coref_value): l = Label(cid1, cid2, ann, v, subtopic_id1=s1, subtopic_id2=s2) assert cid1 in l assert (cid1, None) in l assert (cid1, s1) in l assert cid2 in l assert (cid2, None) in l assert (cid2, s2) in l assert l.other(cid1) == cid2 assert l.other(cid2) == cid1 if cid1 != cid2: assert l.subtopic_for(cid1) == s1 assert l.subtopic_for(cid2) == s2 else: assert l.subtopic_for(cid1) == min(s1, s2)
def build_demo_data(kvl): label_store = LabelStore(kvl) topic = 'where_are_aid_workers_housed_near_Monrovia' subtopics = ['Tanji_Fish_Curing_Site', 'Camp_Ramrod', 'Town_of_Wamba'] subtopic_to_documents = { 0: [(random_sid(), '2100-%d|%s' % (len(subtopics[0]), subtopics[0]), 3), (random_sid(), '15-93|we_drove_out_to_the_other_side_' + 'of_the_river_delta_to_a_small_fish_smoking_camp', 2)], 1: [(random_sid(), '3120-%d|%s' % (len(subtopics[1]), subtopics[1]), 2), (random_sid(), '200-217|Ramrod_(Facility)', 3)], 2: [(random_sid(), '3120-%d|%s' % (len(subtopics[2]), subtopics[2]), 3), (random_sid(), '53-63|Wamba_Town', 2), (random_sid(), '44-50|Woomba', 1)] } for idx, subtopic in enumerate(subtopics): for stream_id, subtopic_id2, rating in subtopic_to_documents[idx]: print stream_id label = Label(topic, stream_id, 'John', CorefValue.Positive, subtopic_id1=subtopic, subtopic_id2=subtopic_id2, rating=rating) label_store.put(label)
def _(cid1=id_, cid2=id_, ann=id_, v=coref_value): label_store.delete_all() lab = Label(cid1, cid2, ann, v) label_store.put(lab) got = label_store.get(cid2, cid1, ann) assert lab == got and lab.value == got.value
def test_negative_label_inference(label_store): ac = Label('a', 'c', '', 1) bc = Label('b', 'c', '', 1) de = Label('d', 'e', '', 1) df = Label('d', 'f', '', 1) dg = Label('d', 'g', '', -1) cd = Label('c', 'd', '', -1) fh = Label('f', 'h', '', 1) label_store.put(ac) label_store.put(bc) label_store.put(de) label_store.put(df) label_store.put(cd) label_store.put(dg) label_store.put(fh) def get_pair(label): return (label.content_id1, label.content_id2) correct_pairs = [('a', 'd'), ('b', 'd'), ('c', 'd'), ('c', 'e'), ('c', 'f'), ('c', 'h')] # [but not (a,b) <-/-> (e,f)] inference = label_store.negative_label_inference(cd) assert frozenset(map(get_pair, inference)) == \ frozenset(correct_pairs)
def test_negative_inference(label_store): ac = Label('a', 'c', '', 1) bc = Label('b', 'c', '', 1) de = Label('d', 'e', '', 1) df = Label('d', 'f', '', 1) cg = Label('c', 'g', '', -1) dg = Label('d', 'g', '', -1) hg = Label('h', 'g', '', 1) label_store.put(ac) label_store.put(bc) label_store.put(de) label_store.put(df) label_store.put(cg) label_store.put(dg) label_store.put(hg) def get_pair(label): return (label.content_id1, label.content_id2) correct_pairs = [('a', 'g'), ('b', 'g'), ('c', 'g'), ('c', 'h'), ('d', 'g'), ('d', 'h'), ('e', 'g'), ('f', 'g')] inference = label_store.negative_inference('g') assert frozenset(map(get_pair, inference)) == \ frozenset(correct_pairs)
def test_connected_component_many_most_recent_diff_value(label_store): ab = Label('a', 'b', '', 1) bc = Label('b', 'c', '', 1) cd = Label('c', 'd', '', 1) label_store.put(ab) label_store.put(bc) label_store.put(cd) connected = list(label_store.connected_component('a')) assert frozenset(connected) == frozenset([ab, bc, cd]) # This label should overwrite the existing `bc` label and contract # the connected component to just `ab`. bc = Label('b', 'c', '', -1, epoch_ticks=bc.epoch_ticks + 1) label_store.put(bc) connected = list(label_store.connected_component('a')) assert frozenset(connected) == frozenset([ab])
def test_connected_component_many_most_recent(label_store): ab = Label('a', 'b', '', 1) bc = Label('b', 'c', '', -1) cd = Label('c', 'd', '', 1) label_store.put(ab) label_store.put(bc) label_store.put(cd) connected = list(label_store.connected_component('a')) assert frozenset(connected) == frozenset([ab]) # This label should overwrite the existing `bc` label and expand # the connected component to `cd` through transitivity. bc = Label('b', 'c', '', 1, epoch_ticks=bc.epoch_ticks + 1) label_store.put(bc) connected = list(label_store.connected_component('a')) assert frozenset(connected) == frozenset([ab, bc, cd])
def test_subtopic_order(cid=id_, s1=id_, s2=id_, ann=id_, v=coref_value): l = Label(cid, cid, ann, v, subtopic_id1=s1, subtopic_id2=s2) assert cid in l assert (cid, s1) in l assert (cid, s2) in l assert l.content_id1 == cid assert l.content_id2 == cid assert l.subtopic_id1 <= l.subtopic_id2 assert l.subtopic_id1 == min(s1, s2) assert l.subtopic_id2 == max(s1, s2)
def dict_to_label(d): return Label( content_id1=d['content_id1'], content_id2=d['content_id2'], annotator_id=d['annotator_id'], value=CorefValue(d['value']), subtopic_id1=d.get('subtopic_id1', None), subtopic_id2=d.get('subtopic_id2', None), epoch_ticks=d.get('epoch_ticks', None), # will become time.time() rating=d.get('rating', None), )