def build_demo_data(kvl): label_store = LabelStore(kvl) topic = 'where_are_aid_workers_housed_near_Monrovia' subtopics = ['Tanji_Fish_Curing_Site', 'Camp_Ramrod', 'Town_of_Wamba'] subtopic_to_documents = { 0: [(random_sid(), '2100-%d|%s' % (len(subtopics[0]), subtopics[0]), 3), (random_sid(), '15-93|we_drove_out_to_the_other_side_' + 'of_the_river_delta_to_a_small_fish_smoking_camp', 2)], 1: [(random_sid(), '3120-%d|%s' % (len(subtopics[1]), subtopics[1]), 2), (random_sid(), '200-217|Ramrod_(Facility)', 3)], 2: [(random_sid(), '3120-%d|%s' % (len(subtopics[2]), subtopics[2]), 3), (random_sid(), '53-63|Wamba_Town', 2), (random_sid(), '44-50|Woomba', 1)] } for idx, subtopic in enumerate(subtopics): for stream_id, subtopic_id2, rating in subtopic_to_documents[idx]: print stream_id label = Label(topic, stream_id, 'John', CorefValue.Positive, subtopic_id1=subtopic, subtopic_id2=subtopic_id2, rating=rating) label_store.put(label)
def _(cid1a=id_, cid1b=id_, ann1=id_, v1=coref_value, t1=time_value, cid2a=id_, cid2b=id_, ann2=id_, v2=coref_value, t2=time_value): label_store.delete_all() l1 = Label(cid1a, cid1b, ann1, v1, epoch_ticks=t1) l2 = Label(cid2a, cid2b, ann2, v2, epoch_ticks=t2) label_store.put(l1) label_store.put(l2) if l1.same_subject_as(l2): if l1.epoch_ticks == l2.epoch_ticks: expected = [l2] else: expected = list(sorted([l1, l2]))[0:1] elif cid1a == cid2a or cid1a == cid2b: expected = list(sorted([l1, l2])) else: expected = [l1] assert (list(label_store.everything(content_id=cid1a)) == expected)
def test_list_deleted(app, label_store): label_store.put( Label('c1', 'c2', 'a1', CorefValue.Positive, epoch_ticks=1234567890)) label_store.put( Label('c1', 'c2', 'a1', CorefValue.Positive, epoch_ticks=1234567891)) label_store.put( Label('c1', 'c2', 'a1', CorefValue.Positive, epoch_ticks=1234567892)) label_store.put( Label('c1', 'c2', 'a2', CorefValue.Negative, epoch_ticks=1234567890)) app.runcmd('list', ['--include-deleted']) assert ( app.stdout.getvalue() == 'c1 ==(1) c2 by a1 at 2009-02-13 23:31:32\n' 'c1 ==(1) c2 by a1 at 2009-02-13 23:31:31\n' 'c1 ==(1) c2 by a1 at 2009-02-13 23:31:30\n' 'c1 !=(0) c2 by a2 at 2009-02-13 23:31:30\n')
def test_negative_inference(label_store): ac = Label('a', 'c', '', 1) bc = Label('b', 'c', '', 1) de = Label('d', 'e', '', 1) df = Label('d', 'f', '', 1) cg = Label('c', 'g', '', -1) dg = Label('d', 'g', '', -1) hg = Label('h', 'g', '', 1) label_store.put(ac) label_store.put(bc) label_store.put(de) label_store.put(df) label_store.put(cg) label_store.put(dg) label_store.put(hg) def get_pair(label): return (label.content_id1, label.content_id2) correct_pairs = [('a', 'g'), ('b', 'g'), ('c', 'g'), ('c', 'h'), ('d', 'g'), ('d', 'h'), ('e', 'g'), ('f', 'g')] inference = label_store.negative_inference('g') assert frozenset(map(get_pair, inference)) == \ frozenset(correct_pairs)
def test_negative_label_inference(label_store): ac = Label('a', 'c', '', 1) bc = Label('b', 'c', '', 1) de = Label('d', 'e', '', 1) df = Label('d', 'f', '', 1) dg = Label('d', 'g', '', -1) cd = Label('c', 'd', '', -1) fh = Label('f', 'h', '', 1) label_store.put(ac) label_store.put(bc) label_store.put(de) label_store.put(df) label_store.put(cd) label_store.put(dg) label_store.put(fh) def get_pair(label): return (label.content_id1, label.content_id2) correct_pairs = [('a', 'd'), ('b', 'd'), ('c', 'd'), ('c', 'e'), ('c', 'f'), ('c', 'h')] # [but not (a,b) <-/-> (e,f)] inference = label_store.negative_label_inference(cd) assert frozenset(map(get_pair, inference)) == \ frozenset(correct_pairs)
def test_connected_component_many_most_recent(label_store): ab = Label('a', 'b', '', 1) bc = Label('b', 'c', '', -1) cd = Label('c', 'd', '', 1) label_store.put(ab) label_store.put(bc) label_store.put(cd) connected = list(label_store.connected_component('a')) assert frozenset(connected) == frozenset([ab]) # This label should overwrite the existing `bc` label and expand # the connected component to `cd` through transitivity. bc = Label('b', 'c', '', 1, epoch_ticks=bc.epoch_ticks + 1) label_store.put(bc) connected = list(label_store.connected_component('a')) assert frozenset(connected) == frozenset([ab, bc, cd])
def test_connected_component_many_most_recent_diff_value(label_store): ab = Label('a', 'b', '', 1) bc = Label('b', 'c', '', 1) cd = Label('c', 'd', '', 1) label_store.put(ab) label_store.put(bc) label_store.put(cd) connected = list(label_store.connected_component('a')) assert frozenset(connected) == frozenset([ab, bc, cd]) # This label should overwrite the existing `bc` label and contract # the connected component to just `ab`. bc = Label('b', 'c', '', -1, epoch_ticks=bc.epoch_ticks + 1) label_store.put(bc) connected = list(label_store.connected_component('a')) assert frozenset(connected) == frozenset([ab])
def test_subtopic_order(cid=id_, s1=id_, s2=id_, ann=id_, v=coref_value): l = Label(cid, cid, ann, v, subtopic_id1=s1, subtopic_id2=s2) assert cid in l assert (cid, s1) in l assert (cid, s2) in l assert l.content_id1 == cid assert l.content_id2 == cid assert l.subtopic_id1 <= l.subtopic_id2 assert l.subtopic_id1 == min(s1, s2) assert l.subtopic_id2 == max(s1, s2)
def _(cid1a=id_, cid1b=id_, ann1=id_, v1=coref_value, t1=time_value, cid2a=id_, cid2b=id_, ann2=id_, v2=coref_value, t2=time_value): label_store.delete_all() l1 = Label(cid1a, cid1b, ann1, v1, epoch_ticks=t1) l2 = Label(cid2a, cid2b, ann2, v2, epoch_ticks=t2) label_store.put(l1) label_store.put(l2) if l1.same_subject_as(l2) and l1.epoch_ticks == l2.epoch_ticks: expected = [l2] else: expected = list(sorted([l1, l2])) assert (list(label_store.everything(include_deleted=True)) == expected)
def test_store_legacy_compatibility(label_store): def legacy_put_label(label): k1, k2 = label_store._keys_from_label(label) to_pack = (label.value.value + 1) | (label.rating << 4) v = struct.pack('B', to_pack) label_store.kvl.put(label_store.TABLE, *[(k1, v), (k2, v)]) label = Label('a', 'b', '', 1, '1', '2') legacy_put_label(label) label_from_store = label_store.get('a', 'b', '', subid1='1', subid2='2') assert label == label_from_store
def test_meta_storage(label_store): label = Label('a', 'b', '', 1, '1', '2') label.meta['hello'] = 'world' label.meta['subtopic1_name'] = 'foo' label.meta['some_num'] = 5 label.meta['some_datastructure'] = [1, 2, 3] label_store.put(label) label_from_store = label_store.get('a', 'b', '', subid1='1', subid2='2') assert label == label_from_store assert label.meta == label_from_store.meta
def _(cid1=id_, cid2=id_, ann=id_, v=coref_value): label_store.delete_all() lab = Label(cid1, cid2, ann, v) label_store.put(lab) got = label_store.get(cid1, cid2, ann) assert lab == got and lab.value == got.value label_store.delete(lab) with pytest.raises(KeyError): label_store.get(cid1, cid2, ann)
def dict_to_label(d): return Label( content_id1=d['content_id1'], content_id2=d['content_id2'], annotator_id=d['annotator_id'], value=CorefValue(d['value']), subtopic_id1=d.get('subtopic_id1', None), subtopic_id2=d.get('subtopic_id2', None), epoch_ticks=d.get('epoch_ticks', None), # will become time.time() rating=d.get('rating', None), )
def test_list_short(app, label_store): label_store.put( Label('c1', 'c2', 'annotator', CorefValue.Positive, epoch_ticks=1234567890)) app.runcmd('list', []) assert (app.stdout.getvalue() == 'c1 ==(1) c2 by annotator at 2009-02-13 23:31:30\n')
def dict_to_label(d): def to_bytes(v): if isinstance(v, unicode): return v.encode('utf-8') return v def to_long(v): if isinstance(v, int): return long(v) return v return Label(**{k: to_long(to_bytes(v)) for k, v in d.items()})
def test_list_subtopics(app, label_store): label_store.put( Label('c1', 'c2', 'a1', CorefValue.Positive, epoch_ticks=1234567890, subtopic_id1='s1', subtopic_id2='s2')) app.runcmd('list', []) assert (app.stdout.getvalue() == 'c1(s1) ==(1) c2(s2) by a1 at 2009-02-13 23:31:30\n')
def label_from_truth_data_file_line(line_data): '''Create a label from a *parsed* truth_data_file line. :param line_data: dict ''' # document data doc_id = line_data['docno'] if not doc_id.strip(): logger.warn('dropping invalid truth data line: ' 'bad docno: %r: %r' % (doc_id, line_data)) return None if len(line_data['passage_name'].strip()) < 1: logger.warn('dropping empty passage: %r', line_data) return None # annotation data topic_id = line_data['topic_id'] subtopic_id = line_data['subtopic_id'] passage_id = line_data['passage_id'] annotator = line_data['userid'] # value data value = CorefValue.Positive try: rating = int(line_data['grade']) except ValueError: logger.warn('replacing bogus grade with zero = %r', line_data['grade']) rating = 0 if rating < 0: value = CorefValue.Negative rating = 0 # meta data meta = {'domain_name': line_data['domain_name'], 'domain_id': line_data['domain_id'], 'username': line_data['username'], 'topic_name': line_data['topic_name'], 'topic_id': line_data['topic_id'], 'subtopic_name': line_data['subtopic_name'], 'passage_text': line_data['passage_name']} label = Label(topic_id, doc_id, annotator, value, subtopic_id1=subtopic_id, subtopic_id2=passage_id, rating=rating, meta=meta) return label
def build_test_data(kvl): topics = ['topic1', 'topic2', 'topic3'] subtopics = ['subtopic1', 'subtopic2', 'subtopic3'] relevances = [[1, 2, 3]] * 3 offset = '13-235' label_store = LabelStore(kvl) for t_idx, topic in enumerate(topics): for s_idx, subtopic in enumerate(subtopics): label = Label(topic, 'doc' + str(t_idx) + str(s_idx), 'me', CorefValue.Positive, subtopic_id1=subtopic, subtopic_id2=offset + '|' + 'some text', relevance=relevances[t_idx][s_idx]) label_store.put(label)
def _(pfx1=str_letters(length=int_(1, 8)), pfx2=str_letters(length=int_(1, 8)), sfx1=str_letters(length=int_(1, 12)), sfx2=str_letters(length=int_(1, 12)), ann=id_, v=coref_value): label_store.delete_all() cid1 = pfx1 + sfx1 cid2 = pfx1 + sfx2 l = Label(cid1, cid2, ann, v) label_store.put(l) assert (list(label_store.everything(prefix=pfx1))) == [l] if pfx1.startswith(pfx2): expected = [l] else: expected = [] assert (list(label_store.everything(prefix=pfx2))) == expected
def test_subtopic_id(cid1=id_, cid2=id_, s1=id_, s2=id_, ann=id_, v=coref_value): l = Label(cid1, cid2, ann, v, subtopic_id1=s1, subtopic_id2=s2) assert cid1 in l assert (cid1, None) in l assert (cid1, s1) in l assert cid2 in l assert (cid2, None) in l assert (cid2, s2) in l assert l.other(cid1) == cid2 assert l.other(cid2) == cid1 if cid1 != cid2: assert l.subtopic_for(cid1) == s1 assert l.subtopic_for(cid2) == s2 else: assert l.subtopic_for(cid1) == min(s1, s2)
def test_split_by_connected_component(label_store): a1 = Label('a1', 'a2', '', 1) a2 = Label('a2', 'a3', '', 1) a3 = Label('a3', 'a4', '', 1) a4 = Label('a4', 'a1', '', 1) b1 = Label('b', 'b1', '', 1) b2 = Label('b', 'b2', '', 1) b3 = Label('b', 'b3', '', 1) c1 = Label('c1', 'c2', '', 1) label_store.put(a1, a2, a3, a4, b1, b2, b3, c1) ids = ['a2', 'a3', 'b1', 'b3', 'c1', 'd', 'e'] splits = label_store.split_by_connected_component(ids) assert ['a2', 'a3'] in splits assert ['b1', 'b3'] in splits assert ['c1'] in splits assert ['d'] in splits assert ['e'] in splits
def positive_subtopic_labels(self): cid, subid = self.query_content_id, self.query_subtopic_id subfolders = list(self.folders.parent_subfolders((cid, subid))) for fid, subfolder_id in subfolders: for cid2, subid2 in self.folders.items(fid, subfolder_id): # Since this item is in the same folder as our query, we # consider it a positive example. But there's no explicit # label for it, so manufacture one. # # TODO: Fix annotator id here. (We need to push annotator # information down into the search engine; the rest is # trivial.) ---AG yield Label(cid, cid2, Folders.DEFAULT_ANNOTATOR_ID, CorefValue.Positive, subid, subid2) # Sometimes the user will directly attach a positive label # to an item in the folder. This will grab those. for lab in self.label_store.directly_connected(cid2): if lab.value == CorefValue.Positive \ and lab.subtopic_for(cid2) == subid2: yield lab
def add_item(self, folder_id, subfolder_id, content_id, subtopic_id=None, ann_id=None): '''Add an item to a subfolder. The format of ``content_id`` and ``subtopic_id`` is unspecified. It is application specific. If ``ann_id`` is set, then the item is owned by the given user. Otherwise, the item is owned and viewable by all anonymous users. :param str folder_id: Folder id :param str subfolder_id: Folder id :param str content_id: content identifier :param str subtopic_id: subtopic identifier :param str ann_id: Username ''' self.assert_valid_folder_id(folder_id) self.assert_valid_folder_id(subfolder_id) ann_id = self._annotator(ann_id) folder_cid = self.wrap_folder_content_id(ann_id, folder_id) subfolder_sid = self.wrap_subfolder_subtopic_id(subfolder_id) if self.store.get(folder_cid) is None: raise KeyError(folder_id) lab = Label(folder_cid, content_id, ann_id, CorefValue.Positive, subtopic_id1=subfolder_sid, subtopic_id2=subtopic_id) self.label_store.put(lab) logger.info('Added subfolder item: %r', lab)
def _(cid1=id_, cid2=id_, ann=id_, v=coref_value): label_store.delete_all() l = Label(cid1, cid2, ann, v) label_store.put(l) assert list(label_store.everything()) == [l]
def test_expand(label_store): ab = Label('a', 'b', '', 1) bc = Label('b', 'c', '', 1) cd = Label('c', 'd', '', 1) ae = Label('a', 'e', '', -1) fg = Label('f', 'g', '', 1) label_store.put(ab) label_store.put(bc) label_store.put(cd) label_store.put(ae) label_store.put(fg) correct_pairs = [Label('a', 'b', '', 1), Label('a', 'c', '', 1), Label('a', 'd', '', 1), Label('b', 'c', '', 1), Label('b', 'd', '', 1), Label('c', 'd', '', 1)] assert frozenset(label_store.expand('a')) == frozenset(correct_pairs) assert len(label_store.expand('e')) == 0 assert label_store.expand('f') == [Label('f', 'g', '', 1)]
def label(id1, id2, v=CorefValue.Positive, sid1=None, sid2=None): return Label(id1, id2, 'foo', v, subtopic_id1=sid1, subtopic_id2=sid2)
def neg_label(id1, id2): return Label(id1, id2, '', CorefValue.Negative)
def pos_label(id1, id2): # Don't care about annotators or subtopics. return Label(id1, id2, '', CorefValue.Positive)
def lab(cid1, sid1, cid2, sid2, neg=False): coref_val = CorefValue.Negative if neg else CorefValue.Positive return Label(cid1, cid2, 'unknown', coref_val, sid1, sid2)