def test_non_counter_features_bad_serialize(): with pytest.raises(SerializationError): FeatureCollection({'NAME': 'foobaz'}) fc = FeatureCollection() fc['NAME'] = 'foobaz' with pytest.raises(SerializationError): fc.dumps()
def test_readonly(counter_type): fc = FeatureCollection({ 'hello': counter_type(Counter('hello')), 'goodbye': counter_type(Counter('goodbye')) }) fc2 = FeatureCollection({ 'hello': counter_type(Counter('hello')), 'goodbye': counter_type(Counter('goodbye')) }) fc.read_only = True with pytest.raises(ReadOnlyException): fc += fc2 with pytest.raises(ReadOnlyException): fc -= fc2 with pytest.raises(ReadOnlyException): fc *= 2 with pytest.raises(ReadOnlyException): fc['woof'] = StringCounter() if hasattr(counter_type, 'read_only'): with pytest.raises(ReadOnlyException): fc['hello']['l'] = 3 with pytest.raises(ReadOnlyException): fc['hello']['l'] += 3 fc.read_only = False fc += fc2 assert Counter(map(abs, fc['hello'].values())) == Counter({2: 3, 4: 1}) fc -= fc2 fc -= fc2 assert Counter(map(abs, fc['hello'].values())) == Counter()
def test_ft_roundtrip(): fc = FeatureCollection() fc['@NAME']['foo'].append([ ('nltk', 5, 2), ]) fc2 = FeatureCollection.loads(fc.dumps()) assert fc['@NAME'] == fc2['@NAME']
def test_read_only_preserved_after_serialized(): fc = FeatureCollection({'NAME': {'foo': 1, 'baz': 2}}) fc.read_only = True fcnew = FeatureCollection.loads(fc.dumps()) assert fcnew.read_only with pytest.raises(ReadOnlyException): fcnew['NAME']['foo'] += 1
def test_string_counter_serialize(): fc = FeatureCollection() fc['thing1'] = StringCounter() fc['thing1']['foo'] += 1 fc_str = fc.dumps() fc2 = FeatureCollection.loads(fc_str) assert fc2['thing1']['foo'] == 1
def test_serialize_deserialize(counter_type): ## build entity, serialize, deserialize, and verify its multisets ent1 = FeatureCollection() ent1['bow'] += counter_type(Counter(['big', 'dog'])) ent1['bow'] += counter_type(Counter('tall building')) ent1['bon'] += counter_type(Counter(['Super Cat', 'Small Cat', 'Tiger Fish'])) blob = ent1.dumps() ent2 = FeatureCollection.loads(blob) assert_same_fc(ent1, ent2)
def test_type(counter_type): m1 = FeatureCollection() m1['bow'] += counter_type(Counter(['big', 'dog'])) assert type(m1) == FeatureCollection m2 = FeatureCollection() m2['bow'] += counter_type(Counter(['cat'])) m1 += m2 assert type(m1) == FeatureCollection
def test_read_only_features(): fc = FeatureCollection({'feat': StringCounter({'foo': 1})}) fc['feat']['foo'] += 1 fc.read_only = True with pytest.raises(ReadOnlyException): fc['feat']['foo'] += 1 with pytest.raises(ReadOnlyException): fc['feat'].pop('foo') with pytest.raises(ReadOnlyException): del fc['feat']['foo']
def test_no_bytes_allowed(): fc = FeatureCollection({'foo': u'bar'}) fc.dumps() # OK! with pytest.raises(SerializationError): fc = FeatureCollection({'foo': 'bar'}) fc = FeatureCollection() fc['foo'] = 'bar' with pytest.raises(SerializationError): fc.dumps()
def test_type(counter_type): ent1 = FeatureCollection() ent1['bow'] += counter_type(Counter(['big', 'dog'])) if counter_type.__name__ == 'StringCounter': ent1['bow']['a'] += 1 assert isinstance(ent1, FeatureCollection) ent3 = FeatureCollection() ent3['bow'] += counter_type(Counter(['cat'])) ent1 += ent3 assert isinstance(ent1, FeatureCollection)
def test_binop_no_share(): fc1 = FeatureCollection({'NAME': {'foo': 1, 'bar': 1}}) fc2 = FeatureCollection({'NAME': {'foo': 2, 'bar': 2}}) fc3 = fc1 + fc2 assert fc1['NAME']['foo'] == 1 assert fc2['NAME']['foo'] == 2 fc1 += fc2 assert fc1 == fc3 assert fc1['NAME']['foo'] == 3 assert fc2['NAME']['foo'] == 2
def test_binop_different_no_share(): fc1 = FeatureCollection({'FOO': {'foo': 1}}) fc2 = FeatureCollection({'BAR': {'bar': 1}}) result = fc1 + fc2 expected = FeatureCollection({'FOO': {'foo': 1 }, 'BAR': {'bar': 1}}) assert result == expected result['BAR']['bar'] = 2 assert fc2['BAR']['bar'] == 1 result['FOO']['foo'] = 2 assert fc1['FOO']['foo'] == 1
def test_fc_eq(counter_type): fc1 = FeatureCollection({ 'hello': counter_type(Counter('hello')), 'goodbye': counter_type(Counter('goodbye'))}) fc2 = FeatureCollection({ 'hello': counter_type(Counter('hello')), 'goodbye': counter_type(Counter('goodbye'))}) fc3 = FeatureCollection({ 'hello': counter_type(Counter('hello')), 'goodbye': counter_type(Counter('goodbye2'))}) assert fc1 == fc2 assert fc1 != fc3
def test_json_serializer(): with registry: registry.add('StringCounter', JsonSerializer) fc = FeatureCollection() fc['thing2'] = StringCounter(dict(hello='people')) fc['thing2']['another'] = 5 fc['thing3'] = StringCounter(dict(hello='people2')) fc_str = fc.dumps() fc2 = FeatureCollection.loads(fc_str) assert fc2['thing2']['another'] == 5 assert fc2['thing2']['hello'] == 'people' assert fc2['thing3']['hello'] == 'people2'
def test_thing_serializer(): with registry: registry.add('StringCounter', ThingSerializer) fc = FeatureCollection() fc['thing1'] = Thing(json.dumps(dict(hello='people'))) fc['thing1']['another'] = 'more' fc['thing1'].do_more_things() fc_str = fc.dumps() fc2 = FeatureCollection.loads(fc_str) assert fc2['thing1']['another'] == 'more' assert fc2['thing1']['hello'] == 'people' assert fc2['thing1']['doing'] == 'something'
def perftest_throughput_feature_collection(): with registry: registry.add('StringCounter', ThingSerializer) fc = FeatureCollection() fc['thing1'] = Thing(json.dumps(dict(one_mb=' ' * 2**20))) fc_str = fc.dumps() start_time = time.time() num = 1000 for i in range(num): fc2 = FeatureCollection.loads(fc_str) fc2.dumps() elapsed = time.time() - start_time rate = float(num) / elapsed print('%d MB in %.1f sec --> %.1f MB per sec' % (num, elapsed, rate))
def test_eq(counter_type): mc1 = FeatureCollection({ 'hello': counter_type(Counter('hello')), 'goodbye': counter_type(Counter('goodbye')) }) mc2 = FeatureCollection({ 'hello': counter_type(Counter('hello')), 'goodbye': counter_type(Counter('goodbye')) }) mc3 = FeatureCollection({ 'hello': counter_type(Counter('hello')), 'goodbye': counter_type(Counter('goodbye2')) }) assert mc1 == mc2 assert mc1 != mc3
def test_default(counter_type): 'does a FC make a new counter that adds properly' mc = FeatureCollection() assert isinstance(mc['foo'], counter_type) mc['foo'] += counter_type(Counter('dog')) assert isinstance(mc['foo'], counter_type), \ 'failed and made %s' % type(mc['foo']) mc['foo'] -= counter_type(Counter('dog')) assert isinstance(mc['foo'], counter_type), \ 'failed and made %s' % type(mc['foo']) if hasattr(mc['foo'], 'substract'): mc['foo'].subtract(counter_type(Counter('dog'))) assert isinstance(mc['foo'], counter_type), \ 'failed and made %s' % type(mc['foo']) mc['foo'] += counter_type(Counter('dog')) mc['foo'] += counter_type(Counter('dog')) assert isinstance(mc['foo'], counter_type), \ 'failed and made %s' % type(mc['foo']) mc['foo'] += counter_type(Counter('dog')) mc['foo'] += counter_type(Counter('dog cat')) assert Counter(map(abs, mc['foo'].values())) == Counter({1: 4, 3: 3})
def test_multiset_change(counter_type): ent1 = FeatureCollection() ent1['bow'] += counter_type(Counter(['big', 'dog'])) ent1.pop('bow') assert dict(ent1.items()) == dict() ## can pop empty -- fails #ent1.pop('foo') ## set equal to test_data = ['big2', 'dog2'] ent1['bow'] = counter_type(Counter(test_data)) assert list(map(abs,ent1['bow'].values())) == [1,1] ent1['bow'] += counter_type(Counter(test_data)) assert list(map(abs,ent1['bow'].values())) == [2,2]
def test_ignored(): fc = FeatureCollection() fc['foo'] = 'bar' with pytest.raises(SerializationError): fc.dumps() fc = FeatureCollection() fc['_foo'] = 'bar' fc.dumps() # _foo is ignored!
def test_fc_chunk(): fc1 = FeatureCollection({'NAME': {'foo': 2, 'baz': 1}}) fc2 = FeatureCollection({'NAME': {'foo': 4, 'baz': 2}}) fh = StringIO() chunk = FeatureCollectionChunk(file_obj=fh, mode='wb') chunk.add(fc1) chunk.add(fc2) chunk.flush() blob = fh.getvalue() assert blob fh = StringIO(blob) chunk = FeatureCollectionChunk(file_obj=fh, mode='rb') rfc1, rfc2 = list(chunk) assert fc1 == rfc1 assert fc2 == rfc2
def test_meta_adding(counter_type): mc = FeatureCollection({ 'hello': counter_type(Counter('hello')), 'goodbye': counter_type(Counter('goodbye')) }) mc2 = mc + mc assert Counter(map(abs, mc2['hello'].values())) == Counter({2: 3, 4: 1})
def test_build_from_dict(counter_type): mc = FeatureCollection({ 'hello': counter_type(Counter('hello')), 'goodbye': counter_type(Counter('goodbye')) }) assert Counter(map(abs, mc['hello'].values())) == Counter({1: 3, 2: 1}) assert isinstance(mc['hello'], counter_type)
def test_fc_meta_adding_complex(counter_type): fc = FeatureCollection({ 'hello': counter_type(Counter('hello')), 'goodbye': counter_type(Counter('goodbye'))}) fc2 = FeatureCollection({ 'hello': counter_type(Counter('hello')), 'goodbye': counter_type(Counter('goodbye'))}) fc3 = fc + fc2 assert Counter(map(abs,fc3['hello'].values())) == Counter({2: 3, 4: 1}) fc += fc2 assert Counter(map(abs,fc['hello'].values())) == Counter({2: 3, 4: 1}) fc3 -= fc2 assert Counter(map(abs,fc3['hello'].values())) == Counter({1: 3, 2: 1}) fc3 -= fc2 assert Counter(map(abs,fc3['hello'].values())) == Counter()
def test_meta_adding_complex(counter_type): mc = FeatureCollection({ 'hello': counter_type(Counter('hello')), 'goodbye': counter_type(Counter('goodbye')) }) mc2 = FeatureCollection({ 'hello': counter_type(Counter('hello')), 'goodbye': counter_type(Counter('goodbye')) }) mc3 = mc + mc2 assert Counter(map(abs, mc3['hello'].values())) == Counter({2: 3, 4: 1}) mc += mc2 assert Counter(map(abs, mc['hello'].values())) == Counter({2: 3, 4: 1}) ## isub tests mc3 -= mc2 assert Counter(map(abs, mc3['hello'].values())) == Counter({1: 3, 2: 1}) mc3 -= mc2 assert Counter(map(abs, mc3['hello'].values())) == Counter()
def test_read_only_binop(): fc1 = FeatureCollection({'NAME': {'foo': 1, 'bar': 1}}) fc2 = FeatureCollection({'NAME': {'foo': 2, 'bar': 2}}) fc1.read_only = True fc2.read_only = True result = fc1 + fc2 expected = FeatureCollection({'NAME': {'foo': 3, 'bar': 3}}) assert result == expected assert not result.read_only
def v1_fc_put(request, response, store, kvlclient, tfidf, cid): '''Store a single feature collection. The route for this endpoint is: ``PUT /dossier/v1/feature-collections/<content_id>``. ``content_id`` is the id to associate with the given feature collection. The feature collection should be in the request body serialized as JSON. Alternatively, if the request's ``Content-type`` is ``text/html``, then a feature collection is generated from the HTML. The generated feature collection is then returned as a JSON payload. This endpoint returns status ``201`` upon successful storage otherwise. An existing feature collection with id ``content_id`` is overwritten. ''' tfidf = tfidf or None if request.headers.get('content-type', '').startswith('text/html'): url = urllib.unquote(cid.split('|', 1)[1]) fc = etl.create_fc_from_html(url, request.body.read(), tfidf=tfidf) logger.info('created FC for %r', cid) store.put([(cid, fc)]) return fc_to_json(fc) else: fc = FeatureCollection.from_dict(json.load(request.body)) keywords = set() for subid in fc: if subid.startswith('subtopic'): ty = subtopic_type(subid) if ty in ('text', 'manual'): # get the user selected string data = typed_subtopic_data(fc, subid) map(keywords.add, cleanse(data).split()) keywords.add(cleanse(data)) folders = Folders(kvlclient) for fid, sid in folders.parent_subfolders(cid): if not isinstance(fid, unicode): fid = fid.decode('utf8') if not isinstance(sid, unicode): sid = sid.decode('utf8') keywords.add(cleanse(fid)) keywords.add(cleanse(sid)) fc[u'keywords'] = StringCounter(keywords) store.put([(cid, fc)]) response.status = 201
def add_folder(self, folder_id, ann_id=None): '''Add a folder. If ``ann_id`` is set, then the folder is owned by the given user. Otherwise, the folder is owned and viewable by all anonymous users. :param str folder_id: Folder id :param str ann_id: Username ''' self.assert_valid_folder_id(folder_id) ann_id = self._annotator(ann_id) cid = self.wrap_folder_content_id(ann_id, folder_id) self.store.put([(cid, FeatureCollection())]) logger.info('Added folder %r with content id %r', folder_id, cid)
def forum_post_features(row): fc = FeatureCollection() for k in row['author']: fc['post_author_' + k] = row['author'][k] if 'image_urls' in row: fc['image_url'] = StringCounter() for image_url in row['image_urls']: fc['image_url'][image_url] += 1 others = ['parent_id', 'thread_id', 'thread_link', 'thread_name', 'title'] for k in others: if k in row: fc['post_' + k] = uni(row[k]) return fc
def v1_fc_put(request, response, visid_to_dbid, store, cid): '''Store a single feature collection. The route for this endpoint is: ``PUT /dossier/v1/feature-collections/<content_id>``. ``content_id`` is the id to associate with the given feature collection. The feature collection should be in the request body serialized as JSON. This endpoint returns status ``201`` upon successful storage. An existing feature collection with id ``content_id`` is overwritten. ''' fc = FeatureCollection.from_dict(json.load(request.body)) store.put([(visid_to_dbid(cid), fc)]) response.status = 201