def test_read_only_preserved_after_serialized(): fc = FeatureCollection({'NAME': {'foo': 1, 'baz': 2}}) fc.read_only = True fcnew = FeatureCollection.loads(fc.dumps()) assert fcnew.read_only with pytest.raises(ReadOnlyException): fcnew['NAME']['foo'] += 1
def test_ft_roundtrip(): fc = FeatureCollection() fc['@NAME']['foo'].append([ ('nltk', 5, 2), ]) fc2 = FeatureCollection.loads(fc.dumps()) assert fc['@NAME'] == fc2['@NAME']
def test_non_counter_features_bad_serialize(): with pytest.raises(SerializationError): FeatureCollection({"NAME": "foobaz"}) fc = FeatureCollection() fc["NAME"] = "foobaz" with pytest.raises(SerializationError): fc.dumps()
def test_nsc_roundtrip(): fc = FeatureCollection() fc['#testing'] = NestedStringCounter() fc['#testing']['foo'] = StringCounter({'foo': 1}) fc['#testing']['bar'] = StringCounter({'foo': 2, 'bar': 1}) dumped = fc.dumps() assert FeatureCollection.loads(dumped) == fc
def test_non_counter_features_bad_serialize(): with pytest.raises(SerializationError): FeatureCollection({'NAME': 'foobaz'}) fc = FeatureCollection() fc['NAME'] = 'foobaz' with pytest.raises(SerializationError): fc.dumps()
def test_readonly(counter_type): fc = FeatureCollection({ 'hello': counter_type(Counter('hello')), 'goodbye': counter_type(Counter('goodbye'))}) fc2 = FeatureCollection({ 'hello': counter_type(Counter('hello')), 'goodbye': counter_type(Counter('goodbye'))}) fc.read_only = True with pytest.raises(ReadOnlyException): fc += fc2 with pytest.raises(ReadOnlyException): fc -= fc2 with pytest.raises(ReadOnlyException): fc *= 2 with pytest.raises(ReadOnlyException): fc['woof'] = StringCounter() if hasattr(counter_type, 'read_only'): with pytest.raises(ReadOnlyException): fc['hello']['l'] = 3 with pytest.raises(ReadOnlyException): fc['hello']['l'] += 3 fc.read_only = False fc += fc2 assert Counter(map(abs,fc['hello'].values())) == Counter({2: 3, 4: 1}) fc -= fc2 fc -= fc2 assert Counter(map(abs,fc['hello'].values())) == Counter()
def test_string_counter_serialize(): fc = FeatureCollection() fc['thing1'] = StringCounter() fc['thing1']['foo'] += 1 fc_str = fc.dumps() fc2 = FeatureCollection.loads(fc_str) assert fc2['thing1']['foo'] == 1
def test_serialize_deserialize(counter_type): ## build entity, serialize, deserialize, and verify its multisets ent1 = FeatureCollection() ent1["bow"] += counter_type(Counter(["big", "dog"])) ent1["bow"] += counter_type(Counter("tall building")) ent1["bon"] += counter_type(Counter(["Super Cat", "Small Cat", "Tiger Fish"])) blob = ent1.dumps() ent2 = FeatureCollection.loads(blob) assert_same_fc(ent1, ent2)
def test_serialize_deserialize(counter_type): ## build entity, serialize, deserialize, and verify its multisets ent1 = FeatureCollection() ent1['bow'] += counter_type(Counter(['big', 'dog'])) ent1['bow'] += counter_type(Counter('tall building')) ent1['bon'] += counter_type(Counter(['Super Cat', 'Small Cat', 'Tiger Fish'])) blob = ent1.dumps() ent2 = FeatureCollection.loads(blob) assert_same_fc(ent1, ent2)
def test_read_only_features(): fc = FeatureCollection({'feat': StringCounter({'foo': 1})}) fc['feat']['foo'] += 1 fc.read_only = True with pytest.raises(ReadOnlyException): fc['feat']['foo'] += 1 with pytest.raises(ReadOnlyException): fc['feat'].pop('foo') with pytest.raises(ReadOnlyException): del fc['feat']['foo']
def test_read_only_binop(): fc1 = FeatureCollection({'NAME': {'foo': 1, 'bar': 1}}) fc2 = FeatureCollection({'NAME': {'foo': 2, 'bar': 2}}) fc1.read_only = True fc2.read_only = True result = fc1 + fc2 expected = FeatureCollection({'NAME': {'foo': 3, 'bar': 3}}) assert result == expected assert not result.read_only
def test_thing_serializer(): with registry: registry.add('StringCounter', ThingSerializer) fc = FeatureCollection() fc['thing1'] = Thing(json.dumps(dict(hello='people'))) fc['thing1']['another'] = 'more' fc['thing1'].do_more_things() fc_str = fc.dumps() fc2 = FeatureCollection.loads(fc_str) assert fc2['thing1']['another'] == 'more' assert fc2['thing1']['hello'] == 'people' assert fc2['thing1']['doing'] == 'something'
def test_json_serializer(): with registry: registry.add('StringCounter', JsonSerializer) fc = FeatureCollection() fc['thing2'] = StringCounter(dict(hello='people')) fc['thing2']['another'] = 5 fc['thing3'] = StringCounter(dict(hello='people2')) fc_str = fc.dumps() fc2 = FeatureCollection.loads(fc_str) assert fc2['thing2']['another'] == 5 assert fc2['thing2']['hello'] == 'people' assert fc2['thing3']['hello'] == 'people2'
def perftest_throughput_feature_collection(): with registry: registry.add('StringCounter', ThingSerializer) fc = FeatureCollection() fc['thing1'] = Thing(json.dumps(dict(one_mb=' ' * 2**20))) fc_str = fc.dumps() start_time = time.time() num = 1000 for i in range(num): fc2 = FeatureCollection.loads(fc_str) fc2.dumps() elapsed = time.time() - start_time rate = float(num) / elapsed print('%d MB in %.1f sec --> %.1f MB per sec' % (num, elapsed, rate))
def test_multiset_change(counter_type): ent1 = FeatureCollection() ent1['bow'] += counter_type(Counter(['big', 'dog'])) ent1.pop('bow') assert dict(ent1.items()) == dict() ## can pop empty -- fails #ent1.pop('foo') ## set equal to test_data = ['big2', 'dog2'] ent1['bow'] = counter_type(Counter(test_data)) assert list(ent1['bow'].values()) == [1,1] ent1['bow'] += counter_type(Counter(test_data)) assert list(ent1['bow'].values()) == [2,2]
def test_read_only(): fcwork = FeatureCollection({'feat': {'foo': 1}}) fc = FeatureCollection() fc['feat']['foo'] += 1 fc.read_only = True with pytest.raises(ReadOnlyException): fc += fcwork with pytest.raises(ReadOnlyException): fc -= fcwork with pytest.raises(ReadOnlyException): fc -= fcwork with pytest.raises(ReadOnlyException): del fc['feat'] with pytest.raises(ReadOnlyException): fc.pop('feat')
def cbor_iter(fh): while True: try: chunk = cbor.load(fh) except EOFError: break yield FeatureCollection.from_dict(chunk)
def test_multiset_change(counter_type): ent1 = FeatureCollection() ent1["bow"] += counter_type(Counter(["big", "dog"])) ent1.pop("bow") assert dict(ent1.items()) == dict() ## can pop empty -- fails # ent1.pop('foo') ## set equal to test_data = ["big2", "dog2"] ent1["bow"] = counter_type(Counter(test_data)) assert list(map(abs, ent1["bow"].values())) == [1, 1] ent1["bow"] += counter_type(Counter(test_data)) assert list(map(abs, ent1["bow"].values())) == [2, 2]
def test_add_facets(): cid1 = 'cid1' fc1 = FeatureCollection() fc1['bowNP_sip'] = StringCounter([u'elephant', u'car']) cid2 = 'cid2' fc2 = FeatureCollection() fc2['bowNP_sip'] = StringCounter([u'car', u'green']) fake_results = {'results': [(cid1, fc1), (cid2, fc2)]} new_results = mod_pairwise.add_facets(fake_results) assert 'facets' in new_results assert new_results['facets'] == { 'elephant': [cid1], 'car': [cid1, cid2], 'green': [cid2], }
def test_no_bytes_allowed(): fc = FeatureCollection({'foo': u'bar'}) fc.dumps() # OK! with pytest.raises(SerializationError): fc = FeatureCollection({'foo': 'bar'}) fc = FeatureCollection() fc['foo'] = 'bar' with pytest.raises(SerializationError): fc.dumps()
def test_entity(counter_type): ## build entity, serialize, deserialize, and verify its multisets fc1 = FeatureCollection() fc1["bow"] += counter_type(Counter(["big", "dog"])) fc1["bow"] += counter_type(Counter("tall building")) fc1["bon"] += counter_type(Counter(["Super Cat", "Small Cat", "Tiger Fish"])) ## there should be nine items of size 1 assert Counter(map(abs, fc1["bow"].values()))[1] == 10, fc1["bow"].items() ## double the counts, should recurse down fc1 += fc1 ## check values doubled assert Counter(map(abs, fc1["bow"].values()))[2] == 10, fc1["bow"].items() ## serialize/deserialize it blob = fc1.dumps() assert_same_fc(fc1, FeatureCollection.loads(blob)) ## deserialize it via chunk fc2 = FeatureCollection.loads(fc1.dumps()) assert_same_fc(fc1, fc2)
def v1_fc_put(request, response, store, kvlclient, tfidf, cid): '''Store a single feature collection. The route for this endpoint is: ``PUT /dossier/v1/feature-collections/<content_id>``. ``content_id`` is the id to associate with the given feature collection. The feature collection should be in the request body serialized as JSON. Alternatively, if the request's ``Content-type`` is ``text/html``, then a feature collection is generated from the HTML. The generated feature collection is then returned as a JSON payload. This endpoint returns status ``201`` upon successful storage otherwise. An existing feature collection with id ``content_id`` is overwritten. ''' tfidf = tfidf or None if request.headers.get('content-type', '').startswith('text/html'): url = urllib.unquote(cid.split('|', 1)[1]) fc = etl.create_fc_from_html(url, request.body.read(), tfidf=tfidf) logger.info('created FC for %r', cid) store.put([(cid, fc)]) return fc_to_json(fc) else: fc = FeatureCollection.from_dict(json.load(request.body)) keywords = set() for subid in fc: if subid.startswith('subtopic'): ty = subtopic_type(subid) if ty in ('text', 'manual'): # get the user selected string data = typed_subtopic_data(fc, subid) map(keywords.add, cleanse(data).split()) keywords.add(cleanse(data)) folders = Folders(kvlclient) for fid, sid in folders.parent_subfolders(cid): if not isinstance(fid, unicode): fid = fid.decode('utf8') if not isinstance(sid, unicode): sid = sid.decode('utf8') keywords.add(cleanse(fid)) keywords.add(cleanse(sid)) fc[u'keywords'] = StringCounter(keywords) store.put([(cid, fc)]) response.status = 201
def test_entity(counter_type): ## build entity, serialize, deserialize, and verify its multisets fc1 = FeatureCollection() fc1['bow'] += counter_type(Counter(['big', 'dog'])) fc1['bow'] += counter_type(Counter('tall building')) fc1['bon'] += counter_type(Counter(['Super Cat', 'Small Cat', 'Tiger Fish'])) ## there should be nine items of size 1 assert Counter(fc1['bow'].values())[1] == 10, fc1['bow'].items() ## double the counts, should recurse down fc1 += fc1 ## check values doubled assert Counter(fc1['bow'].values())[2] == 10, fc1['bow'].items() ## serialize/deserialize it blob = fc1.dumps() assert_same_fc(fc1, FeatureCollection.loads(blob)) ## deserialize it via chunk fc2 = FeatureCollection.loads(fc1.dumps()) assert_same_fc(fc1, fc2)
def test_ignored(): fc = FeatureCollection() fc['foo'] = 'bar' with pytest.raises(SerializationError): fc.dumps() fc = FeatureCollection() fc['_foo'] = 'bar' fc.dumps() # _foo is ignored!
def test_vectorizable_features(): '''Make sure we only do learning on the right features. The "right" features means features that can be vectorized by sklearn. Translation: they must be instances of collections.Mapping. ''' fc = FeatureCollection({ u'yes': { 'fubar': 1 }, u'no': u'haha', }) got = mod_pairwise.vectorizable_features([fc]) assert got == ['yes']
def forum_post_features(row): fc = FeatureCollection() for k in row['author']: fc['post_author_' + k] = row['author'][k] if 'image_urls' in row: fc['image_url'] = StringCounter() for image_url in row['image_urls']: fc['image_url'][image_url] += 1 others = ['parent_id', 'thread_id', 'thread_link', 'thread_name', 'title'] for k in others: if k in row: fc['post_' + k] = uni(row[k]) return fc
def add_folder(self, folder_id, ann_id=None): '''Add a folder. If ``ann_id`` is set, then the folder is owned by the given user. Otherwise, the folder is owned and viewable by all anonymous users. :param str folder_id: Folder id :param str ann_id: Username ''' self.assert_valid_folder_id(folder_id) ann_id = self._annotator(ann_id) cid = self.wrap_folder_content_id(ann_id, folder_id) self.store.put([(cid, FeatureCollection())]) logger.info('Added folder %r with content id %r', folder_id, cid)
def v1_fc_put(request, response, visid_to_dbid, store, cid): '''Store a single feature collection. The route for this endpoint is: ``PUT /dossier/v1/feature-collections/<content_id>``. ``content_id`` is the id to associate with the given feature collection. The feature collection should be in the request body serialized as JSON. This endpoint returns status ``201`` upon successful storage. An existing feature collection with id ``content_id`` is overwritten. ''' fc = FeatureCollection.from_dict(json.load(request.body)) store.put([(visid_to_dbid(cid), fc)]) response.status = 201
def v1_fc_put(request, response, visid_to_dbid, store, cid): """Store a single feature collection. The route for this endpoint is: ``PUT /dossier/v1/feature-collections/<content_id>``. ``content_id`` is the id to associate with the given feature collection. The feature collection should be in the request body serialized as JSON. This endpoint returns status ``201`` upon successful storage. An existing feature collection with id ``content_id`` is overwritten. """ fc = FeatureCollection.from_dict(json.load(request.body)) store.put([(visid_to_dbid(cid), fc)]) response.status = 201
def test_non_counter_features_serialize(): fc = FeatureCollection({'NAME': u'foobaz'}) fc = FeatureCollection.loads(fc.dumps())
def counter_fc(bow): return FeatureCollection({u'feature': bow})
def fc_loads(raw): return FeatureCollection.loads(snappy.decompress(raw))
def test_read_only_not_preserved_via_dict(): fc = FeatureCollection({'NAME': {'foo': 1, 'baz': 2}}) fc.read_only = True fcnew = FeatureCollection(fc.to_dict()) assert not fcnew.read_only fcnew['NAME']['foo'] += 1
def test_identity(): fc = FeatureCollection() fc.read_only = True id(fc['one']) == id(fc['two'])
# !!! IMPORTANT !!! # Define features that you want to index. This will let you quickly scan # for feature collections in the database with matching values. # # You don't have to index everything, but it's probably a good idea to index # the most prominent features. e.g., phone or email or website. # # These should correspond to the names of the corresponding features. feature_indexes = [u'phone', u'email', u'website', u'rate'] # Create a "store," which knows how to store and index feature collections. store = Store(conn, feature_indexes=feature_indexes) # Create a fresh feature collection and add a 'rate' feature. fc = FeatureCollection() fc['rate'] = StringCounter({ u'5per30': 5, u'5per60': 1, u'10per20': 2, }) # Content ids are the unique identifier for each feature collection. # It's probably sufficient to use whatever you have for "ad id." content_id = 'some_unique_value' store.put([(content_id, fc)]) print store.get(content_id) # Use the index scan! print list(store.index_scan_prefix(u'rate', '10'))
def example_fc(): fc = FeatureCollection() fc[u'meta_clean_visible'] = example_text return fc
def html_to_fc(html=None, clean_html=None, clean_visible=None, encoding=None, url=None, timestamp=None, other_features=None): '''`html` is expected to be a raw string received over the wire from a remote webserver, and `encoding`, if provided, is used to decode it. Typically, encoding comes from the Content-Type header field. The :func:`~streamcorpus_pipeline._clean_html.make_clean_html` function handles character encodings. ''' def add_feature(name, xs): if name not in fc: fc[name] = StringCounter() fc[name] += StringCounter(xs) timestamp = timestamp or int(time.time() * 1000) other_features = other_features or {} if clean_html is None: if html is not None: try: clean_html_utf8 = make_clean_html(html, encoding=encoding) except: logger.warn('dropping doc because:', exc_info=True) return clean_html = clean_html_utf8.decode('utf-8') else: clean_html_utf8 = u'' clean_html = u'' else: clean_html_utf8 = u'' if clean_visible is None or len(clean_visible) == 0: clean_visible = make_clean_visible(clean_html_utf8).decode('utf-8') elif isinstance(clean_visible, str): clean_visible = clean_visible.decode('utf-8') fc = FeatureCollection() fc[u'meta_raw'] = html and uni(html, encoding) or u'' fc[u'meta_clean_html'] = clean_html fc[u'meta_clean_visible'] = clean_visible fc[u'meta_timestamp'] = unicode(timestamp) url = url or u'' fc[u'meta_url'] = uni(url) add_feature(u'icq', features.ICQs(clean_visible)) add_feature(u'skype', features.skypes(clean_visible)) add_feature(u'phone', features.phones(clean_visible)) add_feature(u'email', features.emails(clean_visible)) bowNP, normalizations = features.noun_phrases(cleanse(clean_visible), included_unnormalized=True) add_feature(u'bowNP', bowNP) bowNP_unnorm = chain(*normalizations.values()) add_feature(u'bowNP_unnorm', bowNP_unnorm) add_feature(u'image_url', features.image_urls(clean_html)) add_feature(u'a_url', features.a_urls(clean_html)) ## get parsed versions, extract usernames fc[u'img_url_path_dirs'] = features.path_dirs(fc[u'image_url']) fc[u'img_url_hostnames'] = features.host_names(fc[u'image_url']) fc[u'usernames'] = features.usernames(fc[u'image_url']) fc[u'a_url_path_dirs'] = features.path_dirs(fc[u'a_url']) fc[u'a_url_hostnames'] = features.host_names(fc[u'a_url']) fc[u'usernames'] += features.usernames(fc[u'a_url']) #fc[u'usernames'] += features.usernames2( # fc[u'meta_clean_visible']) # beginning of treating this as a pipeline... xform = features.entity_names() fc = xform.process(fc) for feat_name, feat_val in other_features.iteritems(): fc[feat_name] += StringCounter(feat_val) return fc
def test_random_no_name_index(store): # noqa store.put([('foo', FeatureCollection({u'NAME': {'bar': 1}}))]) # just make sure it runs search_engines.random(store).set_query_id('foo').results()
def test_get(): fc = FeatureCollection() assert fc.get("nada", 5) == 5
def test_fc_get(store): # noqa store.put([(visid_to_dbid('abc'), FeatureCollection({'foo': {'a': 1}}))]) fc = routes.v1_fc_get(dbid_to_visid, store, 'abc') assert fc['foo']['a'] == 1
def test_non_counter_features_total(): fc = FeatureCollection({'NAME': u'foobaz'}) assert fc.total() == 0
def make_fc(text): nhash = nilsimsa_hash(text) fc = FeatureCollection() fc['#nilsimsa_all'] = StringCounter([nhash]) return fc