def test_read_only_preserved_after_serialized():
    fc = FeatureCollection({'NAME': {'foo': 1, 'baz': 2}})
    fc.read_only = True
    fcnew = FeatureCollection.loads(fc.dumps())
    assert fcnew.read_only
    with pytest.raises(ReadOnlyException):
        fcnew['NAME']['foo'] += 1
def test_ft_roundtrip():
    fc = FeatureCollection()
    fc['@NAME']['foo'].append([
        ('nltk', 5, 2),
    ])
    fc2 = FeatureCollection.loads(fc.dumps())
    assert fc['@NAME'] == fc2['@NAME']
def test_non_counter_features_bad_serialize():
    with pytest.raises(SerializationError):
        FeatureCollection({"NAME": "foobaz"})
    fc = FeatureCollection()
    fc["NAME"] = "foobaz"
    with pytest.raises(SerializationError):
        fc.dumps()
def test_nsc_roundtrip():
    fc = FeatureCollection()
    fc['#testing'] = NestedStringCounter()
    fc['#testing']['foo'] = StringCounter({'foo': 1})
    fc['#testing']['bar'] = StringCounter({'foo': 2, 'bar': 1})
    dumped = fc.dumps()
    assert FeatureCollection.loads(dumped) == fc
def test_non_counter_features_bad_serialize():
    with pytest.raises(SerializationError):
        FeatureCollection({'NAME': 'foobaz'})
    fc = FeatureCollection()
    fc['NAME'] = 'foobaz'
    with pytest.raises(SerializationError):
        fc.dumps()
def test_readonly(counter_type):
    fc = FeatureCollection({
            'hello': counter_type(Counter('hello')),
            'goodbye': counter_type(Counter('goodbye'))})
    fc2 = FeatureCollection({
            'hello': counter_type(Counter('hello')),
            'goodbye': counter_type(Counter('goodbye'))})

    fc.read_only = True
    with pytest.raises(ReadOnlyException):
        fc += fc2

    with pytest.raises(ReadOnlyException):
        fc -= fc2

    with pytest.raises(ReadOnlyException):
        fc *= 2

    with pytest.raises(ReadOnlyException):
        fc['woof'] = StringCounter()

    if hasattr(counter_type, 'read_only'):
        with pytest.raises(ReadOnlyException):
            fc['hello']['l'] = 3
        with pytest.raises(ReadOnlyException):
            fc['hello']['l'] += 3

    fc.read_only = False
    fc += fc2
    assert Counter(map(abs,fc['hello'].values())) == Counter({2: 3, 4: 1})
    fc -= fc2
    fc -= fc2
    assert Counter(map(abs,fc['hello'].values())) == Counter()
def test_string_counter_serialize():
    fc = FeatureCollection()
    fc['thing1'] = StringCounter()
    fc['thing1']['foo'] += 1
    fc_str = fc.dumps()

    fc2 = FeatureCollection.loads(fc_str)
    assert fc2['thing1']['foo'] == 1
def test_serialize_deserialize(counter_type):
    ## build entity, serialize, deserialize, and verify its multisets
    ent1 = FeatureCollection()
    ent1["bow"] += counter_type(Counter(["big", "dog"]))
    ent1["bow"] += counter_type(Counter("tall building"))
    ent1["bon"] += counter_type(Counter(["Super Cat", "Small Cat", "Tiger Fish"]))

    blob = ent1.dumps()
    ent2 = FeatureCollection.loads(blob)
    assert_same_fc(ent1, ent2)
def test_serialize_deserialize(counter_type):
    ## build entity, serialize, deserialize, and verify its multisets
    ent1 = FeatureCollection()
    ent1['bow'] += counter_type(Counter(['big', 'dog']))
    ent1['bow'] += counter_type(Counter('tall building'))
    ent1['bon'] += counter_type(Counter(['Super Cat', 'Small Cat',
                                         'Tiger Fish']))

    blob = ent1.dumps()
    ent2 = FeatureCollection.loads(blob)
    assert_same_fc(ent1, ent2)
Example #10
0
def test_read_only_features():
    fc = FeatureCollection({'feat': StringCounter({'foo': 1})})
    fc['feat']['foo'] += 1
    fc.read_only = True

    with pytest.raises(ReadOnlyException):
        fc['feat']['foo'] += 1
    with pytest.raises(ReadOnlyException):
        fc['feat'].pop('foo')
    with pytest.raises(ReadOnlyException):
        del fc['feat']['foo']
Example #11
0
def test_read_only_binop():
    fc1 = FeatureCollection({'NAME': {'foo': 1, 'bar': 1}})
    fc2 = FeatureCollection({'NAME': {'foo': 2, 'bar': 2}})

    fc1.read_only = True
    fc2.read_only = True

    result = fc1 + fc2
    expected = FeatureCollection({'NAME': {'foo': 3, 'bar': 3}})
    assert result == expected
    assert not result.read_only
def test_thing_serializer():
    with registry:
        registry.add('StringCounter', ThingSerializer)

        fc = FeatureCollection()
        fc['thing1'] = Thing(json.dumps(dict(hello='people')))
        fc['thing1']['another'] = 'more'
        fc['thing1'].do_more_things()
        fc_str = fc.dumps()

        fc2 = FeatureCollection.loads(fc_str)

        assert fc2['thing1']['another'] == 'more'
        assert fc2['thing1']['hello'] == 'people'
        assert fc2['thing1']['doing'] == 'something'
def test_json_serializer():
    with registry:
        registry.add('StringCounter', JsonSerializer)

        fc = FeatureCollection()
        fc['thing2'] = StringCounter(dict(hello='people'))
        fc['thing2']['another'] = 5
        fc['thing3'] = StringCounter(dict(hello='people2'))
        fc_str = fc.dumps()

        fc2 = FeatureCollection.loads(fc_str)

        assert fc2['thing2']['another'] == 5
        assert fc2['thing2']['hello'] == 'people'
        assert fc2['thing3']['hello'] == 'people2'
Example #14
0
def perftest_throughput_feature_collection():
    with registry:
        registry.add('StringCounter', ThingSerializer)
        fc = FeatureCollection()
        fc['thing1'] = Thing(json.dumps(dict(one_mb=' ' * 2**20)))
        fc_str = fc.dumps()

        start_time = time.time()
        num = 1000
        for i in range(num):
            fc2 = FeatureCollection.loads(fc_str)
            fc2.dumps()
        elapsed = time.time() - start_time
        rate = float(num) / elapsed
        print('%d MB in %.1f sec --> %.1f MB per sec' % (num, elapsed, rate))
def test_multiset_change(counter_type):
    ent1 = FeatureCollection()
    ent1['bow'] += counter_type(Counter(['big', 'dog']))
    ent1.pop('bow')
    assert dict(ent1.items()) == dict()

    ## can pop empty -- fails
    #ent1.pop('foo')

    ## set equal to
    test_data = ['big2', 'dog2']
    ent1['bow'] = counter_type(Counter(test_data))
    assert list(ent1['bow'].values()) == [1,1]

    ent1['bow'] += counter_type(Counter(test_data))
    assert list(ent1['bow'].values()) == [2,2]
Example #16
0
def test_read_only():
    fcwork = FeatureCollection({'feat': {'foo': 1}})

    fc = FeatureCollection()
    fc['feat']['foo'] += 1
    fc.read_only = True
    with pytest.raises(ReadOnlyException):
        fc += fcwork
    with pytest.raises(ReadOnlyException):
        fc -= fcwork
    with pytest.raises(ReadOnlyException):
        fc -= fcwork
    with pytest.raises(ReadOnlyException):
        del fc['feat']
    with pytest.raises(ReadOnlyException):
        fc.pop('feat')
Example #17
0
def cbor_iter(fh):
    while True:
        try:
            chunk = cbor.load(fh)
        except EOFError:
            break
        yield FeatureCollection.from_dict(chunk)
def test_multiset_change(counter_type):
    ent1 = FeatureCollection()
    ent1["bow"] += counter_type(Counter(["big", "dog"]))
    ent1.pop("bow")
    assert dict(ent1.items()) == dict()

    ## can pop empty -- fails
    # ent1.pop('foo')

    ## set equal to
    test_data = ["big2", "dog2"]
    ent1["bow"] = counter_type(Counter(test_data))
    assert list(map(abs, ent1["bow"].values())) == [1, 1]

    ent1["bow"] += counter_type(Counter(test_data))
    assert list(map(abs, ent1["bow"].values())) == [2, 2]
Example #19
0
def cbor_iter(fh):
    while True:
        try:
            chunk = cbor.load(fh)
        except EOFError:
            break
        yield FeatureCollection.from_dict(chunk)
Example #20
0
def test_add_facets():
    cid1 = 'cid1'
    fc1 = FeatureCollection()
    fc1['bowNP_sip'] = StringCounter([u'elephant', u'car'])
    cid2 = 'cid2'
    fc2 = FeatureCollection()
    fc2['bowNP_sip'] = StringCounter([u'car', u'green'])
    fake_results = {'results': [(cid1, fc1), (cid2, fc2)]}

    new_results = mod_pairwise.add_facets(fake_results)

    assert 'facets' in new_results

    assert new_results['facets'] == {
        'elephant': [cid1],
        'car': [cid1, cid2],
        'green': [cid2],
    }
def test_no_bytes_allowed():
    fc = FeatureCollection({'foo': u'bar'})
    fc.dumps()  # OK!

    with pytest.raises(SerializationError):
        fc = FeatureCollection({'foo': 'bar'})

    fc = FeatureCollection()
    fc['foo'] = 'bar'
    with pytest.raises(SerializationError):
        fc.dumps()
def test_entity(counter_type):
    ## build entity, serialize, deserialize, and verify its multisets
    fc1 = FeatureCollection()
    fc1["bow"] += counter_type(Counter(["big", "dog"]))
    fc1["bow"] += counter_type(Counter("tall building"))
    fc1["bon"] += counter_type(Counter(["Super Cat", "Small Cat", "Tiger Fish"]))

    ## there should be nine items of size 1
    assert Counter(map(abs, fc1["bow"].values()))[1] == 10, fc1["bow"].items()

    ## double the counts, should recurse down
    fc1 += fc1

    ## check values doubled
    assert Counter(map(abs, fc1["bow"].values()))[2] == 10, fc1["bow"].items()

    ## serialize/deserialize it
    blob = fc1.dumps()
    assert_same_fc(fc1, FeatureCollection.loads(blob))

    ## deserialize it via chunk
    fc2 = FeatureCollection.loads(fc1.dumps())
    assert_same_fc(fc1, fc2)
Example #23
0
def v1_fc_put(request, response, store, kvlclient, tfidf, cid):
    '''Store a single feature collection.

    The route for this endpoint is:
    ``PUT /dossier/v1/feature-collections/<content_id>``.

    ``content_id`` is the id to associate with the given feature
    collection. The feature collection should be in the request
    body serialized as JSON.

    Alternatively, if the request's ``Content-type`` is
    ``text/html``, then a feature collection is generated from the
    HTML. The generated feature collection is then returned as a
    JSON payload.

    This endpoint returns status ``201`` upon successful
    storage otherwise. An existing feature collection with id
    ``content_id`` is overwritten.
    '''
    tfidf = tfidf or None
    if request.headers.get('content-type', '').startswith('text/html'):
        url = urllib.unquote(cid.split('|', 1)[1])
        fc = etl.create_fc_from_html(url, request.body.read(), tfidf=tfidf)
        logger.info('created FC for %r', cid)
        store.put([(cid, fc)])
        return fc_to_json(fc)
    else:
        fc = FeatureCollection.from_dict(json.load(request.body))
        keywords = set()
        for subid in fc:
            if subid.startswith('subtopic'):
                ty = subtopic_type(subid)
                if ty in ('text', 'manual'):
                    # get the user selected string
                    data = typed_subtopic_data(fc, subid)
                    map(keywords.add, cleanse(data).split())
                    keywords.add(cleanse(data))

        folders = Folders(kvlclient)
        for fid, sid in folders.parent_subfolders(cid):
            if not isinstance(fid, unicode):
                fid = fid.decode('utf8')
            if not isinstance(sid, unicode):
                sid = sid.decode('utf8')
            keywords.add(cleanse(fid))
            keywords.add(cleanse(sid))

        fc[u'keywords'] = StringCounter(keywords)
        store.put([(cid, fc)])
        response.status = 201
def test_entity(counter_type):
    ## build entity, serialize, deserialize, and verify its multisets
    fc1 = FeatureCollection()
    fc1['bow'] += counter_type(Counter(['big', 'dog']))
    fc1['bow'] += counter_type(Counter('tall building'))
    fc1['bon'] += counter_type(Counter(['Super Cat', 'Small Cat', 'Tiger Fish']))

    ## there should be nine items of size 1
    assert Counter(fc1['bow'].values())[1] == 10, fc1['bow'].items()

    ## double the counts, should recurse down
    fc1 += fc1

    ## check values doubled
    assert Counter(fc1['bow'].values())[2] == 10, fc1['bow'].items()

    ## serialize/deserialize it
    blob = fc1.dumps()
    assert_same_fc(fc1, FeatureCollection.loads(blob))

    ## deserialize it via chunk
    fc2 = FeatureCollection.loads(fc1.dumps())
    assert_same_fc(fc1, fc2)
Example #25
0
def v1_fc_put(request, response, store, kvlclient, tfidf, cid):
    '''Store a single feature collection.

    The route for this endpoint is:
    ``PUT /dossier/v1/feature-collections/<content_id>``.

    ``content_id`` is the id to associate with the given feature
    collection. The feature collection should be in the request
    body serialized as JSON.

    Alternatively, if the request's ``Content-type`` is
    ``text/html``, then a feature collection is generated from the
    HTML. The generated feature collection is then returned as a
    JSON payload.

    This endpoint returns status ``201`` upon successful
    storage otherwise. An existing feature collection with id
    ``content_id`` is overwritten.
    '''
    tfidf = tfidf or None
    if request.headers.get('content-type', '').startswith('text/html'):
        url = urllib.unquote(cid.split('|', 1)[1])
        fc = etl.create_fc_from_html(url, request.body.read(), tfidf=tfidf)
        logger.info('created FC for %r', cid)
        store.put([(cid, fc)])
        return fc_to_json(fc)
    else:
        fc = FeatureCollection.from_dict(json.load(request.body))
        keywords = set()
        for subid in fc:
            if subid.startswith('subtopic'):
                ty = subtopic_type(subid)
                if ty in ('text', 'manual'):
                    # get the user selected string
                    data = typed_subtopic_data(fc, subid)
                    map(keywords.add, cleanse(data).split())
                    keywords.add(cleanse(data))

        folders = Folders(kvlclient)
        for fid, sid in folders.parent_subfolders(cid):
            if not isinstance(fid, unicode):
                fid = fid.decode('utf8')
            if not isinstance(sid, unicode):
                sid = sid.decode('utf8')
            keywords.add(cleanse(fid))
            keywords.add(cleanse(sid))

        fc[u'keywords'] = StringCounter(keywords)
        store.put([(cid, fc)])
        response.status = 201
def test_ignored():
    fc = FeatureCollection()
    fc['foo'] = 'bar'
    with pytest.raises(SerializationError):
        fc.dumps()

    fc = FeatureCollection()
    fc['_foo'] = 'bar'
    fc.dumps()  # _foo is ignored!
Example #27
0
def test_vectorizable_features():
    '''Make sure we only do learning on the right features.

    The "right" features means features that can be vectorized
    by sklearn. Translation: they must be instances of
    collections.Mapping.
    '''
    fc = FeatureCollection({
        u'yes': {
            'fubar': 1
        },
        u'no': u'haha',
    })
    got = mod_pairwise.vectorizable_features([fc])
    assert got == ['yes']
Example #28
0
def forum_post_features(row):
    fc = FeatureCollection()
    for k in row['author']:
        fc['post_author_' + k] = row['author'][k]

    if 'image_urls' in row:
        fc['image_url'] = StringCounter()
        for image_url in row['image_urls']:
            fc['image_url'][image_url] += 1

    others = ['parent_id', 'thread_id', 'thread_link', 'thread_name', 'title']
    for k in others:
        if k in row:
            fc['post_' + k] = uni(row[k])
    return fc
Example #29
0
    def add_folder(self, folder_id, ann_id=None):
        '''Add a folder.

        If ``ann_id`` is set, then the folder is owned by the given user.
        Otherwise, the folder is owned and viewable by all anonymous
        users.

        :param str folder_id: Folder id
        :param str ann_id: Username
        '''
        self.assert_valid_folder_id(folder_id)
        ann_id = self._annotator(ann_id)
        cid = self.wrap_folder_content_id(ann_id, folder_id)
        self.store.put([(cid, FeatureCollection())])
        logger.info('Added folder %r with content id %r', folder_id, cid)
Example #30
0
def v1_fc_put(request, response, visid_to_dbid, store, cid):
    '''Store a single feature collection.

    The route for this endpoint is:
    ``PUT /dossier/v1/feature-collections/<content_id>``.

    ``content_id`` is the id to associate with the given feature
    collection. The feature collection should be in the request
    body serialized as JSON.

    This endpoint returns status ``201`` upon successful storage.
    An existing feature collection with id ``content_id`` is
    overwritten.
    '''
    fc = FeatureCollection.from_dict(json.load(request.body))
    store.put([(visid_to_dbid(cid), fc)])
    response.status = 201
Example #31
0
def v1_fc_put(request, response, visid_to_dbid, store, cid):
    """Store a single feature collection.

    The route for this endpoint is:
    ``PUT /dossier/v1/feature-collections/<content_id>``.

    ``content_id`` is the id to associate with the given feature
    collection. The feature collection should be in the request
    body serialized as JSON.

    This endpoint returns status ``201`` upon successful storage.
    An existing feature collection with id ``content_id`` is
    overwritten.
    """
    fc = FeatureCollection.from_dict(json.load(request.body))
    store.put([(visid_to_dbid(cid), fc)])
    response.status = 201
def test_non_counter_features_serialize():
    fc = FeatureCollection({'NAME': u'foobaz'})
    fc = FeatureCollection.loads(fc.dumps())
Example #33
0
def counter_fc(bow):
    return FeatureCollection({u'feature': bow})
Example #34
0
def fc_loads(raw):
    return FeatureCollection.loads(snappy.decompress(raw))
Example #35
0
def fc_loads(raw):
    return FeatureCollection.loads(snappy.decompress(raw))
Example #36
0
def test_read_only_not_preserved_via_dict():
    fc = FeatureCollection({'NAME': {'foo': 1, 'baz': 2}})
    fc.read_only = True
    fcnew = FeatureCollection(fc.to_dict())
    assert not fcnew.read_only
    fcnew['NAME']['foo'] += 1
Example #37
0
def test_identity():
    fc = FeatureCollection()
    fc.read_only = True
    id(fc['one']) == id(fc['two'])
Example #38
0
# !!! IMPORTANT !!!
# Define features that you want to index. This will let you quickly scan
# for feature collections in the database with matching values.
#
# You don't have to index everything, but it's probably a good idea to index
# the most prominent features. e.g., phone or email or website.
#
# These should correspond to the names of the corresponding features.
feature_indexes = [u'phone', u'email', u'website', u'rate']

# Create a "store," which knows how to store and index feature collections.
store = Store(conn, feature_indexes=feature_indexes)

# Create a fresh feature collection and add a 'rate' feature.
fc = FeatureCollection()
fc['rate'] = StringCounter({
    u'5per30': 5,
    u'5per60': 1,
    u'10per20': 2,
})

# Content ids are the unique identifier for each feature collection.
# It's probably sufficient to use whatever you have for "ad id."
content_id = 'some_unique_value'
store.put([(content_id, fc)])
print store.get(content_id)

# Use the index scan!
print list(store.index_scan_prefix(u'rate', '10'))
Example #39
0
def example_fc():
    fc = FeatureCollection()
    fc[u'meta_clean_visible'] = example_text
    return fc
Example #40
0
def html_to_fc(html=None,
               clean_html=None,
               clean_visible=None,
               encoding=None,
               url=None,
               timestamp=None,
               other_features=None):
    '''`html` is expected to be a raw string received over the wire from a
    remote webserver, and `encoding`, if provided, is used to decode
    it.  Typically, encoding comes from the Content-Type header field.
    The :func:`~streamcorpus_pipeline._clean_html.make_clean_html`
    function handles character encodings.

    '''
    def add_feature(name, xs):
        if name not in fc:
            fc[name] = StringCounter()
        fc[name] += StringCounter(xs)

    timestamp = timestamp or int(time.time() * 1000)
    other_features = other_features or {}

    if clean_html is None:
        if html is not None:
            try:
                clean_html_utf8 = make_clean_html(html, encoding=encoding)
            except:
                logger.warn('dropping doc because:', exc_info=True)
                return
            clean_html = clean_html_utf8.decode('utf-8')
        else:
            clean_html_utf8 = u''
            clean_html = u''
    else:
        clean_html_utf8 = u''

    if clean_visible is None or len(clean_visible) == 0:
        clean_visible = make_clean_visible(clean_html_utf8).decode('utf-8')
    elif isinstance(clean_visible, str):
        clean_visible = clean_visible.decode('utf-8')

    fc = FeatureCollection()
    fc[u'meta_raw'] = html and uni(html, encoding) or u''
    fc[u'meta_clean_html'] = clean_html
    fc[u'meta_clean_visible'] = clean_visible
    fc[u'meta_timestamp'] = unicode(timestamp)

    url = url or u''

    fc[u'meta_url'] = uni(url)

    add_feature(u'icq', features.ICQs(clean_visible))
    add_feature(u'skype', features.skypes(clean_visible))
    add_feature(u'phone', features.phones(clean_visible))
    add_feature(u'email', features.emails(clean_visible))
    bowNP, normalizations = features.noun_phrases(cleanse(clean_visible),
                                                  included_unnormalized=True)
    add_feature(u'bowNP', bowNP)
    bowNP_unnorm = chain(*normalizations.values())
    add_feature(u'bowNP_unnorm', bowNP_unnorm)

    add_feature(u'image_url', features.image_urls(clean_html))
    add_feature(u'a_url', features.a_urls(clean_html))

    ## get parsed versions, extract usernames
    fc[u'img_url_path_dirs'] = features.path_dirs(fc[u'image_url'])
    fc[u'img_url_hostnames'] = features.host_names(fc[u'image_url'])
    fc[u'usernames'] = features.usernames(fc[u'image_url'])

    fc[u'a_url_path_dirs'] = features.path_dirs(fc[u'a_url'])
    fc[u'a_url_hostnames'] = features.host_names(fc[u'a_url'])

    fc[u'usernames'] += features.usernames(fc[u'a_url'])

    #fc[u'usernames'] += features.usernames2(
    #    fc[u'meta_clean_visible'])

    # beginning of treating this as a pipeline...
    xform = features.entity_names()
    fc = xform.process(fc)

    for feat_name, feat_val in other_features.iteritems():
        fc[feat_name] += StringCounter(feat_val)

    return fc
Example #41
0
def test_random_no_name_index(store):  # noqa
    store.put([('foo', FeatureCollection({u'NAME': {'bar': 1}}))])
    # just make sure it runs
    search_engines.random(store).set_query_id('foo').results()
def test_get():
    fc = FeatureCollection()
    assert fc.get("nada", 5) == 5
Example #43
0
def test_fc_get(store):  # noqa
    store.put([(visid_to_dbid('abc'), FeatureCollection({'foo': {'a': 1}}))])
    fc = routes.v1_fc_get(dbid_to_visid, store, 'abc')
    assert fc['foo']['a'] == 1
def test_non_counter_features_total():
    fc = FeatureCollection({'NAME': u'foobaz'})
    assert fc.total() == 0
def make_fc(text):
    nhash = nilsimsa_hash(text)
    fc = FeatureCollection()
    fc['#nilsimsa_all'] = StringCounter([nhash])
    return fc