Ejemplo n.º 1
0
def test_add_facets():
    cid1 = 'cid1'
    fc1 = FeatureCollection()
    fc1['bowNP_sip'] = StringCounter([u'elephant', u'car'])
    cid2 = 'cid2'
    fc2 = FeatureCollection()
    fc2['bowNP_sip'] = StringCounter([u'car', u'green'])
    fake_results = {'results': [(cid1, fc1), (cid2, fc2)]}

    new_results = mod_pairwise.add_facets(fake_results)

    assert 'facets' in new_results

    assert new_results['facets'] == {
        'elephant': [cid1],
        'car': [cid1, cid2],
        'green': [cid2],
    }
Ejemplo n.º 2
0
    def add_folder(self, folder_id, ann_id=None):
        '''Add a folder.

        If ``ann_id`` is set, then the folder is owned by the given user.
        Otherwise, the folder is owned and viewable by all anonymous
        users.

        :param str folder_id: Folder id
        :param str ann_id: Username
        '''
        self.assert_valid_folder_id(folder_id)
        ann_id = self._annotator(ann_id)
        cid = self.wrap_folder_content_id(ann_id, folder_id)
        self.store.put([(cid, FeatureCollection())])
        logger.info('Added folder %r with content id %r', folder_id, cid)
Ejemplo n.º 3
0
def forum_post_features(row):
    fc = FeatureCollection()
    for k in row['author']:
        fc['post_author_' + k] = row['author'][k]

    if 'image_urls' in row:
        fc['image_url'] = StringCounter()
        for image_url in row['image_urls']:
            fc['image_url'][image_url] += 1

    others = ['parent_id', 'thread_id', 'thread_link', 'thread_name', 'title']
    for k in others:
        if k in row:
            fc['post_' + k] = uni(row[k])
    return fc
Ejemplo n.º 4
0
def test_vectorizable_features():
    '''Make sure we only do learning on the right features.

    The "right" features means features that can be vectorized
    by sklearn. Translation: they must be instances of
    collections.Mapping.
    '''
    fc = FeatureCollection({
        u'yes': {
            'fubar': 1
        },
        u'no': u'haha',
    })
    got = mod_pairwise.vectorizable_features([fc])
    assert got == ['yes']
Ejemplo n.º 5
0
def test_fc_get(store):  # noqa
    store.put([(visid_to_dbid('abc'), FeatureCollection({'foo': {'a': 1}}))])
    fc = routes.v1_fc_get(dbid_to_visid, store, 'abc')
    assert fc['foo']['a'] == 1
Ejemplo n.º 6
0
def test_random_no_name_index(store):  # noqa
    store.put([('foo', FeatureCollection({u'NAME': {'bar': 1}}))])
    # just make sure it runs
    search_engines.random(store).set_query_id('foo').results()
Ejemplo n.º 7
0
def html_to_fc(html=None,
               clean_html=None,
               clean_visible=None,
               encoding=None,
               url=None,
               timestamp=None,
               other_features=None):
    '''`html` is expected to be a raw string received over the wire from a
    remote webserver, and `encoding`, if provided, is used to decode
    it.  Typically, encoding comes from the Content-Type header field.
    The :func:`~streamcorpus_pipeline._clean_html.make_clean_html`
    function handles character encodings.

    '''
    def add_feature(name, xs):
        if name not in fc:
            fc[name] = StringCounter()
        fc[name] += StringCounter(xs)

    timestamp = timestamp or int(time.time() * 1000)
    other_features = other_features or {}

    if clean_html is None:
        if html is not None:
            try:
                clean_html_utf8 = make_clean_html(html, encoding=encoding)
            except:
                logger.warn('dropping doc because:', exc_info=True)
                return
            clean_html = clean_html_utf8.decode('utf-8')
        else:
            clean_html_utf8 = u''
            clean_html = u''
    else:
        clean_html_utf8 = u''

    if clean_visible is None or len(clean_visible) == 0:
        clean_visible = make_clean_visible(clean_html_utf8).decode('utf-8')
    elif isinstance(clean_visible, str):
        clean_visible = clean_visible.decode('utf-8')

    fc = FeatureCollection()
    fc[u'meta_raw'] = html and uni(html, encoding) or u''
    fc[u'meta_clean_html'] = clean_html
    fc[u'meta_clean_visible'] = clean_visible
    fc[u'meta_timestamp'] = unicode(timestamp)

    url = url or u''

    fc[u'meta_url'] = uni(url)

    add_feature(u'icq', features.ICQs(clean_visible))
    add_feature(u'skype', features.skypes(clean_visible))
    add_feature(u'phone', features.phones(clean_visible))
    add_feature(u'email', features.emails(clean_visible))
    bowNP, normalizations = features.noun_phrases(cleanse(clean_visible),
                                                  included_unnormalized=True)
    add_feature(u'bowNP', bowNP)
    bowNP_unnorm = chain(*normalizations.values())
    add_feature(u'bowNP_unnorm', bowNP_unnorm)

    add_feature(u'image_url', features.image_urls(clean_html))
    add_feature(u'a_url', features.a_urls(clean_html))

    ## get parsed versions, extract usernames
    fc[u'img_url_path_dirs'] = features.path_dirs(fc[u'image_url'])
    fc[u'img_url_hostnames'] = features.host_names(fc[u'image_url'])
    fc[u'usernames'] = features.usernames(fc[u'image_url'])

    fc[u'a_url_path_dirs'] = features.path_dirs(fc[u'a_url'])
    fc[u'a_url_hostnames'] = features.host_names(fc[u'a_url'])

    fc[u'usernames'] += features.usernames(fc[u'a_url'])

    #fc[u'usernames'] += features.usernames2(
    #    fc[u'meta_clean_visible'])

    # beginning of treating this as a pipeline...
    xform = features.entity_names()
    fc = xform.process(fc)

    for feat_name, feat_val in other_features.iteritems():
        fc[feat_name] += StringCounter(feat_val)

    return fc
Ejemplo n.º 8
0
def example_fc():
    fc = FeatureCollection()
    fc[u'meta_clean_visible'] = example_text
    return fc
Ejemplo n.º 9
0
# !!! IMPORTANT !!!
# Define features that you want to index. This will let you quickly scan
# for feature collections in the database with matching values.
#
# You don't have to index everything, but it's probably a good idea to index
# the most prominent features. e.g., phone or email or website.
#
# These should correspond to the names of the corresponding features.
feature_indexes = [u'phone', u'email', u'website', u'rate']

# Create a "store," which knows how to store and index feature collections.
store = Store(conn, feature_indexes=feature_indexes)

# Create a fresh feature collection and add a 'rate' feature.
fc = FeatureCollection()
fc['rate'] = StringCounter({
    u'5per30': 5,
    u'5per60': 1,
    u'10per20': 2,
})

# Content ids are the unique identifier for each feature collection.
# It's probably sufficient to use whatever you have for "ad id."
content_id = 'some_unique_value'
store.put([(content_id, fc)])
print store.get(content_id)

# Use the index scan!
print list(store.index_scan_prefix(u'rate', '10'))
Ejemplo n.º 10
0
def counter_fc(bow):
    return FeatureCollection({u'feature': bow})
Ejemplo n.º 11
0
def make_fc(text):
    nhash = nilsimsa_hash(text)
    fc = FeatureCollection()
    fc['#nilsimsa_all'] = StringCounter([nhash])
    return fc