Ejemplo n.º 1
0
 def __init__(self, *args, **kwargs):
     super(to_dossier_store, self).__init__(*args, **kwargs)
     kvl = kvlayer.client()
     feature_indexes = None
     try:
         conf = yakonfig.get_global_config('dossier.store')
         feature_indexes = conf['feature_indexes']
     except KeyError:
         pass
     self.store = Store(kvl, feature_indexes=feature_indexes)
     tfidf_path = self.config.get('tfidf_path')
     self.tfidf = gensim.models.TfidfModel.load(tfidf_path)
Ejemplo n.º 2
0
 def store(self):
     if self._store is None:
         feature_indexes = None
         try:
             conf = yakonfig.get_global_config('dossier.store')
             feature_indexes = conf['feature_indexes']
         except KeyError:
             pass
         self._store = Store(kvlayer.client(),
                             feature_indexes=feature_indexes)
     return self._store
Ejemplo n.º 3
0
def test_one_to_many_indexing(kvl):  # noqa
    # This config defines an index named `foo` that automatically indexes
    # values in the `bar` and `baz` features. This means that an index scan
    # on the `foo` index will check values in the `bar` and `baz` features.
    index_config = [{'foo': ['bar', 'baz']}]
    store = Store(kvl, feature_indexes=index_config)

    fcx, fcy, fcz = FC(), FC(), FC()
    fcx['unrelated']['a'] = 1
    fcy['bar']['a'] = 1
    fcy['baz']['a'] = 1
    fcz['baz']['a'] = 1
    fcy['baz']['c'] = 1
    fcz['baz']['b'] = 1

    store.put([('x', fcx), ('y', fcy), ('z', fcz)])

    assert list(store.index_scan('foo', 'a')) == ['y', 'z']
    assert list(store.index_scan('foo', 'b')) == ['z']
    assert list(store.index_scan('foo', 'c')) == ['y']
Ejemplo n.º 4
0
 def __init__(self, *args, **kwargs):
     super(to_dossier_store, self).__init__(*args, **kwargs)
     kvl = kvlayer.client()
     feature_indexes = None
     try:
         conf = yakonfig.get_global_config('dossier.store')
         feature_indexes = conf['feature_indexes']
     except KeyError:
         pass
     self.store = Store(kvl,
                        feature_indexes=feature_indexes)
     tfidf_path = self.config.get('tfidf_path')
     self.tfidf = gensim.models.TfidfModel.load(tfidf_path)
Ejemplo n.º 5
0
def test_one_to_many_indexing(kvl):  # noqa
    # This config defines an index named `foo` that automatically indexes
    # values in the `bar` and `baz` features. This means that an index scan
    # on the `foo` index will check values in the `bar` and `baz` features.
    index_config = [{'foo': ['bar', 'baz']}]
    store = Store(kvl, feature_indexes=index_config)

    fcx, fcy, fcz = FC(), FC(), FC()
    fcx['unrelated']['a'] = 1
    fcy['bar']['a'] = 1
    fcy['baz']['a'] = 1
    fcz['baz']['a'] = 1
    fcy['baz']['c'] = 1
    fcz['baz']['b'] = 1

    store.put([('x', fcx), ('y', fcy), ('z', fcz)])

    assert list(store.index_scan('foo', 'a')) == ['y', 'z']
    assert list(store.index_scan('foo', 'b')) == ['z']
    assert list(store.index_scan('foo', 'c')) == ['y']
Ejemplo n.º 6
0
def store(kvl):
    client = Store(kvl, feature_indexes=[u'feature'])
    yield client
    client.delete_all()
Ejemplo n.º 7
0
# There are more backends available like MySQL, PostgreSQL and Accumulo.
#
# See: https://github.com/diffeo/kvlayer

# !!! IMPORTANT !!!
# Define features that you want to index. This will let you quickly scan
# for feature collections in the database with matching values.
#
# You don't have to index everything, but it's probably a good idea to index
# the most prominent features. e.g., phone or email or website.
#
# These should correspond to the names of the corresponding features.
feature_indexes = [u'phone', u'email', u'website', u'rate']

# Create a "store," which knows how to store and index feature collections.
store = Store(conn, feature_indexes=feature_indexes)

# Create a fresh feature collection and add a 'rate' feature.
fc = FeatureCollection()
fc['rate'] = StringCounter({
    u'5per30': 5,
    u'5per60': 1,
    u'10per20': 2,
})

# Content ids are the unique identifier for each feature collection.
# It's probably sufficient to use whatever you have for "ad id."
content_id = 'some_unique_value'
store.put([(content_id, fc)])
print store.get(content_id)
Ejemplo n.º 8
0
def fcstore(kvl):
    return Store(kvl)
Ejemplo n.º 9
0
class to_dossier_store(Configured):
    '''A :mod:`streamcorpus_pipeline` `writer` stage with one optional parameter:

    .. code-block:: yaml

        tfidf_path: path/to/tfidf.data

    '''
    config_name = 'to_dossier_store'
    default_config = {'tfidf_path': None}

    def __init__(self, *args, **kwargs):
        super(to_dossier_store, self).__init__(*args, **kwargs)
        kvl = kvlayer.client()
        feature_indexes = None
        try:
            conf = yakonfig.get_global_config('dossier.store')
            feature_indexes = conf['feature_indexes']
        except KeyError:
            pass
        self.store = Store(kvl, feature_indexes=feature_indexes)
        tfidf_path = self.config.get('tfidf_path')
        self.tfidf = gensim.models.TfidfModel.load(tfidf_path)

    def process(self, t_path, name_info, i_str):
        '''converts each :attr:`streamcorpus.StreamItem.body.clean_html` from
        `t_path` into a :class:`~dossier.fc.FeatureCollection` and saves it in
        a :class:`~dossier.store.Store` configured with the global `kvlayer`
        config.

        '''
        def cids_and_fcs():
            count = 0
            seen = set()
            for si in Chunk(t_path):
                clean_html = getattr(si.body, 'clean_html', '')
                if clean_html is None or len(clean_html.strip()) == 0:
                    logger.warn('dropping SI lacking clean_html: %r',
                                si.abs_url)
                    continue
                if 'other_features' in si.other_content:
                    other_features = json.loads(
                        si.other_content['other_features'].raw)
                else:
                    other_features = None
                fc = html_to_fc(
                    clean_html=si.body.clean_html.decode('utf-8'),
                    clean_visible=si.body.clean_visible.decode('utf-8'),
                    encoding='utf-8',
                    url=si.abs_url,
                    timestamp=si.stream_time.epoch_ticks,
                    other_features=other_features,
                )
                add_sip_to_fc(fc, self.tfidf)
                content_id = mk_content_id(str(fc.get(u'meta_url')))
                if content_id in seen:
                    logger.warn('dropping duplicate content_id=%r', content_id)
                else:
                    seen.add(content_id)
                    yield content_id, fc
                    count += 1
            logger.info('saved %d FCs from %d SIs', count, len(seen))

        self.store.put(cids_and_fcs())
        ## interface spec of streamcorpus_pipeline writers requires
        ## returning a list of locally generated paths.
        return []

    __call__ = process
Ejemplo n.º 10
0
class to_dossier_store(Configured):
    '''A :mod:`streamcorpus_pipeline` `writer` stage with one optional parameter:

    .. code-block:: yaml

        tfidf_path: path/to/tfidf.data

    '''
    config_name = 'to_dossier_store'
    default_config = {
        'tfidf_path': None
    }
    def __init__(self, *args, **kwargs):
        super(to_dossier_store, self).__init__(*args, **kwargs)
        kvl = kvlayer.client()
        feature_indexes = None
        try:
            conf = yakonfig.get_global_config('dossier.store')
            feature_indexes = conf['feature_indexes']
        except KeyError:
            pass
        self.store = Store(kvl,
                           feature_indexes=feature_indexes)
        tfidf_path = self.config.get('tfidf_path')
        self.tfidf = gensim.models.TfidfModel.load(tfidf_path)

    def process(self, t_path, name_info, i_str):
        '''converts each :attr:`streamcorpus.StreamItem.body.clean_html` from
        `t_path` into a :class:`~dossier.fc.FeatureCollection` and saves it in
        a :class:`~dossier.store.Store` configured with the global `kvlayer`
        config.

        '''
        def cids_and_fcs():
            count = 0
            seen = set()
            for si in Chunk(t_path):
                clean_html = getattr(si.body, 'clean_html', '')
                if clean_html is None or len(clean_html.strip()) == 0:
                    logger.warn('dropping SI lacking clean_html: %r', si.abs_url)
                    continue
                if 'other_features' in si.other_content:
                    other_features = json.loads(si.other_content['other_features'].raw)
                else:
                    other_features = None
                fc = html_to_fc(
                    clean_html=si.body.clean_html.decode('utf-8'),
                    clean_visible=si.body.clean_visible.decode('utf-8'),
                    encoding='utf-8',
                    url=si.abs_url,
                    timestamp=si.stream_time.epoch_ticks,
                    other_features=other_features,
                )
                add_sip_to_fc(fc, self.tfidf)
                content_id = mk_content_id(str(fc.get(u'meta_url')))
                if content_id in seen:
                    logger.warn('dropping duplicate content_id=%r', content_id)
                else:
                    seen.add(content_id)
                    yield content_id, fc
                    count += 1
            logger.info('saved %d FCs from %d SIs', count, len(seen))
        self.store.put(cids_and_fcs())
        ## interface spec of streamcorpus_pipeline writers requires
        ## returning a list of locally generated paths.
        return []

    __call__ = process
Ejemplo n.º 11
0
def store(kvl):
    client = Store(kvl, feature_indexes=[u'feature'])
    yield client
    client.delete_all()
Ejemplo n.º 12
0
# There are more backends available like MySQL, PostgreSQL and Accumulo.
#
# See: https://github.com/diffeo/kvlayer

# !!! IMPORTANT !!!
# Define features that you want to index. This will let you quickly scan
# for feature collections in the database with matching values.
#
# You don't have to index everything, but it's probably a good idea to index
# the most prominent features. e.g., phone or email or website.
#
# These should correspond to the names of the corresponding features.
feature_indexes = [u'phone', u'email', u'website', u'rate']

# Create a "store," which knows how to store and index feature collections.
store = Store(conn, feature_indexes=feature_indexes)

# Create a fresh feature collection and add a 'rate' feature.
fc = FeatureCollection()
fc['rate'] = StringCounter({
    u'5per30': 5,
    u'5per60': 1,
    u'10per20': 2,
})

# Content ids are the unique identifier for each feature collection.
# It's probably sufficient to use whatever you have for "ad id."
content_id = 'some_unique_value'
store.put([(content_id, fc)])
print store.get(content_id)