def __init__(self, *args, **kwargs): super(to_dossier_store, self).__init__(*args, **kwargs) kvl = kvlayer.client() feature_indexes = None try: conf = yakonfig.get_global_config('dossier.store') feature_indexes = conf['feature_indexes'] except KeyError: pass self.store = Store(kvl, feature_indexes=feature_indexes) tfidf_path = self.config.get('tfidf_path') self.tfidf = gensim.models.TfidfModel.load(tfidf_path)
def store(self): if self._store is None: feature_indexes = None try: conf = yakonfig.get_global_config('dossier.store') feature_indexes = conf['feature_indexes'] except KeyError: pass self._store = Store(kvlayer.client(), feature_indexes=feature_indexes) return self._store
def test_one_to_many_indexing(kvl): # noqa # This config defines an index named `foo` that automatically indexes # values in the `bar` and `baz` features. This means that an index scan # on the `foo` index will check values in the `bar` and `baz` features. index_config = [{'foo': ['bar', 'baz']}] store = Store(kvl, feature_indexes=index_config) fcx, fcy, fcz = FC(), FC(), FC() fcx['unrelated']['a'] = 1 fcy['bar']['a'] = 1 fcy['baz']['a'] = 1 fcz['baz']['a'] = 1 fcy['baz']['c'] = 1 fcz['baz']['b'] = 1 store.put([('x', fcx), ('y', fcy), ('z', fcz)]) assert list(store.index_scan('foo', 'a')) == ['y', 'z'] assert list(store.index_scan('foo', 'b')) == ['z'] assert list(store.index_scan('foo', 'c')) == ['y']
def store(kvl): client = Store(kvl, feature_indexes=[u'feature']) yield client client.delete_all()
# There are more backends available like MySQL, PostgreSQL and Accumulo. # # See: https://github.com/diffeo/kvlayer # !!! IMPORTANT !!! # Define features that you want to index. This will let you quickly scan # for feature collections in the database with matching values. # # You don't have to index everything, but it's probably a good idea to index # the most prominent features. e.g., phone or email or website. # # These should correspond to the names of the corresponding features. feature_indexes = [u'phone', u'email', u'website', u'rate'] # Create a "store," which knows how to store and index feature collections. store = Store(conn, feature_indexes=feature_indexes) # Create a fresh feature collection and add a 'rate' feature. fc = FeatureCollection() fc['rate'] = StringCounter({ u'5per30': 5, u'5per60': 1, u'10per20': 2, }) # Content ids are the unique identifier for each feature collection. # It's probably sufficient to use whatever you have for "ad id." content_id = 'some_unique_value' store.put([(content_id, fc)]) print store.get(content_id)
def fcstore(kvl): return Store(kvl)
class to_dossier_store(Configured): '''A :mod:`streamcorpus_pipeline` `writer` stage with one optional parameter: .. code-block:: yaml tfidf_path: path/to/tfidf.data ''' config_name = 'to_dossier_store' default_config = {'tfidf_path': None} def __init__(self, *args, **kwargs): super(to_dossier_store, self).__init__(*args, **kwargs) kvl = kvlayer.client() feature_indexes = None try: conf = yakonfig.get_global_config('dossier.store') feature_indexes = conf['feature_indexes'] except KeyError: pass self.store = Store(kvl, feature_indexes=feature_indexes) tfidf_path = self.config.get('tfidf_path') self.tfidf = gensim.models.TfidfModel.load(tfidf_path) def process(self, t_path, name_info, i_str): '''converts each :attr:`streamcorpus.StreamItem.body.clean_html` from `t_path` into a :class:`~dossier.fc.FeatureCollection` and saves it in a :class:`~dossier.store.Store` configured with the global `kvlayer` config. ''' def cids_and_fcs(): count = 0 seen = set() for si in Chunk(t_path): clean_html = getattr(si.body, 'clean_html', '') if clean_html is None or len(clean_html.strip()) == 0: logger.warn('dropping SI lacking clean_html: %r', si.abs_url) continue if 'other_features' in si.other_content: other_features = json.loads( si.other_content['other_features'].raw) else: other_features = None fc = html_to_fc( clean_html=si.body.clean_html.decode('utf-8'), clean_visible=si.body.clean_visible.decode('utf-8'), encoding='utf-8', url=si.abs_url, timestamp=si.stream_time.epoch_ticks, other_features=other_features, ) add_sip_to_fc(fc, self.tfidf) content_id = mk_content_id(str(fc.get(u'meta_url'))) if content_id in seen: logger.warn('dropping duplicate content_id=%r', content_id) else: seen.add(content_id) yield content_id, fc count += 1 logger.info('saved %d FCs from %d SIs', count, len(seen)) self.store.put(cids_and_fcs()) ## interface spec of streamcorpus_pipeline writers requires ## returning a list of locally generated paths. return [] __call__ = process
class to_dossier_store(Configured): '''A :mod:`streamcorpus_pipeline` `writer` stage with one optional parameter: .. code-block:: yaml tfidf_path: path/to/tfidf.data ''' config_name = 'to_dossier_store' default_config = { 'tfidf_path': None } def __init__(self, *args, **kwargs): super(to_dossier_store, self).__init__(*args, **kwargs) kvl = kvlayer.client() feature_indexes = None try: conf = yakonfig.get_global_config('dossier.store') feature_indexes = conf['feature_indexes'] except KeyError: pass self.store = Store(kvl, feature_indexes=feature_indexes) tfidf_path = self.config.get('tfidf_path') self.tfidf = gensim.models.TfidfModel.load(tfidf_path) def process(self, t_path, name_info, i_str): '''converts each :attr:`streamcorpus.StreamItem.body.clean_html` from `t_path` into a :class:`~dossier.fc.FeatureCollection` and saves it in a :class:`~dossier.store.Store` configured with the global `kvlayer` config. ''' def cids_and_fcs(): count = 0 seen = set() for si in Chunk(t_path): clean_html = getattr(si.body, 'clean_html', '') if clean_html is None or len(clean_html.strip()) == 0: logger.warn('dropping SI lacking clean_html: %r', si.abs_url) continue if 'other_features' in si.other_content: other_features = json.loads(si.other_content['other_features'].raw) else: other_features = None fc = html_to_fc( clean_html=si.body.clean_html.decode('utf-8'), clean_visible=si.body.clean_visible.decode('utf-8'), encoding='utf-8', url=si.abs_url, timestamp=si.stream_time.epoch_ticks, other_features=other_features, ) add_sip_to_fc(fc, self.tfidf) content_id = mk_content_id(str(fc.get(u'meta_url'))) if content_id in seen: logger.warn('dropping duplicate content_id=%r', content_id) else: seen.add(content_id) yield content_id, fc count += 1 logger.info('saved %d FCs from %d SIs', count, len(seen)) self.store.put(cids_and_fcs()) ## interface spec of streamcorpus_pipeline writers requires ## returning a list of locally generated paths. return [] __call__ = process