def __init__(self, store, label_store, content_id, subtopic_id=None, canopy_limit=None, label_limit=None): '''Build a new model. :param store: A store of feature collections. :type store: :class:`dossier.store.Store` :param label_store: A store of labels (ground truth data). :type label_store: :class:`dossier.label.LabelStore` :param str content_id: The query content id (which should correspond to a feature collection in the ``store``). If it doesn't, no results are returned. :param int canopy_limit: A limit on the number of results to return in the canopy (the initial index scan). This is meant to be a mechanism for resource control. :param int label_limit: A limit on the number of labels to use in training. This is meant to be a mechanism for resource control. ''' self.store = store self.label_store = label_store self.folders = Folders(store.kvl) self.query_content_id = content_id self.query_subtopic_id = subtopic_id self.query_fc = None self.canopy_limit = canopy_limit self.label_limit = label_limit
def v1_fc_put(request, response, store, kvlclient, tfidf, cid): '''Store a single feature collection. The route for this endpoint is: ``PUT /dossier/v1/feature-collections/<content_id>``. ``content_id`` is the id to associate with the given feature collection. The feature collection should be in the request body serialized as JSON. Alternatively, if the request's ``Content-type`` is ``text/html``, then a feature collection is generated from the HTML. The generated feature collection is then returned as a JSON payload. This endpoint returns status ``201`` upon successful storage otherwise. An existing feature collection with id ``content_id`` is overwritten. ''' tfidf = tfidf or None if request.headers.get('content-type', '').startswith('text/html'): url = urllib.unquote(cid.split('|', 1)[1]) fc = etl.create_fc_from_html(url, request.body.read(), tfidf=tfidf) logger.info('created FC for %r', cid) store.put([(cid, fc)]) return fc_to_json(fc) else: fc = FeatureCollection.from_dict(json.load(request.body)) keywords = set() for subid in fc: if subid.startswith('subtopic'): ty = subtopic_type(subid) if ty in ('text', 'manual'): # get the user selected string data = typed_subtopic_data(fc, subid) map(keywords.add, cleanse(data).split()) keywords.add(cleanse(data)) folders = Folders(kvlclient) for fid, sid in folders.parent_subfolders(cid): if not isinstance(fid, unicode): fid = fid.decode('utf8') if not isinstance(sid, unicode): sid = sid.decode('utf8') keywords.add(cleanse(fid)) keywords.add(cleanse(sid)) fc[u'keywords'] = StringCounter(keywords) store.put([(cid, fc)]) response.status = 201
def main(): p = argparse.ArgumentParser( description='SortingDesk report generation tool') p.add_argument('-c', '--config', required=True, help='dossier stack YAML config file') p.add_argument('-o', '--output', required=True, help='path to write Excel workbook file') p.add_argument('-u', '--user', default='unknown', help='user name (default=ALL)') p.add_argument('folder', help='folder name') p.add_argument('subfolder', nargs='?', default=None, help='subfolder name (default=ALL)') args = p.parse_args() config = yakonfig.set_default_config([kvlayer], filename=args.config) factory = Factory(config) store = factory.create(Store) # Instantiate and run report generator. folders = Folders(kvlayer.client()) gen = ReportGenerator(store, folders, args.folder, subfolder_name=args.subfolder, user=args.user) with open(args.output, 'wb+') as out: gen.run(out)
def _generate_for_subfolder(self, sid): ''' Generate report for a subfolder. :param sid: The subfolder id; assumed valid ''' # TODO: the following assumes subfolder names can be constructed from a # subfolder id, which might not be the case in the future. name = self._sanitise_sheetname(uni(Folders.id_to_name(sid))) ws = self.workbook.add_worksheet(name) fmt = self.formats ws.write("A1", "Dossier report", fmt['title']) ws.write("A2", "%s | %s" % (uni(self.folder_name), name)) # Column dimensions ws.set_column('A:A', 37) ws.set_column('B:B', 37) ws.set_column('C:C', 37) ws.set_column('D:D', 8) ws.set_column('E:E', 30) ws.set_column('F:F', 37) # Header ws.write("A4", "Id", fmt['header']) ws.write("B4", "URL", fmt['header']) ws.write("C4", "Subtopic Id", fmt['header']) ws.write("D4", "Type", fmt['header']) ws.write("E4", "Content", fmt['header']) ws.write("F4", "Image URL", fmt['header']) # TODO: we probably want to wrap the following in a try-catch block, in # case the call to folders.subtopics fails. row = 4 for i in subtopics(self.store, self.folders, self.folder_id, sid, self.user): Item.construct(self, i).generate_to(ws, row) row += 1
class same_subfolder(web.Filter): def __init__(self, kvlclient, label_store): super(same_subfolder, self).__init__() self.kvl = kvlclient self.label_store = label_store self.folders = Folders(self.kvl) def create_predicate(self): subfolders = self.folders.parent_subfolders(self.query_content_id) cids = set() for folder_id, subfolder_id in subfolders: for cid, subid in self.folders.items(folder_id, subfolder_id): cids.add(cid) # Also add directly connected labels too. for lab in self.label_store.directly_connected((cid, subid)): cids.add(lab.other(cid)) return lambda (content_id, fc): content_id not in cids
def _generate_report_single(self, sid): '''Generate report for subfolder given by sid . The main purpose of this method is to make sure the subfolder given by sid does indeed exist. All real work is delegated to _generate_for_subfolder. :param sid: The subfolder id Private method. ''' assert self.workbook is not None assert sid is not None # Ensure subfolder exists if not sid in self.folders.subfolders(self.folder_id, self.user): subfolder = Folders.id_to_name(sid) print("E: subfolder not found: %s" % subfolder, file=sys.stderr) return self._generate_for_subfolder(sid)
def subfolder_id(self): return Folders.name_to_id(self.subfolder_name)
def folder_id(self): return Folders.name_to_id(self.folder_name)
def traverse_extract_fetch(config, wukey, stop_after_extraction=False): '''Given a config and a `wukey=cbor.dumps((folder_name,subfolder_name))`, traverse the folders to generate queries, issue them to Google, fetch the results, and ingest them. ''' config.kvlclient.setup_namespace({'openquery': (str,)}) try: data = list(config.kvlclient.get('openquery', (wukey,))) if data: if data[0][1]: logger.info('found existing query results: %r', data) return else: config.kvlclient.delete('openquery', (wukey,)) except: logger.error('failed to get data from existing table', exc_info=True) fid, sid = cbor.loads(wukey) tfidf = config.tfidf folders = Folders(config.kvlclient) fetcher = Fetcher() ## To disable the keyword extractor model, you can uncomment out ## the next three lines (`get_subfolder_queries`) and comment out ## the following two lines (`extract_keyword_queries`). #keyword_feature_keys = [] #queries = get_subfolder_queries( # config.store, config.label_store, folders, fid, sid) queries, keyword_feature_keys, has_observations = extract_keyword_queries( config.store, config.label_store, folders, fid, sid) logger.info('Model found %d queries: %r', len(queries), queries) if stop_after_extraction: return keywords = set() for key in keyword_feature_keys: ckey = cleanse(key.decode('utf8')) keywords.add(ckey) for part in ckey.split(): keywords.add(part) #link2queries = defaultdict(set) links = set() logger.info('searching google for: %r', queries) for q in queries: for result in config.google.web_search_with_paging(q, limit=10): links.add(result['link']) #map(link2queries[result['link']].add, cleanse(q.decode('utf8')).split()) logger.info('discovered %r', result['link']) result = None #logger.info('got %d URLs from %d queries', len(link2queries), len(queries)) logger.info('got %d URLs from %d queries', len(links), len(queries)) # content_ids gets modified within the 'callback' closure content_ids = [] #for link, queries in link2queries.items(): def callback(si, link): if si is None: return cid_url = hashlib.md5(str(link)).hexdigest() cid = etl.interface.mk_content_id(cid_url) content_ids.append(cid) # hack alert! # We currently use FCs to store subtopic text data, which # means we cannot overwrite existing FCs with reckless # abandon. So we adopt a heuristic: check if an FC already # exists, and if it does, check if it is being used to store # user data. If so, don't overwrite it and move on. fc = config.store.get(cid) if fc is not None and any(k.startswith('subtopic|') for k in fc.iterkeys()): logger.info('skipping ingest for %r (abs url: %r) because ' 'an FC with user data already exists.', cid, link) return other_features = { u'keywords': StringCounter(keywords), #list(queries)), } try: fc = etl.create_fc_from_html( link, si.body.raw, encoding=si.body.encoding or 'utf-8', tfidf=tfidf, other_features=other_features, ) if not fc: logger.info('failed to get an FC, moving on') return logger.info('created FC for %r (abs url: %r)', cid, link) config.store.put([(cid, fc)]) except Exception: logger.info('trapped ingest failure on %r (abs url: %r)', cid, link, exc_info=True) logger.info('FETCHING using ASYNC') fetcher.get_async(islice(links, None), callback) data = json.dumps({'content_ids': content_ids}) logger.info('saving %d content_ids in %d bytes on wukey %r', len(content_ids), len(data), wukey) config.kvlclient.put('openquery', ((wukey,), data)) logger.info('done saving for %r', wukey)
def worker(work_unit, max_sample=1000): '''Expects a coordinate WorkUnit for DragNet and runs the following steps: 1. scans all dossiers at the *folder* level and assembles feature vectors for each folder -- see `make_feature` 2. trains a multinomial naive Bayes classifier that treats each *folder* as a classifier target. 3. sample the corpus by scanning up to `max_sample` and applying the classifier to each item to get an approx "size" of the Folder 4. Bootstrap by treating those classifier predictions as truth data and extract the learned features that are predictive as new query strings. 5. Put the data in kvlayer for webservice end point to return to polling client -- see dossier.models.routes ''' if 'config' not in work_unit.spec: raise coordinate.exceptions.ProgrammerError( 'could not run dragnet without global config') web_conf = Config() unitconf = work_unit.spec['config'] with yakonfig.defaulted_config([coordinate, kvlayer, dblogger, web_conf], config=unitconf): labels = [] D = list() label2fid = dict() rejects = set() keepers = set() # 1. make a classifier target for each *folder*, ignoring # subfolder structure FT = Folders(web_conf.kvlclient) for idx, fid in enumerate(FT.folders()): label2fid[idx] = fid for sid in FT.subfolders(fid): for cid, subtopic_id in FT.items(fid, sid): fc = web_conf.store.get(cid) if fc: # NB: first call to make_feature feat, _rejects, _keepers = make_feature(fc) else: _rejects = {} _keepers = {} D.append(feat) labels.append(idx) rejects.update(_rejects) keepers.update(_keepers) logger.info('fid=%r, observation: %r', fid, cid) # 2. Convert the StringCounters into an sklearn format and # train MultinomialNB logger.info('transforming...') v = DictVectorizer(sparse=False) X = v.fit_transform(D) logger.info('transform fit done.') labels = np.array(labels) # Fit the sklearn Bernoulli Naive Bayes classifer clf = MultinomialNB() clf.fit(X, labels) logger.info('fit MultinomialNB') # 3. Scan the corpus up to max_sample putting the items into # each target to get an approx "size" of the Folder counts = Counter() for cid, fc in islice(web_conf.store.scan(), max_sample): # build the same feature vector as the training process feat, _rejects, _keepers = make_feature(fc) X = v.transform([feat]) # predict which folder it belongs in target = clf.predict(X[0])[0] # count the effective size of that folder in this sample counts[label2fid[target]] += 1 logger.info('counts done') ## 4. Bootstrap by treating those classifier predictions as ## truth data and extract the learned features that are ## predictive as new query strings. clusters = [] for idx in sorted(set(labels)): logger.debug('considering cluster: %d', idx) try: all_features = v.inverse_transform(clf.feature_log_prob_[idx])[0] except: logger.warn('beyond edge on cluster %d', idx) continue words = Counter(all_features) ordered = sorted(words.items(), key=operator.itemgetter(1), reverse=True) filtered = [] for it in ordered: if is_bad_token(it[0]): continue if is_username(it[0]): logger.debug('%r is_username', it[0]) #else: # continue filtered.append(it) if len(filtered) > 100: # hard cutoff break # normalize cluster size exponentially biggest = exp(filtered[0][1]) # rescale all by biggest filtered = [(key, int(round(counts[label2fid[idx]] * exp(w) / biggest))) for key, w in filtered] # describe what we just figured out logger.info('%s --> %r', label2fid[idx], ['%s: %d' % it for it in filtered[:10]]) # return build the JSON-serializable format for the # DragNet UI embedded inside SortingDesk cluster = [] cluster.append({'caption': label2fid[idx], 'weight': counts[label2fid[idx]], 'folder_id': None, }) cluster += [{'caption': caption, 'weight': weight, 'folder_id': label2fid[idx]} for caption, weight in filtered if weight > 0] clusters.append(cluster) # 5. Put the data in kvlayer for webservice end point to # return to polling client web_conf.kvlclient.setup_namespace({'dragnet': (str,)}) web_conf.kvlclient.put('dragnet', (('dragnet',), json.dumps({'clusters': clusters}))) return dict(counts)
class PairwiseFeatureLearner(object): '''A pairwise active learning model. This active learning model applies :class:`~sklearn.linear_model.LogisticRegression` on-the-fly as a user (or simulated user) interacts with content via the web services provided by :mod:`dossier.web`. This reads :class:`~dossier.label.Label` objects from :class:`~dossier.label.LabelStore` and provides predictions of pairwise equivalence, which can be used for coreference resolution, clustering, and ranking. .. automethod:: dossier.models.PairwiseFeatureLearner.__init__ .. automethod:: dossier.models.PairwiseFeatureLearner.probabilities ''' def __init__(self, store, label_store, content_id, subtopic_id=None, canopy_limit=None, label_limit=None): '''Build a new model. :param store: A store of feature collections. :type store: :class:`dossier.store.Store` :param label_store: A store of labels (ground truth data). :type label_store: :class:`dossier.label.LabelStore` :param str content_id: The query content id (which should correspond to a feature collection in the ``store``). If it doesn't, no results are returned. :param int canopy_limit: A limit on the number of results to return in the canopy (the initial index scan). This is meant to be a mechanism for resource control. :param int label_limit: A limit on the number of labels to use in training. This is meant to be a mechanism for resource control. ''' self.store = store self.label_store = label_store self.folders = Folders(store.kvl) self.query_content_id = content_id self.query_subtopic_id = subtopic_id self.query_fc = None self.canopy_limit = canopy_limit self.label_limit = label_limit def as_result(self, cid, fc, p): fnames = sorted(set(self.query_fc.keys()).intersection(fc.keys())) intermediates = dict([(n, { 'kernel': 'cosine', 'feature1': n, 'feature2': n, 'kernel_value': None, 'weight': None, 'common_feature_values': [] }) for n in fnames]) for n in fnames: intermediates[n]['weight'] = self.feature_weights.get(n) for n, qfeat, cfeat in ((n, self.query_fc[n], fc[n]) for n in fnames): if not isinstance(qfeat, StringCounter) \ or not isinstance(cfeat, StringCounter): continue vals = set(qfeat.keys()).intersection(cfeat.keys()) intermediates[n]['common_feature_values'] = \ sorted(filter(None, vals)) all_vals = sorted(set(qfeat.keys()).union(cfeat.keys())) if len(all_vals) > 0: qcounts = [qfeat.get(v, 0) for v in all_vals] ccounts = [cfeat.get(v, 0) for v in all_vals] sim = cosine(qcounts, ccounts) if not math.isnan(sim): intermediates[n]['kernel_value'] = sim return (cid, fc, { 'probability': p, 'intermediate_model_results': intermediates.values(), }) def probabilities(self): '''Trains a model and predicts recommendations. If the query feature collection could not be found or if there is insufficient training data, an empty list is returned. Otherwise, a list of content objects (tuples of content id and feature collection) and probabilities is returned. The probability is generated from the model, and reflects confidence of the model that the corresponding content object is related to the query based on the ground truth data. On a large database, random samples are used for training, so this function is not deterministic. :rtype: ``list`` of ((``content_id``, :class:`dossier.fc.FeatureCollection`), probability) ''' self.query_fc = self.store.get(self.query_content_id) if self.query_fc is None: logger.warning('Could not find FC for %s', self.query_content_id) return [] # Try the canopy query before training, because if the canopy query # gives us nothing, then there's no point in the additional work. # # Possible optimization: If the canopy query yields fewer than N # results, then can we just return all of them? ---AG # # N.B Doing the canopy query first will cause things to be slower # when there is insufficient training data. candidates = self.canopy(limit=self.canopy_limit) if len(candidates) == 0: logger.info( 'Could not find any candidates in a canopy query by ' 'scanning the following indexes: %s', ', '.join(self.store.index_names())) return [] # Get labels from the database and translate them to the form # `[{-1, 1}, i, j]` where `i, j` are indices into the list # `content_objs`, which has type `[(content_id, FeatureCollection)]`. logger.info('Fetching labels...') labels = list(self.labels_from_query(limit=self.label_limit)) logger.info('Fetching FCs from labels...') content_objs = self.content_objs_from_labels(labels) indexed_labels = labels_to_indexed_coref_values(content_objs, labels) logger.info('Training...') model = self.train(content_objs, indexed_labels) if model is None: logger.info( 'Could not train model: insufficient training data. ' '(query content id: %s)', self.query_content_id) raise InsufficientTrainingData feature_names, classifier, transformer = model return zip( candidates, self.classify(feature_names, classifier, transformer, candidates)) def train(self, content_objs, idx_labels): '''Trains and returns a model using sklearn. If there are new labels to add, they can be added, returns an sklearn model which can be used for prediction and getting features. This method may return ``None`` if there is insufficient training data to produce a model. :param labels: Ground truth data. :type labels: list of ``({-1, 1}, index1, index2)``. ''' # We have insufficient training data when there is only one or # fewer classes of labels. if len(set([lab[0] for lab in idx_labels])) <= 1: return None fcs = [fc for _, fc in content_objs] feature_names = vectorizable_features(fcs) dis = dissimilarities(feature_names, fcs) phi_dicts, labels = [], [] # lists are in correspondence for coref_value, i, j in idx_labels: # i, j are indices into the list `fcs` labels.append(coref_value) # either -1 or 1 phi_dict = dict([(name, dis[name][i, j]) for name in feature_names]) phi_dicts.append(phi_dict) vec = dict_vector() training_data = vec.fit_transform(phi_dicts) model = LogisticRegression(class_weight='auto', penalty='l1') model.fit(training_data, labels) self.feature_weights = dict([(name, model.coef_[0][i]) for i, name in enumerate(feature_names)]) return feature_names, model, vec def classify(self, feature_names, classifier, transformer, candidates, query_fc=None): '''Returns ``[probability]`` in correspondence with ``candidates``. Where each ``probability`` corresponds to the probability that the corresponding candidate is classified with a positive label given the training data. The list returned is in correspondence with the list of candidates given. N.B. The contract of this method should be simplified by bundling ``feature_names``, ``classifier`` and ``transformer`` into one thing known as "the model." ---AG ''' if query_fc is None: query_fc = self.query_fc dis = {} for name in feature_names: vec = dict_vector() query = vec.fit_transform([get_feat(query_fc, name)]) cans = vec.transform(get_feat(fc, name) for _, fc in candidates) dis[name] = 1 - pairwise_distances( cans, query, metric='cosine', n_jobs=1)[:, 0] # in correspondence with `candidates` phi_dicts = transformer.transform([ dict([(name, dis[name][i]) for name in feature_names]) for i in xrange(len(candidates)) ]) return classifier.predict_proba(phi_dicts)[:, 1] def canopy(self, limit=None): ids = web.streaming_sample( self.canopy_ids(limit_hint=hard_limit(limit)), limit, hard_limit(limit)) # I don't think it ever makes sense to include the query # as part of the candidate set. return filter(lambda (_, fc): fc is not None, self.store.get_many(ids)) def canopy_ids(self, limit_hint=None): limit_hint = limit_hint or 1000 # TODO: It seems like this should pre-emptively discard content # ids that have already participated in a *direct* label with # the query. But I think this is a premature optimization since # the filtering functions will take care of it. (This optimization # would mean fewer kernel computations.) blacklist = set([self.query_content_id]) cids = set() # OK, so it turns out that a naive index scan is pretty inflexible and # arbitrary. The issue is that in a big enough data set, the first # index scan will probably exhaust all of our result set, which # means result sets will never see any variety. # # Instead, we'll try to sample from each index in small batch sizes. # This is a heuristic; not a principled approach. ---AG index_names = self.store.index_names() batch_size = limit_hint / 10 progress = {} # idx, name |--> last end # When `progress` is empty, the following loop will terminate. # An index is removed from `progress` when it no longer produces # results. for idx_name in index_names: feat = self.query_fc.get(idx_name) if isinstance(feat, StringCounter): for name in feat: if len(name) > 0: progress[(idx_name, name)] = 0 logger.info('starting index scan (query content id: %s)', self.query_content_id) while len(progress) > 0: for idx_name in index_names: for name in self.query_fc.get(idx_name, []): key = (idx_name, name) if key not in progress: continue logger.info('[StringCounter index: %s] scanning for "%s"', idx_name, name) scanner = self.store.index_scan(idx_name, name) progressed = 0 for cid in islice(scanner, progress[key], None): if progressed >= batch_size: break if cid not in cids and cid not in blacklist: cids.add(cid) progressed += 1 yield cid if progressed == 0: progress.pop(key) else: progress[key] += progressed def labels_from_query(self, limit=None): '''ContentId -> [Label]''' return self.infer_subtopic_labels(limit=limit) def infer_subtopic_labels(self, limit=None): # The basic idea here is to aggressively gather truth data while # avoiding cross contamination with other subfolders. Since our query # is a (content_id, subtopic_id), we can use subtopic connected # components to achieve this. # Short aliases. cid, subid = self.query_content_id, self.query_subtopic_id # For positive labels, the only thing we can do is traverse the # subtopic connected component. # Don't impose a hard limit on positive labels. (There are probably # very few of them.) logger.info('Inferring positive labels for: %r', (cid, subid)) pos_labels = (self.label_store.expand( (cid, subid)) + list(self.positive_subtopic_labels())) logger.info('Inferring negative labels for: %r', (cid, subid)) neg_labels = self.negative_subtopic_labels() pos_sample = web.streaming_sample(pos_labels, limit, limit=hard_limit(limit)) neg_sample = web.streaming_sample(neg_labels, limit, limit=hard_limit(limit)) print('-' * 79) print('POSITIVES\n', '\n'.join(map(repr, pos_sample)), '\n') print('-' * 79) print('NEGATIVES\n', '\n'.join(map(repr, neg_sample))) print('-' * 79) return pos_sample + neg_sample def positive_subtopic_labels(self): cid, subid = self.query_content_id, self.query_subtopic_id subfolders = list(self.folders.parent_subfolders((cid, subid))) for fid, subfolder_id in subfolders: for cid2, subid2 in self.folders.items(fid, subfolder_id): # Since this item is in the same folder as our query, we # consider it a positive example. But there's no explicit # label for it, so manufacture one. # # TODO: Fix annotator id here. (We need to push annotator # information down into the search engine; the rest is # trivial.) ---AG yield Label(cid, cid2, Folders.DEFAULT_ANNOTATOR_ID, CorefValue.Positive, subid, subid2) # Sometimes the user will directly attach a positive label # to an item in the folder. This will grab those. for lab in self.label_store.directly_connected(cid2): if lab.value == CorefValue.Positive \ and lab.subtopic_for(cid2) == subid2: yield lab def negative_subtopic_labels(self): cid, subid = self.query_content_id, self.query_subtopic_id for lab in negative_subtopic_labels(self.label_store, self.folders, cid, subid): yield lab def content_objs_from_labels(self, labels): '''[Label] -> [(content_id, FeatureCollection)]''' is_mapping = lambda obj: isinstance(obj, collections.Mapping) def is_valid_fc((cid, fc)): if fc is None: return False if sum(1 for name in fc if is_mapping(fc[name])) == 0: return False return True ids = set() for lab in labels: ids.add(lab.content_id1) ids.add(lab.content_id2) return list(ifilter(is_valid_fc, self.store.get_many(ids)))
class PairwiseFeatureLearner(object): '''A pairwise active learning model. This active learning model applies :class:`~sklearn.linear_model.LogisticRegression` on-the-fly as a user (or simulated user) interacts with content via the web services provided by :mod:`dossier.web`. This reads :class:`~dossier.label.Label` objects from :class:`~dossier.label.LabelStore` and provides predictions of pairwise equivalence, which can be used for coreference resolution, clustering, and ranking. .. automethod:: dossier.models.PairwiseFeatureLearner.__init__ .. automethod:: dossier.models.PairwiseFeatureLearner.probabilities ''' def __init__(self, store, label_store, content_id, subtopic_id=None, canopy_limit=None, label_limit=None): '''Build a new model. :param store: A store of feature collections. :type store: :class:`dossier.store.Store` :param label_store: A store of labels (ground truth data). :type label_store: :class:`dossier.label.LabelStore` :param str content_id: The query content id (which should correspond to a feature collection in the ``store``). If it doesn't, no results are returned. :param int canopy_limit: A limit on the number of results to return in the canopy (the initial index scan). This is meant to be a mechanism for resource control. :param int label_limit: A limit on the number of labels to use in training. This is meant to be a mechanism for resource control. ''' self.store = store self.label_store = label_store self.folders = Folders(store.kvl) self.query_content_id = content_id self.query_subtopic_id = subtopic_id self.query_fc = None self.canopy_limit = canopy_limit self.label_limit = label_limit def as_result(self, cid, fc, p): fnames = sorted(set(self.query_fc.keys()).intersection(fc.keys())) intermediates = dict([(n, {'kernel': 'cosine', 'feature1': n, 'feature2': n, 'kernel_value': None, 'weight': None, 'common_feature_values': []}) for n in fnames]) for n in fnames: intermediates[n]['weight'] = self.feature_weights.get(n) for n, qfeat, cfeat in ((n, self.query_fc[n], fc[n]) for n in fnames): if not isinstance(qfeat, StringCounter) \ or not isinstance(cfeat, StringCounter): continue vals = set(qfeat.keys()).intersection(cfeat.keys()) intermediates[n]['common_feature_values'] = \ sorted(filter(None, vals)) all_vals = sorted(set(qfeat.keys()).union(cfeat.keys())) if len(all_vals) > 0: qcounts = [qfeat.get(v, 0) for v in all_vals] ccounts = [cfeat.get(v, 0) for v in all_vals] sim = cosine(qcounts, ccounts) if not math.isnan(sim): intermediates[n]['kernel_value'] = sim return (cid, fc, { 'probability': p, 'intermediate_model_results': intermediates.values(), }) def probabilities(self): '''Trains a model and predicts recommendations. If the query feature collection could not be found or if there is insufficient training data, an empty list is returned. Otherwise, a list of content objects (tuples of content id and feature collection) and probabilities is returned. The probability is generated from the model, and reflects confidence of the model that the corresponding content object is related to the query based on the ground truth data. On a large database, random samples are used for training, so this function is not deterministic. :rtype: ``list`` of ((``content_id``, :class:`dossier.fc.FeatureCollection`), probability) ''' self.query_fc = self.store.get(self.query_content_id) if self.query_fc is None: logger.warning('Could not find FC for %s', self.query_content_id) return [] # Try the canopy query before training, because if the canopy query # gives us nothing, then there's no point in the additional work. # # Possible optimization: If the canopy query yields fewer than N # results, then can we just return all of them? ---AG # # N.B Doing the canopy query first will cause things to be slower # when there is insufficient training data. candidates = self.canopy(limit=self.canopy_limit) if len(candidates) == 0: logger.info( 'Could not find any candidates in a canopy query by ' 'scanning the following indexes: %s', ', '.join(self.store.index_names())) return [] # Get labels from the database and translate them to the form # `[{-1, 1}, i, j]` where `i, j` are indices into the list # `content_objs`, which has type `[(content_id, FeatureCollection)]`. logger.info('Fetching labels...') labels = list(self.labels_from_query(limit=self.label_limit)) logger.info('Fetching FCs from labels...') content_objs = self.content_objs_from_labels(labels) indexed_labels = labels_to_indexed_coref_values(content_objs, labels) logger.info('Training...') model = self.train(content_objs, indexed_labels) if model is None: logger.info( 'Could not train model: insufficient training data. ' '(query content id: %s)', self.query_content_id) raise InsufficientTrainingData feature_names, classifier, transformer = model return zip(candidates, self.classify( feature_names, classifier, transformer, candidates)) def train(self, content_objs, idx_labels): '''Trains and returns a model using sklearn. If there are new labels to add, they can be added, returns an sklearn model which can be used for prediction and getting features. This method may return ``None`` if there is insufficient training data to produce a model. :param labels: Ground truth data. :type labels: list of ``({-1, 1}, index1, index2)``. ''' # We have insufficient training data when there is only one or # fewer classes of labels. if len(set([lab[0] for lab in idx_labels])) <= 1: return None fcs = [fc for _, fc in content_objs] feature_names = vectorizable_features(fcs) dis = dissimilarities(feature_names, fcs) phi_dicts, labels = [], [] # lists are in correspondence for coref_value, i, j in idx_labels: # i, j are indices into the list `fcs` labels.append(coref_value) # either -1 or 1 phi_dict = dict([(name, dis[name][i,j]) for name in feature_names]) phi_dicts.append(phi_dict) vec = dict_vector() training_data = vec.fit_transform(phi_dicts) model = LogisticRegression(class_weight='auto', penalty='l1') model.fit(training_data, labels) self.feature_weights = dict([(name, model.coef_[0][i]) for i, name in enumerate(feature_names)]) return feature_names, model, vec def classify(self, feature_names, classifier, transformer, candidates, query_fc=None): '''Returns ``[probability]`` in correspondence with ``candidates``. Where each ``probability`` corresponds to the probability that the corresponding candidate is classified with a positive label given the training data. The list returned is in correspondence with the list of candidates given. N.B. The contract of this method should be simplified by bundling ``feature_names``, ``classifier`` and ``transformer`` into one thing known as "the model." ---AG ''' if query_fc is None: query_fc = self.query_fc dis = {} for name in feature_names: vec = dict_vector() query = vec.fit_transform([get_feat(query_fc, name)]) cans = vec.transform(get_feat(fc, name) for _, fc in candidates) dis[name] = 1 - pairwise_distances( cans, query, metric='cosine', n_jobs=1)[:,0] # in correspondence with `candidates` phi_dicts = transformer.transform( [dict([(name, dis[name][i]) for name in feature_names]) for i in xrange(len(candidates))]) return classifier.predict_proba(phi_dicts)[:,1] def canopy(self, limit=None): ids = web.streaming_sample( self.canopy_ids(limit_hint=hard_limit(limit)), limit, hard_limit(limit)) # I don't think it ever makes sense to include the query # as part of the candidate set. return filter(lambda (_, fc): fc is not None, self.store.get_many(ids)) def canopy_ids(self, limit_hint=None): limit_hint = limit_hint or 1000 # TODO: It seems like this should pre-emptively discard content # ids that have already participated in a *direct* label with # the query. But I think this is a premature optimization since # the filtering functions will take care of it. (This optimization # would mean fewer kernel computations.) blacklist = set([self.query_content_id]) cids = set() # OK, so it turns out that a naive index scan is pretty inflexible and # arbitrary. The issue is that in a big enough data set, the first # index scan will probably exhaust all of our result set, which # means result sets will never see any variety. # # Instead, we'll try to sample from each index in small batch sizes. # This is a heuristic; not a principled approach. ---AG index_names = self.store.index_names() batch_size = limit_hint / 10 progress = {} # idx, name |--> last end # When `progress` is empty, the following loop will terminate. # An index is removed from `progress` when it no longer produces # results. for idx_name in index_names: feat = self.query_fc.get(idx_name) if isinstance(feat, StringCounter): for name in feat: if len(name) > 0: progress[(idx_name, name)] = 0 logger.info('starting index scan (query content id: %s)', self.query_content_id) while len(progress) > 0: for idx_name in index_names: for name in self.query_fc.get(idx_name, []): key = (idx_name, name) if key not in progress: continue logger.info('[StringCounter index: %s] scanning for "%s"', idx_name, name) scanner = self.store.index_scan(idx_name, name) progressed = 0 for cid in islice(scanner, progress[key], None): if progressed >= batch_size: break if cid not in cids and cid not in blacklist: cids.add(cid) progressed += 1 yield cid if progressed == 0: progress.pop(key) else: progress[key] += progressed def labels_from_query(self, limit=None): '''ContentId -> [Label]''' return self.infer_subtopic_labels(limit=limit) def infer_subtopic_labels(self, limit=None): # The basic idea here is to aggressively gather truth data while # avoiding cross contamination with other subfolders. Since our query # is a (content_id, subtopic_id), we can use subtopic connected # components to achieve this. # Short aliases. cid, subid = self.query_content_id, self.query_subtopic_id # For positive labels, the only thing we can do is traverse the # subtopic connected component. # Don't impose a hard limit on positive labels. (There are probably # very few of them.) logger.info('Inferring positive labels for: %r', (cid, subid)) pos_labels = (self.label_store.expand((cid, subid)) + list(self.positive_subtopic_labels())) logger.info('Inferring negative labels for: %r', (cid, subid)) neg_labels = self.negative_subtopic_labels() pos_sample = web.streaming_sample( pos_labels, limit, limit=hard_limit(limit)) neg_sample = web.streaming_sample( neg_labels, limit, limit=hard_limit(limit)) print('-' * 79) print('POSITIVES\n', '\n'.join(map(repr, pos_sample)), '\n') print('-' * 79) print('NEGATIVES\n', '\n'.join(map(repr, neg_sample))) print('-' * 79) return pos_sample + neg_sample def positive_subtopic_labels(self): cid, subid = self.query_content_id, self.query_subtopic_id subfolders = list(self.folders.parent_subfolders((cid, subid))) for fid, subfolder_id in subfolders: for cid2, subid2 in self.folders.items(fid, subfolder_id): # Since this item is in the same folder as our query, we # consider it a positive example. But there's no explicit # label for it, so manufacture one. # # TODO: Fix annotator id here. (We need to push annotator # information down into the search engine; the rest is # trivial.) ---AG yield Label(cid, cid2, Folders.DEFAULT_ANNOTATOR_ID, CorefValue.Positive, subid, subid2) # Sometimes the user will directly attach a positive label # to an item in the folder. This will grab those. for lab in self.label_store.directly_connected(cid2): if lab.value == CorefValue.Positive \ and lab.subtopic_for(cid2) == subid2: yield lab def negative_subtopic_labels(self): cid, subid = self.query_content_id, self.query_subtopic_id for lab in negative_subtopic_labels(self.label_store, self.folders, cid, subid): yield lab def content_objs_from_labels(self, labels): '''[Label] -> [(content_id, FeatureCollection)]''' is_mapping = lambda obj: isinstance(obj, collections.Mapping) def is_valid_fc((cid, fc)): if fc is None: return False if sum(1 for name in fc if is_mapping(fc[name])) == 0: return False return True ids = set() for lab in labels: ids.add(lab.content_id1) ids.add(lab.content_id2) return list(ifilter(is_valid_fc, self.store.get_many(ids)))
def __init__(self, kvlclient, label_store): super(same_subfolder, self).__init__() self.kvl = kvlclient self.label_store = label_store self.folders = Folders(self.kvl)
def new_folders(kvlclient, request): conf = {} if 'annotator_id' in request.query: conf['owner'] = request.query['annotator_id'] return Folders(kvlclient, **conf)