Esempio n. 1
0
    def __init__(self,
                 store,
                 label_store,
                 content_id,
                 subtopic_id=None,
                 canopy_limit=None,
                 label_limit=None):
        '''Build a new model.

        :param store: A store of feature collections.
        :type store: :class:`dossier.store.Store`
        :param label_store: A store of labels (ground truth data).
        :type label_store: :class:`dossier.label.LabelStore`
        :param str content_id: The query content id (which should correspond
                               to a feature collection in the ``store``).
                               If it doesn't, no results are returned.
        :param int canopy_limit: A limit on the number of results to return
                                 in the canopy (the initial index scan).
                                 This is meant to be a mechanism for resource
                                 control.
        :param int label_limit: A limit on the number of labels to use in
                                training. This is meant to be a mechanism for
                                resource control.
        '''
        self.store = store
        self.label_store = label_store
        self.folders = Folders(store.kvl)
        self.query_content_id = content_id
        self.query_subtopic_id = subtopic_id
        self.query_fc = None
        self.canopy_limit = canopy_limit
        self.label_limit = label_limit
Esempio n. 2
0
def v1_fc_put(request, response, store, kvlclient, tfidf, cid):
    '''Store a single feature collection.

    The route for this endpoint is:
    ``PUT /dossier/v1/feature-collections/<content_id>``.

    ``content_id`` is the id to associate with the given feature
    collection. The feature collection should be in the request
    body serialized as JSON.

    Alternatively, if the request's ``Content-type`` is
    ``text/html``, then a feature collection is generated from the
    HTML. The generated feature collection is then returned as a
    JSON payload.

    This endpoint returns status ``201`` upon successful
    storage otherwise. An existing feature collection with id
    ``content_id`` is overwritten.
    '''
    tfidf = tfidf or None
    if request.headers.get('content-type', '').startswith('text/html'):
        url = urllib.unquote(cid.split('|', 1)[1])
        fc = etl.create_fc_from_html(url, request.body.read(), tfidf=tfidf)
        logger.info('created FC for %r', cid)
        store.put([(cid, fc)])
        return fc_to_json(fc)
    else:
        fc = FeatureCollection.from_dict(json.load(request.body))
        keywords = set()
        for subid in fc:
            if subid.startswith('subtopic'):
                ty = subtopic_type(subid)
                if ty in ('text', 'manual'):
                    # get the user selected string
                    data = typed_subtopic_data(fc, subid)
                    map(keywords.add, cleanse(data).split())
                    keywords.add(cleanse(data))

        folders = Folders(kvlclient)
        for fid, sid in folders.parent_subfolders(cid):
            if not isinstance(fid, unicode):
                fid = fid.decode('utf8')
            if not isinstance(sid, unicode):
                sid = sid.decode('utf8')
            keywords.add(cleanse(fid))
            keywords.add(cleanse(sid))

        fc[u'keywords'] = StringCounter(keywords)
        store.put([(cid, fc)])
        response.status = 201
Esempio n. 3
0
def v1_fc_put(request, response, store, kvlclient, tfidf, cid):
    '''Store a single feature collection.

    The route for this endpoint is:
    ``PUT /dossier/v1/feature-collections/<content_id>``.

    ``content_id`` is the id to associate with the given feature
    collection. The feature collection should be in the request
    body serialized as JSON.

    Alternatively, if the request's ``Content-type`` is
    ``text/html``, then a feature collection is generated from the
    HTML. The generated feature collection is then returned as a
    JSON payload.

    This endpoint returns status ``201`` upon successful
    storage otherwise. An existing feature collection with id
    ``content_id`` is overwritten.
    '''
    tfidf = tfidf or None
    if request.headers.get('content-type', '').startswith('text/html'):
        url = urllib.unquote(cid.split('|', 1)[1])
        fc = etl.create_fc_from_html(url, request.body.read(), tfidf=tfidf)
        logger.info('created FC for %r', cid)
        store.put([(cid, fc)])
        return fc_to_json(fc)
    else:
        fc = FeatureCollection.from_dict(json.load(request.body))
        keywords = set()
        for subid in fc:
            if subid.startswith('subtopic'):
                ty = subtopic_type(subid)
                if ty in ('text', 'manual'):
                    # get the user selected string
                    data = typed_subtopic_data(fc, subid)
                    map(keywords.add, cleanse(data).split())
                    keywords.add(cleanse(data))

        folders = Folders(kvlclient)
        for fid, sid in folders.parent_subfolders(cid):
            if not isinstance(fid, unicode):
                fid = fid.decode('utf8')
            if not isinstance(sid, unicode):
                sid = sid.decode('utf8')
            keywords.add(cleanse(fid))
            keywords.add(cleanse(sid))

        fc[u'keywords'] = StringCounter(keywords)
        store.put([(cid, fc)])
        response.status = 201
Esempio n. 4
0
def main():
    p = argparse.ArgumentParser(
        description='SortingDesk report generation tool')
    p.add_argument('-c',
                   '--config',
                   required=True,
                   help='dossier stack YAML config file')
    p.add_argument('-o',
                   '--output',
                   required=True,
                   help='path to write Excel workbook file')
    p.add_argument('-u',
                   '--user',
                   default='unknown',
                   help='user name (default=ALL)')
    p.add_argument('folder', help='folder name')
    p.add_argument('subfolder',
                   nargs='?',
                   default=None,
                   help='subfolder name (default=ALL)')
    args = p.parse_args()

    config = yakonfig.set_default_config([kvlayer], filename=args.config)
    factory = Factory(config)
    store = factory.create(Store)

    # Instantiate and run report generator.
    folders = Folders(kvlayer.client())
    gen = ReportGenerator(store,
                          folders,
                          args.folder,
                          subfolder_name=args.subfolder,
                          user=args.user)
    with open(args.output, 'wb+') as out:
        gen.run(out)
Esempio n. 5
0
    def __init__(self, store, label_store, content_id, subtopic_id=None,
                 canopy_limit=None, label_limit=None):
        '''Build a new model.

        :param store: A store of feature collections.
        :type store: :class:`dossier.store.Store`
        :param label_store: A store of labels (ground truth data).
        :type label_store: :class:`dossier.label.LabelStore`
        :param str content_id: The query content id (which should correspond
                               to a feature collection in the ``store``).
                               If it doesn't, no results are returned.
        :param int canopy_limit: A limit on the number of results to return
                                 in the canopy (the initial index scan).
                                 This is meant to be a mechanism for resource
                                 control.
        :param int label_limit: A limit on the number of labels to use in
                                training. This is meant to be a mechanism for
                                resource control.
        '''
        self.store = store
        self.label_store = label_store
        self.folders = Folders(store.kvl)
        self.query_content_id = content_id
        self.query_subtopic_id = subtopic_id
        self.query_fc = None
        self.canopy_limit = canopy_limit
        self.label_limit = label_limit
Esempio n. 6
0
    def _generate_for_subfolder(self, sid):
        ''' Generate report for a subfolder.

        :param sid: The subfolder id; assumed valid
        '''
        # TODO: the following assumes subfolder names can be constructed from a
        # subfolder id, which might not be the case in the future.
        name = self._sanitise_sheetname(uni(Folders.id_to_name(sid)))
        ws = self.workbook.add_worksheet(name)
        fmt = self.formats
        ws.write("A1", "Dossier report", fmt['title'])
        ws.write("A2", "%s | %s" % (uni(self.folder_name), name))

        # Column dimensions
        ws.set_column('A:A', 37)
        ws.set_column('B:B', 37)
        ws.set_column('C:C', 37)
        ws.set_column('D:D', 8)
        ws.set_column('E:E', 30)
        ws.set_column('F:F', 37)

        # Header
        ws.write("A4", "Id", fmt['header'])
        ws.write("B4", "URL", fmt['header'])
        ws.write("C4", "Subtopic Id", fmt['header'])
        ws.write("D4", "Type", fmt['header'])
        ws.write("E4", "Content", fmt['header'])
        ws.write("F4", "Image URL", fmt['header'])

        # TODO: we probably want to wrap the following in a try-catch block, in
        # case the call to folders.subtopics fails.
        row = 4
        for i in subtopics(self.store, self.folders, self.folder_id, sid, self.user):
            Item.construct(self, i).generate_to(ws, row)
            row += 1
Esempio n. 7
0
class same_subfolder(web.Filter):
    def __init__(self, kvlclient, label_store):
        super(same_subfolder, self).__init__()
        self.kvl = kvlclient
        self.label_store = label_store
        self.folders = Folders(self.kvl)

    def create_predicate(self):
        subfolders = self.folders.parent_subfolders(self.query_content_id)
        cids = set()
        for folder_id, subfolder_id in subfolders:
            for cid, subid in self.folders.items(folder_id, subfolder_id):
                cids.add(cid)
                # Also add directly connected labels too.
                for lab in self.label_store.directly_connected((cid, subid)):
                    cids.add(lab.other(cid))
        return lambda (content_id, fc): content_id not in cids
Esempio n. 8
0
class same_subfolder(web.Filter):
    def __init__(self, kvlclient, label_store):
        super(same_subfolder, self).__init__()
        self.kvl = kvlclient
        self.label_store = label_store
        self.folders = Folders(self.kvl)

    def create_predicate(self):
        subfolders = self.folders.parent_subfolders(self.query_content_id)
        cids = set()
        for folder_id, subfolder_id in subfolders:
            for cid, subid in self.folders.items(folder_id, subfolder_id):
                cids.add(cid)
                # Also add directly connected labels too.
                for lab in self.label_store.directly_connected((cid, subid)):
                    cids.add(lab.other(cid))
        return lambda (content_id, fc): content_id not in cids
Esempio n. 9
0
    def _generate_report_single(self, sid):
        '''Generate report for subfolder given by sid .

        The main purpose of this method is to make sure the subfolder given by
        sid does indeed exist.  All real work is delegated to
        _generate_for_subfolder.

        :param sid: The subfolder id

        Private method.
        '''
        assert self.workbook is not None
        assert sid is not None

        # Ensure subfolder exists
        if not sid in self.folders.subfolders(self.folder_id, self.user):
            subfolder = Folders.id_to_name(sid)
            print("E: subfolder not found: %s" % subfolder, file=sys.stderr)
            return

        self._generate_for_subfolder(sid)
Esempio n. 10
0
    def _generate_report_single(self, sid):
        '''Generate report for subfolder given by sid .

        The main purpose of this method is to make sure the subfolder given by
        sid does indeed exist.  All real work is delegated to
        _generate_for_subfolder.

        :param sid: The subfolder id

        Private method.
        '''
        assert self.workbook is not None
        assert sid is not None

        # Ensure subfolder exists
        if not sid in self.folders.subfolders(self.folder_id, self.user):
            subfolder = Folders.id_to_name(sid)
            print("E: subfolder not found: %s" % subfolder, file=sys.stderr)
            return

        self._generate_for_subfolder(sid)
Esempio n. 11
0
    def _generate_for_subfolder(self, sid):
        ''' Generate report for a subfolder.

        :param sid: The subfolder id; assumed valid
        '''
        # TODO: the following assumes subfolder names can be constructed from a
        # subfolder id, which might not be the case in the future.
        name = self._sanitise_sheetname(uni(Folders.id_to_name(sid)))
        ws = self.workbook.add_worksheet(name)
        fmt = self.formats
        ws.write("A1", "Dossier report", fmt['title'])
        ws.write("A2", "%s | %s" % (uni(self.folder_name), name))

        # Column dimensions
        ws.set_column('A:A', 37)
        ws.set_column('B:B', 37)
        ws.set_column('C:C', 37)
        ws.set_column('D:D', 8)
        ws.set_column('E:E', 30)
        ws.set_column('F:F', 37)

        # Header
        ws.write("A4", "Id", fmt['header'])
        ws.write("B4", "URL", fmt['header'])
        ws.write("C4", "Subtopic Id", fmt['header'])
        ws.write("D4", "Type", fmt['header'])
        ws.write("E4", "Content", fmt['header'])
        ws.write("F4", "Image URL", fmt['header'])

        # TODO: we probably want to wrap the following in a try-catch block, in
        # case the call to folders.subtopics fails.
        row = 4
        for i in subtopics(self.store, self.folders, self.folder_id, sid,
                           self.user):
            Item.construct(self, i).generate_to(ws, row)
            row += 1
Esempio n. 12
0
 def subfolder_id(self):
     return Folders.name_to_id(self.subfolder_name)
Esempio n. 13
0
 def folder_id(self):
     return Folders.name_to_id(self.folder_name)
Esempio n. 14
0
def traverse_extract_fetch(config, wukey, stop_after_extraction=False):
    '''Given a config and a
    `wukey=cbor.dumps((folder_name,subfolder_name))`, traverse the
    folders to generate queries, issue them to Google, fetch the
    results, and ingest them.

    '''

    config.kvlclient.setup_namespace({'openquery': (str,)})
    try:
        data = list(config.kvlclient.get('openquery', (wukey,)))
        if data:
            if data[0][1]:
                logger.info('found existing query results: %r', data)
                return
            else:
                config.kvlclient.delete('openquery', (wukey,))
    except:
        logger.error('failed to get data from existing table', exc_info=True)

    fid, sid = cbor.loads(wukey)
    tfidf = config.tfidf
    folders = Folders(config.kvlclient)
    fetcher = Fetcher()

    ## To disable the keyword extractor model, you can uncomment out
    ## the next three lines (`get_subfolder_queries`) and comment out
    ## the following two lines (`extract_keyword_queries`).
    #keyword_feature_keys = []
    #queries = get_subfolder_queries(
    #    config.store, config.label_store, folders, fid, sid)

    queries, keyword_feature_keys, has_observations = extract_keyword_queries(
        config.store, config.label_store, folders, fid, sid)

    logger.info('Model found %d queries: %r', len(queries), queries)

    if stop_after_extraction:
        return

    keywords = set()
    for key in keyword_feature_keys:
        ckey = cleanse(key.decode('utf8'))
        keywords.add(ckey)
        for part in ckey.split():
            keywords.add(part)

    #link2queries = defaultdict(set)
    links = set()
    logger.info('searching google for: %r', queries)
    for q in queries:
        for result in config.google.web_search_with_paging(q, limit=10):
            links.add(result['link'])
            #map(link2queries[result['link']].add, cleanse(q.decode('utf8')).split())
            logger.info('discovered %r', result['link'])

    result = None

    #logger.info('got %d URLs from %d queries', len(link2queries), len(queries))
    logger.info('got %d URLs from %d queries', len(links), len(queries))

    # content_ids gets modified within the 'callback' closure
    content_ids = []
    #for link, queries in link2queries.items():

    def callback(si, link):
        if si is None: return
        cid_url = hashlib.md5(str(link)).hexdigest()
        cid = etl.interface.mk_content_id(cid_url)
        content_ids.append(cid)

        # hack alert!
        # We currently use FCs to store subtopic text data, which
        # means we cannot overwrite existing FCs with reckless
        # abandon. So we adopt a heuristic: check if an FC already
        # exists, and if it does, check if it is being used to store
        # user data. If so, don't overwrite it and move on.
        fc = config.store.get(cid)
        if fc is not None and any(k.startswith('subtopic|')
                                  for k in fc.iterkeys()):
            logger.info('skipping ingest for %r (abs url: %r) because '
                        'an FC with user data already exists.',
                        cid, link)
            return

        other_features = {
            u'keywords': StringCounter(keywords), #list(queries)),
        }

        try:
            fc = etl.create_fc_from_html(
                link, si.body.raw,
                encoding=si.body.encoding or 'utf-8', tfidf=tfidf,
                other_features=other_features,
            )
            if not fc:
                logger.info('failed to get an FC, moving on')
                return
            logger.info('created FC for %r (abs url: %r)',
                        cid, link)
            config.store.put([(cid, fc)])
        except Exception:
            logger.info('trapped ingest failure on %r (abs url: %r)',
                        cid, link, exc_info=True)

    logger.info('FETCHING using ASYNC')
    fetcher.get_async(islice(links, None), callback)

    data = json.dumps({'content_ids': content_ids})
    logger.info('saving %d content_ids in %d bytes on wukey %r',
                len(content_ids), len(data), wukey)
    config.kvlclient.put('openquery', ((wukey,), data))
    logger.info('done saving for %r', wukey)
Esempio n. 15
0
def worker(work_unit, max_sample=1000):
    '''Expects a coordinate WorkUnit for DragNet and runs the following
    steps:

    1. scans all dossiers at the *folder* level and assembles feature
    vectors for each folder -- see `make_feature`

    2. trains a multinomial naive Bayes classifier that treats each
    *folder* as a classifier target.

    3. sample the corpus by scanning up to `max_sample` and applying
    the classifier to each item to get an approx "size" of the Folder

    4. Bootstrap by treating those classifier predictions as truth
    data and extract the learned features that are predictive as new
    query strings.

    5. Put the data in kvlayer for webservice end point to return to
    polling client -- see dossier.models.routes

    '''
    if 'config' not in work_unit.spec:
        raise coordinate.exceptions.ProgrammerError(
            'could not run dragnet without global config')

    web_conf = Config()
    unitconf = work_unit.spec['config']
    with yakonfig.defaulted_config([coordinate, kvlayer, dblogger, web_conf],
                                   config=unitconf):

        labels = []
        D = list()

        label2fid = dict()

        rejects = set()
        keepers = set()

        # 1. make a classifier target for each *folder*, ignoring
        # subfolder structure
        FT = Folders(web_conf.kvlclient)
        for idx, fid in enumerate(FT.folders()):
            label2fid[idx] = fid
            for sid in FT.subfolders(fid):
                for cid, subtopic_id in FT.items(fid, sid):
                    fc = web_conf.store.get(cid)
                    if fc:
                        # NB: first call to make_feature
                        feat, _rejects, _keepers = make_feature(fc)
                    else:
                        _rejects = {}
                        _keepers = {}
                    D.append(feat)
                    labels.append(idx)
                    rejects.update(_rejects)
                    keepers.update(_keepers)
                    logger.info('fid=%r, observation: %r', fid, cid)

        # 2. Convert the StringCounters into an sklearn format and
        # train MultinomialNB
        logger.info('transforming...')
        v = DictVectorizer(sparse=False)
        X = v.fit_transform(D)
        logger.info('transform fit done.')

        labels = np.array(labels)

        # Fit the sklearn Bernoulli Naive Bayes classifer
        clf = MultinomialNB()
        clf.fit(X, labels)
        logger.info('fit MultinomialNB')

        # 3. Scan the corpus up to max_sample putting the items into
        # each target to get an approx "size" of the Folder
        counts = Counter()
        for cid, fc in islice(web_conf.store.scan(), max_sample):
            # build the same feature vector as the training process
            feat, _rejects, _keepers = make_feature(fc)
            X = v.transform([feat])
            # predict which folder it belongs in
            target = clf.predict(X[0])[0]
            # count the effective size of that folder in this sample
            counts[label2fid[target]] += 1

        logger.info('counts done')

        ## 4. Bootstrap by treating those classifier predictions as
        ## truth data and extract the learned features that are
        ## predictive as new query strings.
        clusters = []
        for idx in sorted(set(labels)):
            logger.debug('considering cluster: %d', idx)
            try:
                all_features = v.inverse_transform(clf.feature_log_prob_[idx])[0]
            except:
                logger.warn('beyond edge on cluster %d', idx)
                continue
            words = Counter(all_features)
            ordered = sorted(words.items(),
                             key=operator.itemgetter(1), reverse=True)
            filtered = []
            for it in ordered:
                if is_bad_token(it[0]): continue

                if is_username(it[0]):
                    logger.debug('%r is_username', it[0])
                #else:
                #    continue
                filtered.append(it)
                if len(filtered) > 100: # hard cutoff
                    break

            # normalize cluster size exponentially
            biggest = exp(filtered[0][1])
            # rescale all by biggest
            filtered = [(key, int(round(counts[label2fid[idx]] * exp(w) / biggest))) for key, w in filtered]
            # describe what we just figured out
            logger.info('%s --> %r', label2fid[idx], ['%s: %d' % it for it in filtered[:10]])

            # return build the JSON-serializable format for the
            # DragNet UI embedded inside SortingDesk
            cluster = []
            cluster.append({'caption': label2fid[idx],
                            'weight': counts[label2fid[idx]],
                            'folder_id': None,
                            })
            cluster += [{'caption': caption, 'weight': weight, 'folder_id': label2fid[idx]} for caption, weight in filtered if weight > 0]
            clusters.append(cluster)

        # 5. Put the data in kvlayer for webservice end point to
        # return to polling client
        web_conf.kvlclient.setup_namespace({'dragnet': (str,)})
        web_conf.kvlclient.put('dragnet', (('dragnet',), json.dumps({'clusters': clusters})))
        return dict(counts)
Esempio n. 16
0
 def folder_id(self):
     return Folders.name_to_id(self.folder_name)
Esempio n. 17
0
class PairwiseFeatureLearner(object):
    '''A pairwise active learning model.

    This active learning model applies
    :class:`~sklearn.linear_model.LogisticRegression` on-the-fly
    as a user (or simulated user) interacts with content
    via the web services provided by :mod:`dossier.web`.

    This reads :class:`~dossier.label.Label` objects from
    :class:`~dossier.label.LabelStore` and provides predictions of
    pairwise equivalence, which can be used for coreference resolution,
    clustering, and ranking.

    .. automethod:: dossier.models.PairwiseFeatureLearner.__init__
    .. automethod:: dossier.models.PairwiseFeatureLearner.probabilities
    '''
    def __init__(self,
                 store,
                 label_store,
                 content_id,
                 subtopic_id=None,
                 canopy_limit=None,
                 label_limit=None):
        '''Build a new model.

        :param store: A store of feature collections.
        :type store: :class:`dossier.store.Store`
        :param label_store: A store of labels (ground truth data).
        :type label_store: :class:`dossier.label.LabelStore`
        :param str content_id: The query content id (which should correspond
                               to a feature collection in the ``store``).
                               If it doesn't, no results are returned.
        :param int canopy_limit: A limit on the number of results to return
                                 in the canopy (the initial index scan).
                                 This is meant to be a mechanism for resource
                                 control.
        :param int label_limit: A limit on the number of labels to use in
                                training. This is meant to be a mechanism for
                                resource control.
        '''
        self.store = store
        self.label_store = label_store
        self.folders = Folders(store.kvl)
        self.query_content_id = content_id
        self.query_subtopic_id = subtopic_id
        self.query_fc = None
        self.canopy_limit = canopy_limit
        self.label_limit = label_limit

    def as_result(self, cid, fc, p):
        fnames = sorted(set(self.query_fc.keys()).intersection(fc.keys()))
        intermediates = dict([(n, {
            'kernel': 'cosine',
            'feature1': n,
            'feature2': n,
            'kernel_value': None,
            'weight': None,
            'common_feature_values': []
        }) for n in fnames])
        for n in fnames:
            intermediates[n]['weight'] = self.feature_weights.get(n)
        for n, qfeat, cfeat in ((n, self.query_fc[n], fc[n]) for n in fnames):
            if not isinstance(qfeat, StringCounter) \
                    or not isinstance(cfeat, StringCounter):
                continue
            vals = set(qfeat.keys()).intersection(cfeat.keys())
            intermediates[n]['common_feature_values'] = \
                sorted(filter(None, vals))

            all_vals = sorted(set(qfeat.keys()).union(cfeat.keys()))
            if len(all_vals) > 0:
                qcounts = [qfeat.get(v, 0) for v in all_vals]
                ccounts = [cfeat.get(v, 0) for v in all_vals]
                sim = cosine(qcounts, ccounts)
                if not math.isnan(sim):
                    intermediates[n]['kernel_value'] = sim
        return (cid, fc, {
            'probability': p,
            'intermediate_model_results': intermediates.values(),
        })

    def probabilities(self):
        '''Trains a model and predicts recommendations.

        If the query feature collection could not be found or if there
        is insufficient training data, an empty list is returned.

        Otherwise, a list of content objects (tuples of content
        id and feature collection) and probabilities is returned.
        The probability is generated from the model, and reflects
        confidence of the model that the corresponding content object
        is related to the query based on the ground truth data.

        On a large database, random samples are used for training, so
        this function is not deterministic.

        :rtype: ``list`` of
          ((``content_id``, :class:`dossier.fc.FeatureCollection`),
          probability)
        '''
        self.query_fc = self.store.get(self.query_content_id)
        if self.query_fc is None:
            logger.warning('Could not find FC for %s', self.query_content_id)
            return []

        # Try the canopy query before training, because if the canopy query
        # gives us nothing, then there's no point in the additional work.
        #
        # Possible optimization: If the canopy query yields fewer than N
        # results, then can we just return all of them? ---AG
        #
        # N.B Doing the canopy query first will cause things to be slower
        # when there is insufficient training data.
        candidates = self.canopy(limit=self.canopy_limit)
        if len(candidates) == 0:
            logger.info(
                'Could not find any candidates in a canopy query by '
                'scanning the following indexes: %s',
                ', '.join(self.store.index_names()))
            return []

        # Get labels from the database and translate them to the form
        # `[{-1, 1}, i, j]` where `i, j` are indices into the list
        # `content_objs`, which has type `[(content_id, FeatureCollection)]`.
        logger.info('Fetching labels...')
        labels = list(self.labels_from_query(limit=self.label_limit))
        logger.info('Fetching FCs from labels...')
        content_objs = self.content_objs_from_labels(labels)
        indexed_labels = labels_to_indexed_coref_values(content_objs, labels)

        logger.info('Training...')
        model = self.train(content_objs, indexed_labels)
        if model is None:
            logger.info(
                'Could not train model: insufficient training data. '
                '(query content id: %s)', self.query_content_id)
            raise InsufficientTrainingData

        feature_names, classifier, transformer = model
        return zip(
            candidates,
            self.classify(feature_names, classifier, transformer, candidates))

    def train(self, content_objs, idx_labels):
        '''Trains and returns a model using sklearn.

        If there are new labels to add, they can be added, returns an
        sklearn model which can be used for prediction and getting
        features.

        This method may return ``None`` if there is insufficient
        training data to produce a model.

        :param labels: Ground truth data.
        :type labels: list of ``({-1, 1}, index1, index2)``.
        '''
        # We have insufficient training data when there is only one or
        # fewer classes of labels.
        if len(set([lab[0] for lab in idx_labels])) <= 1:
            return None

        fcs = [fc for _, fc in content_objs]
        feature_names = vectorizable_features(fcs)
        dis = dissimilarities(feature_names, fcs)

        phi_dicts, labels = [], []  # lists are in correspondence
        for coref_value, i, j in idx_labels:
            # i, j are indices into the list `fcs`
            labels.append(coref_value)  # either -1 or 1
            phi_dict = dict([(name, dis[name][i, j])
                             for name in feature_names])
            phi_dicts.append(phi_dict)

        vec = dict_vector()
        training_data = vec.fit_transform(phi_dicts)

        model = LogisticRegression(class_weight='auto', penalty='l1')
        model.fit(training_data, labels)
        self.feature_weights = dict([(name, model.coef_[0][i])
                                     for i, name in enumerate(feature_names)])
        return feature_names, model, vec

    def classify(self,
                 feature_names,
                 classifier,
                 transformer,
                 candidates,
                 query_fc=None):
        '''Returns ``[probability]`` in correspondence with
        ``candidates``.

        Where each ``probability`` corresponds to the probability that
        the corresponding candidate is classified with a positive label
        given the training data.

        The list returned is in correspondence with the list of
        candidates given.

        N.B. The contract of this method should be simplified by
        bundling ``feature_names``, ``classifier`` and ``transformer``
        into one thing known as "the model." ---AG
        '''
        if query_fc is None:
            query_fc = self.query_fc
        dis = {}
        for name in feature_names:
            vec = dict_vector()
            query = vec.fit_transform([get_feat(query_fc, name)])
            cans = vec.transform(get_feat(fc, name) for _, fc in candidates)
            dis[name] = 1 - pairwise_distances(
                cans, query, metric='cosine', n_jobs=1)[:, 0]

        # in correspondence with `candidates`
        phi_dicts = transformer.transform([
            dict([(name, dis[name][i]) for name in feature_names])
            for i in xrange(len(candidates))
        ])
        return classifier.predict_proba(phi_dicts)[:, 1]

    def canopy(self, limit=None):
        ids = web.streaming_sample(
            self.canopy_ids(limit_hint=hard_limit(limit)), limit,
            hard_limit(limit))
        # I don't think it ever makes sense to include the query
        # as part of the candidate set.
        return filter(lambda (_, fc): fc is not None, self.store.get_many(ids))

    def canopy_ids(self, limit_hint=None):
        limit_hint = limit_hint or 1000
        # TODO: It seems like this should pre-emptively discard content
        # ids that have already participated in a *direct* label with
        # the query. But I think this is a premature optimization since
        # the filtering functions will take care of it. (This optimization
        # would mean fewer kernel computations.)
        blacklist = set([self.query_content_id])
        cids = set()

        # OK, so it turns out that a naive index scan is pretty inflexible and
        # arbitrary. The issue is that in a big enough data set, the first
        # index scan will probably exhaust all of our result set, which
        # means result sets will never see any variety.
        #
        # Instead, we'll try to sample from each index in small batch sizes.
        # This is a heuristic; not a principled approach. ---AG
        index_names = self.store.index_names()
        batch_size = limit_hint / 10
        progress = {}  # idx, name |--> last end
        # When `progress` is empty, the following loop will terminate.
        # An index is removed from `progress` when it no longer produces
        # results.
        for idx_name in index_names:
            feat = self.query_fc.get(idx_name)
            if isinstance(feat, StringCounter):
                for name in feat:
                    if len(name) > 0:
                        progress[(idx_name, name)] = 0

        logger.info('starting index scan (query content id: %s)',
                    self.query_content_id)
        while len(progress) > 0:
            for idx_name in index_names:
                for name in self.query_fc.get(idx_name, []):
                    key = (idx_name, name)
                    if key not in progress:
                        continue
                    logger.info('[StringCounter index: %s] scanning for "%s"',
                                idx_name, name)
                    scanner = self.store.index_scan(idx_name, name)
                    progressed = 0
                    for cid in islice(scanner, progress[key], None):
                        if progressed >= batch_size:
                            break
                        if cid not in cids and cid not in blacklist:
                            cids.add(cid)
                            progressed += 1
                            yield cid
                    if progressed == 0:
                        progress.pop(key)
                    else:
                        progress[key] += progressed

    def labels_from_query(self, limit=None):
        '''ContentId -> [Label]'''
        return self.infer_subtopic_labels(limit=limit)

    def infer_subtopic_labels(self, limit=None):
        # The basic idea here is to aggressively gather truth data while
        # avoiding cross contamination with other subfolders. Since our query
        # is a (content_id, subtopic_id), we can use subtopic connected
        # components to achieve this.

        # Short aliases.
        cid, subid = self.query_content_id, self.query_subtopic_id

        # For positive labels, the only thing we can do is traverse the
        # subtopic connected component.
        # Don't impose a hard limit on positive labels. (There are probably
        # very few of them.)
        logger.info('Inferring positive labels for: %r', (cid, subid))
        pos_labels = (self.label_store.expand(
            (cid, subid)) + list(self.positive_subtopic_labels()))
        logger.info('Inferring negative labels for: %r', (cid, subid))
        neg_labels = self.negative_subtopic_labels()

        pos_sample = web.streaming_sample(pos_labels,
                                          limit,
                                          limit=hard_limit(limit))
        neg_sample = web.streaming_sample(neg_labels,
                                          limit,
                                          limit=hard_limit(limit))
        print('-' * 79)
        print('POSITIVES\n', '\n'.join(map(repr, pos_sample)), '\n')
        print('-' * 79)
        print('NEGATIVES\n', '\n'.join(map(repr, neg_sample)))
        print('-' * 79)
        return pos_sample + neg_sample

    def positive_subtopic_labels(self):
        cid, subid = self.query_content_id, self.query_subtopic_id
        subfolders = list(self.folders.parent_subfolders((cid, subid)))

        for fid, subfolder_id in subfolders:
            for cid2, subid2 in self.folders.items(fid, subfolder_id):
                # Since this item is in the same folder as our query, we
                # consider it a positive example. But there's no explicit
                # label for it, so manufacture one.
                #
                # TODO: Fix annotator id here. (We need to push annotator
                # information down into the search engine; the rest is
                # trivial.) ---AG
                yield Label(cid, cid2, Folders.DEFAULT_ANNOTATOR_ID,
                            CorefValue.Positive, subid, subid2)

                # Sometimes the user will directly attach a positive label
                # to an item in the folder. This will grab those.
                for lab in self.label_store.directly_connected(cid2):
                    if lab.value == CorefValue.Positive \
                            and lab.subtopic_for(cid2) == subid2:
                        yield lab

    def negative_subtopic_labels(self):
        cid, subid = self.query_content_id, self.query_subtopic_id
        for lab in negative_subtopic_labels(self.label_store, self.folders,
                                            cid, subid):
            yield lab

    def content_objs_from_labels(self, labels):
        '''[Label] -> [(content_id, FeatureCollection)]'''
        is_mapping = lambda obj: isinstance(obj, collections.Mapping)

        def is_valid_fc((cid, fc)):
            if fc is None:
                return False
            if sum(1 for name in fc if is_mapping(fc[name])) == 0:
                return False
            return True

        ids = set()
        for lab in labels:
            ids.add(lab.content_id1)
            ids.add(lab.content_id2)
        return list(ifilter(is_valid_fc, self.store.get_many(ids)))
Esempio n. 18
0
class PairwiseFeatureLearner(object):
    '''A pairwise active learning model.

    This active learning model applies
    :class:`~sklearn.linear_model.LogisticRegression` on-the-fly
    as a user (or simulated user) interacts with content
    via the web services provided by :mod:`dossier.web`.

    This reads :class:`~dossier.label.Label` objects from
    :class:`~dossier.label.LabelStore` and provides predictions of
    pairwise equivalence, which can be used for coreference resolution,
    clustering, and ranking.

    .. automethod:: dossier.models.PairwiseFeatureLearner.__init__
    .. automethod:: dossier.models.PairwiseFeatureLearner.probabilities
    '''
    def __init__(self, store, label_store, content_id, subtopic_id=None,
                 canopy_limit=None, label_limit=None):
        '''Build a new model.

        :param store: A store of feature collections.
        :type store: :class:`dossier.store.Store`
        :param label_store: A store of labels (ground truth data).
        :type label_store: :class:`dossier.label.LabelStore`
        :param str content_id: The query content id (which should correspond
                               to a feature collection in the ``store``).
                               If it doesn't, no results are returned.
        :param int canopy_limit: A limit on the number of results to return
                                 in the canopy (the initial index scan).
                                 This is meant to be a mechanism for resource
                                 control.
        :param int label_limit: A limit on the number of labels to use in
                                training. This is meant to be a mechanism for
                                resource control.
        '''
        self.store = store
        self.label_store = label_store
        self.folders = Folders(store.kvl)
        self.query_content_id = content_id
        self.query_subtopic_id = subtopic_id
        self.query_fc = None
        self.canopy_limit = canopy_limit
        self.label_limit = label_limit

    def as_result(self, cid, fc, p):
        fnames = sorted(set(self.query_fc.keys()).intersection(fc.keys()))
        intermediates = dict([(n, {'kernel': 'cosine',
                             'feature1': n,
                             'feature2': n,
                             'kernel_value': None,
                             'weight': None,
                             'common_feature_values': []}) for n in fnames])
        for n in fnames:
            intermediates[n]['weight'] = self.feature_weights.get(n)
        for n, qfeat, cfeat in ((n, self.query_fc[n], fc[n]) for n in fnames):
            if not isinstance(qfeat, StringCounter) \
                    or not isinstance(cfeat, StringCounter):
                continue
            vals = set(qfeat.keys()).intersection(cfeat.keys())
            intermediates[n]['common_feature_values'] = \
                sorted(filter(None, vals))

            all_vals = sorted(set(qfeat.keys()).union(cfeat.keys()))
            if len(all_vals) > 0:
                qcounts = [qfeat.get(v, 0) for v in all_vals]
                ccounts = [cfeat.get(v, 0) for v in all_vals]
                sim = cosine(qcounts, ccounts)
                if not math.isnan(sim):
                    intermediates[n]['kernel_value'] = sim
        return (cid, fc, {
            'probability': p,
            'intermediate_model_results': intermediates.values(),
        })

    def probabilities(self):
        '''Trains a model and predicts recommendations.

        If the query feature collection could not be found or if there
        is insufficient training data, an empty list is returned.

        Otherwise, a list of content objects (tuples of content
        id and feature collection) and probabilities is returned.
        The probability is generated from the model, and reflects
        confidence of the model that the corresponding content object
        is related to the query based on the ground truth data.

        On a large database, random samples are used for training, so
        this function is not deterministic.

        :rtype: ``list`` of
          ((``content_id``, :class:`dossier.fc.FeatureCollection`),
          probability)
        '''
        self.query_fc = self.store.get(self.query_content_id)
        if self.query_fc is None:
            logger.warning('Could not find FC for %s', self.query_content_id)
            return []

        # Try the canopy query before training, because if the canopy query
        # gives us nothing, then there's no point in the additional work.
        #
        # Possible optimization: If the canopy query yields fewer than N
        # results, then can we just return all of them? ---AG
        #
        # N.B Doing the canopy query first will cause things to be slower
        # when there is insufficient training data.
        candidates = self.canopy(limit=self.canopy_limit)
        if len(candidates) == 0:
            logger.info(
                'Could not find any candidates in a canopy query by '
                'scanning the following indexes: %s',
                ', '.join(self.store.index_names()))
            return []

        # Get labels from the database and translate them to the form
        # `[{-1, 1}, i, j]` where `i, j` are indices into the list
        # `content_objs`, which has type `[(content_id, FeatureCollection)]`.
        logger.info('Fetching labels...')
        labels = list(self.labels_from_query(limit=self.label_limit))
        logger.info('Fetching FCs from labels...')
        content_objs = self.content_objs_from_labels(labels)
        indexed_labels = labels_to_indexed_coref_values(content_objs, labels)

        logger.info('Training...')
        model = self.train(content_objs, indexed_labels)
        if model is None:
            logger.info(
                'Could not train model: insufficient training data. '
                '(query content id: %s)', self.query_content_id)
            raise InsufficientTrainingData

        feature_names, classifier, transformer = model
        return zip(candidates, self.classify(
            feature_names, classifier, transformer, candidates))

    def train(self, content_objs, idx_labels):
        '''Trains and returns a model using sklearn.

        If there are new labels to add, they can be added, returns an
        sklearn model which can be used for prediction and getting
        features.

        This method may return ``None`` if there is insufficient
        training data to produce a model.

        :param labels: Ground truth data.
        :type labels: list of ``({-1, 1}, index1, index2)``.
        '''
        # We have insufficient training data when there is only one or
        # fewer classes of labels.
        if len(set([lab[0] for lab in idx_labels])) <= 1:
            return None

        fcs = [fc for _, fc in content_objs]
        feature_names = vectorizable_features(fcs)
        dis = dissimilarities(feature_names, fcs)

        phi_dicts, labels = [], []  # lists are in correspondence
        for coref_value, i, j in idx_labels:
            # i, j are indices into the list `fcs`
            labels.append(coref_value)  # either -1 or 1
            phi_dict = dict([(name, dis[name][i,j]) for name in feature_names])
            phi_dicts.append(phi_dict)

        vec = dict_vector()
        training_data = vec.fit_transform(phi_dicts)

        model = LogisticRegression(class_weight='auto', penalty='l1')
        model.fit(training_data, labels)
        self.feature_weights = dict([(name, model.coef_[0][i])
                                     for i, name in enumerate(feature_names)])
        return feature_names, model, vec

    def classify(self, feature_names, classifier, transformer, candidates,
                 query_fc=None):
        '''Returns ``[probability]`` in correspondence with
        ``candidates``.

        Where each ``probability`` corresponds to the probability that
        the corresponding candidate is classified with a positive label
        given the training data.

        The list returned is in correspondence with the list of
        candidates given.

        N.B. The contract of this method should be simplified by
        bundling ``feature_names``, ``classifier`` and ``transformer``
        into one thing known as "the model." ---AG
        '''
        if query_fc is None:
            query_fc = self.query_fc
        dis = {}
        for name in feature_names:
            vec = dict_vector()
            query = vec.fit_transform([get_feat(query_fc, name)])
            cans = vec.transform(get_feat(fc, name) for _, fc in candidates)
            dis[name] = 1 - pairwise_distances(
                cans, query, metric='cosine', n_jobs=1)[:,0]

        # in correspondence with `candidates`
        phi_dicts = transformer.transform(
            [dict([(name, dis[name][i]) for name in feature_names])
             for i in xrange(len(candidates))])
        return classifier.predict_proba(phi_dicts)[:,1]

    def canopy(self, limit=None):
        ids = web.streaming_sample(
            self.canopy_ids(limit_hint=hard_limit(limit)),
            limit, hard_limit(limit))
        # I don't think it ever makes sense to include the query
        # as part of the candidate set.
        return filter(lambda (_, fc): fc is not None, self.store.get_many(ids))

    def canopy_ids(self, limit_hint=None):
        limit_hint = limit_hint or 1000
        # TODO: It seems like this should pre-emptively discard content
        # ids that have already participated in a *direct* label with
        # the query. But I think this is a premature optimization since
        # the filtering functions will take care of it. (This optimization
        # would mean fewer kernel computations.)
        blacklist = set([self.query_content_id])
        cids = set()

        # OK, so it turns out that a naive index scan is pretty inflexible and
        # arbitrary. The issue is that in a big enough data set, the first
        # index scan will probably exhaust all of our result set, which
        # means result sets will never see any variety.
        #
        # Instead, we'll try to sample from each index in small batch sizes.
        # This is a heuristic; not a principled approach. ---AG
        index_names = self.store.index_names()
        batch_size = limit_hint / 10
        progress = {}  # idx, name |--> last end
        # When `progress` is empty, the following loop will terminate.
        # An index is removed from `progress` when it no longer produces
        # results.
        for idx_name in index_names:
            feat = self.query_fc.get(idx_name)
            if isinstance(feat, StringCounter):
                for name in feat:
                    if len(name) > 0:
                        progress[(idx_name, name)] = 0

        logger.info('starting index scan (query content id: %s)',
                    self.query_content_id)
        while len(progress) > 0:
            for idx_name in index_names:
                for name in self.query_fc.get(idx_name, []):
                    key = (idx_name, name)
                    if key not in progress:
                        continue
                    logger.info('[StringCounter index: %s] scanning for "%s"',
                                idx_name, name)
                    scanner = self.store.index_scan(idx_name, name)
                    progressed = 0
                    for cid in islice(scanner, progress[key], None):
                        if progressed >= batch_size:
                            break
                        if cid not in cids and cid not in blacklist:
                            cids.add(cid)
                            progressed += 1
                            yield cid
                    if progressed == 0:
                        progress.pop(key)
                    else:
                        progress[key] += progressed

    def labels_from_query(self, limit=None):
        '''ContentId -> [Label]'''
        return self.infer_subtopic_labels(limit=limit)

    def infer_subtopic_labels(self, limit=None):
        # The basic idea here is to aggressively gather truth data while
        # avoiding cross contamination with other subfolders. Since our query
        # is a (content_id, subtopic_id), we can use subtopic connected
        # components to achieve this.

        # Short aliases.
        cid, subid = self.query_content_id, self.query_subtopic_id

        # For positive labels, the only thing we can do is traverse the
        # subtopic connected component.
        # Don't impose a hard limit on positive labels. (There are probably
        # very few of them.)
        logger.info('Inferring positive labels for: %r', (cid, subid))
        pos_labels = (self.label_store.expand((cid, subid))
                      + list(self.positive_subtopic_labels()))
        logger.info('Inferring negative labels for: %r', (cid, subid))
        neg_labels = self.negative_subtopic_labels()

        pos_sample = web.streaming_sample(
            pos_labels, limit, limit=hard_limit(limit))
        neg_sample = web.streaming_sample(
            neg_labels, limit, limit=hard_limit(limit))
        print('-' * 79)
        print('POSITIVES\n', '\n'.join(map(repr, pos_sample)), '\n')
        print('-' * 79)
        print('NEGATIVES\n', '\n'.join(map(repr, neg_sample)))
        print('-' * 79)
        return pos_sample + neg_sample

    def positive_subtopic_labels(self):
        cid, subid = self.query_content_id, self.query_subtopic_id
        subfolders = list(self.folders.parent_subfolders((cid, subid)))

        for fid, subfolder_id in subfolders:
            for cid2, subid2 in self.folders.items(fid, subfolder_id):
                # Since this item is in the same folder as our query, we
                # consider it a positive example. But there's no explicit
                # label for it, so manufacture one.
                #
                # TODO: Fix annotator id here. (We need to push annotator
                # information down into the search engine; the rest is
                # trivial.) ---AG
                yield Label(cid, cid2,
                            Folders.DEFAULT_ANNOTATOR_ID,
                            CorefValue.Positive,
                            subid, subid2)

                # Sometimes the user will directly attach a positive label
                # to an item in the folder. This will grab those.
                for lab in self.label_store.directly_connected(cid2):
                    if lab.value == CorefValue.Positive \
                            and lab.subtopic_for(cid2) == subid2:
                        yield lab

    def negative_subtopic_labels(self):
        cid, subid = self.query_content_id, self.query_subtopic_id
        for lab in negative_subtopic_labels(self.label_store, self.folders,
                                            cid, subid):
            yield lab

    def content_objs_from_labels(self, labels):
        '''[Label] -> [(content_id, FeatureCollection)]'''
        is_mapping = lambda obj: isinstance(obj, collections.Mapping)
        def is_valid_fc((cid, fc)):
            if fc is None:
                return False
            if sum(1 for name in fc if is_mapping(fc[name])) == 0:
                return False
            return True

        ids = set()
        for lab in labels:
            ids.add(lab.content_id1)
            ids.add(lab.content_id2)
        return list(ifilter(is_valid_fc, self.store.get_many(ids)))
Esempio n. 19
0
 def __init__(self, kvlclient, label_store):
     super(same_subfolder, self).__init__()
     self.kvl = kvlclient
     self.label_store = label_store
     self.folders = Folders(self.kvl)
Esempio n. 20
0
 def __init__(self, kvlclient, label_store):
     super(same_subfolder, self).__init__()
     self.kvl = kvlclient
     self.label_store = label_store
     self.folders = Folders(self.kvl)
Esempio n. 21
0
def new_folders(kvlclient, request):
    conf = {}
    if 'annotator_id' in request.query:
        conf['owner'] = request.query['annotator_id']
    return Folders(kvlclient, **conf)
Esempio n. 22
0
 def subfolder_id(self):
     return Folders.name_to_id(self.subfolder_name)