def v0_highlighter_post(request, response, tfidf, cid): '''Obtain highlights for a document POSTed as the body, which is the pre-design-thinking structure of the highlights API. See v1 below. NB: This end point will soon be deleted. The route for this endpoint is: ``POST /dossier/v0/highlighter/<cid>``. ``content_id`` is the id to associate with the given feature collection. The feature collection should be in the request body serialized as JSON. ''' logger.info('got %r', cid) tfidf = tfidf or None content_type = request.headers.get('content-type', '') if not content_type.startswith('text/html'): logger.critical('content-type=%r', content_type) response.status = 415 return { 'error': { 'code': 0, 'message': 'content_type=%r and should be text/html' % content_type } } url = urllib.unquote(cid.split('|', 1)[1]) body = request.body.read() if len(body) == 0: response.status = 420 return {'error': {'code': 1, 'message': 'empty body'}} logger.info('parsing %d bytes for url: %r', len(body), url) fc = etl.create_fc_from_html(url, body, tfidf=tfidf) if fc is None: logger.critical('failed to get FC using %d bytes from %r', len(body), url) response.status = 506 return { 'error': { 'code': 2, 'message': 'FC not generated for that content' } } highlights = dict() for feature_name, pretty_name in feature_pretty_names: # Each type of string is if feature_name not in fc: continue total = sum(fc[feature_name].values()) highlights[pretty_name] = [ (phrase, count / total, [], []) for phrase, count in sorted( fc[feature_name].items(), key=itemgetter(1), reverse=True) ] logger.info('%r and %d keys', feature_name, len(highlights[pretty_name])) return {'highlights': highlights}
def v1_fc_put(request, response, store, kvlclient, tfidf, cid): '''Store a single feature collection. The route for this endpoint is: ``PUT /dossier/v1/feature-collections/<content_id>``. ``content_id`` is the id to associate with the given feature collection. The feature collection should be in the request body serialized as JSON. Alternatively, if the request's ``Content-type`` is ``text/html``, then a feature collection is generated from the HTML. The generated feature collection is then returned as a JSON payload. This endpoint returns status ``201`` upon successful storage otherwise. An existing feature collection with id ``content_id`` is overwritten. ''' tfidf = tfidf or None if request.headers.get('content-type', '').startswith('text/html'): url = urllib.unquote(cid.split('|', 1)[1]) fc = etl.create_fc_from_html(url, request.body.read(), tfidf=tfidf) logger.info('created FC for %r', cid) store.put([(cid, fc)]) return fc_to_json(fc) else: fc = FeatureCollection.from_dict(json.load(request.body)) keywords = set() for subid in fc: if subid.startswith('subtopic'): ty = subtopic_type(subid) if ty in ('text', 'manual'): # get the user selected string data = typed_subtopic_data(fc, subid) map(keywords.add, cleanse(data).split()) keywords.add(cleanse(data)) folders = Folders(kvlclient) for fid, sid in folders.parent_subfolders(cid): if not isinstance(fid, unicode): fid = fid.decode('utf8') if not isinstance(sid, unicode): sid = sid.decode('utf8') keywords.add(cleanse(fid)) keywords.add(cleanse(sid)) fc[u'keywords'] = StringCounter(keywords) store.put([(cid, fc)]) response.status = 201
def create_highlights(data, tfidf): '''compute highlights for `data`, store it in the store using `kvlclient`, and return a `highlights` response payload. ''' try: fc = etl.create_fc_from_html( data['content-location'], data['body'], tfidf=tfidf, encoding=None) except Exception, exc: logger.critical('failed to build FC', exc_info=True) return { 'state': ERROR, 'error': {'code': 7, 'message': 'internal error: %s' % traceback.format_exc(exc), } }
def v0_highlighter_post(request, response, tfidf, cid): '''Obtain highlights for a document POSTed as the body, which is the pre-design-thinking structure of the highlights API. See v1 below. NB: This end point will soon be deleted. The route for this endpoint is: ``POST /dossier/v0/highlighter/<cid>``. ``content_id`` is the id to associate with the given feature collection. The feature collection should be in the request body serialized as JSON. ''' logger.info('got %r', cid) tfidf = tfidf or None content_type = request.headers.get('content-type', '') if not content_type.startswith('text/html'): logger.critical('content-type=%r', content_type) response.status = 415 return {'error': {'code': 0, 'message': 'content_type=%r and should be text/html' % content_type}} url = urllib.unquote(cid.split('|', 1)[1]) body = request.body.read() if len(body) == 0: response.status = 420 return {'error': {'code': 1, 'message': 'empty body'}} logger.info('parsing %d bytes for url: %r', len(body), url) fc = etl.create_fc_from_html(url, body, tfidf=tfidf) if fc is None: logger.critical('failed to get FC using %d bytes from %r', len(body), url) response.status = 506 return {'error': {'code': 2, 'message': 'FC not generated for that content'}} highlights = dict() for feature_name, pretty_name in feature_pretty_names: # Each type of string is if feature_name not in fc: continue total = sum(fc[feature_name].values()) highlights[pretty_name] = [ (phrase, count / total, [], []) for phrase, count in sorted(fc[feature_name].items(), key=itemgetter(1), reverse=True)] logger.info('%r and %d keys', feature_name, len(highlights[pretty_name])) return {'highlights': highlights}
def create_highlights(data, tfidf): '''compute highlights for `data`, store it in the store using `kvlclient`, and return a `highlights` response payload. ''' try: fc = etl.create_fc_from_html(data['content-location'], data['body'], tfidf=tfidf, encoding=None) except Exception, exc: logger.critical('failed to build FC', exc_info=True) return { 'state': ERROR, 'error': { 'code': 7, 'message': 'internal error: %s' % traceback.format_exc(exc), } }
def callback(si, link): if si is None: return cid_url = hashlib.md5(str(link)).hexdigest() cid = etl.interface.mk_content_id(cid_url) content_ids.append(cid) # hack alert! # We currently use FCs to store subtopic text data, which # means we cannot overwrite existing FCs with reckless # abandon. So we adopt a heuristic: check if an FC already # exists, and if it does, check if it is being used to store # user data. If so, don't overwrite it and move on. fc = config.store.get(cid) if fc is not None and any(k.startswith('subtopic|') for k in fc.iterkeys()): logger.info('skipping ingest for %r (abs url: %r) because ' 'an FC with user data already exists.', cid, link) return other_features = { u'keywords': StringCounter(keywords), #list(queries)), } try: fc = etl.create_fc_from_html( link, si.body.raw, encoding=si.body.encoding or 'utf-8', tfidf=tfidf, other_features=other_features, ) if not fc: logger.info('failed to get an FC, moving on') return logger.info('created FC for %r (abs url: %r)', cid, link) config.store.put([(cid, fc)]) except Exception: logger.info('trapped ingest failure on %r (abs url: %r)', cid, link, exc_info=True)