def v1_fc_put(request, response, store, kvlclient, tfidf, cid): '''Store a single feature collection. The route for this endpoint is: ``PUT /dossier/v1/feature-collections/<content_id>``. ``content_id`` is the id to associate with the given feature collection. The feature collection should be in the request body serialized as JSON. Alternatively, if the request's ``Content-type`` is ``text/html``, then a feature collection is generated from the HTML. The generated feature collection is then returned as a JSON payload. This endpoint returns status ``201`` upon successful storage otherwise. An existing feature collection with id ``content_id`` is overwritten. ''' tfidf = tfidf or None if request.headers.get('content-type', '').startswith('text/html'): url = urllib.unquote(cid.split('|', 1)[1]) fc = etl.create_fc_from_html(url, request.body.read(), tfidf=tfidf) logger.info('created FC for %r', cid) store.put([(cid, fc)]) return fc_to_json(fc) else: fc = FeatureCollection.from_dict(json.load(request.body)) keywords = set() for subid in fc: if subid.startswith('subtopic'): ty = subtopic_type(subid) if ty in ('text', 'manual'): # get the user selected string data = typed_subtopic_data(fc, subid) map(keywords.add, cleanse(data).split()) keywords.add(cleanse(data)) folders = Folders(kvlclient) for fid, sid in folders.parent_subfolders(cid): if not isinstance(fid, unicode): fid = fid.decode('utf8') if not isinstance(sid, unicode): sid = sid.decode('utf8') keywords.add(cleanse(fid)) keywords.add(cleanse(sid)) fc[u'keywords'] = StringCounter(keywords) store.put([(cid, fc)]) response.status = 201
def process(self, fc, context=None): text_source = self.config.get('text_source') if text_source and text_source in fc: text = fc[text_source] else: return fc names = defaultdict(StringCounter) for sent in nltk.sent_tokenize(text): for chunk in nltk.ne_chunk(nltk.pos_tag(nltk.word_tokenize(sent))): if hasattr(chunk, 'label'): label = chunk.label() name = ' '.join(c[0] for c in chunk.leaves()) if not isinstance(name, unicode): name = unicode(name, 'utf-8') name = cleanse(name) #print chunk.node, name names[label][name] += 1 for entity_type, name_counts in names.items(): fc[entity_type] = name_counts return fc
def traverse_extract_fetch(config, wukey, stop_after_extraction=False): '''Given a config and a `wukey=cbor.dumps((folder_name,subfolder_name))`, traverse the folders to generate queries, issue them to Google, fetch the results, and ingest them. ''' config.kvlclient.setup_namespace({'openquery': (str,)}) try: data = list(config.kvlclient.get('openquery', (wukey,))) if data: if data[0][1]: logger.info('found existing query results: %r', data) return else: config.kvlclient.delete('openquery', (wukey,)) except: logger.error('failed to get data from existing table', exc_info=True) fid, sid = cbor.loads(wukey) tfidf = config.tfidf folders = Folders(config.kvlclient) fetcher = Fetcher() ## To disable the keyword extractor model, you can uncomment out ## the next three lines (`get_subfolder_queries`) and comment out ## the following two lines (`extract_keyword_queries`). #keyword_feature_keys = [] #queries = get_subfolder_queries( # config.store, config.label_store, folders, fid, sid) queries, keyword_feature_keys, has_observations = extract_keyword_queries( config.store, config.label_store, folders, fid, sid) logger.info('Model found %d queries: %r', len(queries), queries) if stop_after_extraction: return keywords = set() for key in keyword_feature_keys: ckey = cleanse(key.decode('utf8')) keywords.add(ckey) for part in ckey.split(): keywords.add(part) #link2queries = defaultdict(set) links = set() logger.info('searching google for: %r', queries) for q in queries: for result in config.google.web_search_with_paging(q, limit=10): links.add(result['link']) #map(link2queries[result['link']].add, cleanse(q.decode('utf8')).split()) logger.info('discovered %r', result['link']) result = None #logger.info('got %d URLs from %d queries', len(link2queries), len(queries)) logger.info('got %d URLs from %d queries', len(links), len(queries)) # content_ids gets modified within the 'callback' closure content_ids = [] #for link, queries in link2queries.items(): def callback(si, link): if si is None: return cid_url = hashlib.md5(str(link)).hexdigest() cid = etl.interface.mk_content_id(cid_url) content_ids.append(cid) # hack alert! # We currently use FCs to store subtopic text data, which # means we cannot overwrite existing FCs with reckless # abandon. So we adopt a heuristic: check if an FC already # exists, and if it does, check if it is being used to store # user data. If so, don't overwrite it and move on. fc = config.store.get(cid) if fc is not None and any(k.startswith('subtopic|') for k in fc.iterkeys()): logger.info('skipping ingest for %r (abs url: %r) because ' 'an FC with user data already exists.', cid, link) return other_features = { u'keywords': StringCounter(keywords), #list(queries)), } try: fc = etl.create_fc_from_html( link, si.body.raw, encoding=si.body.encoding or 'utf-8', tfidf=tfidf, other_features=other_features, ) if not fc: logger.info('failed to get an FC, moving on') return logger.info('created FC for %r (abs url: %r)', cid, link) config.store.put([(cid, fc)]) except Exception: logger.info('trapped ingest failure on %r (abs url: %r)', cid, link, exc_info=True) logger.info('FETCHING using ASYNC') fetcher.get_async(islice(links, None), callback) data = json.dumps({'content_ids': content_ids}) logger.info('saving %d content_ids in %d bytes on wukey %r', len(content_ids), len(data), wukey) config.kvlclient.put('openquery', ((wukey,), data)) logger.info('done saving for %r', wukey)
logger.error( 'got other than list of length at least two from service: %r --> %r', url, results) continue query_ack = results[0] query_suggestions = results[1] if not isinstance(query_suggestions, list): logger.error('got other than list of query suggestions: %r --> %r', url, results) continue suggestions += query_suggestions logger.info('%d suggestions from %r', len(query_suggestions), url) logger.info('found %d suggestions for %r', len(suggestions), query) cleansed_query = cleanse(query) if cleansed_query not in suggestions: suggestions.insert(0, query) return [query, suggestions] #list(set(suggestions))] feature_pretty_names = [ ('ORGANIZATION', 'Organizations'), ('PERSON', 'Persons'), ('FACILITY', 'Facilities'), ('GPE', 'Geo-political Entities'), ('LOCATION', 'Locations'), ('skype', 'Skype Handles'), ('phone', 'Phone Numbers'), ('email', 'Email Addresses'), ('bowNP_unnorm', 'Noun Phrases'),