def tag_by_email(self, emails, breached): docs = [] s = Search(using=self.es).\ filter(Q({'terms': {'contact_email.keyword': emails}})).\ source(['id_submission']) print('%s emails breached=%s' % (len(emails), breached)) for hit in s.scan(): docs.append( lib.bulk_update_doc(hit['id_submission'], {'breached': breached})) if not len(docs) % 500: print('\tfetched %s' % len(docs)) print('\t%s matches' % len(docs)) return docs
def tag_by_phrase(self, tag_query, source): print('query=%s source=%s' % (json.dumps(tag_query), source)) resp = self.es.search(index='fcc-comments', body=tag_query, size=0) total = resp['hits']['total'] print('tagging %s / %s matches' % (self.limit, total)) docs = [] for doc in scan(self.es, index='fcc-comments', query=tag_query, size=1000): docs.append(lib.bulk_update_doc(doc['_id'], {'source': source})) if not len(docs) % 1000: print('\tfetched %s\n%s\t%s' % (len(docs), doc['_id'], doc['_source']['text_data'][:400])) if len(docs) >= self.limit: break print('indexing %s' % (len(docs))) tagged = lib.bulk_update(self.es, docs) print('tagged %s / %s matches' % (tagged, total)) return tagged
def tag_mlt(self, text, src_doc_id, min_cluster_size=100): # set up query: more like this with aggregations query = {} query.update(self.mlt_query) query.update(self.mlt_aggs) query['query']['bool']['must']['more_like_this']['like'][0][ '_id'] = src_doc_id terms = min([40, len(text.split(' '))]) query['query']['bool']['must']['more_like_this'][ 'max_query_terms'] = terms print('mlt query=%s' % json.dumps(query)) # get page of like this results; not all because want to check for existing clusters resp = self.es.search(index='fcc-comments', body=query, size=10) mlt_matches = resp['hits']['total'] # only need aggs for the first query del query['aggs'] print('%s more like this' % mlt_matches) print('aggregations=%s' % resp['aggregations']) # if matching documents have a source other than unknown, use that src = 'unknown' source_buckets = resp['aggregations']['source']['buckets'] if source_buckets: src = source_buckets[0]['key'] print('using source %s' % src) # mlt aggregation result is a count of documents per cluster mlt_buckets = resp['aggregations']['mlt']['buckets'] # if the MLT returns an existing cluster, join it if either: # - it's bigger than the current MLT result # - it's bigger than the minimum interesting cluster size join_cluster = False cluster_size = mlt_buckets[0]['doc_count'] if mlt_buckets else 0 if cluster_size: join_cluster = cluster_size >= min([min_cluster_size, mlt_matches]) if join_cluster: # add these results to the existing cluster src_doc_id = mlt_buckets[0]['key'] print('found existing cluster from %s (%s docs)' % (src_doc_id, cluster_size)) # update query to exclude docs already in this cluster query['query']['bool']['filter']['bool']['must_not'] = [{ 'term': { 'analysis.%s.src_doc_id' % TAG: src_doc_id } }] if src != 'unknown': # no need to re-tag docs that already have this source query['query']['bool']['filter']['bool']['must_not'].append( {'term': { 'analysis.source': src }}) print('updated mlt query=%s' % json.dumps(query)) # query to get new size resp = self.es.search(index='fcc-comments', body=query, size=0) mlt_matches = resp['hits']['total'] # fetch matching untagged docs if not mlt_matches: return 0 print('fetching %s' % mlt_matches) docs = [] for doc in scan(self.es, index='fcc-comments', query=query, size=1000): mlt = {} mlt[TAG] = { 'src_doc_id': src_doc_id, 'matches': mlt_matches, } if src != 'unknown': mlt[TAG]['source'] = src docs.append(lib.bulk_update_doc(doc['_id'], mlt)) if not len(docs) % 1000: print('\tfetched %s / %s' % (len(docs), mlt_matches)) # update with analysis.more_like_this if not docs: return 0 print('indexing %s' % (len(docs))) return lib.bulk_update(self.es, docs)
def tag_positive_terms(self): ''' get documents without a sentiment tag that match phrase with slop: - protect|support|keep|need net neutrality - let the new neutrality stand for a broader result set than regex in analyze ''' query = { "_source": "text_data", "query": { "bool": { "filter": { "bool": { "should": [], "must": [{ "term": { "analysis.source": "unknown" } }], "must_not": [{ "exists": { "field": "analysis.titleii" } }, { "exists": { "field": "analysis.sentiment_manual" } }, { "exists": { "field": "analysis.sentiment_sig_terms_ordered" } }] } } } } } phrases = [ 'essential net neutrality', 'keep net neutrality', 'maintain net neutrality', 'need net neutrality', 'preserve net neutrality' 'protect net neutrality', 'save net neutrality', 'support net neutrality', 'support title 2', 'support title II', 'let the new neutrality stand', 'net neutrality rules are extremely important' 'net neutrality is important' ] for phrase in phrases: subq = { "match_phrase": { "text_data": { "query": phrase, "slop": 3 } } } query['query']['bool']['filter']['bool']['should'].append(subq) print(json.dumps(query)) resp = self.es.search(index='fcc-comments', body=query, size=0) total = resp['hits']['total'] print('tagging %s / %s matches' % (self.limit, total)) docs = [] for doc in scan(self.es, index='fcc-comments', query=query, size=1000): docs.append( lib.bulk_update_doc(doc['_id'], {'source': 'es_terms_positive'})) if not len(docs) % 1000: print( '\tfetched %s\n%s\t%s' % (len(docs), doc['_id'], doc['_source']['text_data'][:400])) if len(docs) == self.limit: break print('indexing %s' % (len(docs))) tagged = lib.bulk_update(self.es, docs) print('tagged %s / %s matches' % (tagged, total)) return tagged