Example #1
0
 def tag_by_email(self, emails, breached):
     docs = []
     s = Search(using=self.es).\
         filter(Q({'terms': {'contact_email.keyword': emails}})).\
         source(['id_submission'])
     print('%s emails breached=%s' % (len(emails), breached))
     for hit in s.scan():
         docs.append(
             lib.bulk_update_doc(hit['id_submission'],
                                 {'breached': breached}))
         if not len(docs) % 500:
             print('\tfetched %s' % len(docs))
     print('\t%s matches' % len(docs))
     return docs
Example #2
0
    def tag_by_phrase(self, tag_query, source):
        print('query=%s source=%s' % (json.dumps(tag_query), source))
        resp = self.es.search(index='fcc-comments', body=tag_query, size=0)
        total = resp['hits']['total']
        print('tagging %s / %s matches' % (self.limit, total))
        docs = []
        for doc in scan(self.es, index='fcc-comments', query=tag_query, size=1000):
            docs.append(lib.bulk_update_doc(doc['_id'], {'source': source}))
            if not len(docs) % 1000:
                print('\tfetched %s\n%s\t%s' % (len(docs), doc['_id'], doc['_source']['text_data'][:400]))
            if len(docs) >= self.limit:
                break

        print('indexing %s' % (len(docs)))
        tagged = lib.bulk_update(self.es, docs)
        print('tagged %s / %s matches' % (tagged, total))
        return tagged
Example #3
0
    def tag_mlt(self, text, src_doc_id, min_cluster_size=100):
        # set up query: more like this with aggregations
        query = {}
        query.update(self.mlt_query)
        query.update(self.mlt_aggs)
        query['query']['bool']['must']['more_like_this']['like'][0][
            '_id'] = src_doc_id
        terms = min([40, len(text.split(' '))])
        query['query']['bool']['must']['more_like_this'][
            'max_query_terms'] = terms
        print('mlt query=%s' % json.dumps(query))

        # get page of like this results; not all because want to check for existing clusters
        resp = self.es.search(index='fcc-comments', body=query, size=10)
        mlt_matches = resp['hits']['total']
        # only need aggs for the first query
        del query['aggs']
        print('%s more like this' % mlt_matches)
        print('aggregations=%s' % resp['aggregations'])

        # if matching documents have a source other than unknown, use that
        src = 'unknown'
        source_buckets = resp['aggregations']['source']['buckets']
        if source_buckets:
            src = source_buckets[0]['key']
            print('using source %s' % src)
        # mlt aggregation result is a count of documents per cluster
        mlt_buckets = resp['aggregations']['mlt']['buckets']

        # if the MLT returns an existing cluster, join it if either:
        # - it's bigger than the current MLT result
        # - it's bigger than the minimum interesting cluster size
        join_cluster = False
        cluster_size = mlt_buckets[0]['doc_count'] if mlt_buckets else 0
        if cluster_size:
            join_cluster = cluster_size >= min([min_cluster_size, mlt_matches])
        if join_cluster:
            # add these results to the existing cluster
            src_doc_id = mlt_buckets[0]['key']
            print('found existing cluster from %s (%s docs)' %
                  (src_doc_id, cluster_size))
            # update query to exclude docs already in this cluster
            query['query']['bool']['filter']['bool']['must_not'] = [{
                'term': {
                    'analysis.%s.src_doc_id' % TAG: src_doc_id
                }
            }]
            if src != 'unknown':
                # no need to re-tag docs that already have this source
                query['query']['bool']['filter']['bool']['must_not'].append(
                    {'term': {
                        'analysis.source': src
                    }})
            print('updated mlt query=%s' % json.dumps(query))
            # query to get new size
            resp = self.es.search(index='fcc-comments', body=query, size=0)
            mlt_matches = resp['hits']['total']

        # fetch matching untagged docs
        if not mlt_matches:
            return 0
        print('fetching %s' % mlt_matches)
        docs = []
        for doc in scan(self.es, index='fcc-comments', query=query, size=1000):
            mlt = {}
            mlt[TAG] = {
                'src_doc_id': src_doc_id,
                'matches': mlt_matches,
            }
            if src != 'unknown':
                mlt[TAG]['source'] = src
            docs.append(lib.bulk_update_doc(doc['_id'], mlt))
            if not len(docs) % 1000:
                print('\tfetched %s / %s' % (len(docs), mlt_matches))

        # update with analysis.more_like_this
        if not docs:
            return 0
        print('indexing %s' % (len(docs)))
        return lib.bulk_update(self.es, docs)
Example #4
0
    def tag_positive_terms(self):
        '''
            get documents without a sentiment tag that match phrase with slop:
              - protect|support|keep|need net neutrality
              - let the new neutrality stand
            for a broader result set than regex in analyze
        '''
        query = {
            "_source": "text_data",
            "query": {
                "bool": {
                    "filter": {
                        "bool": {
                            "should": [],
                            "must": [{
                                "term": {
                                    "analysis.source": "unknown"
                                }
                            }],
                            "must_not": [{
                                "exists": {
                                    "field": "analysis.titleii"
                                }
                            }, {
                                "exists": {
                                    "field": "analysis.sentiment_manual"
                                }
                            }, {
                                "exists": {
                                    "field":
                                    "analysis.sentiment_sig_terms_ordered"
                                }
                            }]
                        }
                    }
                }
            }
        }

        phrases = [
            'essential net neutrality', 'keep net neutrality',
            'maintain net neutrality', 'need net neutrality',
            'preserve net neutrality'
            'protect net neutrality', 'save net neutrality',
            'support net neutrality', 'support title 2', 'support title II',
            'let the new neutrality stand',
            'net neutrality rules are extremely important'
            'net neutrality is important'
        ]
        for phrase in phrases:
            subq = {
                "match_phrase": {
                    "text_data": {
                        "query": phrase,
                        "slop": 3
                    }
                }
            }
            query['query']['bool']['filter']['bool']['should'].append(subq)
        print(json.dumps(query))
        resp = self.es.search(index='fcc-comments', body=query, size=0)
        total = resp['hits']['total']
        print('tagging %s / %s matches' % (self.limit, total))
        docs = []
        for doc in scan(self.es, index='fcc-comments', query=query, size=1000):
            docs.append(
                lib.bulk_update_doc(doc['_id'],
                                    {'source': 'es_terms_positive'}))
            if not len(docs) % 1000:
                print(
                    '\tfetched %s\n%s\t%s' %
                    (len(docs), doc['_id'], doc['_source']['text_data'][:400]))
            if len(docs) == self.limit:
                break

        print('indexing %s' % (len(docs)))
        tagged = lib.bulk_update(self.es, docs)
        print('tagged %s / %s matches' % (tagged, total))
        return tagged