def get_correspondents_for_search():
        dataset = Controller.get_arg('dataset')
        core_topics_name = get_config(dataset)['SOLR_CONNECTION']['Core-Topics']
        sort = Controller.get_arg('sort', arg_type=str, required=False)

        filter_string = Controller.get_arg('filters', arg_type=str, default='{}', required=False)
        filter_object = json.loads(filter_string)
        filter_query = build_filter_query(filter_object, core_type=core_topics_name)
        term = filter_object.get('searchTerm', '')

        query_builder = QueryBuilder(
            dataset=dataset,
            query={
                'q': build_fuzzy_solr_query(term),
                'facet':'true',
                'facet.mincount':'1',
                'facet.limit': str(FACET_LIMIT),
                # group by
                'facet.field': 'header.sender.identifying_name'
            },
            fq=filter_query,
            limit=0
        )
        solr_result = query_builder.send()

        return Correspondents.build_correspondents_for_search_result(solr_result, dataset, sort)
    def get_classes_for_correspondent():
        dataset = Controller.get_arg('dataset')
        core_topics_name = get_config(dataset)['SOLR_CONNECTION']['Core-Topics']
        identifying_name = re.escape(Controller.get_arg('identifying_name'))

        filter_string = Controller.get_arg('filters', arg_type=str, default='{}', required=False)
        filter_object = json.loads(filter_string)
        filter_query = build_filter_query(filter_object, False, core_type=core_topics_name)

        query_builder = QueryBuilder(
            dataset=dataset,
            query={
                'q': 'header.sender.identifying_name:' + identifying_name + \
                     ' AND ' + build_fuzzy_solr_query(filter_object.get('searchTerm', '')),
                'group': 'true',
                'group.field': 'category.top_subcategory'
            },
            fq=filter_query,
            fl='groupValue'
        )
        solr_result = query_builder.send()

        grouped_result = solr_result['grouped']['category.top_subcategory']
        groups = grouped_result['groups']
        num = grouped_result['matches']

        if num == 0:
            return []

        return [{
            'key': group['groupValue'],
            'num': group['doclist']['numFound'],
            'share': round(group['doclist']['numFound'] / num, 4)
        } for group in groups]
Example #3
0
    def get_date_facet_result(dataset, filter_query, term,
                              identifying_name_filter, start_date, end_date,
                              bin_size, category):
        if bin_size == 'day':
            facet_gap = '+1DAY'
        elif bin_size == 'week':
            facet_gap = '+7DAYS'
        else:
            facet_gap = '+1MONTH'
        query_builder = QueryBuilder(dataset=dataset,
                                     query={
                                         'q':
                                         build_fuzzy_solr_query(term),
                                         'facet':
                                         'true',
                                         'facet.range':
                                         'header.date',
                                         'facet.range.start':
                                         start_date,
                                         'facet.range.end':
                                         end_date,
                                         'facet.range.gap':
                                         facet_gap,
                                         'fq':
                                         '+category.top_category:' + category +
                                         '+' + identifying_name_filter
                                     },
                                     limit=0,
                                     fq=filter_query)
        solr_result = query_builder.send()

        return Dates.build_dates_for_search_result(solr_result, bin_size)
Example #4
0
    def get_topics_for_correspondent():
        dataset = Controller.get_arg('dataset')
        core_name = get_config(dataset)['SOLR_CONNECTION']['Core']
        core_topics_name = get_config(
            dataset)['SOLR_CONNECTION']['Core-Topics']
        identifying_name = re.escape(Controller.get_arg('identifying_name'))

        join_string = '{!join from=doc_id fromIndex=' + core_name + ' to=doc_id}'

        filter_string = Controller.get_arg('filters',
                                           arg_type=str,
                                           default='{}',
                                           required=False)
        filter_object = json.loads(filter_string)
        filter_query = build_filter_query(filter_object,
                                          False,
                                          True,
                                          join_string,
                                          core_type=core_topics_name)
        filter_query.append(join_string + 'header.sender.identifying_name:' + identifying_name + \
                            ' AND ' + build_fuzzy_solr_query(filter_object.get('searchTerm', '')))
        join_query = filter_query

        aggregated_topics_for_correspondent = Topics.get_aggregated_distribution(
            dataset, core_topics_name, identifying_name, filter_object,
            join_query)
        aggregated_distribution = {
            'topics': aggregated_topics_for_correspondent
        }

        all_topics = Topics.get_all_topics(dataset)

        mail_topic_distributions = Topics.get_distributions_for_mails(
            dataset, join_query)

        all_topic_distributions = {
            'main': aggregated_distribution,
            'singles': mail_topic_distributions
        }

        for distribution in all_topic_distributions['singles']:
            topics = Topics.complete_distribution_and_add_ranks(
                distribution['topics'], all_topics)
            topics = Topics.remove_words(topics)
            distribution['topics'] = topics

        all_topic_distributions['main'][
            'topics'] = Topics.complete_distribution_and_add_ranks(
                all_topic_distributions['main']['topics'], all_topics)

        return all_topic_distributions
Example #5
0
    def search_correspondences_for_term(dataset, filter_string):
        filter_object = json.loads(filter_string)
        core_topics_name = get_config(
            dataset)['SOLR_CONNECTION']['Core-Topics']
        filter_query = build_filter_query(filter_object,
                                          core_type=core_topics_name)
        term = filter_object.get('searchTerm', '')

        facet_query = {
            'senders': {
                'type': 'terms',
                'field': 'header.sender.identifying_name',
                'facet': {
                    'recipients': {
                        'type': 'terms',
                        'field': 'header.recipients',
                        'limit': FACET_LIMIT,
                        'refine': True
                    }
                },
                'limit': SOLR_MAX_INT,
                'refine': True
            }
        }

        query_builder = QueryBuilder(dataset=dataset,
                                     query={
                                         'q': build_fuzzy_solr_query(term),
                                         'json.facet': json.dumps(facet_query)
                                     },
                                     limit=0,
                                     fq=filter_query)
        solr_result = query_builder.send()

        correspondences = []
        if solr_result['facets']['count'] == 0:
            return correspondences
        for sender_bucket in solr_result['facets']['senders']['buckets']:
            correspondences_for_source = {
                'source': sender_bucket.get('val', '')
            }
            targets = set()
            for recipient in sender_bucket['recipients']['buckets']:
                target = literal_eval(recipient.get('val', '')).get(
                    'identifying_name', '')
                targets.add(target)
            correspondences_for_source['targets'] = list(targets)
            correspondences.append(correspondences_for_source)

        return correspondences
Example #6
0
    def get_keyphrases_for_correspondent():
        dataset = Controller.get_arg('dataset')
        core_topics_name = get_config(
            dataset)['SOLR_CONNECTION']['Core-Topics']
        identifying_name = re.escape(Controller.get_arg('identifying_name'))

        filter_string = Controller.get_arg('filters',
                                           arg_type=str,
                                           default='{}',
                                           required=False)
        filter_object = json.loads(filter_string)
        filter_query = build_filter_query(filter_object,
                                          core_type=core_topics_name)

        query_builder = QueryBuilder(
            dataset=dataset,
            query={
                'q':
                'header.sender.identifying_name:' + identifying_name +
                ' AND ' +
                build_fuzzy_solr_query(filter_object.get('searchTerm', '')),
                'facet':
                'true',
                'facet.field':
                'keyphrases',
                'facet.mincount':
                '1'
            },
            fq=filter_query,
            limit=0,
        )
        solr_result = query_builder.send()

        parsed_solr_result = parse_solr_result(solr_result)
        results = parsed_solr_result['facet_counts']['facet_fields'][
            'keyphrases']

        if len(results) == 0:
            return results

        aggregated_keyphrases = Keyphrases.parse_keyphrases(results)

        return aggregated_keyphrases
Example #7
0
    def get_aggregated_distribution(dataset, core_topics_name,
                                    identifying_name, filter_object,
                                    join_query):
        facet_query = {
            'facet_topic_id': {
                'type': 'terms',
                'field': 'topic_id',
                'facet': {
                    'sum_of_confs_for_topic': 'sum(topic_conf)',
                    'facet_terms': {
                        'type': 'terms',
                        'field': 'terms',
                        'limit': 1
                    }
                },
                'sort': 'index asc',
                'limit': FACET_LIMIT,
                'refine': True
            }
        }

        query_builder_topic_distribution = QueryBuilder(
            dataset=dataset,
            query={
                'q': '*:*',
                'json.facet': json.dumps(facet_query)
            },
            fq=join_query,
            limit=0,
            core_type='Core-Topics')

        # get all topics that the pipeline returned with confidences for the correspondent
        solr_result_topic_distribution = query_builder_topic_distribution.send(
        )

        filter_query = build_filter_query(filter_object,
                                          False,
                                          core_type=core_topics_name)

        query_builder_doc_count_for_correspondent = QueryBuilder(
            dataset=dataset,
            query='header.sender.identifying_name:' + identifying_name +
            ' AND ' +
            build_fuzzy_solr_query(filter_object.get('searchTerm', '')),
            fq=filter_query,
            limit=0)
        solr_result_email_count = query_builder_doc_count_for_correspondent.send(
        )
        total_email_count = solr_result_email_count['response']['numFound']

        if solr_result_topic_distribution['facets']['count'] == 0:
            return []

        correspondent_topics_parsed = []

        if total_email_count:
            correspondent_topics_parsed = list(
                map(
                    Topics.parse_topic_closure_wrapper(total_email_count),
                    solr_result_topic_distribution['facets']['facet_topic_id']
                    ['buckets']))

        return correspondent_topics_parsed
    def search():
        dataset = Controller.get_arg('dataset')
        core_topics_name = get_config(dataset)['SOLR_CONNECTION']['Core-Topics']
        limit = Controller.get_arg('limit', arg_type=int, required=False)
        offset = Controller.get_arg('offset', arg_type=int, required=False)
        sort = Controller.get_arg('sort', arg_type=str, required=False)

        filter_string = Controller.get_arg('filters', arg_type=str, default='{}', required=False)
        filter_object = json.loads(filter_string)
        filter_query = build_filter_query(filter_object, core_type=core_topics_name)
        term = filter_object.get('searchTerm', '')

        query = build_fuzzy_solr_query(term)

        query_builder = QueryBuilder(
            dataset=dataset,
            query=query,
            limit=limit,
            offset=offset,
            fq=filter_query,
            sort=sort
        )
        solr_result = query_builder.send()

        parsed_solr_result = parse_solr_result(solr_result)
        results = parse_email_list(parsed_solr_result['response']['docs'])

        if len(results) == 0:
            return {
                'results': results,
                'searchTopics': {
                    'main': {
                        'topics': []
                    },
                    'singles': [],
                },
                'numFound': parsed_solr_result['response']['numFound'],
                'searchTerm': term
            }

        conditions = map(lambda result_element: 'doc_id:' + result_element['doc_id'], results)
        doc_id_filter_query = reduce(lambda condition_1, condition_2: condition_1 + ' OR ' + condition_2, conditions)

        facet_query = {
            'facet_topic_id': {
                'type': 'terms',
                'field': 'topic_id',
                'facet': {
                    'sum_of_confs_for_topic': 'sum(topic_conf)',
                    'facet_terms': {
                        'type': 'terms',
                        'field': 'terms',
                        'limit': 1
                    }
                },
                'sort': 'index asc',
                'limit': FACET_LIMIT,
                'refine': True
            }
        }

        query_builder = QueryBuilder(
            dataset=dataset,
            query={
                'q': doc_id_filter_query,
                'group': 'true',
                'group.field': 'doc_id',
                'group.limit': '100',
                'json.facet': json.dumps(facet_query)
            },
            limit=SOLR_MAX_INT,
            core_type='Core-Topics'
        )
        solr_topic_result = query_builder.send()
        topic_dists_for_emails = solr_topic_result['grouped']['doc_id']['groups']
        topic_dists_for_emails_parsed = Search.parse_grouped_topic_distributions(topic_dists_for_emails)

        aggregated_topic_dist_parsed = list(map(
            Topics.parse_topic_closure_wrapper(len(topic_dists_for_emails)),
            solr_topic_result['facets']['facet_topic_id']['buckets']
        ))

        all_topics = Topics.get_all_topics(dataset)

        for distribution in topic_dists_for_emails_parsed:
            topics = Topics.complete_distribution_and_add_ranks(distribution['topics'], all_topics)
            topics = Topics.remove_words(topics)
            distribution['topics'] = topics

        aggregated_topic_dist_parsed = Topics.complete_distribution_and_add_ranks(
            aggregated_topic_dist_parsed, all_topics)

        return {
            'results': results,
            'searchTopics': {
                'main': {
                    'topics': aggregated_topic_dist_parsed
                },
                'singles': topic_dists_for_emails_parsed,
            },
            'numFound': parsed_solr_result['response']['numFound'],
            'searchTerm': term
        }
Example #9
0
    def get_sender_recipient_email_list():
        dataset = Controller.get_arg('dataset')
        core_topics_name = get_config(
            dataset)['SOLR_CONNECTION']['Core-Topics']
        sender = Controller.get_arg('sender', default='*')
        recipient = Controller.get_arg('recipient', default='*')
        sender_or_recipient = Controller.get_arg('sender_or_recipient',
                                                 required=False)
        current_app.logger.debug(
            '########## %s ###### %s ###### %s ###########', sender, recipient,
            sender_or_recipient)
        limit = Controller.get_arg('limit', int, default=DEFAULT_LIMIT)
        offset = Controller.get_arg('offset', int, default=DEFAULT_OFFSET)

        filter_string = Controller.get_arg('filters',
                                           arg_type=str,
                                           default='{}',
                                           required=False)
        filter_object = json.loads(filter_string)
        filter_query = build_filter_query(filter_object,
                                          False,
                                          core_type=core_topics_name)

        if sender == '*' and recipient == '*' and not sender_or_recipient:
            raise SyntaxError(
                'Please provide sender or recipient or both or sender_or_recipient.'
            )

        original_sender = sender
        original_recipient = recipient
        if sender_or_recipient:
            sender = recipient = sender_or_recipient

        if sender != '*':
            sender = re.escape(sender)
        if recipient != '*':
            # all non-alphanumerics must be escaped in order for Solr to match only the identifying_name field-part:
            # if we DIDN'T specify 'identifying_name' for 'recipients' here, also 'name' and 'email' would be searched
            # because all these three attributes are stored in one big 'recipients' string in Solr!
            recipient = '*"\'identifying_name\': \'{}\'"*'.format(
                re.escape(recipient))

        operator = 'OR' if sender_or_recipient else 'AND'
        q = '(header.sender.identifying_name:{} {} header.recipients:{}) AND {}'.format(
            sender, operator, recipient,
            build_fuzzy_solr_query(filter_object.get('searchTerm', '')))

        query_builder = QueryBuilder(dataset=dataset,
                                     query=q,
                                     fq=filter_query,
                                     limit=limit,
                                     offset=offset,
                                     sort='Newest first')
        solr_result = query_builder.send()

        parsed_solr_result = parse_solr_result(solr_result)

        return {
            'results':
            parse_email_list(parsed_solr_result['response']['docs']),
            'numFound': parsed_solr_result['response']['numFound'],
            'query': q,
            'senderEmail': original_sender,
            'recipientEmail': original_recipient
        }