def get_correspondents_for_search(): dataset = Controller.get_arg('dataset') core_topics_name = get_config(dataset)['SOLR_CONNECTION']['Core-Topics'] sort = Controller.get_arg('sort', arg_type=str, required=False) filter_string = Controller.get_arg('filters', arg_type=str, default='{}', required=False) filter_object = json.loads(filter_string) filter_query = build_filter_query(filter_object, core_type=core_topics_name) term = filter_object.get('searchTerm', '') query_builder = QueryBuilder( dataset=dataset, query={ 'q': build_fuzzy_solr_query(term), 'facet':'true', 'facet.mincount':'1', 'facet.limit': str(FACET_LIMIT), # group by 'facet.field': 'header.sender.identifying_name' }, fq=filter_query, limit=0 ) solr_result = query_builder.send() return Correspondents.build_correspondents_for_search_result(solr_result, dataset, sort)
def get_classes_for_correspondent(): dataset = Controller.get_arg('dataset') core_topics_name = get_config(dataset)['SOLR_CONNECTION']['Core-Topics'] identifying_name = re.escape(Controller.get_arg('identifying_name')) filter_string = Controller.get_arg('filters', arg_type=str, default='{}', required=False) filter_object = json.loads(filter_string) filter_query = build_filter_query(filter_object, False, core_type=core_topics_name) query_builder = QueryBuilder( dataset=dataset, query={ 'q': 'header.sender.identifying_name:' + identifying_name + \ ' AND ' + build_fuzzy_solr_query(filter_object.get('searchTerm', '')), 'group': 'true', 'group.field': 'category.top_subcategory' }, fq=filter_query, fl='groupValue' ) solr_result = query_builder.send() grouped_result = solr_result['grouped']['category.top_subcategory'] groups = grouped_result['groups'] num = grouped_result['matches'] if num == 0: return [] return [{ 'key': group['groupValue'], 'num': group['doclist']['numFound'], 'share': round(group['doclist']['numFound'] / num, 4) } for group in groups]
def get_date_facet_result(dataset, filter_query, term, identifying_name_filter, start_date, end_date, bin_size, category): if bin_size == 'day': facet_gap = '+1DAY' elif bin_size == 'week': facet_gap = '+7DAYS' else: facet_gap = '+1MONTH' query_builder = QueryBuilder(dataset=dataset, query={ 'q': build_fuzzy_solr_query(term), 'facet': 'true', 'facet.range': 'header.date', 'facet.range.start': start_date, 'facet.range.end': end_date, 'facet.range.gap': facet_gap, 'fq': '+category.top_category:' + category + '+' + identifying_name_filter }, limit=0, fq=filter_query) solr_result = query_builder.send() return Dates.build_dates_for_search_result(solr_result, bin_size)
def get_topics_for_correspondent(): dataset = Controller.get_arg('dataset') core_name = get_config(dataset)['SOLR_CONNECTION']['Core'] core_topics_name = get_config( dataset)['SOLR_CONNECTION']['Core-Topics'] identifying_name = re.escape(Controller.get_arg('identifying_name')) join_string = '{!join from=doc_id fromIndex=' + core_name + ' to=doc_id}' filter_string = Controller.get_arg('filters', arg_type=str, default='{}', required=False) filter_object = json.loads(filter_string) filter_query = build_filter_query(filter_object, False, True, join_string, core_type=core_topics_name) filter_query.append(join_string + 'header.sender.identifying_name:' + identifying_name + \ ' AND ' + build_fuzzy_solr_query(filter_object.get('searchTerm', ''))) join_query = filter_query aggregated_topics_for_correspondent = Topics.get_aggregated_distribution( dataset, core_topics_name, identifying_name, filter_object, join_query) aggregated_distribution = { 'topics': aggregated_topics_for_correspondent } all_topics = Topics.get_all_topics(dataset) mail_topic_distributions = Topics.get_distributions_for_mails( dataset, join_query) all_topic_distributions = { 'main': aggregated_distribution, 'singles': mail_topic_distributions } for distribution in all_topic_distributions['singles']: topics = Topics.complete_distribution_and_add_ranks( distribution['topics'], all_topics) topics = Topics.remove_words(topics) distribution['topics'] = topics all_topic_distributions['main'][ 'topics'] = Topics.complete_distribution_and_add_ranks( all_topic_distributions['main']['topics'], all_topics) return all_topic_distributions
def search_correspondences_for_term(dataset, filter_string): filter_object = json.loads(filter_string) core_topics_name = get_config( dataset)['SOLR_CONNECTION']['Core-Topics'] filter_query = build_filter_query(filter_object, core_type=core_topics_name) term = filter_object.get('searchTerm', '') facet_query = { 'senders': { 'type': 'terms', 'field': 'header.sender.identifying_name', 'facet': { 'recipients': { 'type': 'terms', 'field': 'header.recipients', 'limit': FACET_LIMIT, 'refine': True } }, 'limit': SOLR_MAX_INT, 'refine': True } } query_builder = QueryBuilder(dataset=dataset, query={ 'q': build_fuzzy_solr_query(term), 'json.facet': json.dumps(facet_query) }, limit=0, fq=filter_query) solr_result = query_builder.send() correspondences = [] if solr_result['facets']['count'] == 0: return correspondences for sender_bucket in solr_result['facets']['senders']['buckets']: correspondences_for_source = { 'source': sender_bucket.get('val', '') } targets = set() for recipient in sender_bucket['recipients']['buckets']: target = literal_eval(recipient.get('val', '')).get( 'identifying_name', '') targets.add(target) correspondences_for_source['targets'] = list(targets) correspondences.append(correspondences_for_source) return correspondences
def get_keyphrases_for_correspondent(): dataset = Controller.get_arg('dataset') core_topics_name = get_config( dataset)['SOLR_CONNECTION']['Core-Topics'] identifying_name = re.escape(Controller.get_arg('identifying_name')) filter_string = Controller.get_arg('filters', arg_type=str, default='{}', required=False) filter_object = json.loads(filter_string) filter_query = build_filter_query(filter_object, core_type=core_topics_name) query_builder = QueryBuilder( dataset=dataset, query={ 'q': 'header.sender.identifying_name:' + identifying_name + ' AND ' + build_fuzzy_solr_query(filter_object.get('searchTerm', '')), 'facet': 'true', 'facet.field': 'keyphrases', 'facet.mincount': '1' }, fq=filter_query, limit=0, ) solr_result = query_builder.send() parsed_solr_result = parse_solr_result(solr_result) results = parsed_solr_result['facet_counts']['facet_fields'][ 'keyphrases'] if len(results) == 0: return results aggregated_keyphrases = Keyphrases.parse_keyphrases(results) return aggregated_keyphrases
def get_aggregated_distribution(dataset, core_topics_name, identifying_name, filter_object, join_query): facet_query = { 'facet_topic_id': { 'type': 'terms', 'field': 'topic_id', 'facet': { 'sum_of_confs_for_topic': 'sum(topic_conf)', 'facet_terms': { 'type': 'terms', 'field': 'terms', 'limit': 1 } }, 'sort': 'index asc', 'limit': FACET_LIMIT, 'refine': True } } query_builder_topic_distribution = QueryBuilder( dataset=dataset, query={ 'q': '*:*', 'json.facet': json.dumps(facet_query) }, fq=join_query, limit=0, core_type='Core-Topics') # get all topics that the pipeline returned with confidences for the correspondent solr_result_topic_distribution = query_builder_topic_distribution.send( ) filter_query = build_filter_query(filter_object, False, core_type=core_topics_name) query_builder_doc_count_for_correspondent = QueryBuilder( dataset=dataset, query='header.sender.identifying_name:' + identifying_name + ' AND ' + build_fuzzy_solr_query(filter_object.get('searchTerm', '')), fq=filter_query, limit=0) solr_result_email_count = query_builder_doc_count_for_correspondent.send( ) total_email_count = solr_result_email_count['response']['numFound'] if solr_result_topic_distribution['facets']['count'] == 0: return [] correspondent_topics_parsed = [] if total_email_count: correspondent_topics_parsed = list( map( Topics.parse_topic_closure_wrapper(total_email_count), solr_result_topic_distribution['facets']['facet_topic_id'] ['buckets'])) return correspondent_topics_parsed
def search(): dataset = Controller.get_arg('dataset') core_topics_name = get_config(dataset)['SOLR_CONNECTION']['Core-Topics'] limit = Controller.get_arg('limit', arg_type=int, required=False) offset = Controller.get_arg('offset', arg_type=int, required=False) sort = Controller.get_arg('sort', arg_type=str, required=False) filter_string = Controller.get_arg('filters', arg_type=str, default='{}', required=False) filter_object = json.loads(filter_string) filter_query = build_filter_query(filter_object, core_type=core_topics_name) term = filter_object.get('searchTerm', '') query = build_fuzzy_solr_query(term) query_builder = QueryBuilder( dataset=dataset, query=query, limit=limit, offset=offset, fq=filter_query, sort=sort ) solr_result = query_builder.send() parsed_solr_result = parse_solr_result(solr_result) results = parse_email_list(parsed_solr_result['response']['docs']) if len(results) == 0: return { 'results': results, 'searchTopics': { 'main': { 'topics': [] }, 'singles': [], }, 'numFound': parsed_solr_result['response']['numFound'], 'searchTerm': term } conditions = map(lambda result_element: 'doc_id:' + result_element['doc_id'], results) doc_id_filter_query = reduce(lambda condition_1, condition_2: condition_1 + ' OR ' + condition_2, conditions) facet_query = { 'facet_topic_id': { 'type': 'terms', 'field': 'topic_id', 'facet': { 'sum_of_confs_for_topic': 'sum(topic_conf)', 'facet_terms': { 'type': 'terms', 'field': 'terms', 'limit': 1 } }, 'sort': 'index asc', 'limit': FACET_LIMIT, 'refine': True } } query_builder = QueryBuilder( dataset=dataset, query={ 'q': doc_id_filter_query, 'group': 'true', 'group.field': 'doc_id', 'group.limit': '100', 'json.facet': json.dumps(facet_query) }, limit=SOLR_MAX_INT, core_type='Core-Topics' ) solr_topic_result = query_builder.send() topic_dists_for_emails = solr_topic_result['grouped']['doc_id']['groups'] topic_dists_for_emails_parsed = Search.parse_grouped_topic_distributions(topic_dists_for_emails) aggregated_topic_dist_parsed = list(map( Topics.parse_topic_closure_wrapper(len(topic_dists_for_emails)), solr_topic_result['facets']['facet_topic_id']['buckets'] )) all_topics = Topics.get_all_topics(dataset) for distribution in topic_dists_for_emails_parsed: topics = Topics.complete_distribution_and_add_ranks(distribution['topics'], all_topics) topics = Topics.remove_words(topics) distribution['topics'] = topics aggregated_topic_dist_parsed = Topics.complete_distribution_and_add_ranks( aggregated_topic_dist_parsed, all_topics) return { 'results': results, 'searchTopics': { 'main': { 'topics': aggregated_topic_dist_parsed }, 'singles': topic_dists_for_emails_parsed, }, 'numFound': parsed_solr_result['response']['numFound'], 'searchTerm': term }
def get_sender_recipient_email_list(): dataset = Controller.get_arg('dataset') core_topics_name = get_config( dataset)['SOLR_CONNECTION']['Core-Topics'] sender = Controller.get_arg('sender', default='*') recipient = Controller.get_arg('recipient', default='*') sender_or_recipient = Controller.get_arg('sender_or_recipient', required=False) current_app.logger.debug( '########## %s ###### %s ###### %s ###########', sender, recipient, sender_or_recipient) limit = Controller.get_arg('limit', int, default=DEFAULT_LIMIT) offset = Controller.get_arg('offset', int, default=DEFAULT_OFFSET) filter_string = Controller.get_arg('filters', arg_type=str, default='{}', required=False) filter_object = json.loads(filter_string) filter_query = build_filter_query(filter_object, False, core_type=core_topics_name) if sender == '*' and recipient == '*' and not sender_or_recipient: raise SyntaxError( 'Please provide sender or recipient or both or sender_or_recipient.' ) original_sender = sender original_recipient = recipient if sender_or_recipient: sender = recipient = sender_or_recipient if sender != '*': sender = re.escape(sender) if recipient != '*': # all non-alphanumerics must be escaped in order for Solr to match only the identifying_name field-part: # if we DIDN'T specify 'identifying_name' for 'recipients' here, also 'name' and 'email' would be searched # because all these three attributes are stored in one big 'recipients' string in Solr! recipient = '*"\'identifying_name\': \'{}\'"*'.format( re.escape(recipient)) operator = 'OR' if sender_or_recipient else 'AND' q = '(header.sender.identifying_name:{} {} header.recipients:{}) AND {}'.format( sender, operator, recipient, build_fuzzy_solr_query(filter_object.get('searchTerm', ''))) query_builder = QueryBuilder(dataset=dataset, query=q, fq=filter_query, limit=limit, offset=offset, sort='Newest first') solr_result = query_builder.send() parsed_solr_result = parse_solr_result(solr_result) return { 'results': parse_email_list(parsed_solr_result['response']['docs']), 'numFound': parsed_solr_result['response']['numFound'], 'query': q, 'senderEmail': original_sender, 'recipientEmail': original_recipient }