def get_subjects_for_doc_ids(doc_ids, dataset): results = [] for doc_id in doc_ids: solr_result = Emails.get_email_from_solr(dataset, doc_id) parsed_solr_result = parse_solr_result(solr_result) if parsed_solr_result['response']['numFound'] == 0: results.append({ 'subject': 'NO THREAD DATA FOUND', 'doc_id': doc_id }) else: email = parse_email_list(parsed_solr_result['response']['docs'])[0] results.append({ 'subject': email['header']['subject'], 'doc_id': doc_id }) return results
def search(): dataset = Controller.get_arg('dataset') core_topics_name = get_config(dataset)['SOLR_CONNECTION']['Core-Topics'] limit = Controller.get_arg('limit', arg_type=int, required=False) offset = Controller.get_arg('offset', arg_type=int, required=False) sort = Controller.get_arg('sort', arg_type=str, required=False) filter_string = Controller.get_arg('filters', arg_type=str, default='{}', required=False) filter_object = json.loads(filter_string) filter_query = build_filter_query(filter_object, core_type=core_topics_name) term = filter_object.get('searchTerm', '') query = build_fuzzy_solr_query(term) query_builder = QueryBuilder( dataset=dataset, query=query, limit=limit, offset=offset, fq=filter_query, sort=sort ) solr_result = query_builder.send() parsed_solr_result = parse_solr_result(solr_result) results = parse_email_list(parsed_solr_result['response']['docs']) if len(results) == 0: return { 'results': results, 'searchTopics': { 'main': { 'topics': [] }, 'singles': [], }, 'numFound': parsed_solr_result['response']['numFound'], 'searchTerm': term } conditions = map(lambda result_element: 'doc_id:' + result_element['doc_id'], results) doc_id_filter_query = reduce(lambda condition_1, condition_2: condition_1 + ' OR ' + condition_2, conditions) facet_query = { 'facet_topic_id': { 'type': 'terms', 'field': 'topic_id', 'facet': { 'sum_of_confs_for_topic': 'sum(topic_conf)', 'facet_terms': { 'type': 'terms', 'field': 'terms', 'limit': 1 } }, 'sort': 'index asc', 'limit': FACET_LIMIT, 'refine': True } } query_builder = QueryBuilder( dataset=dataset, query={ 'q': doc_id_filter_query, 'group': 'true', 'group.field': 'doc_id', 'group.limit': '100', 'json.facet': json.dumps(facet_query) }, limit=SOLR_MAX_INT, core_type='Core-Topics' ) solr_topic_result = query_builder.send() topic_dists_for_emails = solr_topic_result['grouped']['doc_id']['groups'] topic_dists_for_emails_parsed = Search.parse_grouped_topic_distributions(topic_dists_for_emails) aggregated_topic_dist_parsed = list(map( Topics.parse_topic_closure_wrapper(len(topic_dists_for_emails)), solr_topic_result['facets']['facet_topic_id']['buckets'] )) all_topics = Topics.get_all_topics(dataset) for distribution in topic_dists_for_emails_parsed: topics = Topics.complete_distribution_and_add_ranks(distribution['topics'], all_topics) topics = Topics.remove_words(topics) distribution['topics'] = topics aggregated_topic_dist_parsed = Topics.complete_distribution_and_add_ranks( aggregated_topic_dist_parsed, all_topics) return { 'results': results, 'searchTopics': { 'main': { 'topics': aggregated_topic_dist_parsed }, 'singles': topic_dists_for_emails_parsed, }, 'numFound': parsed_solr_result['response']['numFound'], 'searchTerm': term }
def get_email_by_doc_id(): dataset = Controller.get_arg('dataset') doc_id = Controller.get_arg('doc_id') solr_result = Emails.get_email_from_solr(dataset, doc_id, True) parsed_solr_result = parse_solr_result(solr_result) if parsed_solr_result['response']['numFound'] == 0: return parsed_solr_result email = parse_email_list(parsed_solr_result['response']['docs'])[0] similars = solr_result['moreLikeThis'][solr_result['response']['docs'][0]['id']]['docs'] similar_ids = list(map(lambda x: x['doc_id'], similars)) if email['header']['recipients'][0] != 'NO RECIPIENTS FOUND': email['header']['recipients'] = [literal_eval(recipient) for recipient in email['header']['recipients']] if email['keyphrases'][0] != 'NO KEYPHRASES FOUND': email['keyphrases'] = [literal_eval(keyphrase)[0] for keyphrase in email['keyphrases']] if parsed_solr_result['response']['docs'][0]: request_results = Emails.get_topic_distribution_for_email(dataset, doc_id) topics_as_objects = Emails.parse_topics(request_results) solr_result_all_topics = Emails.get_all_topics(dataset) all_topics_parsed = parse_all_topics(solr_result_all_topics['response']['docs']) topics_as_objects = Topics.complete_distribution_and_add_ranks(topics_as_objects, all_topics_parsed) completed_dists = [] if similar_ids: dists = [Emails.parse_topics(Emails .get_topic_distribution_for_email(dataset, id)) for id in similar_ids] completed_dists = [ { 'topics': Topics.remove_words(Topics.complete_distribution_and_add_ranks(dist, all_topics_parsed)) } for dist in dists] for dist, id in zip(completed_dists, similar_ids): dist['highlightId'] = id email['topics'] = { 'main': { 'topics': topics_as_objects }, 'singles': completed_dists if similar_ids else [] } if email['predecessor'] == 'NO THREAD DATA FOUND': email['predecessor'] = { 'subject': email['predecessor'], 'doc_id': '' } else: email['predecessor'] = Emails.get_subjects_for_doc_ids([email['predecessor']], dataset)[0] if email['successor'][0] == 'NO THREAD DATA FOUND': email['successor'][0] = { 'subject': email['successor'][0], 'doc_id': '' } else: email['successor'] = Emails.get_subjects_for_doc_ids(email['successor'], dataset) return { 'email': email, 'numFound': parsed_solr_result['response']['numFound'], 'searchTerm': doc_id } else: return { 'numFound': parsed_solr_result['response']['numFound'], 'searchTerm': doc_id }
def get_similar_emails_by_doc_id(): dataset = Controller.get_arg('dataset') doc_id = Controller.get_arg('doc_id') solr_result = Emails.get_email_from_solr(dataset, doc_id, more_like_this=True) if solr_result['response']['numFound'] == 0 or \ solr_result['moreLikeThis'][solr_result['response']['docs'][0]['id']]['numFound'] == 0: return [] result = { 'response': { 'docs': [] } } parsed_solr_result = parse_solr_result(solr_result) main_email = parse_email_list(parsed_solr_result['response']['docs'])[0] result['response']['docs'] = solr_result['moreLikeThis'][main_email['id']]['docs'] parsed_similar_result = parse_solr_result(result) parsed_similar_mails = parse_email_list(parsed_similar_result['response']['docs']) date = main_email['header']['date'].split("T")[0] if main_email['header']['date'] != 'NO DATE FOUND' else None similar_dates = { date: { 'date': date, 'business': 0, 'personal': 0, 'spam': 0, 'this email': 1 } } for mail in parsed_similar_mails: date = mail['header']['date'].split("T")[0] if mail['header']['date'] != 'NO DATE FOUND' else None if date not in similar_dates: similar_dates[date] = { 'date': date, 'business': 0, 'personal': 0, 'spam': 0 } similar_dates[date][mail['category']] += 1 dates = [x['date'] for x in similar_dates.values() if x['date'] is not None] start_date = datetime.datetime.strptime(min(dates), '%Y-%m-%d') end_date = datetime.datetime.strptime(max(dates), '%Y-%m-%d') for offset in range((end_date - start_date).days): date = (start_date + datetime.timedelta(days=offset)).strftime('%Y-%m-%d') if date not in similar_dates: similar_dates[date] = { 'date': date, 'business': 0, 'personal': 0, 'spam': 0 } similar_dates = sorted(filter(lambda x: x['date'] is not None, similar_dates.values()), key=lambda k: k['date']) for i, entry in enumerate(similar_dates): similar_dates[i]['date'] = Dates.format_date_for_axis(entry['date'], 'day') return { 'docs': parsed_similar_mails, 'dates': { 'month': [], 'week': [], 'day': similar_dates } }
def get_sender_recipient_email_list(): dataset = Controller.get_arg('dataset') core_topics_name = get_config( dataset)['SOLR_CONNECTION']['Core-Topics'] sender = Controller.get_arg('sender', default='*') recipient = Controller.get_arg('recipient', default='*') sender_or_recipient = Controller.get_arg('sender_or_recipient', required=False) current_app.logger.debug( '########## %s ###### %s ###### %s ###########', sender, recipient, sender_or_recipient) limit = Controller.get_arg('limit', int, default=DEFAULT_LIMIT) offset = Controller.get_arg('offset', int, default=DEFAULT_OFFSET) filter_string = Controller.get_arg('filters', arg_type=str, default='{}', required=False) filter_object = json.loads(filter_string) filter_query = build_filter_query(filter_object, False, core_type=core_topics_name) if sender == '*' and recipient == '*' and not sender_or_recipient: raise SyntaxError( 'Please provide sender or recipient or both or sender_or_recipient.' ) original_sender = sender original_recipient = recipient if sender_or_recipient: sender = recipient = sender_or_recipient if sender != '*': sender = re.escape(sender) if recipient != '*': # all non-alphanumerics must be escaped in order for Solr to match only the identifying_name field-part: # if we DIDN'T specify 'identifying_name' for 'recipients' here, also 'name' and 'email' would be searched # because all these three attributes are stored in one big 'recipients' string in Solr! recipient = '*"\'identifying_name\': \'{}\'"*'.format( re.escape(recipient)) operator = 'OR' if sender_or_recipient else 'AND' q = '(header.sender.identifying_name:{} {} header.recipients:{}) AND {}'.format( sender, operator, recipient, build_fuzzy_solr_query(filter_object.get('searchTerm', ''))) query_builder = QueryBuilder(dataset=dataset, query=q, fq=filter_query, limit=limit, offset=offset, sort='Newest first') solr_result = query_builder.send() parsed_solr_result = parse_solr_result(solr_result) return { 'results': parse_email_list(parsed_solr_result['response']['docs']), 'numFound': parsed_solr_result['response']['numFound'], 'query': q, 'senderEmail': original_sender, 'recipientEmail': original_recipient }