def _index_warc(filename, index, counter): """ Index individual WARC file. :param filename: WARC file name :param index: Elasticsearch index :param counter: Spark counter """ try: nlp = spacy.load('en_core_web_sm') nlp.add_pipe(LanguageDetector(), name='language_detector', last=True) helpers.bulk(util.get_es_client(), _generate_docs(index, filename, nlp, counter)) except Exception as e: logger.error(e)
def _retrieve_messages(slice_id, max_slices, scroll_size, index): logger.info('Retrieving initial batch (slice {}/{})'.format(slice_id, max_slices)) es = util.get_es_client() results = util.es_retry( es.search, index=index, scroll='3h', request_timeout=360, size=scroll_size, body={ "query": { "bool": { "must": [ {"range": {"annotation_version": {"gte": ANNOTATION_VERSION}}}, {"wildcard": {"group": "gmane.*"}} ] } }, "sort": ["group", "headers.date"], "_source": ["group", "lang", "headers", "text_plain", "segments"], "slice": { "id": slice_id, "max": max_slices, "field": "id_hash" } }) try: while results['hits']['hits']: batch = results['hits']['hits'] for doc in batch: out_doc = doc['_source'].copy() out_doc['headers'] = {k: v for k, v in out_doc['headers'].items()if v and k in ( 'date', 'message_id', 'from', 'to', 'cc', 'in_reply_to', 'references', 'subject', 'list_id' )} yield doc['_id'], out_doc logger.info('Retrieving next batch (slice {}/{})'.format(slice_id, max_slices)) results = util.es_retry(es.scroll, scroll_id=results['_scroll_id'], scroll='3h', request_timeout=360) finally: es.clear_scroll(scroll_id=results['_scroll_id'])
def main(index, output_file, **kwargs): """ Sample mails from Elasticsearch index. Arguments: index: Elasticsearch index to sample from output_file: output file (prefix without extension in case multiple formats are specified) """ output_jsonl = None output_text = None if 'json' in kwargs['output_format']: fname = output_file if len( kwargs['output_format'] ) == 1 else kwargs['output_format'] + '.jsonl' output_jsonl = open(fname, 'w') if 'text' in kwargs['output_format']: fname = output_file if len( kwargs['output_format']) == 1 else kwargs['output_format'] + '.txt' output_text = open(fname, 'w') if kwargs.get('query') is not None: query = json.load(kwargs.get('query')) else: query = { "sort": ["warc_id"], "size": 200, "query": { "bool": { "filter": { "bool": { "must_not": [{ "query_string": { "analyze_wildcard": True, "default_field": "*", "query": """group:(*.patches OR *.commits* OR *.dist-commits* OR *.version-control* OR *.git* OR *.cvs* OR *.svn* OR *.trunk* OR *.scm* OR *.pkg*) OR (group:(*.bugs* OR *.issues* OR *.bugzilla* OR *.codereview*) OR headers.subject.keyword:(*jira* OR *bugzilla*) OR headers.from_email.keyword:(*bugs* OR *bugzilla* OR *jira* OR *jboss*))""" } }], "must": { "term": { "lang": "en" } }, "minimum_should_match": 1, "should": [{ "wildcard": { "group": "gmane.culture.*" } }, { "wildcard": { "group": "gmane.politics.*" } }, { "wildcard": { "group": "gmane.science.*" } }, { "wildcard": { "group": "gmane.education.*" } }, { "wildcard": { "group": "gmane.music.*" } }, { "wildcard": { "group": "gmane.games.*" } }, { "wildcard": { "group": "gmane.recreation.*" } }] } } } } } logger.info('Retrieving initial batch') es = util.get_es_client() results = util.es_retry(es.search, index=index, scroll='10m', size=kwargs['scroll_size'], body=query) skip = kwargs['skip'] if skip > 0: logger.info('Skipping ahead {} messages'.format(skip)) sampled_groups = {} num_samples = 0 num_skipped = 0 try: with tqdm(desc='Calculating progress', unit=' messages') as progress_bar: while num_samples < kwargs['total_mails'] and len( results['hits']['hits']) > 0: for hit in results['hits']['hits']: if skip > 0 and num_skipped < skip: progress_bar.set_description('Skipping messages') progress_bar.total = skip num_skipped += 1 progress_bar.update() continue elif (skip == 0 or num_skipped >= skip) and num_samples == 0: progress_bar.set_description('Sampling messages') progress_bar.total = kwargs['total_mails'] progress_bar.n = 0 progress_bar.last_print_n = 0 progress_bar.update(0) src = hit['_source'] text_plain = src['text_plain'] prev_samples = sampled_groups.get(src['group'], 0) if kwargs['group_limit'] and prev_samples > kwargs[ 'group_limit']: continue sampled_groups[src['group']] = prev_samples + 1 num_samples += 1 progress_bar.update() if output_jsonl: json.dump( { 'text': text_plain, 'meta': { k: src[k] for k in src.keys() if k not in ['text_plain', 'text_html'] }, 'labels': [] }, output_jsonl) output_jsonl.write('\n') if output_text: output_text.write( util.normalize_message_text(text_plain)) output_text.write('\n') if num_samples >= kwargs['total_mails']: break results = util.es_retry(es.scroll, scroll_id=results['_scroll_id'], scroll='10m') finally: es.clear_scroll(scroll_id=results['_scroll_id']) if output_jsonl: output_jsonl.close() if output_text: output_text.close()
def start_indexer(index, segmentation_model, fasttext_model, **kwargs): """ Start annotation indexer. :param index: Elasticsearch index :param segmentation_model: HDF5 Email Segmentation model :param fasttext_model: fastText email embedding Keyword Args: dry_run (bool): Perform dry run, do not actually index anything progress_bar (bool): Show indexing progress bar """ if kwargs.get('dry_run'): logger.warning('Started in dry run mode, nothing will be indexed.') es = util.get_es_client() if not es.indices.exists(index=index): raise RuntimeError('Index has to exist.') logger.info('Updating Elasticsearch index mapping') es.indices.put_mapping(index=index, body={ "properties": { "main_content": { "type": "text" }, "segments": { "type": "nested", "properties": { "begin": { "type": "integer" }, "end": { "type": "integer" }, "label": { "type": "keyword" }, } }, "label_stats": { "properties": { "paragraph_quotation.num_ratio": { "type": "float" }, "paragraph_quotation.lines_ratio": { "type": "float" } } }, "annotation_version": { "type": "short" } }, "dynamic": True, "dynamic_templates": [{ "stats": { "path_match": "label_stats.*", "mapping": { "properties": { "num": { "type": "integer" }, "chars": { "type": "integer" }, "lines": { "type": "integer" }, "avg_len": { "type": "float" } } } } }] }) slices = kwargs.get('scroll_slices', 2) sc = util.get_spark_context( 'Mail Annotation Indexer', additional_conf={'spark.default.parallelism': slices}) rdd = sc.range(0, slices) rdd = rdd.repartition(slices) rdd.foreach( partial(_start_spark_worker, index=index, segmentation_model=segmentation_model, fasttext_model=fasttext_model, **kwargs))
def _start_spark_worker(slice_id, index, segmentation_model, fasttext_model, **kwargs): # Fix to circumvent Yarn's buggy HOME override os.environ['HOME'] = os.environ.get('HADOOP_HOME', os.environ['HOME']) logger.info('Loading SpaCy') if not spacy.util.is_package('en_core_web_sm'): oldbase = site.USER_BASE site.USER_BASE = os.path.join(os.environ['HOME'], '.local') site.USER_SITE = site.USER_SITE.replace(oldbase, site.USER_BASE) os.makedirs(site.USER_SITE, exist_ok=True) sys.path.insert(0, site.USER_SITE) spacy.cli.download('en_core_web_sm') import en_core_web_sm nlp = en_core_web_sm.load() nlp.add_pipe(LanguageDetector(), name='language_detector', last=True) logger.info('Loading segmentation model') load_fasttext_model(fasttext_model) segmentation_model = models.load_model(segmentation_model) max_slices = kwargs.get('scroll_slices', 2) logger.info('Retrieving initial batch (slice {}/{})'.format( slice_id, max_slices)) es = util.get_es_client() results = util.es_retry(es.search, index=index, scroll='45m', size=kwargs['scroll_size'], body={ 'sort': ['_id'], 'slice': { 'id': slice_id, 'max': max_slices, 'field': 'id_hash' }, 'query': { 'bool': { "must": { "wildcard": { "group": "gmane.*" } }, 'must_not': { 'range': { 'annotation_version': { 'gte': ANNOTATION_VERSION } } } } } }) try: while results['hits']['hits']: logger.info('Processing batch.') doc_gen = _generate_docs(results['hits']['hits'], index, segmentation_model, nlp, progress_bar=False, anonymize=kwargs.get('anonymize', False)) try: if kwargs.get('dry_run'): while True: next(doc_gen) else: # only start bulk request if generator has at least one element peek = next(doc_gen) helpers.bulk(es, itertools.chain([peek], doc_gen)) logger.info('Finished indexing batch.') except StopIteration: pass logger.info('Retrieving next batch (slice {}/{})'.format( slice_id, max_slices)) results = util.es_retry(es.scroll, scroll_id=results['_scroll_id'], scroll='45m') finally: es.clear_scroll(scroll_id=results['_scroll_id'])
def index_directory(input_dir, index): """ Index WARC files from the given directory. :param input_dir: input directory containing raw WARC files :param index: Elasticsearch index """ es = util.get_es_client() sc = util.get_spark_context('Mail WARC Indexer', 'Mail WARC Indexer for {}'.format(input_dir)) if not es.indices.exists(index=index): es.indices.create(index=index, body={ "settings": { "number_of_replicas": 0, "number_of_shards": 30 }, "mappings": { "properties": { "modified": { "type": "date", "format": "epoch_millis" }, "headers": { "properties": { "date": { "type": "date", "format": "yyyy-MM-dd HH:mm:ssXXX" } } }, "id_hash": { "type": "long" }, "group": { "type": "keyword" }, "warc_file": { "type": "keyword" }, "warc_offset": { "type": "long" }, "warc_id": { "type": "keyword" }, "news_url": { "type": "keyword" }, "lang": { "type": "keyword" }, "text_plain": { "type": "text" }, "text_html": { "type": "text" } } } }) counter = sc.accumulator(0) logger.info("Listing group directories") group_dirs = glob(os.path.join(input_dir, 'gmane.*')) group_dirs = sc.parallelize(group_dirs, len(group_dirs) // 5) logger.info('Listing WARCS') warcs = group_dirs.flatMap(lambda d: glob(os.path.join(d, '*.warc.gz'))) warcs.cache() logger.info('Indexing messages') warcs.foreach(partial(_index_warc, index=index, counter=counter))