def _index_warc(filename, index, counter):
    """
    Index individual WARC file.

    :param filename: WARC file name
    :param index: Elasticsearch index
    :param counter: Spark counter
    """
    try:
        nlp = spacy.load('en_core_web_sm')
        nlp.add_pipe(LanguageDetector(), name='language_detector', last=True)
        helpers.bulk(util.get_es_client(),
                     _generate_docs(index, filename, nlp, counter))
    except Exception as e:
        logger.error(e)
def _retrieve_messages(slice_id, max_slices, scroll_size, index):
    logger.info('Retrieving initial batch (slice {}/{})'.format(slice_id, max_slices))
    es = util.get_es_client()
    results = util.es_retry(
        es.search, index=index, scroll='3h', request_timeout=360, size=scroll_size, body={
            "query": {
                "bool": {
                    "must": [
                        {"range": {"annotation_version": {"gte": ANNOTATION_VERSION}}},
                        {"wildcard": {"group": "gmane.*"}}
                    ]
                }
            },
            "sort": ["group", "headers.date"],
            "_source": ["group", "lang", "headers", "text_plain", "segments"],
            "slice": {
                "id": slice_id,
                "max": max_slices,
                "field": "id_hash"
            }
        })

    try:
        while results['hits']['hits']:
            batch = results['hits']['hits']

            for doc in batch:
                out_doc = doc['_source'].copy()
                out_doc['headers'] = {k: v for k, v in out_doc['headers'].items()if v and k in (
                    'date', 'message_id', 'from', 'to', 'cc', 'in_reply_to', 'references', 'subject', 'list_id'
                )}
                yield doc['_id'], out_doc

            logger.info('Retrieving next batch (slice {}/{})'.format(slice_id, max_slices))
            results = util.es_retry(es.scroll, scroll_id=results['_scroll_id'], scroll='3h', request_timeout=360)

    finally:
        es.clear_scroll(scroll_id=results['_scroll_id'])
Example #3
0
def main(index, output_file, **kwargs):
    """
    Sample mails from Elasticsearch index.

    Arguments:
        index: Elasticsearch index to sample from
        output_file: output file (prefix without extension in case multiple formats are specified)
    """

    output_jsonl = None
    output_text = None
    if 'json' in kwargs['output_format']:
        fname = output_file if len(
            kwargs['output_format']
        ) == 1 else kwargs['output_format'] + '.jsonl'
        output_jsonl = open(fname, 'w')
    if 'text' in kwargs['output_format']:
        fname = output_file if len(
            kwargs['output_format']) == 1 else kwargs['output_format'] + '.txt'
        output_text = open(fname, 'w')

    if kwargs.get('query') is not None:
        query = json.load(kwargs.get('query'))
    else:
        query = {
            "sort": ["warc_id"],
            "size": 200,
            "query": {
                "bool": {
                    "filter": {
                        "bool": {
                            "must_not": [{
                                "query_string": {
                                    "analyze_wildcard":
                                    True,
                                    "default_field":
                                    "*",
                                    "query":
                                    """group:(*.patches OR *.commits* OR
                                            *.dist-commits* OR *.version-control* OR *.git* OR *.cvs* OR *.svn*
                                            OR *.trunk* OR *.scm* OR *.pkg*) OR (group:(*.bugs* OR *.issues*
                                            OR *.bugzilla* OR *.codereview*) OR 
                                            headers.subject.keyword:(*jira* OR *bugzilla*) OR
                                            headers.from_email.keyword:(*bugs* OR *bugzilla* OR *jira* OR *jboss*))"""
                                }
                            }],
                            "must": {
                                "term": {
                                    "lang": "en"
                                }
                            },
                            "minimum_should_match":
                            1,
                            "should": [{
                                "wildcard": {
                                    "group": "gmane.culture.*"
                                }
                            }, {
                                "wildcard": {
                                    "group": "gmane.politics.*"
                                }
                            }, {
                                "wildcard": {
                                    "group": "gmane.science.*"
                                }
                            }, {
                                "wildcard": {
                                    "group": "gmane.education.*"
                                }
                            }, {
                                "wildcard": {
                                    "group": "gmane.music.*"
                                }
                            }, {
                                "wildcard": {
                                    "group": "gmane.games.*"
                                }
                            }, {
                                "wildcard": {
                                    "group": "gmane.recreation.*"
                                }
                            }]
                        }
                    }
                }
            }
        }

    logger.info('Retrieving initial batch')
    es = util.get_es_client()
    results = util.es_retry(es.search,
                            index=index,
                            scroll='10m',
                            size=kwargs['scroll_size'],
                            body=query)

    skip = kwargs['skip']
    if skip > 0:
        logger.info('Skipping ahead {} messages'.format(skip))

    sampled_groups = {}
    num_samples = 0
    num_skipped = 0

    try:
        with tqdm(desc='Calculating progress',
                  unit=' messages') as progress_bar:
            while num_samples < kwargs['total_mails'] and len(
                    results['hits']['hits']) > 0:
                for hit in results['hits']['hits']:
                    if skip > 0 and num_skipped < skip:
                        progress_bar.set_description('Skipping messages')
                        progress_bar.total = skip
                        num_skipped += 1
                        progress_bar.update()
                        continue
                    elif (skip == 0
                          or num_skipped >= skip) and num_samples == 0:
                        progress_bar.set_description('Sampling messages')
                        progress_bar.total = kwargs['total_mails']
                        progress_bar.n = 0
                        progress_bar.last_print_n = 0
                        progress_bar.update(0)

                    src = hit['_source']
                    text_plain = src['text_plain']

                    prev_samples = sampled_groups.get(src['group'], 0)
                    if kwargs['group_limit'] and prev_samples > kwargs[
                            'group_limit']:
                        continue
                    sampled_groups[src['group']] = prev_samples + 1

                    num_samples += 1
                    progress_bar.update()

                    if output_jsonl:
                        json.dump(
                            {
                                'text': text_plain,
                                'meta': {
                                    k: src[k]
                                    for k in src.keys()
                                    if k not in ['text_plain', 'text_html']
                                },
                                'labels': []
                            }, output_jsonl)
                        output_jsonl.write('\n')

                    if output_text:
                        output_text.write(
                            util.normalize_message_text(text_plain))
                        output_text.write('\n')

                    if num_samples >= kwargs['total_mails']:
                        break

                results = util.es_retry(es.scroll,
                                        scroll_id=results['_scroll_id'],
                                        scroll='10m')
    finally:
        es.clear_scroll(scroll_id=results['_scroll_id'])

    if output_jsonl:
        output_jsonl.close()
    if output_text:
        output_text.close()
Example #4
0
def start_indexer(index, segmentation_model, fasttext_model, **kwargs):
    """
    Start annotation indexer.

    :param index: Elasticsearch index
    :param segmentation_model: HDF5 Email Segmentation model
    :param fasttext_model: fastText email embedding

    Keyword Args:
        dry_run (bool): Perform dry run, do not actually index anything
        progress_bar (bool): Show indexing progress bar
    """

    if kwargs.get('dry_run'):
        logger.warning('Started in dry run mode, nothing will be indexed.')

    es = util.get_es_client()

    if not es.indices.exists(index=index):
        raise RuntimeError('Index has to exist.')

    logger.info('Updating Elasticsearch index mapping')
    es.indices.put_mapping(index=index,
                           body={
                               "properties": {
                                   "main_content": {
                                       "type": "text"
                                   },
                                   "segments": {
                                       "type": "nested",
                                       "properties": {
                                           "begin": {
                                               "type": "integer"
                                           },
                                           "end": {
                                               "type": "integer"
                                           },
                                           "label": {
                                               "type": "keyword"
                                           },
                                       }
                                   },
                                   "label_stats": {
                                       "properties": {
                                           "paragraph_quotation.num_ratio": {
                                               "type": "float"
                                           },
                                           "paragraph_quotation.lines_ratio": {
                                               "type": "float"
                                           }
                                       }
                                   },
                                   "annotation_version": {
                                       "type": "short"
                                   }
                               },
                               "dynamic":
                               True,
                               "dynamic_templates": [{
                                   "stats": {
                                       "path_match": "label_stats.*",
                                       "mapping": {
                                           "properties": {
                                               "num": {
                                                   "type": "integer"
                                               },
                                               "chars": {
                                                   "type": "integer"
                                               },
                                               "lines": {
                                                   "type": "integer"
                                               },
                                               "avg_len": {
                                                   "type": "float"
                                               }
                                           }
                                       }
                                   }
                               }]
                           })

    slices = kwargs.get('scroll_slices', 2)
    sc = util.get_spark_context(
        'Mail Annotation Indexer',
        additional_conf={'spark.default.parallelism': slices})
    rdd = sc.range(0, slices)
    rdd = rdd.repartition(slices)
    rdd.foreach(
        partial(_start_spark_worker,
                index=index,
                segmentation_model=segmentation_model,
                fasttext_model=fasttext_model,
                **kwargs))
Example #5
0
def _start_spark_worker(slice_id, index, segmentation_model, fasttext_model,
                        **kwargs):
    # Fix to circumvent Yarn's buggy HOME override
    os.environ['HOME'] = os.environ.get('HADOOP_HOME', os.environ['HOME'])

    logger.info('Loading SpaCy')
    if not spacy.util.is_package('en_core_web_sm'):
        oldbase = site.USER_BASE
        site.USER_BASE = os.path.join(os.environ['HOME'], '.local')
        site.USER_SITE = site.USER_SITE.replace(oldbase, site.USER_BASE)
        os.makedirs(site.USER_SITE, exist_ok=True)
        sys.path.insert(0, site.USER_SITE)
        spacy.cli.download('en_core_web_sm')
    import en_core_web_sm
    nlp = en_core_web_sm.load()
    nlp.add_pipe(LanguageDetector(), name='language_detector', last=True)

    logger.info('Loading segmentation model')
    load_fasttext_model(fasttext_model)
    segmentation_model = models.load_model(segmentation_model)

    max_slices = kwargs.get('scroll_slices', 2)
    logger.info('Retrieving initial batch (slice {}/{})'.format(
        slice_id, max_slices))
    es = util.get_es_client()
    results = util.es_retry(es.search,
                            index=index,
                            scroll='45m',
                            size=kwargs['scroll_size'],
                            body={
                                'sort': ['_id'],
                                'slice': {
                                    'id': slice_id,
                                    'max': max_slices,
                                    'field': 'id_hash'
                                },
                                'query': {
                                    'bool': {
                                        "must": {
                                            "wildcard": {
                                                "group": "gmane.*"
                                            }
                                        },
                                        'must_not': {
                                            'range': {
                                                'annotation_version': {
                                                    'gte': ANNOTATION_VERSION
                                                }
                                            }
                                        }
                                    }
                                }
                            })

    try:
        while results['hits']['hits']:
            logger.info('Processing batch.')
            doc_gen = _generate_docs(results['hits']['hits'],
                                     index,
                                     segmentation_model,
                                     nlp,
                                     progress_bar=False,
                                     anonymize=kwargs.get('anonymize', False))
            try:
                if kwargs.get('dry_run'):
                    while True:
                        next(doc_gen)
                else:
                    # only start bulk request if generator has at least one element
                    peek = next(doc_gen)
                    helpers.bulk(es, itertools.chain([peek], doc_gen))
                logger.info('Finished indexing batch.')
            except StopIteration:
                pass

            logger.info('Retrieving next batch (slice {}/{})'.format(
                slice_id, max_slices))
            results = util.es_retry(es.scroll,
                                    scroll_id=results['_scroll_id'],
                                    scroll='45m')
    finally:
        es.clear_scroll(scroll_id=results['_scroll_id'])
def index_directory(input_dir, index):
    """
    Index WARC files from the given directory.

    :param input_dir: input directory containing raw WARC files
    :param index: Elasticsearch index
    """

    es = util.get_es_client()
    sc = util.get_spark_context('Mail WARC Indexer',
                                'Mail WARC Indexer for {}'.format(input_dir))

    if not es.indices.exists(index=index):
        es.indices.create(index=index,
                          body={
                              "settings": {
                                  "number_of_replicas": 0,
                                  "number_of_shards": 30
                              },
                              "mappings": {
                                  "properties": {
                                      "modified": {
                                          "type": "date",
                                          "format": "epoch_millis"
                                      },
                                      "headers": {
                                          "properties": {
                                              "date": {
                                                  "type":
                                                  "date",
                                                  "format":
                                                  "yyyy-MM-dd HH:mm:ssXXX"
                                              }
                                          }
                                      },
                                      "id_hash": {
                                          "type": "long"
                                      },
                                      "group": {
                                          "type": "keyword"
                                      },
                                      "warc_file": {
                                          "type": "keyword"
                                      },
                                      "warc_offset": {
                                          "type": "long"
                                      },
                                      "warc_id": {
                                          "type": "keyword"
                                      },
                                      "news_url": {
                                          "type": "keyword"
                                      },
                                      "lang": {
                                          "type": "keyword"
                                      },
                                      "text_plain": {
                                          "type": "text"
                                      },
                                      "text_html": {
                                          "type": "text"
                                      }
                                  }
                              }
                          })

    counter = sc.accumulator(0)

    logger.info("Listing group directories")
    group_dirs = glob(os.path.join(input_dir, 'gmane.*'))
    group_dirs = sc.parallelize(group_dirs, len(group_dirs) // 5)

    logger.info('Listing WARCS')
    warcs = group_dirs.flatMap(lambda d: glob(os.path.join(d, '*.warc.gz')))
    warcs.cache()

    logger.info('Indexing messages')
    warcs.foreach(partial(_index_warc, index=index, counter=counter))