Exemple #1
0
def extract_from_file(source_settings, destination):
    with File(source_settings.filename).iter() as handle:
        for g, d in Q.groupby(handle, size=BATCH_SIZE):
            try:
                d2 = map(
                    lambda (x): {
                        "id": x.id,
                        "value": x
                    },
                    map(
                        lambda (x): transform_bugzilla.normalize(
                            CNV.JSON2object(fix_json(x))), d))
                destination.add(d2)
            except Exception, e:
                filename = "Error_" + Random.hex(20) + ".txt"
                File(filename).write(d)
                D.warning("Can not convert block {{block}} (file={{host}})", {
                    "block": g,
                    "filename": filename
                }, e)
Exemple #2
0
def main(settings):
    #USE A FILE
    if settings.source.filename is not None:
        settings.destination.alias = settings.destination.index
        settings.destination.index = settings.destination.alias + CNV.datetime2string(
            datetime.utcnow(), "%Y%m%d_%H%M%S")
        schema = CNV.JSON2object(File(settings.source.schema_filename).read())

        dest = ElasticSearch.create_index(settings.destination, schema)
        dest.set_refresh_interval(-1)
        extract_from_file(settings.source, dest)
        dest.set_refresh_interval(1)

        dest.delete_all_but(settings.destination.alias,
                            settings.destination.index)
        dest.add_alias(settings.destination.alias)
        return

    # SYNCH WITH source ES INDEX
    source = ElasticSearch(settings.source)
    destination = get_or_create_index(settings["destination"], source)
    last_updated = get_last_updated(destination) - timedelta(days=7)
    pending = get_pending(source, last_updated)

    # pending IS IN {"bug_id":b, "count":c} FORM
    # MAIN ETL LOOP
    for g, bugs in Q.groupby(pending, max_size=BATCH_SIZE):
        data = source.search({
            "query": {
                "filtered": {
                    "query": {
                        "match_all": {}
                    },
                    "filter": {
                        "and": [{
                            "terms": {
                                "bug_id": bugs
                            }
                        }, {
                            "range": {
                                "modified_ts": {
                                    "gte": CNV.datetime2milli(last_updated)
                                }
                            }
                        }]
                    }
                }
            },
            "from": 0,
            "size": 200000,
            "sort": []
        })

        d2 = map(
            lambda (x): {
                "id": x.id,
                "value": x
            },
            map(
                lambda (x): transform_bugzilla.normalize(
                    transform_bugzilla.rename_attachments(x)), data.hits.hits))
        destination.add(d2)