def replicate(source, destination, pending, last_updated):
    """
    COPY source RECORDS TO destination
    """
    for g, bugs in Q.groupby(pending, max_size=BATCH_SIZE):
        with Timer("Replicate {{num_bugs}} bug versions", {"num_bugs": len(bugs)}):
            data = source.search({
                "query": {"filtered": {
                    "query": {"match_all": {}},
                    "filter": {"and": [
                        {"terms": {"bug_id": set(bugs)}},
                        {"range": {"modified_ts":
                            {"gte": CNV.datetime2milli(last_updated)}
                        }}
                    ]}
                }},
                "from": 0,
                "size": 200000,
                "sort": []
            })

            d2 = map(
                lambda(x): {"id": x.id, "value": x},
                map(
                    lambda(x): transform_bugzilla.normalize(transform_bugzilla.rename_attachments(x._source), old_school=True),
                    data.hits.hits
                )
            )
            destination.extend(d2)
Ejemplo n.º 2
0
def extract_from_file(source_settings, destination):
    with File(source_settings.filename).iter() as handle:
        for g, d in Q.groupby(handle, size=BATCH_SIZE):
            try:
                d2=map(lambda(x): {"id":x.id, "value":x}, map(lambda(x): transform_bugzilla.normalize(CNV.JSON2object(fix_json(x))), d))
                destination.add(d2)
            except Exception, e:
                filename="Error_"+Random.hex(20)+".txt"
                File(filename).write(d)
                D.warning("Can not convert block {{block}} (file={{host}})", {"block":g, "filename":filename}, e)
Ejemplo n.º 3
0
def main(settings):
    #USE A FILE
    if settings.source.filename is not None:
        settings.destination.alias=settings.destination.index
        settings.destination.index=settings.destination.alias+CNV.datetime2string(datetime.utcnow(), "%Y%m%d_%H%M%S")
        schema=CNV.JSON2object(File(settings.source.schema_filename).read())

        dest=ElasticSearch.create_index(settings.destination, schema)
        dest.set_refresh_interval(-1)
        extract_from_file(settings.source, dest)
        dest.set_refresh_interval(1)

        dest.delete_all_but(settings.destination.alias, settings.destination.index)
        dest.add_alias(settings.destination.alias)
        return

    # SYNCH WITH source ES INDEX
    source=ElasticSearch(settings.source)
    destination=get_or_create_index(settings["destination"], source)
    last_updated=get_last_updated(destination)-timedelta(days=7)
    pending=get_pending(source, last_updated)

    # pending IS IN {"bug_id":b, "count":c} FORM
    # MAIN ETL LOOP
    for g, bugs in Q.groupby(pending, max_size=BATCH_SIZE):
        data=source.search({
            "query":{"filtered":{
                "query":{"match_all":{}},
                "filter":{"and":[
                    {"terms":{"bug_id":bugs}},
                    {"range":{"modified_ts":{"gte":CNV.datetime2milli(last_updated)}}}
                ]}
            }},
            "from":0,
            "size":200000,
            "sort":[]
        })

        d2=map(
            lambda(x): {"id":x.id, "value":x},
            map(
                lambda(x): transform_bugzilla.normalize(transform_bugzilla.rename_attachments(x)),
                data.hits.hits
            )
        )
        destination.add(d2)
def extract_from_file(source_settings, destination):
    with File(source_settings.filename) as handle:
        for g, d in Q.groupby(handle, size=BATCH_SIZE):
            try:
                d2 = map(
                    lambda (x): {"id": x.id, "value": x},
                    map(
                        lambda(x): transform_bugzilla.normalize(CNV.JSON2object(fix_json(x))),
                        d
                    )
                )
                destination.add(d2)
            except Exception, e:
                filename = "Error_" + unicode(g) + ".txt"
                File(filename).write(d)
                Log.warning("Can not convert block {{block}} (file={{host}})", {
                    "block": g,
                    "filename": filename
                }, e)
Ejemplo n.º 5
0
def main(settings):
    #USE A FILE
    if settings.source.filename is not None:
        settings.destination.alias = settings.destination.index
        settings.destination.index = settings.destination.alias + CNV.datetime2string(
            datetime.utcnow(), "%Y%m%d_%H%M%S")
        schema = CNV.JSON2object(File(settings.source.schema_filename).read())

        dest = ElasticSearch.create_index(settings.destination, schema)
        dest.set_refresh_interval(-1)
        extract_from_file(settings.source, dest)
        dest.set_refresh_interval(1)

        dest.delete_all_but(settings.destination.alias,
                            settings.destination.index)
        dest.add_alias(settings.destination.alias)
        return

    # SYNCH WITH source ES INDEX
    source = ElasticSearch(settings.source)
    destination = get_or_create_index(settings["destination"], source)
    last_updated = get_last_updated(destination) - timedelta(days=7)
    pending = get_pending(source, last_updated)

    # pending IS IN {"bug_id":b, "count":c} FORM
    # MAIN ETL LOOP
    for g, bugs in Q.groupby(pending, max_size=BATCH_SIZE):
        data = source.search({
            "query": {
                "filtered": {
                    "query": {
                        "match_all": {}
                    },
                    "filter": {
                        "and": [{
                            "terms": {
                                "bug_id": bugs
                            }
                        }, {
                            "range": {
                                "modified_ts": {
                                    "gte": CNV.datetime2milli(last_updated)
                                }
                            }
                        }]
                    }
                }
            },
            "from": 0,
            "size": 200000,
            "sort": []
        })

        d2 = map(
            lambda (x): {
                "id": x.id,
                "value": x
            },
            map(
                lambda (x): transform_bugzilla.normalize(
                    transform_bugzilla.rename_attachments(x)), data.hits.hits))
        destination.add(d2)