def get_existing_ids(es, settings, branches):
    #FIND WHAT'S IN ES
    bad_ids = []
    int_ids = set()

    demand_pushlog = {"match_all":{}}
    if branches:
        demand_pushlog = {"or": [
            {"not": {"missing": {"field": "test_build.push_date"}}},
            {"not": {"missing": {"field": "test_build.no_pushlog"}}}
        ]}

    if settings.elasticsearch.debug and settings.production.step < 10:
        # SIMPLY RELOAD THIS SMALL NUMBER
        return set([])

    with ESQuery(es) as esq:
        max_id = esq.query({
            "from": es.settings.alias,
            "select": {"value": "datazilla.id", "aggregate": "max"}
        })

        interval_size = 200000
        for mini, maxi in Q.intervals(settings.production.min, max_id+interval_size, interval_size):
            existing_ids = es.search({
                "query": {
                    "filtered": {
                        "query": {"match_all": {}},
                        "filter": {"and": [
                            {"range": {"datazilla.id": {"gte": mini, "lt": maxi}}},
                            demand_pushlog
                        ]}
                    }
                },
                "from": 0,
                "size": 0,
                "sort": [],
                "facets": {
                    "ids": {"terms": {"field": "datazilla.id", "size": interval_size}}
                }
            })

            for t in existing_ids.facets.ids.terms:
                try:
                    int_ids.add(int(t.term))
                except Exception, e:
                    bad_ids.append(t.term)

        existing_ids = int_ids
        Log.println("Number of ids in ES: " + str(len(existing_ids)))
        Log.println("BAD ids in ES: " + str(bad_ids))
        return existing_ids
 def __del__(self):
     try:
         Log.println("Branches missing from pushlog:\n{{list}}", {"list": self.unknown_branches})
     except Exception, e:
         pass
                if len(line.strip()) == 0:
                    continue

                col = line.split("\t")
                id = int(col[0])
                if id < MINIMUM_ID:
                    continue

                json = col[1]
                if Math.is_number(json):
                    json = col[2]
                data = CNV.JSON2object(json).json_blob
                date = CNV.unix2datetime(data.testrun.date)

                if id % 1000 == 0:
                    Log.println("loading id " + str(id) + " date: " + CNV.datetime2string(date, "%Y-%m-%d %H:%M:%S"))

                if date < MINIMUM_DATE:
                    continue

                if id in all:
                    continue
                all.add(id)

                arrays_add(id, "[" + data.test_build.branch + "][" + data.testrun.suite + "]", data)
                output_file.write(str(id) + "\t" + json)
            except Exception, e:
                Log.warning("can not process line:\n\t" + line, e)

        smallest = min(*all)
        Log.println("First id >= date: {{min}}", {"min": smallest})
    try:
        if content.startswith("Id not found"):
            Log.note("{{id}} not found {{url}}", {"id": id, "url": url})
            if id < max_id:
                return True
            else:
                return False

        data = CNV.JSON2object(content.decode('utf-8'))
        content = CNV.object2JSON(data)  #ENSURE content HAS NO crlf

        if data.test_run_id:
            Log.println("Add {{id}} for revision {{revision}} ({{bytes}} bytes)", {
                "id": id,
                "revision": data.json_blob.test_build.revision,
                "bytes": len(content)
            })
            with Profiler("transform"):
                result = transformer.transform(id, data)

            if result:
                Log.println("{{num}} records to add", {
                    "num": len(result)
                })
                es_sink.extend({"value": d} for d in result)

            file_sink.add(str(id) + "\t" + content + "\n")
        elif data.error_flag == 'Y':
            error = data.json_blob
            error.datazilla = data