Example #1
0
def query_to_csv(outf, t, body, header_fields, fields, id_field, raw, tabs,
                 id_func):
    es = get_connection()
    cw = None
    if tabs:
        cw = csv.writer(outf, dialect=csv.excel_tab)
    else:
        cw = csv.writer(outf)

    cw.writerow([id_field] + header_fields)

    for r in elasticsearch.helpers.scan(es,
                                        index=indexName,
                                        query=body,
                                        size=1000,
                                        doc_type=t):
        try:
            r_fields = [id_func(r)]
            for k in fields:
                v = get_source_value(r["_source"], k)
                if v is not None:
                    if isinstance(v, str) or isinstance(v, unicode):
                        r_fields.append(v)
                    else:
                        r_fields.append(json.dumps(v))
                else:
                    r_fields.append("")
            cw.writerow(r_fields)
        except:
            logger.exception("Error generating csv")
Example #2
0
def get_recordsets(params, generate=True):
    rq, mq = None, None

    if generate:
        record_query = None
        mediarecord_query = None
        if params["rq"] is not None:
            record_query = queryFromShim(params["rq"])["query"]

        if params["mq"] is not None:
            mediarecord_query = queryFromShim(params["mq"])["query"]
        rq, mq = generate_queries(record_query, mediarecord_query)
    else:
        rq = params["rq"]
        mq = params["mq"]

    q = None
    t = None
    if params["core_type"] == "mediarecords":
        t = "mediarecords"
        q = {
            "query": mq,
            "aggs": {
                "recordsets": {
                    "terms": {
                        "field": "recordset",
                        "size": 10000
                    }
                }
            }
        }
    else:
        t = "records"
        q = {
            "query": rq,
            "aggs": {
                "recordsets": {
                    "terms": {
                        "field": "recordset",
                        "size": 10000
                    }
                }
            }
        }

    es = get_connection()
    ro = es.search(index=indexName, doc_type=t, body=q)
    recsets = {}
    for b in ro["aggregations"]["recordsets"]["buckets"]:
        recsets[b["key"]] = b["doc_count"]
    return (q, recsets)
Example #3
0
def query_to_uniquevals(outf, t, body, val_field, tabs, val_func):
    es = get_connection()
    cw = None
    if tabs:
        cw = csv.writer(outf, dialect=csv.excel_tab)
    else:
        cw = csv.writer(outf)

    ifn = None
    if val_field.startswith("data."):
        ifn = val_field.split(".")[-1]
    else:
        ifn = index_field_to_longname[t][val_field]

    cw.writerow(["id", ifn, "idigbio:itemCount"])

    values = Counter()
    for r in elasticsearch.helpers.scan(es,
                                        index=indexName,
                                        query=body,
                                        size=1000,
                                        doc_type=t):
        source = get_source_value(r["_source"], val_field)
        try:
            if source is not None:
                v = source
                if val_field.lower().endswith("scientificname"):
                    v = v.capitalize()
                values[v] += 1
            else:
                values[""] += 1
        except:
            logger.exception("Error generating uniquevals")

    for k, v in values.most_common():
        cw.writerow([val_func(k), k, v])
Example #4
0
def search(index=indexName, body=None, doc_type=typeName, es=None):
    if es is None:
        from idb.indexing.indexer import get_connection
        es = get_connection()
    return es.query(index=index, body=body, doc_type=doc_type)
Example #5
0
def index(index=indexName, body=None, doc_type=typeName, es=None):
    if es is None:
        from idb.indexing.indexer import get_connection
        es = get_connection()
    return es.index(index=index, doc_type=doc_type, body=body)
Example #6
0
def runQuery(query):
    return get_connection().search(index=get_indexname(),
                                   doc_type="records,mediarecords",
                                   body=query)
Example #7
0
def make_file(t,
              query,
              raw=False,
              tabs=False,
              fields=None,
              core_type="records",
              core_source="indexterms",
              file_prefix="",
              final_filename=""):
    file_extension = ".tsv" if tabs else ".csv"
    final_filename = final_filename + file_extension
    core = t == core_type and raw == core_source == "raw"

    id_func, core_id_field = type_core_type_ids[(core_type, t, core_source)]

    outfile_name = file_prefix + t + file_extension
    if raw:
        outfile_name = file_prefix + t + ".raw" + file_extension
    logger.debug("Creating %r", outfile_name)

    if t in ["records", "mediarecords"]:
        id_field = "id"
        if not core:
            id_field = "coreid"

        exclude_from_fields = ["data"]
        if raw:
            exclude_from_fields = ["id", "coreid"]

        es = get_connection()
        mapping = es.indices.get_mapping(index=indexName, doc_type=t)
        mapping_root = mapping.values()[0]["mappings"][t]["properties"]
        if raw:
            mapping_root = mapping_root["data"]["properties"]

        if fields is None:
            fields = []
            for f in mapping_root:
                if f not in exclude_from_fields and acceptable_field_name(f):
                    if raw:
                        fields.append("data." + f)
                    else:
                        fields.append(f)
            fields = sorted(fields)
        elif len(fields) == 0:
            return None

        if raw:
            # Remove "data."
            converted_fields = ["".join(f[5:]) for f in fields]
        else:
            converted_fields = []
            filtered_fields = []
            for f in fields:
                if f in index_field_to_longname[t]:
                    converted_fields.append(index_field_to_longname[t][f])
                    filtered_fields.append(f)
            fields = filtered_fields

        meta_block = make_file_block(filename=final_filename,
                                     core=core,
                                     tabs=tabs,
                                     fields=converted_fields,
                                     t=t)

        if core_id_field is not None:
            fields_include = fields + [core_id_field]
        else:
            fields_include = fields

        body = {"_source": fields_include, "query": query}

        with AtomicFile(outfile_name, "wb") as outf:
            query_to_csv(outf, t, body, converted_fields, fields, id_field,
                         raw, tabs, id_func)
        return FileArtifact(outfile_name, final_filename, meta_block)
    elif t.startswith("unique"):
        if t == "uniquelocality":
            unique_field = "locality"
            if raw:
                unique_field = "data.dwc:locality"
        elif t == "uniquenames":
            unique_field = "scientificname"
            if raw:
                unique_field = "data.dwc:scientificName"

        body = {"_source": [unique_field], "query": query}

        converted_fields = None
        if unique_field.startswith("data."):
            converted_fields = [unique_field[5:], "idigbio:itemCount"]
        else:
            converted_fields = [
                index_field_to_longname["records"][unique_field],
                "idigbio:itemCount"
            ]

        meta_block = make_file_block(filename=final_filename,
                                     core=core,
                                     tabs=tabs,
                                     fields=converted_fields,
                                     t=t)

        with AtomicFile(outfile_name, "wb") as outf:
            query_to_uniquevals(outf, "records", body, unique_field, tabs,
                                identifiy_locality)
        return FileArtifact(outfile_name, final_filename, meta_block)
Example #8
0
def count_query(t, query):
    es = get_connection()
    return es.count(index=indexName, doc_type=t, body=query)["count"]