def query_to_csv(outf, t, body, header_fields, fields, id_field, raw, tabs, id_func): es = get_connection() cw = None if tabs: cw = csv.writer(outf, dialect=csv.excel_tab) else: cw = csv.writer(outf) cw.writerow([id_field] + header_fields) for r in elasticsearch.helpers.scan(es, index=indexName, query=body, size=1000, doc_type=t): try: r_fields = [id_func(r)] for k in fields: v = get_source_value(r["_source"], k) if v is not None: if isinstance(v, str) or isinstance(v, unicode): r_fields.append(v) else: r_fields.append(json.dumps(v)) else: r_fields.append("") cw.writerow(r_fields) except: logger.exception("Error generating csv")
def get_recordsets(params, generate=True): rq, mq = None, None if generate: record_query = None mediarecord_query = None if params["rq"] is not None: record_query = queryFromShim(params["rq"])["query"] if params["mq"] is not None: mediarecord_query = queryFromShim(params["mq"])["query"] rq, mq = generate_queries(record_query, mediarecord_query) else: rq = params["rq"] mq = params["mq"] q = None t = None if params["core_type"] == "mediarecords": t = "mediarecords" q = { "query": mq, "aggs": { "recordsets": { "terms": { "field": "recordset", "size": 10000 } } } } else: t = "records" q = { "query": rq, "aggs": { "recordsets": { "terms": { "field": "recordset", "size": 10000 } } } } es = get_connection() ro = es.search(index=indexName, doc_type=t, body=q) recsets = {} for b in ro["aggregations"]["recordsets"]["buckets"]: recsets[b["key"]] = b["doc_count"] return (q, recsets)
def query_to_uniquevals(outf, t, body, val_field, tabs, val_func): es = get_connection() cw = None if tabs: cw = csv.writer(outf, dialect=csv.excel_tab) else: cw = csv.writer(outf) ifn = None if val_field.startswith("data."): ifn = val_field.split(".")[-1] else: ifn = index_field_to_longname[t][val_field] cw.writerow(["id", ifn, "idigbio:itemCount"]) values = Counter() for r in elasticsearch.helpers.scan(es, index=indexName, query=body, size=1000, doc_type=t): source = get_source_value(r["_source"], val_field) try: if source is not None: v = source if val_field.lower().endswith("scientificname"): v = v.capitalize() values[v] += 1 else: values[""] += 1 except: logger.exception("Error generating uniquevals") for k, v in values.most_common(): cw.writerow([val_func(k), k, v])
def search(index=indexName, body=None, doc_type=typeName, es=None): if es is None: from idb.indexing.indexer import get_connection es = get_connection() return es.query(index=index, body=body, doc_type=doc_type)
def index(index=indexName, body=None, doc_type=typeName, es=None): if es is None: from idb.indexing.indexer import get_connection es = get_connection() return es.index(index=index, doc_type=doc_type, body=body)
def runQuery(query): return get_connection().search(index=get_indexname(), doc_type="records,mediarecords", body=query)
def make_file(t, query, raw=False, tabs=False, fields=None, core_type="records", core_source="indexterms", file_prefix="", final_filename=""): file_extension = ".tsv" if tabs else ".csv" final_filename = final_filename + file_extension core = t == core_type and raw == core_source == "raw" id_func, core_id_field = type_core_type_ids[(core_type, t, core_source)] outfile_name = file_prefix + t + file_extension if raw: outfile_name = file_prefix + t + ".raw" + file_extension logger.debug("Creating %r", outfile_name) if t in ["records", "mediarecords"]: id_field = "id" if not core: id_field = "coreid" exclude_from_fields = ["data"] if raw: exclude_from_fields = ["id", "coreid"] es = get_connection() mapping = es.indices.get_mapping(index=indexName, doc_type=t) mapping_root = mapping.values()[0]["mappings"][t]["properties"] if raw: mapping_root = mapping_root["data"]["properties"] if fields is None: fields = [] for f in mapping_root: if f not in exclude_from_fields and acceptable_field_name(f): if raw: fields.append("data." + f) else: fields.append(f) fields = sorted(fields) elif len(fields) == 0: return None if raw: # Remove "data." converted_fields = ["".join(f[5:]) for f in fields] else: converted_fields = [] filtered_fields = [] for f in fields: if f in index_field_to_longname[t]: converted_fields.append(index_field_to_longname[t][f]) filtered_fields.append(f) fields = filtered_fields meta_block = make_file_block(filename=final_filename, core=core, tabs=tabs, fields=converted_fields, t=t) if core_id_field is not None: fields_include = fields + [core_id_field] else: fields_include = fields body = {"_source": fields_include, "query": query} with AtomicFile(outfile_name, "wb") as outf: query_to_csv(outf, t, body, converted_fields, fields, id_field, raw, tabs, id_func) return FileArtifact(outfile_name, final_filename, meta_block) elif t.startswith("unique"): if t == "uniquelocality": unique_field = "locality" if raw: unique_field = "data.dwc:locality" elif t == "uniquenames": unique_field = "scientificname" if raw: unique_field = "data.dwc:scientificName" body = {"_source": [unique_field], "query": query} converted_fields = None if unique_field.startswith("data."): converted_fields = [unique_field[5:], "idigbio:itemCount"] else: converted_fields = [ index_field_to_longname["records"][unique_field], "idigbio:itemCount" ] meta_block = make_file_block(filename=final_filename, core=core, tabs=tabs, fields=converted_fields, t=t) with AtomicFile(outfile_name, "wb") as outf: query_to_uniquevals(outf, "records", body, unique_field, tabs, identifiy_locality) return FileArtifact(outfile_name, final_filename, meta_block)
def count_query(t, query): es = get_connection() return es.count(index=indexName, doc_type=t, body=query)["count"]