def crawlResourceForSubject(resourceName, pat_ids, collection, key, value, name, resource_val_path): # Dynamically load module for resource try: resource = getattr(importlib.import_module("fhirclient.models." + resourceName.lower()), resourceName) except Exception: logger.error("Resource " + resourceName + " does not exist", exc_info=1) raise # Perform search try: if resourceName == 'Patient': serverSearchParams = {"_id": pat_ids} else: serverSearchParams = {"patient": pat_ids, key: value} search = resource.where(serverSearchParams) ret = search.perform(server.server) except Exception: logger.error("Search failed", exc_info=1) raise if(len(ret.entry) == 0): logger.info("No values found for search for patients " + pat_ids + " on resource " + resourceName) return insert_list = [] next_page = True while next_page: for entry_elem in ret.entry: ret_element = entry_elem.resource element = resource.as_json(ret_element) element["_id"] = str(ObjectId()) element["feature"] = value element["name"] = name if name is not None else value if resourceName == "Patient": element["patient_id"] = pat_ids elif resourceName == "Condition": element["patient_id"] = ret_element.patient.reference.replace("Patient/", "") else: element["patient_id"] = ret_element.subject.reference.replace("Patient/", "") if resource_val_path is not None: element["resource_val_path"] = resource_val_path insert_list.append(element) if len(ret.link) < 2 or ret.link[1].relation != "next": next_page = False break res = server.server.request_json(ret.link[1].url) ret = bundle.Bundle(res) mongodbConnection.get_db()[collection].insert(list(insert_list))
def insert_resource_config(resource_name, resource_value_relative_path, sort_order): mongodbConnection.get_db().resourceConfig.find_one_and_delete({"_id" : resource_name}) mongodbConnection.get_db().resourceConfig.insert_one( {"_id": resource_name, "resource_value_relative_path" : resource_value_relative_path, "sort_order": sort_order, "resource_name": resource_name} ) ret = mongodbConnection.get_db().resourceConfig.find_one({"_id" : resource_name}, {"_id": False}) resourceLoader.writeResource(ret) return ret
def setUp(self): with api.app.app_context( ): # necessary because mongodbConnection uses flask functionality # TODO: would be better to start MongoDB from here instead of relying that it"s already started mongodbConnection.get_db()[self.aggregatorTestTable].delete_many( {}) crawler_data = open(self.crawler_data_path, "r") crawler_data_json = json.loads(crawler_data.read()) mongodbConnection.get_db()[self.aggregatorTestTable].insert( crawler_data_json) job_info = open(self.crawler_job_path, "r") self.job_info_json = json.loads(job_info.read())
def get(self, crawler_id): args = parser.parse_args() aggregation_type = args["aggregation_type"] output_type = args["output_type"] crawlerJob = mongodbConnection.get_db().crawlerJobs.find_one( {"_id": crawler_id}) if crawlerJob["status"] != "finished": return "Crawler Job did not finish yet", 404 if aggregation_type.lower() not in allowedAggregationTypes: return "Wrong aggregation type provided: " + aggregation_type, 400 if aggregation_type.lower() == "all" and output_type == "csv": return "Time series not supported", 400 aggregator = Aggregator(crawler_id, aggregation_type.lower(), crawlerJob["feature_set"], crawlerJob["resource_configs"]) aggregator.aggregate() if output_type == "csv": return Response(aggregator.getCSVOfAggregated(), mimetype='text/csv') elif output_type == "json": return aggregator.getRestructured() else: return aggregator.getAggregated()
def loadResources(): searchDir = os.path.join(os.path.dirname(__file__), '../fhir_resource_configs') for f in os.listdir(searchDir): path = os.path.join(searchDir, f) if os.path.isfile(path): try: config_file = open(path, 'r') file_content = json.loads(config_file.read()) if (file_content["resource_name"] is None or file_content["resource_val_path"] is None): raise ValueError( 'Wrong format of file. Must contain fields "resource_name" and "resource_val_path".' ) mongodbConnection.get_db().resourceConfig.find_one_and_delete( {"_id": file_content["resource_name"]}) mongodbConnection.get_db().resourceConfig.insert_one({ "_id": file_content["resource_name"], "resource_val_path": file_content["resource_val_path"], "sort_order": file_content["sort_order"], "resource_name": file_content["resource_name"], "key_path": file_content.get("key_path"), "key": file_content.get("key"), }) logger.info("Added resource " + file_content["resource_name"] + " of file " + path + " to db.") except Exception: logger.error("Reading resource file " + path + " failed. Skipping.", exc_info=1) continue config_file.close()
def crawlObservationForSubject(subject, collection, key, name): url_params = {"_pretty": "true", "subject": subject, "_format": "json", "_count": 100, key: name} next_page = configuration.HAPIFHIR_URL+"Observation"+'?'+urllib.parse.urlencode(url_params) all_entries = [] while next_page != None: try: request = requests.get(next_page) except Exception as e: # this avoids connection refused when to many varialbes arre requested time.sleep(10) continue json = request.json() if "entry" not in json: return entries = json["entry"] if len(json["link"]) > 1 and json["link"][1]["relation"] == "next": next_page = json["link"][1]["url"] else: next_page = None all_entries += entries observations = [] for entry in all_entries: reducer = ObservationReducer(entry["resource"]) reduced = reducer.getReduced() #patient = reducer.getEntity() observations.append(reduced) mongodbConnection.get_db()[collection].find_one_and_update( { "_id": subject }, {"$push": { "observations" : {"$each": observations}}}, {"resource": "Observation"}, upsert=True )
def process_search_results(ret, resource_name, values, code_inf_map, resource_configs, key_path, collection): insert_list = [] next_page = True while next_page: for entry_elem in ret.entry: if resource_name == 'Patient': process_patient_resource(insert_list, entry_elem, values, code_inf_map, resource_configs) else: process_resource(insert_list, entry_elem, resource_name, key_path, values, code_inf_map, resource_configs) if len(ret.link) < 2 or ret.link[1].relation != "next": next_page = False break res = server.server.request_json(ret.link[1].url) ret = bundle.Bundle(res) mongodbConnection.get_db()[collection].insert(list(insert_list))
def run(self): with self.app.app_context(): while True: next_job = mongodbConnection.get_db().crawlerJobs.find_one( {"status": "queued"}) if (next_job is None): time.sleep(self.interval) continue logger.info("executing new job") crawler.executeCrawlerJob(next_job)
def createCrawlerJob(crawler_id, crawler_status, patient_ids, feature_set, aggregation_type, resource_configs): from api import api if isinstance(patient_ids, str): patient_ids = [patient_ids] url_params = {"output_type": "csv", "aggregation_type": aggregation_type} url = "http://"+configuration.HOSTEXTERN+":"+str(configuration.WSPORT)+api.url_for(aggregationResource.Aggregation, crawler_id=crawler_id)+ "?" + urllib.parse.urlencode(url_params) crawlerJob = { "_id": crawler_id, "patient_ids": patient_ids, "feature_set": feature_set, "resource_configs": resource_configs, "status": crawler_status, "finished": [], "queued_time": str(datetime.now()), "start_time": None, "url": url } mongodbConnection.get_db().crawlerJobs.insert_one(crawlerJob) return crawlerJob
def executeCrawlerJob(crawlerJob): mongodbConnection.get_db().crawlerJobs.update({"_id": crawlerJob["_id"]}, {"$set": {"status": "running", "start_time": str(datetime.now())}}) try: resource_map = create_resource_map(crawlerJob["feature_set"]) pat_ids = ','.join(crawlerJob["patient_ids"]) for resource_name, resource in resource_map.items(): crawlResourceGroupsForSubjects(resource_name, pat_ids, crawlerJob["_id"], resource['feature_list'], resource['feature_maps'], crawlerJob['resource_configs']) #crawlResourceGroupsForSubjects(resource_name, pat_ids, crawlerJob["_id"], feature["value"], feature.get('name'), feature.get('resource_val_path')) mongodbConnection.get_db().crawlerJobs.update({"_id": crawlerJob["_id"]}, {"$set": {"status": "finished", "end_time": str(datetime.now())}}) except Exception as e: print("error executing crawler", e, file=sys.stderr) traceback.print_exc() logger.error("Execution of Crawler " + crawlerJob["_id"] + " failed", exc_info=1) mongodbConnection.get_db().crawlerJobs.update({"_id": crawlerJob["_id"]}, {"$set": {"status": "error", "end_time": str(datetime.now())}}) return "error"
def get(self): return list(mongodbConnection.get_db().resourceConfig.find({}, {"_id": False}))
def delete(self): ret = mongodbConnection.get_db().crawlerJobs.delete_many({}) return ret.deleted_count
def getResourceConfig(resource_name, resource_configs): # If resource config was not provided in crawler job -> read config from mongo db resource_configs = [] if resource_configs is None else resource_configs return next((c for c in resource_configs if c["resource_name"] == resource_name), mongodbConnection.get_db().resourceConfig.find_one({"_id": resource_name}))
def get(self, crawler_id): return mongodbConnection.get_db().crawlerJobs.find_one( {"_id": crawler_id})
def get(self): print(request.environ['REMOTE_ADDR'], file=sys.stderr) return list(mongodbConnection.get_db().crawlerJobs.find())
def remove_resource_config(resource_name): resourceLoader.deleteResource(resource_name) return mongodbConnection.get_db().resourceConfig.find_one_and_delete({"_id" : resource_name}, {"_id": False})
def delete(self, crawler_id): ret = mongodbConnection.get_db().crawlerJobs.delete_many( {"_id": crawler_id}) return ret.deleted_count
def aggregateObservations(self): mongorequest = [{ "$unwind": "$observations" }, { "$group": { "_id": { "attribute": "$observations.attribute", "patient_id": "$_id" }, "entry": { "$push": "$$CURRENT.observations" } } }, { "$unwind": "$entry" }, { "$sort": { "entry.timestamp": 1 } }] if self.aggregation_type == "" or self.aggregation_type == "all": mongorequest += [{ "$group": { "_id": "$_id", "observations": { "$push": "$entry" } } }, { "$group": { "_id": "$_id.patient_id", "observations": { "$push": "$$CURRENT.observations" } } }] elif self.aggregation_type == "latest" or self.aggregation_type == "oldest": tmp = "first" if self.aggregation_type == "oldest" else "last" mongorequest += [{ "$group": { "_id": "$_id", "observations": { "$" + tmp: "$entry" } } }, { "$group": { "_id": "$_id.patient_id", "observations": { "$push": "$$CURRENT.observations" } } }] elif self.aggregation_type == "avg": mongorequest += [{ "$group": { "_id": "$_id", "attribute": { "$first": "$_id.attribute" }, "observations": { "$avg": "$entry.value" } } }, { "$group": { "_id": "$_id.patient_id", "observations": { "$push": { "avg": "$$CURRENT.observations", "attribute": "$_id.attribute" } } } }] else: return None result = list(mongodbConnection.get_db()[self.crawler_id].aggregate( mongorequest)) for res in result: res["resourceType"] = "Observation" self.aggregatedElements.extend(result)
def aggregateFeature(self, resource, selection): resource_config = self.getResourceConfig(resource) mongorequest = [{ "$match": { "feature": selection, "resourceType": resource } }] foundSearchPath = False allElementsForFeature = list(mongodbConnection.get_db()[ self.crawler_id].aggregate(mongorequest + [{ "$group": { "_id": None, "count": { "$sum": 1 } } }])) if len(allElementsForFeature) == 0: raise ValueError("Feature " + selection + " of resource " + resource + " has no elements.") numAllElementsForFeature = allElementsForFeature[0]["count"] if resource_config["sort_order"] is not None and resource_config[ "sort_order"] != "None": # Before actually sorting check if every single element has the attribute that should be sorted after -> throw error if they do not for sortPath in resource_config["sort_order"]: mongoSortPath = ".".join( sortPath.split("/")) # Change "/" to "." elementsWithPath = list(mongodbConnection.get_db()[ self.crawler_id].aggregate(mongorequest + [{ "$match": { mongoSortPath: { "$exists": True } } }, { "$group": { "_id": None, "count": { "$sum": 1 } } }])) if len(elementsWithPath) == 0: continue numElementsWithPath = elementsWithPath[0]["count"] # Check if every element has attribute search path if numAllElementsForFeature == numElementsWithPath: mongorequest += [{"$sort": {mongoSortPath: 1}}] foundSearchPath = True break else: logger.warning("No sort order provided.") foundSearchPath = True if foundSearchPath: tmp = "first" if self.aggregation_type == "oldest" else "last" mongorequest += [{ "$group": { "_id": "$patient_id", "elements": { "$push": "$$CURRENT" } } }, { "$group": { "_id": "$_id", "elements": { "$first": "$elements" } } }] sortedFeature = list(mongodbConnection.get_db()[ self.crawler_id].aggregate(mongorequest)) if self.aggregation_type == "" or self.aggregation_type == "all": self.aggregatedElements.extend(sortedFeature) elif self.aggregation_type == "latest": for res in sortedFeature: res["resourceType"] = resource self.aggregatedElements.extend(sortedFeature) #self.aggregatedElements.append(sortedFeature[-1]) elif self.aggregation_type == "oldest": self.aggregatedElements.append(sortedFeature[0]) else: raise ValueError( "Elements have different fields to sort! Sorting not possible." )