def execute(self, context): requester = Requester(self.datastore_url, self.access_token) data = pd.read_csv(self.filename, dtype=str).to_dict('records') upload_data = [] row_count = 0 for record in data: record['project_owner_id'] = self.project_owner record['added'] = datetime.utcnow().isoformat() record['updated'] = datetime.utcnow().isoformat() upload_data.append(record) if len(upload_data) >= self.bulk_size: row_count += len(upload_data) logging.info("Loading {} rows, {} so far".format(len(upload_data), row_count)) response = requester.upload_chunk(constants.PROJECT_BULK_UPSERT_URL, upload_data) upload_data = [] if response.status_code != 200: raise RuntimeError('Bad response attempting to upsert') # leftovers if len(upload_data) > 0: row_count += len(upload_data) logging.info("Loading {} rows, {} so far".format(len(upload_data), row_count)) response = requester.upload_chunk(constants.PROJECT_BULK_UPSERT_URL, upload_data) if response.status_code != 200: raise RuntimeError('Bad response attempting to upsert') logging.info("Loaded {} project rows".format(len(data)))
def bulk_load_trace_csv(f, method="upsert", skip_trace_records=False): requester = Requester(config.oeem.url, config.oeem.access_token) try: data = pd.read_csv(f, dtype=str).to_dict('records') except ValueError: # Assume this is an empty file error, which is ok return True unique_traces = list(set([ (d["trace_id"], d["interpretation"], d["unit"], d.get("interval", None)) for d in data ])) trace_data = [ { "trace_id": trace[0], "interpretation": trace[1], "unit": trace[2], "interval": trace[3], "added": datetime.utcnow().isoformat(), "updated": datetime.utcnow().isoformat(), } for trace in unique_traces ] trace_response = requester.post( constants.TRACE_BULK_UPSERT_VERBOSE_URL, trace_data) if skip_trace_records: return trace_response.status_code < 300 trace_pks_by_id = { record["trace_id"]: record["id"] for record in trace_response.json() } def maybe_float(value): try: return float(value) except: return np.nan trace_record_data = [ { "trace_id": trace_pks_by_id[record["trace_id"]], "value": maybe_float(record["value"]), "start": record["start"], "estimated": record["estimated"], } for record in data ] if method == "upsert": trace_record_response = requester.post( constants.TRACE_RECORD_BULK_UPSERT_URL, trace_record_data) elif method == "insert": trace_record_response = requester.post( constants.TRACE_RECORD_BULK_INSERT_URL, trace_record_data) return trace_record_response.status_code == 200
def bulk_load_project_metadata_csv(f): requester = Requester(config.oeem.url, config.oeem.access_token) input_data = read_csv_file(f) if len(input_data) == 0: print("No data to upload.") return True # auto-detect wide/tall (pivoted, unpivoted) format columns = input_data[0].keys() data = [] if set(columns) == set(['project_id', 'key', 'value']): # tall format for row in input_data: key = row['key'] value = row['value'] if value is None or value.strip() == '': continue data.append({ 'project_id': row['project_id'], 'key': key.decode('utf-8').encode('utf-8'), 'value': value.decode('utf-8').encode('utf-8') }) else: # wide format for row in input_data: for key, value in row.items(): if value is None: continue if value.strip() == '': continue if key == 'project_id': continue data.append({ 'project_id': row['project_id'], 'key': key.decode('utf-8').encode('utf-8'), 'value': value.decode('utf-8').encode('utf-8') }) n = len(data) batch_size = 500 print( "Uploading {} rows of metadata in {} batches of {}" .format(n, n/batch_size, batch_size) ) success = [] for batch in tqdm(batches(data, batch_size)): response = requester.post(constants.PROJECT_METADATA_BULK_UPSERT_URL, batch) success.append(response.status_code == 200) return all(success)
def bulk_load_trace_blacklist(f): requester = Requester(config.oeem.url, config.oeem.access_token) input_data = read_csv_file(f) successes = [] for batch in tqdm(batches(input_data, 500)): response = requester.post( constants.TRACE_BLACKLIST_UPSERT_VERBOSE_URL, batch) successes.append(response.status_code == 201) return all(successes)
def bulk_load_project_csv(f): requester = Requester(config.oeem.url, config.oeem.access_token) data = pd.read_csv(f, dtype=str).to_dict('records') for record in data: # have to patch in project owner field from config record['project_owner_id'] = config.oeem.project_owner # have to patch in fields that are normally autopopulated record['added'] = datetime.utcnow().isoformat() record['updated'] = datetime.utcnow().isoformat() # only support upsert for now response = requester.post(constants.PROJECT_BULK_UPSERT_URL, data) return response.status_code == 200
def execute(self, context): requester = Requester(self.datastore_url, self.access_token) response = requester.get(constants.PROJECT_ID_LIST_URL) loaded_project_ids = response.json() response = requester.get(constants.TRACE_ID_LIST_URL) loaded_trace_ids = response.json() trace_ids = {d["trace_id"]: d["id"] for d in loaded_trace_ids} project_ids = {d["project_id"]: d["id"] for d in loaded_project_ids} raw_matches = pd.read_csv(self.filename, dtype=str).to_dict('records') data = [] row_count = 0 for match in raw_matches: trace_id = trace_ids.get(match["trace_id"], None) project_id = project_ids.get(match["project_id"], None) if trace_id is not None and project_id is not None: data.append({ "trace_id": trace_id, "project_id": project_id, }) if len(data) >= self.bulk_size: response = requester.upload_chunk(constants.PROJECT_TRACE_MAPPING_BULK_UPSERT_VERBOSE_URL, data) if response.status_code < 200 or response.status_code >= 300: raise RuntimeError('Bad response attempting to upsert') row_count += len(data) logging.info("Loaded {} proj-trace maps, {} so far".format(len(data), row_count)) data = [] # leftovers if len(data) > 0: response = requester.upload_chunk(constants.PROJECT_TRACE_MAPPING_BULK_UPSERT_VERBOSE_URL, data) if response.status_code < 200 or response.status_code >= 300: raise RuntimeError('Bad response attempting to upsert') row_count += len(data) logging.info("Loaded {} proj-trace maps, {} so far".format(len(data), row_count)) logging.info("Completed loading {} proj-trace map records".format(row_count))
def bulk_load_project_trace_mapping_csv(f): requester = Requester(config.oeem.url, config.oeem.access_token) trace_ids = {d["trace_id"]: d["id"] for d in loaded_trace_ids()} project_ids = {d["project_id"]: d["id"] for d in loaded_project_ids()} raw_matches = pd.read_csv(f, dtype=str).to_dict('records') data = [] for match in raw_matches: trace_id = trace_ids.get(match["trace_id"], None) project_id = project_ids.get(match["project_id"], None) if trace_id is not None and project_id is not None: data.append({ "trace_id": trace_id, "project_id": project_id }) for batch in tqdm(batches(data, 800)): response = requester.post( constants.PROJECT_TRACE_MAPPING_BULK_UPSERT_VERBOSE_URL, batch) return response.status_code == 201
def execute(self, context): upload_data = [] rows_loaded = 0 requester = Requester(self.datastore_url, self.access_token) with open(self.filename, 'r') as f_in: reader = csv.DictReader(f_in, skipinitialspace=True) for record in reader: project_id = record['project_id'] for key in record.keys(): value = record[key] if value is None: continue if value.strip() == '': continue if key == 'project_id': continue upload_data.append({ 'project_id': project_id, 'key': key.decode('utf-8').encode('utf-8'), 'value': value.decode('utf-8').encode('utf-8'), }) if len(upload_data) >= self.bulk_size: response = requester.upload_chunk(constants.PROJECT_METADATA_BULK_UPSERT_URL, upload_data) if response.status_code != 200: raise RuntimeError('Bad response attempting to upsert') rows_loaded += len(upload_data) logging.info("Loading {} rows, {} total so far".format(len(upload_data), rows_loaded)) upload_data = [] # upload the leftovers if len(upload_data) > 0: response = requester.upload_chunk(constants.PROJECT_METADATA_BULK_UPSERT_URL, upload_data) if response.status_code != 200: raise RuntimeError('Bad response attempting to upsert') rows_loaded += len(upload_data) logging.info("{} metadata records loaded.".format(rows_loaded))
def loaded_project_ids(): requester = Requester(config.oeem.url, config.oeem.access_token) response = requester.get(constants.PROJECT_ID_LIST_URL) return response.json()
def loaded_trace_ids(): requester = Requester(config.oeem.url, config.oeem.access_token) response = requester.get(constants.TRACE_ID_LIST_URL) return response.json()
def execute(self, context): requester = Requester(self.datastore_url, self.access_token) trace_row_count = 0 trace_record_row_count = 0 for chunk in pd.read_csv(self.filename, dtype=str, chunksize=self.bulk_size * 10): try: data = chunk.to_dict('records') except ValueError: return True unique_traces = list(set([ (d["trace_id"], d["interpretation"], d["unit"]) for d in data ])) upload_data = [] trace_pks_by_id = {} for trace in unique_traces: upload_data.append({ "trace_id": trace[0], "interpretation": trace[1], "unit": trace[2], "added": datetime.utcnow().isoformat(), "updated": datetime.utcnow().isoformat(), }) if len(upload_data) >= self.bulk_size / 2: # trace endpoint seems slow by comparison trace_response = requester.upload_chunk(constants.TRACE_BULK_UPSERT_VERBOSE_URL, upload_data) trace_row_count += len(upload_data) if trace_response.status_code < 200 or trace_response.status_code >= 300: raise RuntimeError('Bad response attempting to upsert traces') logging.info("Loaded {} trace rows, {} so far".format(len(upload_data), trace_row_count)) upload_data = [] for record in trace_response.json(): trace_pks_by_id[record["trace_id"]] = record["id"] # leftovers if len(upload_data) > 0: trace_response = requester.upload_chunk(constants.TRACE_BULK_UPSERT_VERBOSE_URL, upload_data) trace_row_count += len(upload_data) if trace_response.status_code < 200 or trace_response.status_code >= 300: raise RuntimeError('Bad response attempting to upsert traces') logging.info("Loaded {} trace rows, {} so far".format(len(upload_data), trace_row_count)) upload_data = [] for record in trace_response.json(): trace_pks_by_id[record["trace_id"]] = record["id"] logging.info("Loaded {} trace rows".format(trace_row_count)) def maybe_float(value): try: return float(value) except: return np.nan trace_record_data = [ { "trace_id": trace_pks_by_id[record["trace_id"]], "value": maybe_float(record["value"]), "start": record["start"], "estimated": record["estimated"], } for record in data ] upload_data = [] for record in data: trace_record = { "trace_id": trace_pks_by_id[record["trace_id"]], "value": maybe_float(record["value"]), "start": record["start"], "estimated": record["estimated"], } upload_data.append(trace_record) if len(upload_data) >= self.bulk_size: if self.method == "upsert": response = requester.upload_chunk(constants.TRACE_RECORD_BULK_UPSERT_URL, upload_data) elif self.method == "insert": response = requester.upload_chunk(constants.TRACE_RECORD_BULK_INSERT_URL, upload_data) if response.status_code < 200 or response.status_code >=300: raise RuntimeError('Bad response attempting to upsert') trace_record_row_count += len(upload_data) logging.info("Loaded {} trace records, {} so far".format(len(upload_data), trace_record_row_count)) upload_data = [] # load leftovers if len(upload_data) > 0: if self.method == "upsert": response = requester.upload_chunk(constants.TRACE_RECORD_BULK_UPSERT_URL, upload_data) elif self.method == "insert": response = requester.upload_chunk(constants.TRACE_RECORD_BULK_INSERT_URL, upload_data) if response.status_code < 200 or response.status_code >=300: raise RuntimeError('Bad response attempting to upsert') trace_record_row_count += len(upload_data) logging.info("Loaded {} trace records, {} so far".format(len(upload_data), trace_record_row_count)) logging.info("Completed loading {} trace records".format(trace_record_row_count))