def execute(self, context): requester = Requester(self.datastore_url, self.access_token) data = pd.read_csv(self.filename, dtype=str).to_dict('records') upload_data = [] row_count = 0 for record in data: record['project_owner_id'] = self.project_owner record['added'] = datetime.utcnow().isoformat() record['updated'] = datetime.utcnow().isoformat() upload_data.append(record) if len(upload_data) >= self.bulk_size: row_count += len(upload_data) logging.info("Loading {} rows, {} so far".format(len(upload_data), row_count)) response = requester.upload_chunk(constants.PROJECT_BULK_UPSERT_URL, upload_data) upload_data = [] if response.status_code != 200: raise RuntimeError('Bad response attempting to upsert') # leftovers if len(upload_data) > 0: row_count += len(upload_data) logging.info("Loading {} rows, {} so far".format(len(upload_data), row_count)) response = requester.upload_chunk(constants.PROJECT_BULK_UPSERT_URL, upload_data) if response.status_code != 200: raise RuntimeError('Bad response attempting to upsert') logging.info("Loaded {} project rows".format(len(data)))
def execute(self, context): requester = Requester(self.datastore_url, self.access_token) response = requester.get(constants.PROJECT_ID_LIST_URL) loaded_project_ids = response.json() response = requester.get(constants.TRACE_ID_LIST_URL) loaded_trace_ids = response.json() trace_ids = {d["trace_id"]: d["id"] for d in loaded_trace_ids} project_ids = {d["project_id"]: d["id"] for d in loaded_project_ids} raw_matches = pd.read_csv(self.filename, dtype=str).to_dict('records') data = [] row_count = 0 for match in raw_matches: trace_id = trace_ids.get(match["trace_id"], None) project_id = project_ids.get(match["project_id"], None) if trace_id is not None and project_id is not None: data.append({ "trace_id": trace_id, "project_id": project_id, }) if len(data) >= self.bulk_size: response = requester.upload_chunk(constants.PROJECT_TRACE_MAPPING_BULK_UPSERT_VERBOSE_URL, data) if response.status_code < 200 or response.status_code >= 300: raise RuntimeError('Bad response attempting to upsert') row_count += len(data) logging.info("Loaded {} proj-trace maps, {} so far".format(len(data), row_count)) data = [] # leftovers if len(data) > 0: response = requester.upload_chunk(constants.PROJECT_TRACE_MAPPING_BULK_UPSERT_VERBOSE_URL, data) if response.status_code < 200 or response.status_code >= 300: raise RuntimeError('Bad response attempting to upsert') row_count += len(data) logging.info("Loaded {} proj-trace maps, {} so far".format(len(data), row_count)) logging.info("Completed loading {} proj-trace map records".format(row_count))
def execute(self, context): upload_data = [] rows_loaded = 0 requester = Requester(self.datastore_url, self.access_token) with open(self.filename, 'r') as f_in: reader = csv.DictReader(f_in, skipinitialspace=True) for record in reader: project_id = record['project_id'] for key in record.keys(): value = record[key] if value is None: continue if value.strip() == '': continue if key == 'project_id': continue upload_data.append({ 'project_id': project_id, 'key': key.decode('utf-8').encode('utf-8'), 'value': value.decode('utf-8').encode('utf-8'), }) if len(upload_data) >= self.bulk_size: response = requester.upload_chunk(constants.PROJECT_METADATA_BULK_UPSERT_URL, upload_data) if response.status_code != 200: raise RuntimeError('Bad response attempting to upsert') rows_loaded += len(upload_data) logging.info("Loading {} rows, {} total so far".format(len(upload_data), rows_loaded)) upload_data = [] # upload the leftovers if len(upload_data) > 0: response = requester.upload_chunk(constants.PROJECT_METADATA_BULK_UPSERT_URL, upload_data) if response.status_code != 200: raise RuntimeError('Bad response attempting to upsert') rows_loaded += len(upload_data) logging.info("{} metadata records loaded.".format(rows_loaded))
def execute(self, context): requester = Requester(self.datastore_url, self.access_token) trace_row_count = 0 trace_record_row_count = 0 for chunk in pd.read_csv(self.filename, dtype=str, chunksize=self.bulk_size * 10): try: data = chunk.to_dict('records') except ValueError: return True unique_traces = list(set([ (d["trace_id"], d["interpretation"], d["unit"]) for d in data ])) upload_data = [] trace_pks_by_id = {} for trace in unique_traces: upload_data.append({ "trace_id": trace[0], "interpretation": trace[1], "unit": trace[2], "added": datetime.utcnow().isoformat(), "updated": datetime.utcnow().isoformat(), }) if len(upload_data) >= self.bulk_size / 2: # trace endpoint seems slow by comparison trace_response = requester.upload_chunk(constants.TRACE_BULK_UPSERT_VERBOSE_URL, upload_data) trace_row_count += len(upload_data) if trace_response.status_code < 200 or trace_response.status_code >= 300: raise RuntimeError('Bad response attempting to upsert traces') logging.info("Loaded {} trace rows, {} so far".format(len(upload_data), trace_row_count)) upload_data = [] for record in trace_response.json(): trace_pks_by_id[record["trace_id"]] = record["id"] # leftovers if len(upload_data) > 0: trace_response = requester.upload_chunk(constants.TRACE_BULK_UPSERT_VERBOSE_URL, upload_data) trace_row_count += len(upload_data) if trace_response.status_code < 200 or trace_response.status_code >= 300: raise RuntimeError('Bad response attempting to upsert traces') logging.info("Loaded {} trace rows, {} so far".format(len(upload_data), trace_row_count)) upload_data = [] for record in trace_response.json(): trace_pks_by_id[record["trace_id"]] = record["id"] logging.info("Loaded {} trace rows".format(trace_row_count)) def maybe_float(value): try: return float(value) except: return np.nan trace_record_data = [ { "trace_id": trace_pks_by_id[record["trace_id"]], "value": maybe_float(record["value"]), "start": record["start"], "estimated": record["estimated"], } for record in data ] upload_data = [] for record in data: trace_record = { "trace_id": trace_pks_by_id[record["trace_id"]], "value": maybe_float(record["value"]), "start": record["start"], "estimated": record["estimated"], } upload_data.append(trace_record) if len(upload_data) >= self.bulk_size: if self.method == "upsert": response = requester.upload_chunk(constants.TRACE_RECORD_BULK_UPSERT_URL, upload_data) elif self.method == "insert": response = requester.upload_chunk(constants.TRACE_RECORD_BULK_INSERT_URL, upload_data) if response.status_code < 200 or response.status_code >=300: raise RuntimeError('Bad response attempting to upsert') trace_record_row_count += len(upload_data) logging.info("Loaded {} trace records, {} so far".format(len(upload_data), trace_record_row_count)) upload_data = [] # load leftovers if len(upload_data) > 0: if self.method == "upsert": response = requester.upload_chunk(constants.TRACE_RECORD_BULK_UPSERT_URL, upload_data) elif self.method == "insert": response = requester.upload_chunk(constants.TRACE_RECORD_BULK_INSERT_URL, upload_data) if response.status_code < 200 or response.status_code >=300: raise RuntimeError('Bad response attempting to upsert') trace_record_row_count += len(upload_data) logging.info("Loaded {} trace records, {} so far".format(len(upload_data), trace_record_row_count)) logging.info("Completed loading {} trace records".format(trace_record_row_count))