Example #1
0
    def execute(self, context):
        requester = Requester(self.datastore_url, self.access_token)
        data = pd.read_csv(self.filename, dtype=str).to_dict('records')

        upload_data = []
        row_count = 0
        for record in data:
            record['project_owner_id'] = self.project_owner
            record['added'] = datetime.utcnow().isoformat()
            record['updated'] = datetime.utcnow().isoformat()
            upload_data.append(record)

            if len(upload_data) >= self.bulk_size:
                row_count += len(upload_data)
                logging.info("Loading {} rows, {} so far".format(len(upload_data), row_count))
                response = requester.upload_chunk(constants.PROJECT_BULK_UPSERT_URL, upload_data)
                upload_data = []
                if response.status_code != 200:
                    raise RuntimeError('Bad response attempting to upsert')

        # leftovers
        if len(upload_data) > 0:
            row_count += len(upload_data)
            logging.info("Loading {} rows, {} so far".format(len(upload_data), row_count))
            response = requester.upload_chunk(constants.PROJECT_BULK_UPSERT_URL, upload_data)
            if response.status_code != 200:
                    raise RuntimeError('Bad response attempting to upsert')

        logging.info("Loaded {} project rows".format(len(data)))
Example #2
0
def bulk_load_trace_csv(f, method="upsert", skip_trace_records=False):
    requester = Requester(config.oeem.url, config.oeem.access_token)

    try:
        data = pd.read_csv(f, dtype=str).to_dict('records')
    except ValueError:
        # Assume this is an empty file error, which is ok
        return True

    unique_traces = list(set([
        (d["trace_id"], d["interpretation"], d["unit"], d.get("interval", None)) for d in data
    ]))

    trace_data = [
        {
            "trace_id": trace[0],
            "interpretation": trace[1],
            "unit": trace[2],
            "interval": trace[3],
            "added": datetime.utcnow().isoformat(),
            "updated": datetime.utcnow().isoformat(),
        } for trace in unique_traces
    ]

    trace_response = requester.post(
        constants.TRACE_BULK_UPSERT_VERBOSE_URL, trace_data)

    if skip_trace_records:
        return trace_response.status_code < 300

    trace_pks_by_id = {
        record["trace_id"]: record["id"]
        for record in trace_response.json()
    }

    def maybe_float(value):
        try: return float(value)
        except: return np.nan

    trace_record_data = [
        {
            "trace_id": trace_pks_by_id[record["trace_id"]],
            "value": maybe_float(record["value"]),
            "start": record["start"],
            "estimated": record["estimated"],
        }
        for record in data
    ]

    if method == "upsert":
        trace_record_response = requester.post(
            constants.TRACE_RECORD_BULK_UPSERT_URL, trace_record_data)
    elif method == "insert":
        trace_record_response = requester.post(
            constants.TRACE_RECORD_BULK_INSERT_URL, trace_record_data)
    return trace_record_response.status_code == 200
Example #3
0
def bulk_load_project_metadata_csv(f):
    requester = Requester(config.oeem.url, config.oeem.access_token)

    input_data = read_csv_file(f)

    if len(input_data) == 0:
        print("No data to upload.")
        return True

    # auto-detect wide/tall (pivoted, unpivoted) format
    columns = input_data[0].keys()
    data = []
    if set(columns) == set(['project_id', 'key', 'value']):
        # tall format
        for row in input_data:
            key = row['key']
            value = row['value']
            if value is None or value.strip() == '':
                continue
            data.append({
                'project_id': row['project_id'],
                'key': key.decode('utf-8').encode('utf-8'),
                'value': value.decode('utf-8').encode('utf-8')
            })
    else:
        # wide format
        for row in input_data:
            for key, value in row.items():
                if value is None:
                    continue
                if value.strip() == '':
                    continue
                if key == 'project_id':
                    continue
                data.append({
                    'project_id': row['project_id'],
                    'key': key.decode('utf-8').encode('utf-8'),
                    'value': value.decode('utf-8').encode('utf-8')
                })


    n = len(data)
    batch_size = 500
    print(
        "Uploading {} rows of metadata in {} batches of {}"
        .format(n, n/batch_size, batch_size)
    )

    success = []
    for batch in tqdm(batches(data, batch_size)):
        response = requester.post(constants.PROJECT_METADATA_BULK_UPSERT_URL, batch)
        success.append(response.status_code == 200)
    return all(success)
Example #4
0
def bulk_load_trace_blacklist(f):
    requester = Requester(config.oeem.url, config.oeem.access_token)

    input_data = read_csv_file(f)

    successes = []
    for batch in tqdm(batches(input_data, 500)):
        response = requester.post(
            constants.TRACE_BLACKLIST_UPSERT_VERBOSE_URL, batch)

        successes.append(response.status_code == 201)

    return all(successes)
Example #5
0
def bulk_load_project_csv(f):
    requester = Requester(config.oeem.url, config.oeem.access_token)
    data = pd.read_csv(f, dtype=str).to_dict('records')

    for record in data:
        # have to patch in project owner field from config
        record['project_owner_id'] = config.oeem.project_owner

        # have to patch in fields that are normally autopopulated
        record['added'] = datetime.utcnow().isoformat()
        record['updated'] = datetime.utcnow().isoformat()

    # only support upsert for now
    response = requester.post(constants.PROJECT_BULK_UPSERT_URL, data)
    return response.status_code == 200
Example #6
0
def bulk_load_project_trace_mapping_csv(f):
    requester = Requester(config.oeem.url, config.oeem.access_token)

    trace_ids = {d["trace_id"]: d["id"] for d in loaded_trace_ids()}
    project_ids = {d["project_id"]: d["id"] for d in loaded_project_ids()}

    raw_matches = pd.read_csv(f, dtype=str).to_dict('records')

    data = []
    for match in raw_matches:
        trace_id = trace_ids.get(match["trace_id"], None)
        project_id = project_ids.get(match["project_id"], None)
        if trace_id is not None and project_id is not None:
            data.append({
                "trace_id": trace_id,
                "project_id": project_id
            })

    for batch in tqdm(batches(data, 800)):
        response = requester.post(
            constants.PROJECT_TRACE_MAPPING_BULK_UPSERT_VERBOSE_URL, batch)

    return response.status_code == 201
Example #7
0
    def execute(self, context):
        upload_data = []
        rows_loaded = 0
        requester = Requester(self.datastore_url, self.access_token)
        with open(self.filename, 'r') as f_in:
            reader = csv.DictReader(f_in, skipinitialspace=True)
            for record in reader:
                project_id = record['project_id']
                for key in record.keys():
                    value = record[key]
                    if value is None:
                        continue
                    if value.strip() == '':
                        continue
                    if key == 'project_id':
                        continue
                    upload_data.append({
                        'project_id': project_id,
                        'key': key.decode('utf-8').encode('utf-8'),
                        'value': value.decode('utf-8').encode('utf-8'),
                    })

                    if len(upload_data) >= self.bulk_size:
                        response = requester.upload_chunk(constants.PROJECT_METADATA_BULK_UPSERT_URL, upload_data)
                        if response.status_code != 200:
                            raise RuntimeError('Bad response attempting to upsert')
                        rows_loaded += len(upload_data)
                        logging.info("Loading {} rows, {} total so far".format(len(upload_data), rows_loaded))
                        upload_data = []

        # upload the leftovers
        if len(upload_data) > 0:
            response = requester.upload_chunk(constants.PROJECT_METADATA_BULK_UPSERT_URL, upload_data)
            if response.status_code != 200:
                raise RuntimeError('Bad response attempting to upsert')
            rows_loaded += len(upload_data)
        logging.info("{} metadata records loaded.".format(rows_loaded))
Example #8
0
    def execute(self, context):
        requester = Requester(self.datastore_url, self.access_token)

        response = requester.get(constants.PROJECT_ID_LIST_URL)
        loaded_project_ids = response.json()

        response = requester.get(constants.TRACE_ID_LIST_URL)
        loaded_trace_ids = response.json()

        trace_ids = {d["trace_id"]: d["id"] for d in loaded_trace_ids}
        project_ids = {d["project_id"]: d["id"] for d in loaded_project_ids}

        raw_matches = pd.read_csv(self.filename, dtype=str).to_dict('records')

        data = []
        row_count = 0
        for match in raw_matches:
            trace_id = trace_ids.get(match["trace_id"], None)
            project_id = project_ids.get(match["project_id"], None)

            if trace_id is not None and project_id is not None:
                data.append({
                    "trace_id": trace_id,
                    "project_id": project_id,
                })
                if len(data) >= self.bulk_size:
                    response = requester.upload_chunk(constants.PROJECT_TRACE_MAPPING_BULK_UPSERT_VERBOSE_URL, data)
                    if response.status_code < 200 or response.status_code >= 300:
                        raise RuntimeError('Bad response attempting to upsert')
                    row_count += len(data)
                    logging.info("Loaded {} proj-trace maps, {} so far".format(len(data), row_count))
                    data = []

        # leftovers
        if len(data) > 0:
            response = requester.upload_chunk(constants.PROJECT_TRACE_MAPPING_BULK_UPSERT_VERBOSE_URL, data)
            if response.status_code < 200 or response.status_code >= 300:
                raise RuntimeError('Bad response attempting to upsert')
            row_count += len(data)
            logging.info("Loaded {} proj-trace maps, {} so far".format(len(data), row_count))

        logging.info("Completed loading {} proj-trace map records".format(row_count))
Example #9
0
def loaded_project_ids():
    requester = Requester(config.oeem.url, config.oeem.access_token)
    response = requester.get(constants.PROJECT_ID_LIST_URL)
    return response.json()
Example #10
0
def loaded_trace_ids():
    requester = Requester(config.oeem.url, config.oeem.access_token)
    response = requester.get(constants.TRACE_ID_LIST_URL)
    return response.json()
Example #11
0
    def execute(self, context):
        requester = Requester(self.datastore_url, self.access_token)

        trace_row_count = 0
        trace_record_row_count = 0

        for chunk in pd.read_csv(self.filename, dtype=str, chunksize=self.bulk_size * 10):
            try:
                data = chunk.to_dict('records')
            except ValueError:
                return True

            unique_traces = list(set([
                (d["trace_id"], d["interpretation"], d["unit"]) for d in data
            ]))

            upload_data = []
            trace_pks_by_id = {}

            for trace in unique_traces:
                upload_data.append({
                    "trace_id": trace[0],
                    "interpretation": trace[1],
                    "unit": trace[2],
                    "added": datetime.utcnow().isoformat(),
                    "updated": datetime.utcnow().isoformat(),
                })

                if len(upload_data) >= self.bulk_size / 2: # trace endpoint seems slow by comparison
                    trace_response = requester.upload_chunk(constants.TRACE_BULK_UPSERT_VERBOSE_URL, upload_data)
                    trace_row_count += len(upload_data)

                    if trace_response.status_code < 200 or trace_response.status_code >= 300:
                        raise RuntimeError('Bad response attempting to upsert traces')

                    logging.info("Loaded {} trace rows, {} so far".format(len(upload_data), trace_row_count))
                    upload_data = []

                    for record in trace_response.json():
                        trace_pks_by_id[record["trace_id"]] = record["id"]

            # leftovers
            if len(upload_data) > 0:
                trace_response = requester.upload_chunk(constants.TRACE_BULK_UPSERT_VERBOSE_URL, upload_data)
                trace_row_count += len(upload_data)

                if trace_response.status_code < 200 or trace_response.status_code >= 300:
                    raise RuntimeError('Bad response attempting to upsert traces')

                logging.info("Loaded {} trace rows, {} so far".format(len(upload_data), trace_row_count))
                upload_data = []

                for record in trace_response.json():
                        trace_pks_by_id[record["trace_id"]] = record["id"]

            logging.info("Loaded {} trace rows".format(trace_row_count))

            def maybe_float(value):
                try: return float(value)
                except: return np.nan

            trace_record_data = [
                {
                    "trace_id": trace_pks_by_id[record["trace_id"]],
                    "value": maybe_float(record["value"]),
                    "start": record["start"],
                    "estimated": record["estimated"],
                }
                for record in data
            ]

            upload_data = []
            for record in data:
                trace_record = {
                    "trace_id": trace_pks_by_id[record["trace_id"]],
                    "value": maybe_float(record["value"]),
                    "start": record["start"],
                    "estimated": record["estimated"],
                }

                upload_data.append(trace_record)
                if len(upload_data) >= self.bulk_size:
                    if self.method == "upsert":
                        response = requester.upload_chunk(constants.TRACE_RECORD_BULK_UPSERT_URL, upload_data)
                    elif self.method == "insert":
                        response = requester.upload_chunk(constants.TRACE_RECORD_BULK_INSERT_URL, upload_data)
                    if response.status_code < 200 or response.status_code >=300:
                        raise RuntimeError('Bad response attempting to upsert')
                    trace_record_row_count += len(upload_data)
                    logging.info("Loaded {} trace records, {} so far".format(len(upload_data), trace_record_row_count))
                    upload_data = []

            # load leftovers
            if len(upload_data) > 0:
                if self.method == "upsert":
                    response = requester.upload_chunk(constants.TRACE_RECORD_BULK_UPSERT_URL, upload_data)
                elif self.method == "insert":
                    response = requester.upload_chunk(constants.TRACE_RECORD_BULK_INSERT_URL, upload_data)
                if response.status_code < 200 or response.status_code >=300:
                    raise RuntimeError('Bad response attempting to upsert')
                trace_record_row_count += len(upload_data)
                logging.info("Loaded {} trace records, {} so far".format(len(upload_data), trace_record_row_count))

            logging.info("Completed loading {} trace records".format(trace_record_row_count))