Example #1
0
def load_signatureset_json_iter(data, ksize=None, ignore_md5sum=False, ijson=ijson):
    """
    - data: file handle (or file handle-like) object
    - ksize:
    - ignore_md5sum:
    - ijson: ijson backend
    """

    parser = ijson.parse(data)

    prefix, event, value = next(parser)
    assert prefix == '' and event == 'start_array' and value is None

    n = 0
    while True:
        try:
            sig = load_signature_json(parser,
                                      prefix_item = 'item.signatures.item.mins.item',
                                      ignore_md5sum=ignore_md5sum,
                                      ijson=ijson)
            if not ksize or ksize == sig.minhash.ksize:
                yield sig
        except ValueError:
            # possible end of the array of signatures
            try:
                prefix, event, value = next(parser)
                assert event == 'end_array'
            except StopIteration:
                pass
            finally:
                break
        n += 1
Example #2
0
def load_signatureset_json_iter(data, ksize=None, ignore_md5sum=False, ijson=ijson):
    """
    - data: file handle (or file handle-like) object
    - ksize:
    - ignore_md5sum:
    - ijson: ijson backend
    """

    parser = ijson.parse(data)

    prefix, event, value = next(parser)
    assert prefix == '' and event == 'start_array' and value is None

    n = 0
    while True:
        try:
            sig = load_signature_json(parser,
                                      prefix_item = 'item.signatures.item.mins.item',
                                      ignore_md5sum=ignore_md5sum,
                                      ijson=ijson)
            if not ksize or ksize == sig.minhash.ksize:
                yield sig
        except ValueError:
            # possible end of the array of signatures
            try:
                prefix, event, value = next(parser)
                assert event == 'end_array'
            except StopIteration:
                pass
            finally:
                break
        n += 1
def process_tests_json(content_metadata, custom_metadata):
    sys.stdout.write('[')
    ijson = get_json_parser()
    # parser = ijson.parse(open('c:\jenkins-stories.json', mode='rb'))
    parser = ijson.parse(sys.stdin.buffer)
    count = 0
    test_execution = None
    expected_key = None
    for prefix, event, value in parser:
        # we only care about suites cases, don't care about root object or suites properties
        if prefix == 'suites.item.cases.item':
            if event == 'start_map':
                test_execution = {}
            elif event == 'end_map':
                if count > 0:
                    sys.stdout.write(',')
                process_test_execution(content_metadata, custom_metadata, test_execution)
                count = count + 1
            elif event == 'map_key':
                expected_key = value
        else:
            if expected_key != None and prefix == ('suites.item.cases.item.' + expected_key):
                test_execution[expected_key] = value
            expected_key = None
        # print(prefix + ':' + event + ':' + str(value))
    sys.stdout.write(']')
def usingIjson(file):
    # file="/home/lnvp-linux-wkst1/Desktop/future/ctl_records_subsample"

    certs = []  # for holding all of the ordered data
    # from https://stackoverflow.com/questions/37200302/using-python-ijson-to-read-a-large-json-file-with-multiple-json-objects
    with open(file, encoding="UTF-8") as json_file:
        # cursor = 0
        for line_number, line in enumerate(json_file):
            # for line in enumerate(json_file):
            # print ("Processing line", line_number + 1,"at cursor index:", cursor)
            line_as_file = io.StringIO(line)
            # Use a new parser for each line
            json_parser = ijson.parse(line_as_file)
            cert = {}
            # print("json_parser: ", json_parser)
            for prefix, kind, value in json_parser:
                # print ("prefix=",prefix, "type=",kind, "value=",value)
                if "string" == kind:
                    cert.update({prefix: value})
            certs.append(cert)
            # cursor += len(line)

    certsDF = pd.DataFrame(certs)
    # print("dim(certsDF): ", certsDF.shape)
    # print("data.columns: ", certsDF.columns)
    # certsDF.to_csv("subsample_ijson_pd.csv")

    # print()
    dups = certsDF.duplicated(subset="data.leaf_cert.fingerprint")
    dups = certsDF[dups]
Example #5
0
def parse_snapshot(resp, callback):
    """
    Iteratively parses the response to the etcd snapshot, calling the
    callback with each key/value pair found.

    :raises ResyncRequired if the snapshot contains an error response.
    """
    _log.debug("Parsing snapshot response...")
    if resp.status != 200:
        raise ResyncRequired("Read from etcd failed.  HTTP status code %s",
                             resp.status)
    parser = ijson.parse(resp)  # urllib3 response is file-like.

    try:
        prefix, event, value = next(parser)
        _log.debug("Read first token from response %s, %s, %s", prefix, event,
                   value)
        if event == "start_map":
            # As expected, response is a map.
            _parse_map(parser, callback)
        else:
            _log.error("Response from etcd did non contain a JSON map.")
            raise ResyncRequired("Bad response from etcd")
    except JSONError:
        _log.exception("Response from etcd containers bad JSON.")
        raise ResyncRequired("Bad JSON from etcd")
def process_provider_into_mongo(fname, db, conn):
    provider_count = 0
    facilities_count = 0
    status = False
    with open(fname, 'r') as infile:
        event = imap(floaten, yajl2.parse(infile))
        data = common.items(event, 'item')
        try:
            for doc in data:
                if doc['type'] == 'INDIVIDUAL':
                    db.providers.save(doc)
                    provider_count += 1
                else:
                    db.facilities.save(doc)
                    facilities_count += 1
            status = True
        except (KeyboardInterrupt, SystemExit):
            conn.rollback()
            raise
        except (UnicodeDecodeError, ValueError, JSONError) as ex:
            print "{0}\n".format(str(ex))
    if (provider_count > 0):
        print "Wrote {0} provider documents to MongoDB\n".format(
            provider_count)
    if (facilities_count > 0):
        print "Wrote {0} provider documents to MongoDB\n".format(
            facilities_count)
    return status
def process_plan_into_mongo(fname, db, conn):
    status = False
    count = 0
    with open(fname, 'r') as infile:
        # use the float override for ijson parser to prevent Decimal values
        event = imap(floaten, yajl2.parse(infile))
        data = common.items(event, 'item')
        try:
            for doc in data:
                # not everyone adheres to the ISO date requirement
                if 'last_updated_on' in doc:
                    inferred_date_format = dateinfer.infer(
                        [doc['last_updated_on']])
                    _date = time.strptime(doc['last_updated_on'],
                                          inferred_date_format)
                    doc['last_updated_on'] = time.strftime('%Y-%m-%d', _date)

                # first of all make sure that the coinsurance rate is a number and not a string
                # second of all check to see if it is a Decimal and convert it to a float if it is.
                if 'formulary' in doc:
                    if type(doc['formulary']) == dict:
                        formulary = []
                        formulary.append(doc['formulary'])
                        doc['formulary'] = formulary

                    if type(doc['formulary']) == list:
                        for f in doc['formulary']:
                            if 'cost_sharing' in f:
                                if type(f['cost_sharing']) != list:
                                    if f['cost_sharing']['coinsurance_rate']:
                                        if f['cost_sharing'][
                                                'coinsurance_rate']:
                                            f['cost_sharing']['coinsurance_rate'] = \
                                                ensure_is_float(f['cost_sharing']['coinsurance_rate'])
                                        if f['cost_sharing']['copay_amount']:
                                            f['cost_sharing']['copay_amount'] = \
                                                ensure_is_float(f['cost_sharing']['copay_amount'])
                                else:
                                    for item in f['cost_sharing']:
                                        if 'coinsurance_rate' in item:
                                            item[
                                                'coinsurance_rate'] = ensure_is_float(
                                                    item['coinsurance_rate'])
                                        if 'copay_amount' in item:
                                            item[
                                                'copay_amount'] = ensure_is_float(
                                                    item['copay_amount'])

                db.plans.save(doc)
                count += 1
            status = True
            print "Wrote {0} plan docs to mongodb\n".format(count)
        except (KeyboardInterrupt, SystemExit):
            conn.rollback()
            raise
        except (UnicodeDecodeError, ValueError, JSONError) as ex:
            print "{0}\n".format(str(ex))
        except Exception as ex:
            print "{0}\n".format(str(ex))
    return status
Example #8
0
def parse_snapshot(resp, callback):
    """
    Iteratively parses the response to the etcd snapshot, calling the
    callback with each key/value pair found.

    :raises ResyncRequired if the snapshot contains an error response.
    """
    _log.debug("Parsing snapshot response...")
    if resp.status != 200:
        raise ResyncRequired("Read from etcd failed.  HTTP status code %s",
                             resp.status)
    parser = ijson.parse(resp)  # urllib3 response is file-like.

    try:
        prefix, event, value = next(parser)
        _log.debug("Read first token from response %s, %s, %s", prefix, event,
                   value)
        if event == "start_map":
            # As expected, response is a map.
            _parse_map(parser, callback)
        else:
            _log.error("Response from etcd did non contain a JSON map.")
            raise ResyncRequired("Bad response from etcd")
    except JSONError:
        _log.exception("Response from etcd containers bad JSON.")
        raise ResyncRequired("Bad JSON from etcd")
def process_tests_json(content_metadata, custom_metadata):
    sys.stdout.write('[')
    ijson = get_json_parser()
    # parser = ijson.parse(open('c:\circleci-tests.json', mode='rb'))
    parser = ijson.parse(sys.stdin.buffer)
    count = 0
    test_execution = None
    expected_key = None
    for prefix, event, value in parser:
        if prefix == 'tests.item':
            if event == 'start_map':
                test_execution = {}
            elif event == 'end_map':
                if count > 0:
                    sys.stdout.write(',')
                process_test_execution(content_metadata, custom_metadata,
                                       test_execution)
                count = count + 1
            elif event == 'map_key':
                expected_key = value
        else:
            if expected_key != None and prefix == ('tests.item.' +
                                                   expected_key):
                test_execution[expected_key] = value
            expected_key = None
        # print(prefix + ':' + event + ':' + str(value))
    sys.stdout.write(']')
def process_tests_json(content_metadata, custom_metadata):
    sys.stdout.write('[')
    ijson = get_json_parser()
    # parser = ijson.parse(open('c:\circleci-tests.json', mode='rb'))
    parser = ijson.parse(sys.stdin.buffer)
    count = 0
    test_execution = None
    expected_key = None
    for prefix, event, value in parser:
        if prefix == 'tests.item':
            if event == 'start_map':
                test_execution = {}
            elif event == 'end_map':
                if count > 0:
                    sys.stdout.write(',')
                process_test_execution(content_metadata, custom_metadata, test_execution)
                count = count + 1
            elif event == 'map_key':
                expected_key = value
        else:
            if expected_key != None and prefix == ('tests.item.' + expected_key):
                test_execution[expected_key] = value
            expected_key = None
        # print(prefix + ':' + event + ':' + str(value))
    sys.stdout.write(']')
Example #11
0
    def generate_spec(self, output_path, whitelist=[]):
        """
        Enriches state from existing chain with target chain spec
        :param output_path:
        :param whitelist:
        :return:
        """
        with open(self.state_export, 'rb') as state_fd:
            with open(self.target_spec, 'rb') as template_fd:
                parser = ijson.parse(template_fd)
                depth_map = {}
                depth_val = -1
                with open(output_path, 'w') as out:
                    for prefix, event, value in parser:
                        if event == "string":
                            out.write("\"{1}\"".format(prefix.split('.')[-1], value))
                        elif event == "number":
                            out.write("{1}".format(prefix.split('.')[-1], value))
                        elif event == 'null':
                            out.write('null')
                        elif event == 'start_map':
                            depth_val += 1
                            depth_map[depth_val] = 0
                            out.write('{')

                            if prefix == 'accounts':
                                whitelist_size = len(whitelist)
                                whitelist_hit = 0
                                for _exported_state in json_states(state_fd):

                                    _address = _exported_state['address']
                                    _exported_state["balance"] = str(int(_exported_state["balance"], 16))
                                    _exported_state["nonce"] = str(int(_exported_state["nonce"], 16))
                                    del _exported_state['address']

                                    # determine if all whitelisted addresses have been processed
                                    if whitelist and whitelist_hit == whitelist_size:
                                        break

                                    # include whitelisted addresses only or if whitelist not defined
                                    if not whitelist or _address in whitelist:
                                        whitelist_hit += 1
                                        _json_acc = "\"{0}\": {1}".format(
                                            _address,
                                            json.dumps(_exported_state)
                                        )
                                        out.write(_json_acc)
                                        out.write(",")
                                    depth_map[depth_val] = 1

                        elif event == 'map_key':
                            if depth_map[depth_val] == 0:
                                out.write('"{0}":'.format(value))
                            else:
                                out.write(',"{0}":'.format(value))
                            depth_map[depth_val] += 1
                        elif event == 'end_map':
                            depth_val -= 1
                            out.write('}')
Example #12
0
    def upload(self):
        """Uploads the contents of the given file by parsing it as an ijson
        stream.

        Prints out ending message regarding number of genomes processed and
        errors encountered
        """
        with open(generate_path(self.filename), "r") as fd:
            data = ijson.parse(fd)
            self.parse_metadata(data)

        print "%d genomes parsed, %d errors occurred." % (self.progress, self.error)
Example #13
0
def load_json(filename):
    with open(filename, 'r') as fd:
        parser = ijson.parse(fd)
        ret = {'builders': {}}
        for prefix, event, value in parser:
            if (prefix, event) == ('builders', 'map_key'):
                buildername = value
                ret['builders'][buildername] = {}
            elif prefix.endswith('.shortname'):
                ret['builders'][buildername]['shortname'] = value

        return ret
Example #14
0
def json_states(fd):
    parser = ijson.parse(fd)

    for prefix, event, value in parser:

        if event == 'start_map' and prefix and 'storage' not in prefix:
            # address based
            balance = ""
            nonce = ""
            address = ""

            # code based
            code = ""
            code_hash = ""
            code_storage_root = ""
            code_storage = {}

        elif event == 'end_map' and prefix and 'storage' not in prefix:
            if code:
                yield {
                    'address': address,
                    'nonce': nonce,
                    'balance': balance,
                    'code': code,
                    #'code_hash': code_hash,
                    'storage': code_storage,
                    #'storageRoot': code_storage_root
                }
            elif address:
                yield {'address': address, 'nonce': nonce, 'balance': balance}

        elif '.' in prefix:
            if 'balance' in prefix:
                balance = value
                address = prefix.split('.')[1]
            elif 'nonce' in prefix:
                nonce = value
            elif 'code_hash' in prefix:
                code_hash = value
            elif prefix.endswith('code'):
                code = value
            elif 'storage_root' in prefix:
                code_storage_root = value
            elif 'storage' in prefix and event == 'string':
                _storage_address = prefix.split('.')[-1]
                code_storage[_storage_address] = value
def process_formulary_into_mongo(fname, db, conn):
    status = False
    count = 0
    with open(fname, 'r') as infile:
        event = imap(floaten, yajl2.parse(infile))
        data = common.items(event, 'item')
        try:
            for doc in data:
                db.drugs.save(doc)
                count += 1
            status = True
            print "Wrote {0} drug docs to MongoDB\n".format(count)
        except (KeyboardInterrupt, SystemExit):
            conn.rollback()
            raise
        except (UnicodeDecodeError, ValueError, JSONError) as ex:
            print "{0}\n".format(str(ex))
            print
    return status
Example #16
0
def process_file(file_url, org_feature_mappings):
    print('Loading', file_url)
    some_engine = create_engine(
        os.getenv('DATABASE_URL', 'postgresql://localhost/digital_land'))
    Session = sessionmaker(bind=some_engine)
    session = Session()
    total = 0
    try:
        if file_url.startswith('http'):
            f = urlopen(file_url)
        else:
            f = open(file_url, 'rb')
        events = map(floaten, ijson.parse(f))
        data = common.items(events, 'features.item')
        records = []
        orgs_to_save = []
        processed = set([])

        for feature in data:
            id = feature['properties'].get('feature')
            item = 'item:%s' % feature['properties'].get('item')
            publication = feature['properties'].get('publication')
            feature_id = id if id is not None else item

            if session.query(Feature).get(
                    feature_id) is None and feature_id not in processed:
                geo = json.dumps(feature['geometry'])
                geometry = session.execute(json_to_geo_query %
                                           geo).fetchone()[0]

                if feature_id in org_feature_mappings:
                    org = session.query(Organisation).get(
                        org_feature_mappings[feature_id])
                    org.feature_id = feature_id
                    orgs_to_save.append(org)

                records.append(
                    dict(feature=feature_id,
                         data=feature,
                         geometry=geometry,
                         item=item,
                         publication=publication))

                processed.add(feature_id)

                if len(records) % 10000 == 0:
                    session.bulk_insert_mappings(Feature, records)
                    session.bulk_save_objects(orgs_to_save)
                    session.commit()
                    print('Saved', len(records), 'features from', file_url)
                    total += len(records)
                    records = []
                    orgs_to_save = []

        session.bulk_insert_mappings(Feature, records)
        session.bulk_save_objects(orgs_to_save)
        session.commit()

        print('Saved', len(records), 'features from', file_url)
        total += len(records)
        print('Finished loading', file_url)

    except Exception as e:
        print(e)
        print('Error loading', file_url)
    finally:
        try:
            f.close()
        except:
            pass

    return 'Loaded total of %d features from %s' % (total, file_url)
Example #17
0
 def load_JSON(self):
     path = os.path.join(self.currdir, self.filename)
     fd = open(path, "r")
     self.parser = ijson.parse(fd)
Example #18
0
def stream_ijson(big_file):
    start = time.time()


    # with open(big_file) as f1:
    #     thingy = json.load(f1)
    #     print thingy.keys()

    a_url = 'https://www.nomisweb.co.uk/api/v01/dataset/NM_548_1.data.json?geography=2092957700TYPE298&&measures=20100&&RecordLimit=100&&RecordOffset=0&&uid=0x7065df0e03b2e953ecf3027601a11084f1e87469'
    filename_to_save = '/home/ubuntu/to_save.json'

    # with open(filename_to_save, 'w') as file_to_save:
    #     req = requests.get(a_url, stream=True)
    #     for chunk in req.iter_content(chunk_size=1024):
    #         if chunk: # filter out keep-alive new chunks
    #             file_to_save.write(chunk)
    #             file_to_save.flush()

    complete = {}
    current_object = None

    with open(filename_to_save) as f3:
        t2 = time.time()
        print t2 - start
        skip = False
        for prefix, event, value in ijson.parse(f3):

            # print prefix, event, value

            if prefix == 'obs.item' and event == 'end_map':
                # Tidy away old object
                if current_object is not None:
                    if not skip:
                        if current_object['geography_code'] in complete:
                            print ',',
                            # print 'replacing', current_object['geography_code']
                        else:
                            print len(complete.keys())
                            # print 'creating', current_object['geography_code']

                        complete[current_object['geography_code']] = current_object

                    skip = False


            if not skip:
                if prefix == 'obs.item' and event == 'start_map':
                    # Create a new object to populate
                    current_object = {}

                # Populate object
                if prefix == 'obs.item.geography.description':
                    current_object['geography'] = value

                elif prefix == 'obs.item.geography.geogcode':
                    if value in complete:
                        skip = True

                    current_object['geography_code'] = value

                elif prefix == 'obs.item.geography.value':
                    current_object['geography_id'] = value

                elif prefix == 'obs.item.obs_value.description':
                    current_object['name'] = value

                elif prefix == 'obs.item.obs_value.value':
                    current_object['value'] = value

    t3 = time.time()
    print t3 - t2
    print pprint.pformat(complete)

    t4 = time.time()
    print t4 - t3
Example #19
0
def stream_ijson(big_file):
    start = time.time()

    # with open(big_file) as f1:
    #     thingy = json.load(f1)
    #     print thingy.keys()

    a_url = 'https://www.nomisweb.co.uk/api/v01/dataset/NM_548_1.data.json?geography=2092957700TYPE298&&measures=20100&&RecordLimit=100&&RecordOffset=0&&uid=0x7065df0e03b2e953ecf3027601a11084f1e87469'
    filename_to_save = '/home/ubuntu/to_save.json'

    # with open(filename_to_save, 'w') as file_to_save:
    #     req = requests.get(a_url, stream=True)
    #     for chunk in req.iter_content(chunk_size=1024):
    #         if chunk: # filter out keep-alive new chunks
    #             file_to_save.write(chunk)
    #             file_to_save.flush()

    complete = {}
    current_object = None

    with open(filename_to_save) as f3:
        t2 = time.time()
        print t2 - start
        skip = False
        for prefix, event, value in ijson.parse(f3):

            # print prefix, event, value

            if prefix == 'obs.item' and event == 'end_map':
                # Tidy away old object
                if current_object is not None:
                    if not skip:
                        if current_object['geography_code'] in complete:
                            print ',',
                            # print 'replacing', current_object['geography_code']
                        else:
                            print len(complete.keys())
                            # print 'creating', current_object['geography_code']

                        complete[
                            current_object['geography_code']] = current_object

                    skip = False

            if not skip:
                if prefix == 'obs.item' and event == 'start_map':
                    # Create a new object to populate
                    current_object = {}

                # Populate object
                if prefix == 'obs.item.geography.description':
                    current_object['geography'] = value

                elif prefix == 'obs.item.geography.geogcode':
                    if value in complete:
                        skip = True

                    current_object['geography_code'] = value

                elif prefix == 'obs.item.geography.value':
                    current_object['geography_id'] = value

                elif prefix == 'obs.item.obs_value.description':
                    current_object['name'] = value

                elif prefix == 'obs.item.obs_value.value':
                    current_object['value'] = value

    t3 = time.time()
    print t3 - t2
    print pprint.pformat(complete)

    t4 = time.time()
    print t4 - t3
Example #20
0
 def load_JSON(self):
     path = os.path.join(self.currdir, self.filename)
     fd = open(path, "r")
     self.parser = ijson.parse(fd)