def load_signatureset_json_iter(data, ksize=None, ignore_md5sum=False, ijson=ijson): """ - data: file handle (or file handle-like) object - ksize: - ignore_md5sum: - ijson: ijson backend """ parser = ijson.parse(data) prefix, event, value = next(parser) assert prefix == '' and event == 'start_array' and value is None n = 0 while True: try: sig = load_signature_json(parser, prefix_item = 'item.signatures.item.mins.item', ignore_md5sum=ignore_md5sum, ijson=ijson) if not ksize or ksize == sig.minhash.ksize: yield sig except ValueError: # possible end of the array of signatures try: prefix, event, value = next(parser) assert event == 'end_array' except StopIteration: pass finally: break n += 1
def process_tests_json(content_metadata, custom_metadata): sys.stdout.write('[') ijson = get_json_parser() # parser = ijson.parse(open('c:\jenkins-stories.json', mode='rb')) parser = ijson.parse(sys.stdin.buffer) count = 0 test_execution = None expected_key = None for prefix, event, value in parser: # we only care about suites cases, don't care about root object or suites properties if prefix == 'suites.item.cases.item': if event == 'start_map': test_execution = {} elif event == 'end_map': if count > 0: sys.stdout.write(',') process_test_execution(content_metadata, custom_metadata, test_execution) count = count + 1 elif event == 'map_key': expected_key = value else: if expected_key != None and prefix == ('suites.item.cases.item.' + expected_key): test_execution[expected_key] = value expected_key = None # print(prefix + ':' + event + ':' + str(value)) sys.stdout.write(']')
def usingIjson(file): # file="/home/lnvp-linux-wkst1/Desktop/future/ctl_records_subsample" certs = [] # for holding all of the ordered data # from https://stackoverflow.com/questions/37200302/using-python-ijson-to-read-a-large-json-file-with-multiple-json-objects with open(file, encoding="UTF-8") as json_file: # cursor = 0 for line_number, line in enumerate(json_file): # for line in enumerate(json_file): # print ("Processing line", line_number + 1,"at cursor index:", cursor) line_as_file = io.StringIO(line) # Use a new parser for each line json_parser = ijson.parse(line_as_file) cert = {} # print("json_parser: ", json_parser) for prefix, kind, value in json_parser: # print ("prefix=",prefix, "type=",kind, "value=",value) if "string" == kind: cert.update({prefix: value}) certs.append(cert) # cursor += len(line) certsDF = pd.DataFrame(certs) # print("dim(certsDF): ", certsDF.shape) # print("data.columns: ", certsDF.columns) # certsDF.to_csv("subsample_ijson_pd.csv") # print() dups = certsDF.duplicated(subset="data.leaf_cert.fingerprint") dups = certsDF[dups]
def parse_snapshot(resp, callback): """ Iteratively parses the response to the etcd snapshot, calling the callback with each key/value pair found. :raises ResyncRequired if the snapshot contains an error response. """ _log.debug("Parsing snapshot response...") if resp.status != 200: raise ResyncRequired("Read from etcd failed. HTTP status code %s", resp.status) parser = ijson.parse(resp) # urllib3 response is file-like. try: prefix, event, value = next(parser) _log.debug("Read first token from response %s, %s, %s", prefix, event, value) if event == "start_map": # As expected, response is a map. _parse_map(parser, callback) else: _log.error("Response from etcd did non contain a JSON map.") raise ResyncRequired("Bad response from etcd") except JSONError: _log.exception("Response from etcd containers bad JSON.") raise ResyncRequired("Bad JSON from etcd")
def process_provider_into_mongo(fname, db, conn): provider_count = 0 facilities_count = 0 status = False with open(fname, 'r') as infile: event = imap(floaten, yajl2.parse(infile)) data = common.items(event, 'item') try: for doc in data: if doc['type'] == 'INDIVIDUAL': db.providers.save(doc) provider_count += 1 else: db.facilities.save(doc) facilities_count += 1 status = True except (KeyboardInterrupt, SystemExit): conn.rollback() raise except (UnicodeDecodeError, ValueError, JSONError) as ex: print "{0}\n".format(str(ex)) if (provider_count > 0): print "Wrote {0} provider documents to MongoDB\n".format( provider_count) if (facilities_count > 0): print "Wrote {0} provider documents to MongoDB\n".format( facilities_count) return status
def process_plan_into_mongo(fname, db, conn): status = False count = 0 with open(fname, 'r') as infile: # use the float override for ijson parser to prevent Decimal values event = imap(floaten, yajl2.parse(infile)) data = common.items(event, 'item') try: for doc in data: # not everyone adheres to the ISO date requirement if 'last_updated_on' in doc: inferred_date_format = dateinfer.infer( [doc['last_updated_on']]) _date = time.strptime(doc['last_updated_on'], inferred_date_format) doc['last_updated_on'] = time.strftime('%Y-%m-%d', _date) # first of all make sure that the coinsurance rate is a number and not a string # second of all check to see if it is a Decimal and convert it to a float if it is. if 'formulary' in doc: if type(doc['formulary']) == dict: formulary = [] formulary.append(doc['formulary']) doc['formulary'] = formulary if type(doc['formulary']) == list: for f in doc['formulary']: if 'cost_sharing' in f: if type(f['cost_sharing']) != list: if f['cost_sharing']['coinsurance_rate']: if f['cost_sharing'][ 'coinsurance_rate']: f['cost_sharing']['coinsurance_rate'] = \ ensure_is_float(f['cost_sharing']['coinsurance_rate']) if f['cost_sharing']['copay_amount']: f['cost_sharing']['copay_amount'] = \ ensure_is_float(f['cost_sharing']['copay_amount']) else: for item in f['cost_sharing']: if 'coinsurance_rate' in item: item[ 'coinsurance_rate'] = ensure_is_float( item['coinsurance_rate']) if 'copay_amount' in item: item[ 'copay_amount'] = ensure_is_float( item['copay_amount']) db.plans.save(doc) count += 1 status = True print "Wrote {0} plan docs to mongodb\n".format(count) except (KeyboardInterrupt, SystemExit): conn.rollback() raise except (UnicodeDecodeError, ValueError, JSONError) as ex: print "{0}\n".format(str(ex)) except Exception as ex: print "{0}\n".format(str(ex)) return status
def process_tests_json(content_metadata, custom_metadata): sys.stdout.write('[') ijson = get_json_parser() # parser = ijson.parse(open('c:\circleci-tests.json', mode='rb')) parser = ijson.parse(sys.stdin.buffer) count = 0 test_execution = None expected_key = None for prefix, event, value in parser: if prefix == 'tests.item': if event == 'start_map': test_execution = {} elif event == 'end_map': if count > 0: sys.stdout.write(',') process_test_execution(content_metadata, custom_metadata, test_execution) count = count + 1 elif event == 'map_key': expected_key = value else: if expected_key != None and prefix == ('tests.item.' + expected_key): test_execution[expected_key] = value expected_key = None # print(prefix + ':' + event + ':' + str(value)) sys.stdout.write(']')
def generate_spec(self, output_path, whitelist=[]): """ Enriches state from existing chain with target chain spec :param output_path: :param whitelist: :return: """ with open(self.state_export, 'rb') as state_fd: with open(self.target_spec, 'rb') as template_fd: parser = ijson.parse(template_fd) depth_map = {} depth_val = -1 with open(output_path, 'w') as out: for prefix, event, value in parser: if event == "string": out.write("\"{1}\"".format(prefix.split('.')[-1], value)) elif event == "number": out.write("{1}".format(prefix.split('.')[-1], value)) elif event == 'null': out.write('null') elif event == 'start_map': depth_val += 1 depth_map[depth_val] = 0 out.write('{') if prefix == 'accounts': whitelist_size = len(whitelist) whitelist_hit = 0 for _exported_state in json_states(state_fd): _address = _exported_state['address'] _exported_state["balance"] = str(int(_exported_state["balance"], 16)) _exported_state["nonce"] = str(int(_exported_state["nonce"], 16)) del _exported_state['address'] # determine if all whitelisted addresses have been processed if whitelist and whitelist_hit == whitelist_size: break # include whitelisted addresses only or if whitelist not defined if not whitelist or _address in whitelist: whitelist_hit += 1 _json_acc = "\"{0}\": {1}".format( _address, json.dumps(_exported_state) ) out.write(_json_acc) out.write(",") depth_map[depth_val] = 1 elif event == 'map_key': if depth_map[depth_val] == 0: out.write('"{0}":'.format(value)) else: out.write(',"{0}":'.format(value)) depth_map[depth_val] += 1 elif event == 'end_map': depth_val -= 1 out.write('}')
def upload(self): """Uploads the contents of the given file by parsing it as an ijson stream. Prints out ending message regarding number of genomes processed and errors encountered """ with open(generate_path(self.filename), "r") as fd: data = ijson.parse(fd) self.parse_metadata(data) print "%d genomes parsed, %d errors occurred." % (self.progress, self.error)
def load_json(filename): with open(filename, 'r') as fd: parser = ijson.parse(fd) ret = {'builders': {}} for prefix, event, value in parser: if (prefix, event) == ('builders', 'map_key'): buildername = value ret['builders'][buildername] = {} elif prefix.endswith('.shortname'): ret['builders'][buildername]['shortname'] = value return ret
def json_states(fd): parser = ijson.parse(fd) for prefix, event, value in parser: if event == 'start_map' and prefix and 'storage' not in prefix: # address based balance = "" nonce = "" address = "" # code based code = "" code_hash = "" code_storage_root = "" code_storage = {} elif event == 'end_map' and prefix and 'storage' not in prefix: if code: yield { 'address': address, 'nonce': nonce, 'balance': balance, 'code': code, #'code_hash': code_hash, 'storage': code_storage, #'storageRoot': code_storage_root } elif address: yield {'address': address, 'nonce': nonce, 'balance': balance} elif '.' in prefix: if 'balance' in prefix: balance = value address = prefix.split('.')[1] elif 'nonce' in prefix: nonce = value elif 'code_hash' in prefix: code_hash = value elif prefix.endswith('code'): code = value elif 'storage_root' in prefix: code_storage_root = value elif 'storage' in prefix and event == 'string': _storage_address = prefix.split('.')[-1] code_storage[_storage_address] = value
def process_formulary_into_mongo(fname, db, conn): status = False count = 0 with open(fname, 'r') as infile: event = imap(floaten, yajl2.parse(infile)) data = common.items(event, 'item') try: for doc in data: db.drugs.save(doc) count += 1 status = True print "Wrote {0} drug docs to MongoDB\n".format(count) except (KeyboardInterrupt, SystemExit): conn.rollback() raise except (UnicodeDecodeError, ValueError, JSONError) as ex: print "{0}\n".format(str(ex)) print return status
def process_file(file_url, org_feature_mappings): print('Loading', file_url) some_engine = create_engine( os.getenv('DATABASE_URL', 'postgresql://localhost/digital_land')) Session = sessionmaker(bind=some_engine) session = Session() total = 0 try: if file_url.startswith('http'): f = urlopen(file_url) else: f = open(file_url, 'rb') events = map(floaten, ijson.parse(f)) data = common.items(events, 'features.item') records = [] orgs_to_save = [] processed = set([]) for feature in data: id = feature['properties'].get('feature') item = 'item:%s' % feature['properties'].get('item') publication = feature['properties'].get('publication') feature_id = id if id is not None else item if session.query(Feature).get( feature_id) is None and feature_id not in processed: geo = json.dumps(feature['geometry']) geometry = session.execute(json_to_geo_query % geo).fetchone()[0] if feature_id in org_feature_mappings: org = session.query(Organisation).get( org_feature_mappings[feature_id]) org.feature_id = feature_id orgs_to_save.append(org) records.append( dict(feature=feature_id, data=feature, geometry=geometry, item=item, publication=publication)) processed.add(feature_id) if len(records) % 10000 == 0: session.bulk_insert_mappings(Feature, records) session.bulk_save_objects(orgs_to_save) session.commit() print('Saved', len(records), 'features from', file_url) total += len(records) records = [] orgs_to_save = [] session.bulk_insert_mappings(Feature, records) session.bulk_save_objects(orgs_to_save) session.commit() print('Saved', len(records), 'features from', file_url) total += len(records) print('Finished loading', file_url) except Exception as e: print(e) print('Error loading', file_url) finally: try: f.close() except: pass return 'Loaded total of %d features from %s' % (total, file_url)
def load_JSON(self): path = os.path.join(self.currdir, self.filename) fd = open(path, "r") self.parser = ijson.parse(fd)
def stream_ijson(big_file): start = time.time() # with open(big_file) as f1: # thingy = json.load(f1) # print thingy.keys() a_url = 'https://www.nomisweb.co.uk/api/v01/dataset/NM_548_1.data.json?geography=2092957700TYPE298&&measures=20100&&RecordLimit=100&&RecordOffset=0&&uid=0x7065df0e03b2e953ecf3027601a11084f1e87469' filename_to_save = '/home/ubuntu/to_save.json' # with open(filename_to_save, 'w') as file_to_save: # req = requests.get(a_url, stream=True) # for chunk in req.iter_content(chunk_size=1024): # if chunk: # filter out keep-alive new chunks # file_to_save.write(chunk) # file_to_save.flush() complete = {} current_object = None with open(filename_to_save) as f3: t2 = time.time() print t2 - start skip = False for prefix, event, value in ijson.parse(f3): # print prefix, event, value if prefix == 'obs.item' and event == 'end_map': # Tidy away old object if current_object is not None: if not skip: if current_object['geography_code'] in complete: print ',', # print 'replacing', current_object['geography_code'] else: print len(complete.keys()) # print 'creating', current_object['geography_code'] complete[current_object['geography_code']] = current_object skip = False if not skip: if prefix == 'obs.item' and event == 'start_map': # Create a new object to populate current_object = {} # Populate object if prefix == 'obs.item.geography.description': current_object['geography'] = value elif prefix == 'obs.item.geography.geogcode': if value in complete: skip = True current_object['geography_code'] = value elif prefix == 'obs.item.geography.value': current_object['geography_id'] = value elif prefix == 'obs.item.obs_value.description': current_object['name'] = value elif prefix == 'obs.item.obs_value.value': current_object['value'] = value t3 = time.time() print t3 - t2 print pprint.pformat(complete) t4 = time.time() print t4 - t3
def stream_ijson(big_file): start = time.time() # with open(big_file) as f1: # thingy = json.load(f1) # print thingy.keys() a_url = 'https://www.nomisweb.co.uk/api/v01/dataset/NM_548_1.data.json?geography=2092957700TYPE298&&measures=20100&&RecordLimit=100&&RecordOffset=0&&uid=0x7065df0e03b2e953ecf3027601a11084f1e87469' filename_to_save = '/home/ubuntu/to_save.json' # with open(filename_to_save, 'w') as file_to_save: # req = requests.get(a_url, stream=True) # for chunk in req.iter_content(chunk_size=1024): # if chunk: # filter out keep-alive new chunks # file_to_save.write(chunk) # file_to_save.flush() complete = {} current_object = None with open(filename_to_save) as f3: t2 = time.time() print t2 - start skip = False for prefix, event, value in ijson.parse(f3): # print prefix, event, value if prefix == 'obs.item' and event == 'end_map': # Tidy away old object if current_object is not None: if not skip: if current_object['geography_code'] in complete: print ',', # print 'replacing', current_object['geography_code'] else: print len(complete.keys()) # print 'creating', current_object['geography_code'] complete[ current_object['geography_code']] = current_object skip = False if not skip: if prefix == 'obs.item' and event == 'start_map': # Create a new object to populate current_object = {} # Populate object if prefix == 'obs.item.geography.description': current_object['geography'] = value elif prefix == 'obs.item.geography.geogcode': if value in complete: skip = True current_object['geography_code'] = value elif prefix == 'obs.item.geography.value': current_object['geography_id'] = value elif prefix == 'obs.item.obs_value.description': current_object['name'] = value elif prefix == 'obs.item.obs_value.value': current_object['value'] = value t3 = time.time() print t3 - t2 print pprint.pformat(complete) t4 = time.time() print t4 - t3