def loadExpObjectFast(filename): """loads a CiPdeN object from a JSON file irnores generation data, expect the first and the last Parameters ---------- filename : str includes path and filename Returns ------- dict returns a dict if it worked, else return None """ try: with open(filename, 'rb') as f: result = bigjson.load(f) obj_dict = dict() obj_dict["pde"] = result["pde"] obj_dict["kernel_type"] = result["kernel_type"] obj_dict["opt_algo"] = result["opt_algo"] obj_dict["exec_time"] = result["exec_time"] obj_dict["mem_consumption"] = result["mem_consumption"] obj_dict["normL2"] = result["normL2"] obj_dict["sol_kernel"] = np.array(result["sol_kernel"].to_python()) return obj_dict except Exception as e: print(str(e)) return None
def handle(self, *args, **options): with open('parsed_dissected.json', 'rb') as json_file2: data2 = bigjson.load(json_file2)['features'] for entry in data2: coords = [] for cor in entry.get('geometry').get('coordinates'): xy = [cor[0], cor[1]] coords.append(xy) sc = Section.objects.create( km=try_to_num(entry.get('properties').get('km'), 'int'), section_start=try_to_num( entry.get('properties').get('section_start'), 'int'), section_end=try_to_num( entry.get('properties').get('section_end'), 'int'), coordinates=json.dumps(list(coords))) contracts = entry['properties']['contract'] for contract in contracts: #print(contract.get('id')) try: ct = Contract.objects.get(id=contract.get('id')) sc.contracts.add(ct) except: pass #print('----------404 CONTRACT-------------***') sc.save() rd = Road.objects.create( type=entry.get('geometry').get('type'), road_title=entry.get('properties').get('road_title')) rd.sections.add(sc) rd.save()
def bigjson_test(filename): with open(filename, 'rb') as f: for k in range(100): j = bigjson.load(f) element = j print(element)
def dispatcher(filename, host='redis', queued_tasks=100, queue_check_time=5, job_timeout=24 * 60 * 60, result_ttl=24 * 60 * 60): with open(filename, 'rb') as f, open("/data/loading.logs", "w") as log: j = bigjson.load(f) qin, qout = rq_launch(Redis(host=host)) qin.queue.empty() # reset previous queue log.write("number of tasks loaded to queue") for n, i in enumerate(j, start=1): # put tasks into queue if not n % queued_tasks: log.write(f"{n}\n") if n % queued_tasks == 0: print("started tasks", n) while queued_tasks < qin.queue.count: time.sleep(queue_check_time) qin.put(**dict(i), index=n, job_timeout=job_timeout, result_ttl=result_ttl) log.write( f"""Loading to queue complete, dispatcher stops at this point,\n still there are about {qin.queue.count} tasks in Redis queue""" )
def test_missing_open_quote(self): file = BytesIO(MISSING_OPEN_QUOTE_JSON_FILE) data = bigjson.load(file) with self.assertRaises(Exception) as e: _ = len(data) self.assertEqual(e.exception.args[0], "Unexpected bytes! Value 'y' Position 32")
def test_corrupt_backslash_encoding(self): file = BytesIO(CORRUPT_BACKSLASH_ENCODING_JSON_FILE) data = bigjson.load(file) with self.assertRaises(Exception) as e: _ = len(data) self.assertEqual(e.exception.args[0], "Unexpected \\q in backslash encoding! Position 19")
def test_missing_digit_after_dot(self): file = BytesIO(MISSING_DIGIT_AFTER_DOT_JSON_FILE) data = bigjson.load(file) with self.assertRaises(Exception) as e: _ = len(data) self.assertEqual(e.exception.args[0], "Expected digit after dot! Position 21")
def test_crlf_line_terminators(self): file_in = open(DATA_JSON_PATH, 'rb') json_obj = bigjson.load(file_in, encoding='utf-8') self.assertEqual(len(json_obj['nested_list1']), 2) self.assertEqual(len(json_obj['nested_list1'][0]['sub_nested_arr1']), 2) self.assertEqual( json_obj['nested_list1'][0]['sub_nested_arr1'][0]['ssn_1'], 'v1')
def bigjson_load(self, file, size=None): self.dataset = [] with open('%s/%s.json' % (self.base_path, file), mode='rb') as outfile: if size is None: self.dataset = json.load(outfile) elif type(size) == int: temp = bigjson.load(outfile) count = 0 while count != size: self.dataset.append(temp[count].to_python()) count += 1
def bigjson_read(filename, fun): with open(filename, 'r', encoding="utf-16") as f: j = bigjson.load(f) k = 0 while (1): try: element = j[k] except EOFError: return else: fun(element) k = k + 1
def test_basics(self): file = BytesIO(JSON_FILE) data = bigjson.load(file) self.assertEqual(len(data), 7) self.assertEqual(data['string'], 'blah') self.assertEqual(data['number'], 123) self.assertEqual(data['true'], True) self.assertEqual(data['false'], False) self.assertEqual(data['null'], None) self.assertEqual(len(data['array']), 3) self.assertEqual(data['array'][0], 1) self.assertEqual(data['array'][1], 2) self.assertEqual(data['array'][2], 3) self.assertEqual(len(data['object']), 1) self.assertEqual(data['object']['x'], 'y')
def load_some_data(self, num_reviews): column_names = ['stars', 'useful', 'funny', 'cool', 'text'] df = pd.DataFrame(columns=column_names) with open(os.path.join(self.processed_folder, 'yelp_review.json'), 'rb') as file: j = bigjson.load(file) for idx in tqdm(range(num_reviews)): element = j[idx] new_row = { 'stars': element.values()[0], 'useful': element.values()[1], 'funny': element.values()[2], 'cool': element.values()[3], 'text': element.values()[4] } df.loc[idx] = new_row return df
def convert_to_tsv(self, destination_file: str): with bz2.open(self.dump_file, "rb") as bzinput: with open(destination_file, 'w') as output: output.write(f'type\tid\tlabel\tdescription\taliases\n') j = bigjson.load(bzinput) for index, element in enumerate(j): if index % 1000 == 0: print(f'done up until {index}!') entity_type = self.extract_type(element) entity_id = self.extract_id(element) entity_label = self.extract_label(element) entity_description = self.extract_description(element) entity_aliases = self.extract_aliases(element) try: output.write( f'{entity_type}\t{entity_id}\t{entity_label}\t{entity_description}\t{"|||".join(entity_aliases)}\n' ) except UnicodeEncodeError: print('skipped!') continue
def convert_big_destination_geojson_file_to_source_csv(job, **kwparameters): input_filepath = job.destination_file_path filename = input_filepath.split('/')[-1] basename, extension = filename.split('.') output_filepath = job.local_directory + basename + '.csv' with open(input_filepath, 'rb') as f: j = bigjson.load(f) features = j['features'] csv_rows = [] count = 0 keys = None for feature in features: csv_row = convert_geojson_row_to_dict(feature.to_python()) csv_rows.append(csv_row) if len(csv_rows) == 1000: count += len(csv_rows) if keys is None: keys = detect_keys(csv_rows) write_or_append_to_csv(output_filepath, csv_rows, keys) csv_rows = [] print(f"{count} rows have been written.")
def json_to_csv(infile, outfile, sorted_file_keys=None): with open(infile, 'rb') as f, open(outfile,"a") as csvfile: # Read infile with bigjson and init csv writer data = bigjson.load(f) writer = csv.writer(csvfile, delimiter='|', quoting=csv.QUOTE_MINIMAL) # Counter for tracking number of completed rows completed_row_count = 0 for json_row in data: # Convert bigjson object to python dictionary row = json_row.to_python() compiled_data = {} # Parse each key and add to compiled_data for key in row.keys(): parse_data(key, row[key], compiled_data) # If no sorted_file_keys are provided, then the keys of the first row are selected if sorted_file_keys is None: sorted_file_keys = list(sorted(compiled_data.keys())) # Print column headers to file if completed_row_count == 0: writer.writerow(sorted_file_keys) # Sort compiled data and print to csv sorted_compiled_data = [compiled_data.get(key,'') for key in sorted_file_keys] writer.writerow(sorted_compiled_data) completed_row_count += 1
#with open("log.txt") as infile: # for line in infile: # do_something_with(line) import sys sys.path.append('./bigjson/') import bigjson with open('auction-130001112018.json', 'rb') as f: j = bigjson.load(f) element = j[4] print(element['type']) print(element['id'])
def handle(self, *args, **options): with open('odh_prepared.json', 'rb') as data_file: data = bigjson.load(data_file) for entry in data: owner_id_entry = entry.get('owner_id') print(owner_id_entry) owner_id = OwnerId.objects.create( legal_person_id=owner_id_entry.get('legal_person_id'), legal_person_version_id=owner_id_entry.get( 'legal_person_version_id'), start_date=owner_id_entry.get('start_date'), end_date=owner_id_entry.get('end_date')) owner_id.save() calc_attribute_entry = entry.get('calc_attribute') calc_attribute = CalcAttribute.objects.create( info=calc_attribute_entry.get('info', 0.0) != 'null' or 0.0, sign=calc_attribute_entry.get('sign', 0.0) != 'null' or 0.0, buffer=calc_attribute_entry.get('buffer', 0.0) != 'null' or 0.0, pointer=calc_attribute_entry.get('pointer', 0.0) != 'null' or 0.0, asperity=calc_attribute_entry.get('asperity', 0.0) != 'null' or 0.0, tpu_area=calc_attribute_entry.get('tpu_area', 0.0) != 'null' or 0.0, engin_qty=calc_attribute_entry.get('engin_qty', 0) != 'null' or 0.0, other_area=calc_attribute_entry.get('other_area', 0.0) != 'null' or 0.0, total_area=calc_attribute_entry.get('total_area', 0.0) != 'null' or 0.0, guiding_qty=calc_attribute_entry.get('guiding_qty', 0) != 'null' or 0, margin_area=calc_attribute_entry.get('margin_area', 0.0) != 'null' or 0.0, station_qty=calc_attribute_entry.get('station_qty', 0) != 'null' or 0, bicycle_area=calc_attribute_entry.get( 'bicycle_area', 0.0) != 'null' or 0.0, footway_area=calc_attribute_entry.get( 'footway_area', 0.0) != 'null' or 0.0, guiding_area=calc_attribute_entry.get( 'guiding_area', 0.0) != 'null' or 0.0, inbound_area=calc_attribute_entry.get( 'inbound_area', 0.0) != 'null' or 0.0, roadway_area=calc_attribute_entry.get( 'roadway_area', 0.0) != 'null' or 0.0, station_area=calc_attribute_entry.get( 'station_area', 0.0) != 'null' or 0.0, bar_antinoise=calc_attribute_entry.get( 'bar_antinoise', 0.0) != 'null' or 0.0, cleaning_area=calc_attribute_entry.get( 'cleaning_area', 0.0) != 'null' or 0.0, bar_new_jersey=calc_attribute_entry.get( 'bar_new_jersey', 0.0) != 'null' or 0.0, bicycle_length=calc_attribute_entry.get( 'bicycle_length', 0.0) != 'null' or 0.0, guiding_length=calc_attribute_entry.get( 'guiding_length', 0.0) != 'null' or 0.0, gutters_length=calc_attribute_entry.get( 'gutters_length', 0.0) != 'null' or 0.0, station_number=calc_attribute_entry.get( 'station_number', 0) != 'null' or 0, traff_light_qty=calc_attribute_entry.get( 'traff_light_qty', 0) != 'null' or 0, auto_footway_area=calc_attribute_entry.get( 'auto_footway_area', 0.0) != 'null' or 0.0, traffic_signs_qty=calc_attribute_entry.get( 'traffic_signs_qty', 0) != 'null' or 0, tram_rails_length=calc_attribute_entry.get( 'tram_rails_length', 0.0) != 'null' or 0.0, bound_stone_length=calc_attribute_entry.get( 'bound_stone_length', 0.0) != 'null' or 0.0, manual_footway_area=calc_attribute_entry.get( 'manual_footway_area', 0.0) != 'null' or 0.0, cleaning_guiding_qty=calc_attribute_entry.get( 'cleaning_guiding_qty', 0) != 'null' or 0, cleaning_guiding_length=calc_attribute_entry.get( 'cleaning_guiding_length', 0.0) != 'null' or 0.0, roadway_prkg_auto_clean_area=calc_attribute_entry.get( 'roadway_prkg_auto_clean_area', 0.0) != 'null' or 0.0, roadway_noprkg_auto_clean_area=calc_attribute_entry.get( 'roadway_noprkg_auto_clean_area', 0.0) != 'null' or 0.0, roadway_prkg_manual_clean_area=calc_attribute_entry.get( 'roadway_prkg_manual_clean_area', 0.0) != 'null' or 0.0, roadway_noprkg_manual_clean_area=calc_attribute_entry.get( 'roadway_noprkg_manual_clean_area', 0.0) != 'null' or 0.0) calc_attribute.save() points = [] for coordinates in entry.get('geometry').get('coordinates')[0]: point = Point.objects.create( coordinates=f'{coordinates[0]};{coordinates[1]}') point.save() points.append(point) polygon = Polygon.objects.create() polygon.points.set(points) polygon.save() _object = Object.objects.create( name=entry.get('name'), owner_id=owner_id, calc_attribute=calc_attribute, ) _object.geometry.set([ polygon, ]) _object.save()
import json import bigjson from jsonpath_ng import jsonpath, parse jsonpath_expr = parse('features[*].properties.test') with open('huge.json', 'rb') as file: obj = bigjson.load(file) match = jsonpath_expr.find(obj) for k in match: print(k.value) # for feature in obj['features']: # print(feature['properties']['MAPBLKLOT'])
# one time script to generate a list of keys to work with # runtime was about 4 hours on my computer, key list file is ~250 MB import datetime import bigjson import json print(datetime.datetime.now()) print("getting keys") with open("../../data/dnssec-resolver.lz4", "rb") as fin: data = bigjson.load(fin) #print (data["pdnssec-ba-29-211-894"].keys()) key_list = list(data.keys()) print(datetime.datetime.now()) print("writing file") with open("../../data/key_list.json", "w") as fout: json.dump(key_list, fout) print(datetime.datetime.now()) print("done")
import requests from db import engine, User, Counts from sqlalchemy.orm import scoped_session, sessionmaker import concurrent.futures import threading import time import bigjson db_session = scoped_session(sessionmaker(bind=engine)) if __name__ == "__main__": with open("user.json", "rb") as file: data = bigjson.load(file) for item in data: counts = Counts(**item['counts']) user = User(counts, **item) db_session.add(counts) db_session.add(user) db_session.commit()
async def main(): user = '******' password = '******' database = 'globi' host = 'geodb.cldo1feu6uzi.us-east-2.rds.amazonaws.com' props_names = [ 'locality', 'type', 'localityId', 'bodyPartId', 'lifeStageId', 'bodyPartLabel', 'lifeStageLabel', 'externalUrl', 'kingdomId', 'kingdomName', 'phylumId', 'phylumName', 'classId', 'className', 'orderId', 'orderName', 'familyId', 'familyName', 'genusId', 'genusName', 'speciesId', 'speciesName', 'externalId', 'name', 'rank' ] script, in_file = argv conn = await asyncpg.connect(user=user, password=password, database=database, host=host) await define_geom_type(conn) stmt = await conn.prepare( '''INSERT INTO observations (locality,type,localityId,bodyPartId,lifeStageId,bodyPartLabel, lifeStageLabel,externalUrl,kingdomId,kingdomName,phylumId,phylumName,classId,className, orderId,orderName,familyId,familyName,genusId,genusName,speciesId,speciesName,externalId,name,rank,geom) VALUES($1, $2, $3, $4, $5, $6, $7, $8, $9, $10, $11, $12, $13, $14, $15, $16, $17, $18, $19, $20, $21, $22, $23, $24, $25, $26::geometry)''' ) with open(in_file, 'rb') as f: data = bigjson.load(f) for n in data: d = n['n'] default = {k: '' for k in props_names} props = { **{k: v for k, v in d['loc']['properties'].iteritems()}, **{k: v for k, v in d['obs']['properties'].iteritems()}, **{k: v for k, v in d['sp']['properties'].iteritems()} } props = {**default, **props} data = tuple(v for k, v in props.items() if k in props_names) point = shapely.geometry.Point(props['longitude'], props['latitude']) data = data + (point, ) if conn.is_closed(): conn = await asyncpg.connect(user=user, password=password, database=database, host=host) await define_geom_type(conn) stmt = await conn.prepare( '''INSERT INTO observations (locality,type,localityId,bodyPartId,lifeStageId,bodyPartLabel, lifeStageLabel,externalUrl,kingdomId,kingdomName,phylumId,phylumName,classId,className, orderId,orderName,familyId,familyName,genusId,genusName,speciesId,speciesName,externalId,name,rank, geom) VALUES($1, $2, $3, $4, $5, $6, $7, $8, $9, $10, $11, $12, $13, $14, $15, $16, $17, $18, $19, $20, $21, $22, $23, $24, $25, $26::geometry)''' ) try: await stmt.executemany([data]) except Exception as e: print(e) await writeLog(props) await conn.close()
import bigjson import os, psutil process = psutil.Process(os.getpid()) suffixes = ['B', 'KB', 'MB', 'GB', 'TB', 'PB'] def humansize(nbytes): i = 0 while nbytes >= 1024 and i < len(suffixes) - 1: nbytes /= 1024. i += 1 f = ('%.2f' % nbytes).rstrip('0').rstrip('.') return '%s %s' % (f, suffixes[i]) def print_memory(): print(humansize(process.memory_info().rss)) print_memory() # 9.16 MB with open('huge.json', 'rb') as f: # file size 18,6 MB print_memory() # 9.18 MB dict = bigjson.load(f) print(dict['type']) # FeatureCollection print(dict['features'][0]['properties']['MAPBLKLOT']) print_memory() # 10.18 MB
import bigjson import json tweet_parts = 10_000 file_count = 5 with open('location_merged_unduplicated_sentiment.json', 'rb') as f: tweets = bigjson.load(f) part = [] file_index = 1 for index, tweet in enumerate(tweets, start=1): inner_tweet = { 'user_id': tweet['user_id'], 'user_name': tweet['name'], 'text': tweet['tweet'], 'tweet_id': tweet['tweet_id'], 'favorites': tweet['favorites'], 'retweets': tweet['retweets'], 'created': tweet['created'], 'verified': tweet['is_user_verified'] } part.append(inner_tweet) if (index % tweet_parts) == 0: with open('data-%d.json' % file_index, 'w') as outfile: json.dump(part, outfile) file_index += 1 part = [] if (index % tweet_parts) == 0: print(index) if (file_index - 1) == file_count: break
def handle(self, *args, **options): with open('full_export.json', 'rb') as json_file, open('parsed_dissected.json', 'rb') as json_file2: data = bigjson.load(json_file) count = 0 for entry in data: budget_entry = None #if count < 129663: # count += 1 # continue try: budget_entry = (entry.get('contract')).get('finances') except: log('Budget F****D UP', False) date = None try: date = datetime( int(budget_entry['budgetFunds']['stages'][0] ['payments']['paymentYear']), int(budget_entry['budgetFunds']['stages'][0] ['payments']['paymentMonth']), 1) except: pass try: budget_entry.get('budget').get('code') except: log('Budget F****d Up', False) continue try: bg = Budget.objects.create( code=budget_entry.get('budget').get('code'), endDate=try_to_date( budget_entry.get('budgetFunds')['stages'][0] ['endDate'][:-1]), KBK=budget_entry.get( 'budgetFunds')['stages'][0]['payments'].get('KBK'), paymentSumRUR=try_to_num( try_get_first( budget_entry.get('budgetFunds').get('stages')). get('payments').get('paymentSumRUR'), "float"), name=budget_entry.get('budget').get('name'), paymentYearMonth=date) bg.save() except django.db.utils.IntegrityError: log('SAVE ID ERROR', False) contract_entry = entry.get('contract') try: ct = Contract.objects.create( id=contract_entry.get('_id'), contractUrl=contract_entry.get('contractUrl'), documentBase=contract_entry.get('documentBase'), startDate=datetime.fromisoformat( contract_entry.get('execution').get('startDate') [:-1]), endDate=datetime.fromisoformat( contract_entry['execution']['endDate'][:-1]), fz=try_to_num(contract_entry.get('fz'), "int"), price=try_to_num(contract_entry.get('price'), "decimal"), printFromUrl=contract_entry.get('printFormUrl'), protocolDate=try_to_date( (contract_entry.get('protocolDate'))), publishDate=datetime.fromisoformat( contract_entry.get('publishDate')), signDate=datetime.fromisoformat( contract_entry['signDate'][:-1]), regionCode=try_to_num(contract_entry.get('regionCode'), "int"), scanUrl=contract_entry['scan'][0]['url'], budget=bg) customer_entry = contract_entry.get('customer') try: cm = Customer.objects.create( inn=try_to_num(customer_entry.get('inn'), "int"), fullName=customer_entry.get('fullName'), kpp=try_to_num(customer_entry.get('kpp'), "int"), postalAddress=customer_entry.get('postalAddress'), regNum=try_to_num(customer_entry.get('regNum'), "int")) cm.save() ct.customers.add(cm) except django.db.utils.IntegrityError: log('THE SAME CUSTOMER', False) suppliers_entry = contract_entry.get('suppliers') for supplier_entry in suppliers_entry: try: sp = Supplier.objects.create( inn=try_to_num(supplier_entry.get('inn'), "int"), kpp=try_to_num(supplier_entry.get('kpp'), "int"), factualAddress=supplier_entry.get( 'factualAddress'), organizationName=supplier_entry.get( 'organizationName'), singularName=try_get_get( supplier_entry.get('legalForm'), 'singularName'), middleName=try_get_get( supplier_entry.get('contactInfo'), 'middleName'), lastName=try_get_get( supplier_entry.get('contactInfo'), 'lastName'), firstName=try_get_get( supplier_entry.get('contactInfo'), 'firstName')) sp.save() ct.suppliers.add(sp) except django.db.utils.IntegrityError: log('THE SAME SUPPLIER', False) ct.save() obj = Object.objects.create( id=entry.get('_id'), title=entry.get('title'), region=try_to_num(entry.get('region'), "int"), signDate=datetime.fromisoformat( entry.get('signDate')[:-1]), contract=ct) obj.save() products_entry = contract_entry.get('products') for product_entry in products_entry: try: pd = Product.objects.create( sid=try_to_num( try_get_first(products_entry).get('sid'), "int"), name=try_get_first(products_entry).get('name'), price=try_to_num( try_get_first(products_entry).get('price'), "decimal"), contract=ct) pd.save() except django.db.utils.IntegrityError: log('THE SAME PRODUCT', False) except django.db.utils.IntegrityError: log('SAME ID!', False) print() print() print() log('THE END :)') data2 = bigjson.load(json_file2)['features'] for entry in data2: coords = [] for cor in entry.get('geometry').get('coordinates'): xy = [cor[0], cor[1]] coords.append(xy) sc = Section.objects.create( km=try_to_num(entry.get('properties').get('km'), 'int'), section_start=try_to_num( entry.get('properties').get('section_start'), 'int'), section_end=try_to_num( entry.get('properties').get('section_end'), 'int'), coordinates=json.dumps(list(coords))) contracts = entry['properties']['contract'] for contract in contracts: #print(contract.get('id')) try: ct = Contract.objects.get(id=contract.get('id')) sc.contracts.add(ct) except: pass #print('----------404 CONTRACT-------------***') sc.save() rd = Road.objects.create( type=entry.get('geometry').get('type'), road_title=entry.get('properties').get('road_title')) rd.sections.add(sc) rd.save()