def calculate_centroids(self): if os.path.exists(self.centroids_file): os.remove(self.centroids_file) f = open(self.corpus_file, 'r') objects = ijson.items(f, 'articles.item') i = 0 idmap = {} cent_array = [] for article in objects: abstract_text = article["abstractText"] abstract_id = article["pmid"] text = article["title"] + " " + abstract_text centroid = get_centroid_idf(text, self.emb, self.idf, self.stopwords, self.dim) cent_array.append(np.array(centroid, dtype=np.float32)) idmap[i] = abstract_id i += 1 final_cent_array = np.array(cent_array, dtype=np.float32).reshape((i, self.dim)) print final_cent_array.shape np.save(centroids_file, final_cent_array) fout = open(self.idmap_file, 'wb') pickle.dump(idmap, fout) fout.close()
def calculate_centroids(self): if os.path.exists(self.centroids_file): os.remove(self.centroids_file) f = open(self.corpus_file, 'r') objects = ijson.items(f, 'articles.item') i = 0 idmap = {} cent_array = [] for article in objects: abstract_text = article["abstractText"] abstract_id = article["pmid"] text = article["title"] + " " + abstract_text centroid = get_centroid_idf(text, self.emb, self.idf, self.stopwords, self.dim) cent_array.append(np.array(centroid, dtype=np.float32)) idmap[i] = abstract_id i += 1 final_cent_array = np.array(cent_array, dtype=np.float32).reshape( (i, self.dim)) print final_cent_array.shape np.save(centroids_file, final_cent_array) fout = open(self.idmap_file, 'wb') pickle.dump(idmap, fout) fout.close()
def _lean_load_json_file(filepath): """Helper function to load json contents from a file using ijson.""" LOG.debug("About to load %s." % filepath) fd = open(filepath, 'rb') gzipper = gzip.GzipFile(fileobj=fd) builds = ijson.items(gzipper, 'builds.item') ret = {'builds': []} try: # We are going to store only the information we need from builds-.js # and ignore the rest. ret['builds'] = [{ 'properties': { key: value for (key, value) in b["properties"].iteritems() if key in ('buildername', 'request_ids', 'revision', 'packageUrl', 'testPackagesUrl', 'testsUrl') }, 'request_ids': b['request_ids'] } for b in builds] except IOError, e: LOG.warning(str(e)) raise
def _lean_load_json_file(filepath): """Helper function to load json contents from a file using ijson.""" LOG.debug("About to load %s." % filepath) fd = open(filepath, 'rb') gzipper = gzip.GzipFile(fileobj=fd) builds = ijson.items(gzipper, 'builds.item') ret = {'builds': []} # We are going to store only the information we need from builds-.js # and drop the rest. ret['builds'] = [{ "properties": { key: value for (key, value) in b["properties"].iteritems() if key in ('buildername', 'request_ids', 'revision', 'packageUrl', 'testPackagesUrl', 'testsUrl') }, "request_ids": b["request_ids"] } for b in builds] fd.close() gzipper.close() return ret
def load_data(self): """ method to lazy load music.json file in memory """ f = open("data/music-sample.json", 'rb') data = ijson.items(f, "item") return data
def parser(files,accuracy): datapoints = [] x = 0 for filename in files: with open(filename, 'rb') as f: print "Parsing %s" % (filename) if args.fast == True: jsondata = json.load(f) data = jsondata["locations"] else: objects = ijson.items(f, 'locations.item') data = (o for o in objects) for entry in data: try: if entry["accuracy"] > accuracy: lat = str(float(entry["latitudeE7"]) / 1e7) long = str(float(entry["longitudeE7"]) / 1e7) #lat = str(decimal.Decimal(entry["latitudeE7"]) * decimal.Decimal(0.0000001)) #long = str(decimal.Decimal(entry["longitudeE7"]) * decimal.Decimal(0.0000001)) location = (lat, long, 1) if len(datapoints) > 2: #make sure the list is long enough to bisect lookuplocation = (bisect.bisect_left(datapoints, location)) #bisect based on current location from google #print "length: %s, location: %s" % (len(datapoints), lookuplocation) if lookuplocation + 1 < len(datapoints): #check to see if we are at the end of the data set, if so append # print "listlat: %s, lat: %s" %(datapoints[lookuplocation+1][0], lat) if datapoints[lookuplocation + 1][0] == lat: # if the value is not a duplicate insert it if datapoints[lookuplocation + 1][1] == long: # if the value is not a duplicate insert it datapoints[lookuplocation+1] = (lat, long, datapoints[lookuplocation+1][2]+1) # print "duplicate found: %s, %s" % (datapoints[lookuplocation+1], datapoints[lookuplocation+1][2]) else: # print "long didnt match" datapoints.insert(lookuplocation, location) else: # print "lat didnt match" datapoints.insert(lookuplocation, location) else: # if the data goes at the end just append it instead of inserting it #print "length: %s, location: %s" % (len(datapoints), lookuplocation) datapoints.append(location) else: datapoints.append(location) except KeyError: #print "entry contained no accuracy information, excluding" continue count, frequent = 0,[] for data in datapoints: if data[2] > count: frequent = data count = frequent[2] return datapoints, frequent
def handle(self, *args, **options): self.groups = list(Groupe.objects.values_list('nom', flat=True)) for source in args: with open(args[0], 'r') as f: for item in ijson.items(f, 'item'): if item['groupe'] in self.groups: continue groupe, created = Groupe.objects.get_or_create( nom=item['groupe']) self.groups.append(item['groupe'])
def run(self): with open(self.file, "r") as f: packets = ijson.items(f, 'item') batch = [] for packet in packets: batch.append(packet) if len(batch) == self.batch_size: self.queue.put(batch) batch = [] if len(batch) > 0: self.queue.put(batch) self.finished = True
def initialize(self): f = open(self.ret_file, 'r') data_q = json.load(f) abstracts_needed = set() for i in range(len(data_q["questions"])): abstracts_needed = abstracts_needed | set(data_q["questions"][i]["retrieved"]) f.close() print "Collecting Abstracts.." f = open(self.corpus_file, 'r') corpus = ijson.items(f, 'articles.item') for article in corpus: pmid = article["pmid"] if pmid in abstracts_needed: self.corpus_index[pmid] = article["title"] + ' ' + article["abstractText"] abstracts_needed.remove(pmid) if not abstracts_needed: break f.close() print len(self.corpus_index) q_array_q = [] q_array_d = [] q_array_max = [] print "Reranking.." n_questions = len(data_q["questions"]) for i in range(n_questions): #print i progress(i+1, n_questions, 'questions') q_id = data_q["questions"][i]["id"] q_body = data_q["questions"][i]["body"] q_retrieved = data_q["questions"][i]["retrieved"] retr_array_q, retr_array_d, retr_array_max = self.rerank(q_body, q_retrieved) q_array_q.append(Question(q_body, q_id, [x[0] for x in retr_array_q], [x[1] for x in retr_array_q])) q_array_d.append(Question(q_body, q_id, [x[0] for x in retr_array_d], [x[1] for x in retr_array_d])) q_array_max.append(Question(q_body, q_id, [x[0] for x in retr_array_max], [x[1] for x in retr_array_max])) with open('.'.join(self.ret_file.split('.')[:-1]) + '_RWMD_Q.json', 'w+') as outfile: outfile.write(json.dumps({"questions":[ob.__dict__ for ob in q_array_q]}, indent=2)) with open('.'.join(self.ret_file.split('.')[:-1]) + '_RWMD_D.json', 'w+') as outfile: outfile.write(json.dumps({"questions":[ob.__dict__ for ob in q_array_d]}, indent=2)) with open('.'.join(self.ret_file.split('.')[:-1]) + '_RWMD_MAX.json', 'w+') as outfile: outfile.write(json.dumps({"questions":[ob.__dict__ for ob in q_array_max]}, indent=2))
def load_json_from_file(file_path): """Loads JSON data from the file. By passing ``file_path`` parameter, the file is opened and the data from the file is extracted. :param str file_path: Optional file path. :returns: JSON data. :rtype: `dict` """ with open(file_path, 'rb') as file: items_generator = ijson.items(file, '') list_items = [item for item in items_generator] json_dict = list_items[0] return json_dict
def extract_jD( inputFile ): print('Loading data file...') f = open(inputFile , 'rb') valid = [] #This part appends sorrounding evidences done = 0 ignored = 0 allJavaDocSeqs = [] for program in ijson.items(f, 'programs.item'): javadoc = program['javaDoc'] if (javadoc is None) or (len(javadoc) == 0): ignored += 1 else: javadoc_list = javadoc.strip().split() # replace all non alphabetical char into underscore javadoc_list = [re.sub("[^a-zA-Z]", '_', w) for w in javadoc_list] # break the terms using underscores tmp_list = [] for t in javadoc_list: s = re.split("_+", t) tmp_list.extend(s) result_list = [] for x in tmp_list: if len(x) > 1: x = LEMMATIZER.lemmatize(x) result_list.extend(camelCaseSplit(x)) done += 1 allJavaDocSeqs.append(result_list) print("Done with this many programs :: " + str(done + ignored), end='\r' ) f.close() print("Number of programs with javadoc :: " + str(done) ) print("Number of programs without javadoc :: " + str(ignored)) return allJavaDocSeqs
def validate_catalog_datasets(agency_id, schema='DATASET_1.0'): agency = audit = tasks = resp = None with transaction.atomic(): try: # Get agency agency = Agency.objects.get(id=agency_id) except Agency.DoesNotExist as e: logger.exception(e) raise e # Get schema info (schema path, dataset_prefix) schema_info = JSON_SCHEMAS.get(schema, None) with transaction.atomic(): audit = Audit.objects.create(agency_id=agency_id, audit_type=Audit.DATA_CATALOG_VALIDATION) try: with closing(open_streaming_response('GET', agency.data_json_url)) as resp: # Use the schema dataset_prefix to get an iterator for the items to be validated. objects = ijson.items(resp.raw, schema_info.get('dataset_prefix', '')) default_args = { 'json_schema_name': schema, 'source_url': agency.data_json_url } if audit: default_args.update({'audit_id': audit.id}) # We're going to spin off async tas tasks = [] for num, obj in enumerate(objects): args = default_args.copy() args.update({'json_object': obj, 'object_position': num}) task = validate_json_object.apply_async( args=(args, ), countdown=(num % COUNTDOWN_MODULO)) tasks.append(task) except Exception as e: logger.exception(e) return tasks
def handle(self, *args, **options): self.insert = options['insert'] if not self.insert and Vote.objects.count() == 0: print '--insert not specified and no vote in db, forcing --insert' self.insert = True self.deputes = {} for depute in Depute.objects.values_list('nom', 'prenom', 'pk'): self.deputes[depute[0]+depute[1]] = depute[2] self.scrutins = {} for scrutin in Scrutin.objects.values_list('uri', 'pk'): self.scrutins[scrutin[0]] = scrutin[1] self.groupes = {} for groupe in Groupe.objects.values_list('pk', 'nom'): self.groupes[groupe[1]] = groupe[0] print 'loaded cache' votes = [] for source in args: with open(source, 'r') as f: print 'opening json' items = ijson.items(f, 'item') print 'loaded json' for item in items: try: votes.append(self.get_vote(item)) except KeyError: pass if not self.insert: continue if len(votes) > 20000: Vote.objects.bulk_create(votes) votes = [] print 'flushing' if self.insert: Vote.objects.bulk_create(votes)
def _lean_load_json_file(filepath): """Helper function to load json contents from a file using ijson.""" LOG.debug("About to load %s." % filepath) fd = open(filepath, 'rb') gzipper = gzip.GzipFile(fileobj=fd) builds = ijson.items(gzipper, 'builds.item') ret = {'builds': []} # We are going to store only the information we need from builds-.js # and drop the rest. ret['builds'] = [ {"properties": {key: value for (key, value) in b["properties"].iteritems() if key in ('buildername', 'request_ids', 'revision', 'packageUrl', 'testPackagesUrl', 'testsUrl')}, "request_ids": b["request_ids"]} for b in builds] fd.close() gzipper.close() return ret
def _json_parse(stdout): try: results = list(ijson.items(stdout, ''))[0] return results except ijson.backends.python.UnexpectedSymbol: return []
def main(): parser = argparse.ArgumentParser() parser.add_argument('location_file', help='json extraction of your location') parser.add_argument( 'home_coord', help='coordinate of home locations. (list of comma separated lat/lon)') parser.add_argument( 'work_coord', help='coordinate of work locations. (list of comma separated lat/lon)') parser.add_argument( '--start_date', help='only parse data since this date. Format is YYYY/MM/DD') parser.add_argument('--tolerance', default=0.5, type=float, help='default tolerance when detection location') args = parser.parse_args() start_date = args.start_date if start_date: try: start_date = datetime.strptime(start_date, '%Y/%m/%d') except ValueError: print('start date "{}" doesn\'t match the expected format: ' 'YYYY/MM/DD'.format(start_date)) start_date_ms = int(start_date.timestamp() * 1000) if start_date else 0 # Lat, Lon home = [[float(c) for c in l.split(':')] for l in args.home_coord.split(',')] work = [[float(c) for c in l.split(':')] for l in args.work_coord.split(',')] distance_helper = DistanceHelper(home, work, args.tolerance) print('Distance between work and home: {}'.format( distance_helper.distance(home[0], work[0]))) days_at_work = {} days_at_home = {} with open(args.location_file, 'rb') as loc: records = ijson.items(loc, 'locations.item') n = 0 for rec in records: n += 1 if n % 10000 == 0: print('At {}'.format(n)) timestamp = int(rec['timestampMs']) if timestamp < start_date_ms: break location = (rec['latitudeE7'] / 10000000., rec['longitudeE7'] / 10000000) timestamp = timestamp / 1000. d = datetime.fromtimestamp(timestamp) date_tuple = (d.year, d.month, d.day) if distance_helper.at_work(location): day = days_at_work.setdefault(date_tuple, DaySomewhere(MAX_DATE, MIN_DATE)) if day.arrived.timestamp() > timestamp: day.arrived = d if day.left.timestamp() < timestamp: day.left = d elif distance_helper.at_home(location): day = days_at_home.setdefault(date_tuple, DaySomewhere(MAX_DATE, MIN_DATE)) if day.arrived.timestamp() > timestamp and d.hour > 13: day.arrived = d elif day.left.timestamp() < timestamp and d.hour < 13: day.left = d print('Processed {} records'.format(n)) print('worked for {} days'.format(len(days_at_work))) print('was home for {} days'.format(len(days_at_home))) # Print the work day length. worked_seconds = [] for k in sorted(days_at_work.keys()): arrived = days_at_work[k].arrived left = days_at_work[k].left t = left - arrived worked_hours = int(t.total_seconds() / 3600) worked_minutes = int((t.total_seconds() % 3600) / 60) if worked_hours > 2: worked_seconds.append(t.total_seconds()) print('{}: Worked {} hours and {} minutes, from {} to {}'.format( arrived.strftime('%a %b %d'), worked_hours, worked_minutes, arrived.strftime('%H:%M'), left.strftime('%H:%M'))) average_seconds = sum(worked_seconds) / len(worked_seconds) print('On average: {} hours and {} minutes'.format( int(average_seconds / 3600), int((average_seconds % 3600) / 60))) # Print the commute length. print('\n\n\nCommute lengths:') commute_seconds = [] for k in sorted(days_at_work.keys()): if not k in days_at_home: continue work = days_at_work[k] home = days_at_home[k] morning_commute = int( (work.arrived - home.left).total_seconds() / 60) evening_commute = int( (home.arrived - work.left).total_seconds() / 60) print('{}: Morning {}min, Evening {}min'.format( work.arrived.strftime('%a %b %d'), morning_commute, evening_commute))
def _pkginfo_iterator(): url = 'https://skimdb.npmjs.com/registry/_all_docs?include_docs=true' fh = urllib.urlopen(url) # temporary patch to use locally cached file # fh = open("../dataset/npm.cache/npm.json") return ijson.items(fh, 'rows.item')
''' Split apart test.json so I can run the damn thing on my computer. Because apparently I don't have enough memory. ''' import ijson.backends.yajl2 as ijson import simplejson as json jsonFile = open("data/test.json", "rb") curSplit = 0 objBuffer = [] objs = ijson.items(jsonFile, "item") for obj in objs: objBuffer.append(obj) if len(objBuffer) == 1000: with open("data/test." + str(curSplit) + ".json", "w") as outfile: json.dump(objBuffer, outfile, separators=(',', ':')) objBuffer = [] curSplit += 1 print("Wrote " + str(curSplit * 1000) + " records to disk") # dump whatever is left with open("data/test." + str(curSplit) + ".json", "w") as outfile: json.dump(objBuffer, outfile, separators=(',', ':'))
def parse(self, data): import ijson.backends.yajl2 as ijson bugs = ijson.items(data, 'result.bugs.item') return (self.service.item(service=self.service, bug=bug) for bug in bugs)
def _process_json(filename): if filename is None: return None cnt = 0 selected = 0 onlyArticles = True PO_NAME = 'wikidata.po' SAVE_INTERVAL = 1000 PROCESS_NOF_ENTRIES = 2 * 1000 * 1000 po_file = _create_empty_po_file() with open(filename, 'r') as json_data: value = ijson.items(json_data, 'item') for item in value: label = item.get('labels') if label is None: continue item_id = item['id'] if onlyArticles is True: if item_id is None or item_id.startswith("Q") is False: continue comment = u'Article {0}'.format(item_id) en_label = label.get('en') ca_label = label.get('ca') if en_label is None or ca_label is None: continue cnt = cnt + 1 value = en_label['value'] if _is_segment_valid(value) is False: continue exists = exists_in_tm(value) if exists > 0: selected = selected + 1 else: continue entry = polib.POEntry(msgid=en_label['value'], msgstr=ca_label['value'], tcomment = comment + " (label)") _insert_entry_inpofile(po_file, entry) desc = item.get('descriptions') if desc is not None: en_desc = desc.get('en') ca_desc = desc.get('ca') if en_desc is not None and ca_desc is not None: entry = polib.POEntry(msgid=en_desc['value'], msgstr=ca_desc['value'], tcomment = comment + " (description)") _insert_entry_inpofile(po_file, entry) if cnt % SAVE_INTERVAL == 0: po_file.save(PO_NAME) if cnt > PROCESS_NOF_ENTRIES: break po_file.save(PO_NAME) print ("Total entries: " + str(cnt)) print ("Selected: {0} (%{1})".format(str(selected), str(percentage(selected, cnt))))
def initialize(self): f = open(self.ret_file, 'r') data_q = json.load(f) abstracts_needed = set() for i in range(len(data_q["questions"])): abstracts_needed = abstracts_needed | set( data_q["questions"][i]["retrieved"]) f.close() print "Collecting Abstracts.." f = open(self.corpus_file, 'r') corpus = ijson.items(f, 'articles.item') for article in corpus: pmid = article["pmid"] if pmid in abstracts_needed: self.corpus_index[ pmid] = article["title"] + ' ' + article["abstractText"] abstracts_needed.remove(pmid) if not abstracts_needed: break f.close() print len(self.corpus_index) q_array_q = [] q_array_d = [] q_array_max = [] print "Reranking.." n_questions = len(data_q["questions"]) for i in range(n_questions): #print i progress(i + 1, n_questions, 'questions') q_id = data_q["questions"][i]["id"] q_body = data_q["questions"][i]["body"] q_retrieved = data_q["questions"][i]["retrieved"] retr_array_q, retr_array_d, retr_array_max = self.rerank( q_body, q_retrieved) q_array_q.append( Question(q_body, q_id, [x[0] for x in retr_array_q], [x[1] for x in retr_array_q])) q_array_d.append( Question(q_body, q_id, [x[0] for x in retr_array_d], [x[1] for x in retr_array_d])) q_array_max.append( Question(q_body, q_id, [x[0] for x in retr_array_max], [x[1] for x in retr_array_max])) with open('.'.join(self.ret_file.split('.')[:-1]) + '_RWMD_Q.json', 'w+') as outfile: outfile.write( json.dumps({"questions": [ob.__dict__ for ob in q_array_q]}, indent=2)) with open('.'.join(self.ret_file.split('.')[:-1]) + '_RWMD_D.json', 'w+') as outfile: outfile.write( json.dumps({"questions": [ob.__dict__ for ob in q_array_d]}, indent=2)) with open('.'.join(self.ret_file.split('.')[:-1]) + '_RWMD_MAX.json', 'w+') as outfile: outfile.write( json.dumps({"questions": [ob.__dict__ for ob in q_array_max]}, indent=2))
def _process_json(filename): if filename is None: return None cnt = 0 selected = 0 onlyArticles = True PO_NAME = 'wikidata.po' SAVE_INTERVAL = 1000 PROCESS_NOF_ENTRIES = 2 * 1000 * 1000 po_file = _create_empty_po_file() with open(filename, 'r') as json_data: value = ijson.items(json_data, 'item') for item in value: label = item.get('labels') if label is None: continue item_id = item['id'] if onlyArticles is True: if item_id is None or item_id.startswith("Q") is False: continue comment = u'Article {0}'.format(item_id) en_label = label.get('en') ca_label = label.get('ca') if en_label is None or ca_label is None: continue cnt = cnt + 1 value = en_label['value'] if _is_segment_valid(value) is False: continue exists = exists_in_tm(value) if exists > 0: selected = selected + 1 else: continue entry = polib.POEntry(msgid=en_label['value'], msgstr=ca_label['value'], tcomment=comment + " (label)") _insert_entry_inpofile(po_file, entry) desc = item.get('descriptions') if desc is not None: en_desc = desc.get('en') ca_desc = desc.get('ca') if en_desc is not None and ca_desc is not None: entry = polib.POEntry(msgid=en_desc['value'], msgstr=ca_desc['value'], tcomment=comment + " (description)") _insert_entry_inpofile(po_file, entry) if cnt % SAVE_INTERVAL == 0: po_file.save(PO_NAME) if cnt > PROCESS_NOF_ENTRIES: break po_file.save(PO_NAME) print("Total entries: " + str(cnt)) print("Selected: {0} (%{1})".format(str(selected), str(percentage(selected, cnt))))
for rep in range(times): try: counter = 0 now = time.time() with open(sys.argv[3]) as fileobj: if parser.startswith('jsaone'): if parser == 'jsaone_cyt': from jsaone_cyt import load elif parser == 'jsaone_py': from jsaone_py import load elif parser == 'jsaone': from jsaone import load for obj in load(fileobj): counter += 1 elif parser == 'json': from json import load for item in load(fileobj): counter += 1 elif parser == 'ijson': import ijson.backends.yajl2 as ijson for obj in ijson.items(fileobj, ''): counter += 1 print("Read %d objects in %f seconds with parser %s" % (counter, time.time() - now, parser)) except Exception as exc: raise exc
with TestCase('ijson.yajl2_c', 'bytes', args.json_size, results): gen = io.BytesIO(jsondata.encode('utf-8')) parser = ijson_yajl2_c.items(gen, b'level1.level2.item') for n, item in enumerate(parser): assert (item['id'] == n) with TestCase('ijson.yajl2_cffi', 'bytes', args.json_size, results): gen = io.BytesIO(jsondata.encode('utf-8')) parser = ijson_yajl2_cffi.items(gen, b'level1.level2.item') for n, item in enumerate(parser): assert (item['id'] == n) with TestCase('ijson.yajl2', 'bytes', args.json_size, results): gen = io.BytesIO(jsondata.encode('utf-8')) parser = ijson_yajl2.items(gen, 'level1.level2.item') for n, item in enumerate(parser): assert (item['id'] == n) with TestCase('ijson.python', 'str', args.json_size, results): gen = io.StringIO(jsondata) parser = ijson_python.items(gen, 'level1.level2.item') for n, item in enumerate(parser): assert (item['id'] == n) print( tabulate( results, headers=['Facility', 'Type', 'Objects/sec'], stralign='right', # colalign=['left', 'center', 'right'],
def parser(files, accuracy): datapoints = [] x = 0 for filename in files: with open(filename, 'rb') as f: print "Parsing %s" % (filename) if args.fast == True: jsondata = json.load(f) data = jsondata["locations"] else: objects = ijson.items(f, 'locations.item') data = (o for o in objects) for entry in data: try: if entry["accuracy"] > accuracy: lat = str(float(entry["latitudeE7"]) / 1e7) long = str(float(entry["longitudeE7"]) / 1e7) #lat = str(decimal.Decimal(entry["latitudeE7"]) * decimal.Decimal(0.0000001)) #long = str(decimal.Decimal(entry["longitudeE7"]) * decimal.Decimal(0.0000001)) location = (lat, long, 1) if len( datapoints ) > 2: #make sure the list is long enough to bisect lookuplocation = ( bisect.bisect_left(datapoints, location) ) #bisect based on current location from google #print "length: %s, location: %s" % (len(datapoints), lookuplocation) if lookuplocation + 1 < len( datapoints ): #check to see if we are at the end of the data set, if so append # print "listlat: %s, lat: %s" %(datapoints[lookuplocation+1][0], lat) if datapoints[lookuplocation + 1][ 0] == lat: # if the value is not a duplicate insert it if datapoints[lookuplocation + 1][ 1] == long: # if the value is not a duplicate insert it datapoints[lookuplocation + 1] = ( lat, long, datapoints[lookuplocation + 1][2] + 1) # print "duplicate found: %s, %s" % (datapoints[lookuplocation+1], datapoints[lookuplocation+1][2]) else: # print "long didnt match" datapoints.insert( lookuplocation, location) else: # print "lat didnt match" datapoints.insert(lookuplocation, location) else: # if the data goes at the end just append it instead of inserting it #print "length: %s, location: %s" % (len(datapoints), lookuplocation) datapoints.append(location) else: datapoints.append(location) except KeyError: #print "entry contained no accuracy information, excluding" continue count, frequent = 0, [] for data in datapoints: if data[2] > count: frequent = data count = frequent[2] return datapoints, frequent
def load_ijson(filepath, itempath): return ijson.items(open(filepath, "rb"), itempath)