def _get_subjects(f): items = ijson.items(f, 'cudSubjects.item') for subject in ijson.items(f, 'cudSubjects.item'): attributes = {a.remote: None for a in cud_attributes} attributes.update({a['name']: a['value'] for a in subject['attributes']}) attributes[cud_id] = int(attributes[cud_id]) yield attributes
def do_import(args, db): pdb = db() friend_name = args.friend_name friend = pdb.get_friend_by_name(friend_name) if not friend: print >> sys.stderr, "No friend by that name, check your spelling or create a new friend using add_friend" return False friend_id = friend['id'] print "Importing Authors" with open(args.file_name) as import_file: authors_to_insert = [] author_docs = ijson.items(import_file, 'authors.item') for author_doc in author_docs: authors_to_insert.append(author_doc) if len(authors_to_insert) >= INSERT_BATCH_SIZE: print "." pdb.load_author_documents_from_friend(friend_id, authors_to_insert) authors_to_insert = [] if authors_to_insert: pdb.load_author_documents_from_friend(friend_id, authors_to_insert) print "Importing Tomes" with open(args.file_name) as import_file: tomes_to_insert = [] tome_docs = ijson.items(import_file, 'tomes.item') for tome_doc in tome_docs: tomes_to_insert.append(tome_doc) if len(tomes_to_insert) >= INSERT_BATCH_SIZE: print "." pdb.load_tome_documents_from_friend(friend_id, tomes_to_insert) tomes_to_insert = [] if tomes_to_insert: pdb.load_tome_documents_from_friend(friend_id, tomes_to_insert)
def parser(base, objconf, skip, **kwargs): """ Parses the pipe content Args: base (str): The base currency (exchanging from) objconf (obj): The pipe configuration (an Objectify instance) skip (bool): Don't parse the content kwargs (dict): Keyword arguments Kwargs: assign (str): Attribute to assign parsed content (default: exchangerate) stream (dict): The original item Returns: Tuple(dict, bool): Tuple of (item, skip) Examples: >>> from riko import get_path >>> from riko.lib.utils import Objectify >>> >>> url = get_path('quote.json') >>> conf = {'url': url, 'currency': 'USD', 'sleep': 0, 'precision': 6} >>> item = {'content': 'GBP'} >>> objconf = Objectify(conf) >>> kwargs = {'stream': item, 'assign': 'content'} >>> result, skip = parser(item['content'], objconf, False, **kwargs) >>> result Decimal('1.545801') """ if skip: rate = kwargs['stream'] elif objconf.url.startswith('http'): get = partial(requests.get, stream=True) sget = utils.memoize(utils.HALF_DAY)(get) if objconf.memoize else get r = sget(objconf.url, params=objconf.params) json = next(items(r.raw, '')) else: context = utils.SleepyDict(delay=objconf.sleep) url = utils.get_abspath(objconf.url) try: with closing(urlopen(url, context=context)) as f: json = next(items(f, '')) except TypeError: with closing(urlopen(url)) as f: json = next(items(f, '')) if not skip: places = Decimal(10) ** -objconf.precision rates = parse_response(json) rate = calc_rate(base, objconf.currency, rates, places=places) return rate, skip
def do_import_self(args, db): with open(args.file_name) as import_file: author_docs = ijson.items(import_file, 'authors.item') for author_doc in author_docs: print "Author: ", author_doc db().load_own_author_document(author_doc) with open(args.file_name) as import_file: tome_docs = ijson.items(import_file, 'tomes.item') for tome_doc in tome_docs: print "Tome: ", tome_doc db().load_own_tome_document(tome_doc)
def handle(self, *args: Any, **options: Any) -> None: total_count = 0 changed_count = 0 with open(options['dump1'], 'r') as dump1, open(options['dump2'], 'r') as dump2: for m1, m2 in zip(ijson.items(dump1, 'item'), ijson.items(dump2, 'item')): total_count += 1 if m1['id'] != m2['id']: self.stderr.write('Inconsistent messages dump') break if m1['content'] != m2['content']: changed_count += 1 self.stdout.write('Changed message id: {id}'.format(id=m1['id'])) self.stdout.write('Total messages: {count}'.format(count=total_count)) self.stdout.write('Changed messages: {count}'.format(count=changed_count))
def annotate(self, string, bypass_exceptions=True): params = urlencode([("document", string.lower()), ("id", 1)]) url_string = "{}/disambiguate?{}".format(self.url, params) sys.stderr.write(url_string + "\n") efforts = 0 response = None while response == None and efforts < 30: efforts += 1 try: response = urllib2.urlopen(url_string).read() except Exception as e: if "[Errno 104]" in str(e): sys.stderr.write("Connection with WSD analyzer reset by peer. Trying again in a minute\n") sleep(60) elif "HTTP Error 500" in str(e): sys.stderr.write("WSD analyzer could not analyze this sentence and returned Error 500\n") return " ".join(["{}|-".format(t) for t in string.split()]) else: sys.exit("Error while talking with WSD: {}".format(e)) if response == None : sys.exit("WSD analyzer did not respond for more than 30 minutes") sys.stderr.write(response + "\n") responsefile = StringIO.StringIO() responsefile.write('{"documents": [' + response + ' ]}') responsefile.seek(0) items = list(ijson.items(responsefile, "documents.item")) sys.stderr.write("items: {}\n".format(items)) factored_output, replaced_output = read_wsd_output(items[0], string) responsefile.close() return factored_output
def write_data(dict_words): items = [] with open(POSTS_FILE_PATH, 'r') as f: for post in ijson.items(f, 'item'): labels = [0] * len(CATEGORIES) was = False for hub in post['hubs']: for cur_label, label in enumerate(CATEGORIES): if hub in label: labels[cur_label] = 1 was = True if not was: continue words = [0] * BagOfWords.NUM_VOCABULARY_SIZE post_words = post['content'] + post['title'] for word in post_words: if word in dict_words: words[dict_words[word]] = 1 else: words[BagOfWords.NUM_VOCABULARY_SIZE - 1] = 1 labels = np.packbits(labels).tolist() words = np.packbits(words).tolist() items.append((labels, words)) shuffle(items) train_set_size = int(len(items) * BagOfWords.TRAIN_RATIO) _write_set(TF_ONE_SHOT_TRAIN_FILE_PATH, items[:train_set_size]) _write_set(TF_ONE_SHOT_EVAL_FILE_PATH, items[train_set_size:]) print('Set size : ', len(items)) print('Train set size : ', train_set_size) print('Eval set size : ', len(items) - train_set_size)
def get_l1(ss_ontology): for item in ijson.items(open(ss_ontology), ''): l1 = [] for x in item["data"]: if "level1" in x and x["level1"] not in l1: l1.append(x["level1"]) return l1
def get_l2(ss_ontology): for item in ijson.items(open(ss_ontology), ''): l2 = [] for x in item["data"]: if "level2" in x and x["level2"] not in l2: l2.append(x["level2"]) return l2
def parse(self, destination_folder, fname, from_date, to_date): self.parsing_started.emit() self.fname = fname self.destination_folder = destination_folder Path(destination_folder).makedirs_p() self.from_date = from_date self.to_date = to_date nb_instances = 0 success = False error_message = "" try: with open(self.fname, encoding="UTF-8", mode='r') as f: items = ijson.items(f, 'item') nb_instances = len(list(filter(self.submission_filter, items))) except IOError: error_message = "Impossible de lire le fichier." except ValueError: error_message = "Le fichier n'est pas un fichier JSON valide." except Exception as e: error_message = repr(e) else: success = True finally: self.nb_instances = nb_instances self.parsing_ended.emit(success, nb_instances, error_message)
def iter_data(self, fil, **kwargs): '''iterates over file and returns rows with location, date and parameter value''' from acacia.data.models import MeetLocatie, Parameter location = kwargs.get('meetlocatie',None) if location and isinstance(location, MeetLocatie): location = location.name parameter = kwargs.get('parameter',self.parm) if parameter and isinstance(parameter, Parameter): parameter = parameter.name for p in ijson.items(fil,'features.item.properties'): loc = p['Meetpuntcode'] if location and loc != location: continue par = p['Parametercode'] if parameter and par != parameter: continue for d in p['data']: try: val = float(d['Waarde']) dat = datetime.datetime.strptime(d['datum'],'%Y-%m-%d') yield (loc,par,dat,val) except: # problem with datapoint pass
def build_fwdtable(V, E, tf, args): # Build tables of vertex and edge names for forwarding table parsing num_v = {} for v in V: num_v[v.vnum] = v num_e = {} for e in E: num_e[e.enum] = e # Build the forwarding tables fwdtable = Fwdtable() #TODO: If this actually takes too much time, we need a parser from # ijson.parse #print 'Parsing tf...' for subtable in ijson.items(tf, "item"): for vnum in subtable: v = num_v[int(vnum)] if v not in fwdtable: fwdtable[v] = {} for entry in subtable[vnum]: key, value = entry dsti, ctreei, utreei, ei = key port, mark_failed = value dst = num_v[int(dsti)] e = num_e[int(ei)] #XXX: TODO: fail less extremely assert(e.v1 == v and e.p1 == port or e.v2 == v and e.p2 == port) assert(port in v.ports) if (dst, ctreei) not in fwdtable[v]: fwdtable[v][dst, ctreei] = [] fwdtable[v][dst, ctreei].append(((utreei, e), (port, mark_failed))) return fwdtable
def main(argv): vocabDict = {} labelDict = {} with open("labelIndex.json") as f: labelDict = json.load(f) numTokens = len(labelDict) print numTokens with open('2mdumpR.json') as f: i = 0 articles = ijson.items(f, 'item') finalRow = [] finalCol = [] for article in articles: if i%1000 == 0: print i row = [] col = [] labels = article['meshMajor'] for label in labels: if labelDict.has_key(label): row.append(i) col.append(labelDict[label]) finalRow.extend(row) finalCol.extend(col) i += 1 # if (i+1)%250000 == 0: # break; freq = np.ones(len(finalRow)) output = csr_matrix((freq, (finalRow, finalCol)), shape = (2000000, 26840)) with open('labelMat.pkl', 'wb') as fp: cPickle.dump(output, fp, -1)
def main(): if len(sys.argv) != 3: print('usage: {0} bulk.json http://localhost:5984/my_db'.format(sys.argv[0])) return FILE = sys.argv[1] DB_NAME = sys.argv[2] bulksize = 10000 f = open(FILE, 'r') doc_count = 0 batch_count = 0 docs = [] for doc in items(f, 'docs.item'): docs.append(json.dumps(doc, use_decimal=True)) doc_count += 1 if doc_count == bulksize: send_req(DB_NAME, docs) doc_count = 0 docs = [] print('finished batch: {0}'.format(batch_count)) batch_count += 1 # send remaining docs (if any) if len(docs) > 0: send_req(DB_NAME, docs)
def process_provider_into_es(fname, es, conn): status = False with open(fname, 'r') as infile: actions = [] try: for doc in ijson.items(infile, "item"): if doc['type'] == 'INDIVIDUAL': action = { "_index": "data", "_type": "provider", "_source": doc } else: action = { "_index": "data", "_type": "facility", "_source": doc } actions.append(action) if len(actions) > 0 and len(actions) % 50 == 0: helpers.bulk(es, actions) status = True actions = [] except (KeyboardInterrupt, SystemExit): conn.rollback() raise except (UnicodeDecodeError, ValueError, ijson.JSONError): print "{0}\n".format(str(ex)) return status
def run(): """Removes all content from database and creates new tables""" limit = 0 with app.app_context(): f = urlopen(app.config["DATA_URL"]) objects = items(f, app.config["DATA_LOCATION"]) row_limit = app.config["ROW_LIMIT"] chunk_size = min(row_limit or "inf", app.config["CHUNK_SIZE"]) debug = app.config["DEBUG"] if app.config["TESTING"]: createdb() for records in utils.chunk(objects, chunk_size): count = len(records) limit += count flattened = [dict(utils.flatten_fields(r)) for r in records] if debug: print ("Inserting %s records into the database..." % count) # pprint(flattened) db.engine.execute(Data.__table__.insert(), flattened) if row_limit and limit >= row_limit: break if debug: print ("Successfully inserted %s records into the database!" % limit)
def read_json(filename, mode='rt', encoding=None, prefix=''): """ Iterate over JSON objects matching the field given by ``prefix``. Useful for reading a large JSON array one item (with ``prefix='item'``) or sub-item (``prefix='item.fieldname'``) at a time. Args: filename (str): /path/to/file on disk from which json items will be streamed, such as items in a JSON array; for example:: [ {"title": "Harrison Bergeron", "text": "The year was 2081, and everybody was finally equal."}, {"title": "2BR02B", "text": "Everything was perfectly swell."} ] mode (str, optional) encoding (str, optional) prefix (str, optional): if '', the entire JSON object will be read in at once; if 'item', each item in a top-level array will be read in successively; if 'item.text', each array item's 'text' value will be read in successively Yields: next matching JSON object; could be a dict, list, int, float, str, depending on the value of ``prefix`` Notes: Refer to ``ijson`` at https://pypi.python.org/pypi/ijson/ for usage details. """ with io.open(filename, mode=mode, encoding=encoding) as f: if prefix == '': yield json.load(f) else: for item in ijson.items(f, prefix): yield item
def _LoadHistogramList(input_file): """Incremental file decoding and JSON parsing when handling new histograms. This helper function takes an input file which yields fragments of JSON encoded histograms then incrementally builds the list of histograms to return the fully formed list in the end. Returns This function returns an instance of a list() containing dict()s decoded from the input_file. Raises This function may raise ValueError instances if we end up not finding valid JSON fragments inside the file. """ try: with timing.WallTimeLogger('json.load'): def NormalizeDecimals(obj): # Traverse every object in obj to turn Decimal objects into floats. if isinstance(obj, decimal.Decimal): return float(obj) if isinstance(obj, dict): for k, v in obj.items(): obj[k] = NormalizeDecimals(v) if isinstance(obj, list): obj = [NormalizeDecimals(x) for x in obj] return obj objects = [NormalizeDecimals(x) for x in ijson.items(input_file, 'item')] except ijson.JSONError as e: # Wrap exception in a ValueError raise ValueError('Failed to parse JSON: %s' % (e)) return objects
def _get_dataset_indexes(): train_indexes = set() eval_indexes = set() with open(POSTS_FILE_PATH, 'r') as f: it = 0 for post_num, post in enumerate(ijson.items(f, 'item')): was = False for hub in post['hubs']: for category_num, category in enumerate(CATEGORIES): if hub in category: was = True if not was: continue if random() > TRAIN_RATIO: eval_indexes.add(post_num) else: train_indexes.add(post_num) if DEBUG: it += 1 if it > 100: break accepted_posts_num = len(eval_indexes) + len(train_indexes) print('eval data set ratio', len(eval_indexes) / float(accepted_posts_num)) return train_indexes, eval_indexes
def handle(self, *args, **options): print "Clean old downloaded files" os.system("rm %s %s" % (join(settings.MEMOPOL_TMP_DIR, "ep_votes.json"), join(settings.MEMOPOL_TMP_DIR, "ep_votes.json.xz"))) print "Download vote data from parltrack" os.system("wget -O %s http://parltrack.euwiki.org/dumps/ep_votes.json.xz" % join(settings.MEMOPOL_TMP_DIR, "ep_votes.json.xz")) print "unxz it" os.system("unxz %s" % join(settings.MEMOPOL_TMP_DIR, "ep_votes.json.xz")) print "cleaning old votes data..." connection.cursor().execute("DELETE FROM votes_recommendationdata") transaction.commit_unless_managed() print RecommendationData.objects.count() print "read file" a = 1 with transaction.commit_on_success(): # I need to parse the json file by hand, otherwise this eat way to much memory for vote in ijson.items(open(join(settings.MEMOPOL_TMP_DIR, "ep_votes.json")), 'item'): RecommendationData.objects.create(proposal_name=vote.get("report", vote["title"]), title=vote["title"], data=dumps(vote, indent=4), date=parse(vote["ts"])), reset_queries() sys.stdout.write("%s\r" % a) sys.stdout.flush() a += 1 sys.stdout.write("\n")
def main(): argv = sys.argv[1].split() inFile = argv[0] outDir = argv[1] cpu_count = int([node.split()[1] for node in open(os.environ['PE_HOSTFILE'])][0]) pool = mp.Pool(cpu_count) logger.info("using no. of CPUs: {0}...".format(cpu_count)) manager = mp.Manager() q = manager.Queue() abstracts = [] with open(inFile, 'r') as data: objs = ijson.items(data, "item") for abstract in objs: abstracts.append((abstract["filepath"], abstract["pmid"])) logger.info("extracting text from {} abstracts...".format(len(abstracts))) jobs = [] watcher = pool.apply_async(listener, (q, outDir,)) for filepath, pmid in abstracts: job = pool.apply_async(worker, (filepath, pmid, q)) jobs.append(job) for job in jobs: job.get() logger.info("adding kill to queue...") q.put(["KILL"]) pool.close() pool.join() logger.info("Job complete.")
def test_items(self): meta = list(items(StringIO(JSON), 'docs.item.meta')) self.assertEqual(meta, [ [[1], {}], {'key': 'value'}, None, ])
def run(): """Populates db with most recent data""" limit = 0 with app.app_context(): f = urlopen(app.config['DATA_URL']) objects = items(f, app.config['DATA_LOCATION']) row_limit = app.config['ROW_LIMIT'] chunk_size = min(row_limit or 'inf', app.config['CHUNK_SIZE']) debug = app.config['DEBUG'] if app.config['TESTING']: createdb() for records in utils.chunk(objects, chunk_size): count = len(records) limit += count flattened = [dict(utils.flatten_fields(r)) for r in records] if debug: print('Inserting %s records into the database...' % count) # pprint(flattened) db.engine.execute(Data.__table__.insert(), flattened) if row_limit and limit >= row_limit: break if debug: print('Successfully inserted %s records into the database!' % limit)
def generate_mols_from_json(json): j=0 for item in items(json, "item"): j+=1 #print " reading",j mol = parse_mol_json(item) yield mol
def exec_worker_map_filter(self, endpoint, args, request): """Forward request and process response. Forward the request to the third party service, and map the response through the ``process`` user function. """ if endpoint != 'search': raise APIException("service of type 'map_filter' does " "not support /list") if is_https(self.url) and request.method == 'GET': method = tls1_get else: method = getattr(requests, request.method.lower()) try: headers = {'Authorization': request.headers['Authorization']} except KeyError: headers = {} response = method(self.url, params=request.args, headers=headers, stream=True) if response.ok: path = '.'.join(filter(None, [self.json_path, 'item'])) results = ijson.items(FileLikeWrapper(response), path) return Response( result_generator(process_by_client(self, results), lambda: {}), mimetype='application/json') else: raise APIException('response from external service: {}' .format(response))
def main(filepath, outputpath): include_from = 'Gene Kogan <*****@*****.**>' exclude_to = 'GeneKogan<*****@*****.**>' output_file = open(outputpath, 'w') input_file = open(filepath) # num_items = len(ijson.items(input_file, 'item')) num_found_items = 0 errors = 0 for (i,t) in enumerate(ijson.items(input_file, 'item')): if (i % 100 == 0): print "try email %d, found %d so far" % (i, num_found_items) try: if t['From'] == include_from and str(t['To'][0]) != exclude_to: for p in t['parts']: if p['contentType']=='text/plain': content = parse_email_content(t["parts"][0]["content"]) output_file.write(removeNonAscii(content)) output_file.flush() num_found_items += 1 except: print "Ooops, error "+str(errors)+"...." errors += 1 output_file.close() input_file.close()
def call(self): """ Makes a request to cghub server. Returns generator that returns Result objects. """ self.patch_input_data() query = self.build_query() url = '%s%s' % (self.server_url, self.uri) if query: url = '%s?%s' % (url, query) xml = self.get_source_file(url) if self.format == self.FORMAT_JSON: results = ijson.items(xml, 'response.docs.item') for item in results: yield item else: # http://docs.python.org/dev/library/xml.dom.pulldom.html doc = pulldom.parse(xml) for event, node in doc: if event == pulldom.START_ELEMENT: if node.tagName == 'doc': doc.expandNode(node) # convert to python object # http://docs.python.org/2/library/xml.etree.elementtree.html result_xml = node.toxml(encoding='utf-8') tree = ElementTree.fromstring(result_xml) result = Result(tree) yield self.patch_result(result, result_xml) elif node.tagName == 'result': self.hits = int(node.getAttribute('numFound'))
def executer(self, *args): """Execute remotely""" options = self.options try: # from dbnav import daemon # if not daemon.is_running(options): # daemon.start_server(options) url = 'http://{host}:{port}/{path}'.format( host=options.host, port=options.port, path=COMMANDS[options.prog]) request = json.dumps(args[1:]) log.logger.debug('Request to %s:\n%s', url, request) response = urllib2.urlopen(url, request) for i in ijson.items(response, 'item'): yield from_json(i) except urllib2.HTTPError as e: raise from_json(json.load(e)) except urllib2.URLError as e: log.logger.error('Daemon not available: %s', e) except BaseException as e: log.logger.exception(e)
def _get_labels(train_indexes, eval_indexes): train_labels = [] eval_labels = [] it = 0 with open(POSTS_FILE_PATH, 'r') as f: for post_num, post in enumerate(ijson.items(f, 'item')): if DEBUG: it += 1 if it > 100: break is_in_train_set = post_num in train_indexes is_in_eval_set = post_num in eval_indexes if not is_in_train_set and not is_in_eval_set: continue label = [0.] * len(CATEGORIES) for hub in post['hubs']: for category_num, category in enumerate(CATEGORIES): if hub in category: label[category_num] = 1 if is_in_train_set: train_labels.append(label) else: eval_labels.append(label) return train_labels, eval_labels
def search_and_match(coord_index): coord = coordinates[coord_index] result = read_search(coord, coord_index) reads = ijson.items(HTTPStream(result), 'reads.item') reports = coord_indices[coord_index] reports_visited = [] for read in reads: read_coord = Coordinate(read['referenceSequenceName'], read['position'], read['position']+get_ref_length(read['cigar'])) covered_reports = [report for report in reports if (read_coord.chrom == report['chrom'] and read_coord.start <= report['seqStart'] and read_coord.end > report['seqEnd'])] if covered_reports: new_matched_reports = [report for report in covered_reports if (report['reportId'] not in reports_visited and match(report, read))] matched_reports.extend(new_matched_reports) reports_visited.extend([report['reportId'] for report in covered_reports if report['reportId'] not in reports_visited]) #push read into cache: read['repository'] = report_set['repository'] read['readsetId'] = report_set['readsetId'] read['start'] = read['position'] read['end'] = read['position'] + get_ref_length(read['cigar']) CachedReads.save(read) if len(reports_visited) >= len(reports): break
def load_and_write_content(filename, filename2): count=0 file = codecs.open(filename2, 'w', encoding='utf-8') with open(filename, 'r') as fd: for item in ijson.items(fd, 'item'): count+=1 file.write('[[제목]]: ') file.write(item['title']) file.write('\n') file.write('[[내용]]: \n') file.write(item['text']) file.write("\n") file.close() print('contents count=', count)
def any2dict(f, ext='xml', html5=False, path=None): path = path or '' if ext in {'xml', 'html'}: xml = ext == 'xml' root = xml2etree(f, xml, html5).getroot() replaced = '/'.join(path.split('.')) tree = next(xpath(root, replaced)) if replaced else root content = etree2dict(tree) elif ext == 'json': content = next(items(f, path)) else: raise TypeError('Invalid file type %s' % ext) return content
def parse_json_items(self, tag, limit=0): self.__items = [] self.__file.seek(0) cnt = 0 objs = ijson.items(self.__file, tag) for obj in objs: item = json.dumps(obj, sort_keys=True, indent=4, ensure_ascii=True) self.__items.append(json.loads(item)) cnt += 1 if limit != 0 and cnt >= limit: break return self.__items
def get_main_blocks(ast): cont_main_blks = [] with open(ast, 'r') as f: objects = ijson.items(f, 'children') contract_body = (o for o in objects) for child in contract_body: d = dict(child[1]) for i in range(0, len(d['children'])): cont_main_blks.append({ "id": (d['children'][i])['id'], "name": (d['children'][i])['name'], "src": (d['children'][i])['src'] }) return cont_main_blks
def migrate_data(args, table, config): global gargs gargs = args change_table_info(table, config) with open(os.path.join(args.data_dir, "%s.json" % table.name)) as f: rows = ijson.items(f, "results.item") pool = multiprocessing.Pool() result = pool.imap(handle_rows, rows) for row in result: if isinstance(row, dict): handle_fault_data(row) else: insert_data(row)
def parse_data_to_case_class(input): conversations = [] with open(input["data_path"] + ".json", encoding="utf8") as data: print("Successfully opened " + input["data_path"] + ".json...") for conversation in ijson.items(data, 'conversations.conversation.item'): id = conversation["@id"] messages = [] for message in conversation["message"]: messages.append( Message.Message(message["author"], message["time"], str(message["text"]))) conversations.append(Conversation.Conversation(id, messages)) return conversations
def _getEsriRESTJSON(self, url, params, attempt=1, useIjson=False, debug=None): """Helper function to query an Esri REST endpoint and return json""" #Wait five seconds if previous error if attempt > 1 and attempt != 6: time.sleep(5) #Set token if registered with object if self.token != None: params['token'] = self.token #all other attempts... if attempt <= 5: data = urllib.urlencode(params) req = urllib2.Request(url, data) try: response = urllib2.urlopen(req) except httplib.BadStatusLine as e: if debug: debug.log("Bad Status Line at attempt %n: %attempt") return self._getEsriRESTJSON(url, params, attempt + 1, useIjson=useIjson, debug=debug) except urllib2.HTTPError as e: if debug: debug.log("HTTP Error at attempt %n: sleeping" % attempt) return self._getEsriRESTJSON(url, params, attempt + 1, useIjson=useIjson, debug=debug) if useIjson: if debug: debug.log("Using ijson") return ijson.items(response, "features.item") else: final = json.loads(response.read()) if 'error' in final.keys(): if debug: debug.log("Error in json loads " + str(final)) return self._getEsriRESTJSON(url, params, attempt + 1, debug=debug) else: return final else: if debug: debug.log("Too many attempts") raise MapServiceError("Error Accessing Map Service " + self.url)
def readJsonInput(self, test): try: # load test specific Dictionary, using Key = func # this is to avoid loading very large JSON in memory log.debug(" Read JSON Section: " + test) jInput = "" with open(self.jsonFile, 'rb') as f: jInst = ijson.items(f, test) for it in jInst: jInput = jInput + json.dumps(it) log.debug("Read json JIn: {}".format(jInput)) except Exception as e: printExceptionDetails() return jInput
def get_moving_average(self, symbol, days): history = [] averages = [] months_necessary = math.ceil(days / 22) raw_history = ijson.items(self.get_history(symbol, months_necessary), 'history.day.item') for day in raw_history: history.append({'date': day['date'], 'close': day['close']}) last_elements = list(chain(history[-days:])) moving_average = sum([e['close'] for e in last_elements]) / days return moving_average
def __init__(self, jsonInput): with open(jsonInput, 'r') as jsonInputFile: # self.jsonDoc = json.load(jsonInputFile.read()) # self.jsonDoc = json.load(jsonInputFile) objects = ijson.items(jsonInputFile, "Document.item") blocks = list(objects) # testBlock = list(ijson.items(jsonInputFile, "Document.item")) jsonInputFile.close() self.__rawJson = blocks self.__dataFrame = pd.DataFrame() # self.__text = "" self.__response = ""
def read_big_json_file(file_path, prefix=""): """ijson读取大文件 prefix: None 读取全部内容 prefix:"earth.europe.item" 读取europe中的内容 { "earth": { "europe": [ { "name": "Paris", "type": "city", "info": "aaa" }, { "name": "Thames", "type": "river", "info": "sss" }, { "name": "yyy", "type": "city", "info": "aaa" }, { "name": "eee", "type": "river", "info": "sss" } ], "america": [ { "name": "Texas", "type": "state", "info": "jjj" } ] } } """ with open(file_path, 'r', encoding='utf-8') as f: file_gen = ijson.items(f, prefix) while True: try: print(file_gen.__next__()) except StopIteration as e: print("数据读取完成") break
def Deserializer(stream_or_string, **options): if isinstance(stream_or_string, six.string_types): stream_or_string = six.BytesIO(stream_or_string.encode('utf-8')) try: objects = ijson.items(stream_or_string, 'item') for obj in PythonDeserializer(objects, **options): yield obj except GeneratorExit: raise except Exception as e: # Map to deserializer error six.reraise(DeserializationError, DeserializationError(e), sys.exc_info()[2])
def handle(self, *args, **kwargs): """Entry point for load data command""" self.stdout.write(self.style.SUCCESS('Starting load task:')) data_path = DATA_PATH if kwargs['sample']: data_path = SAMPLE_PATH log = self.get_error_log() was_error = False f = open(data_path) objects = ijson.items(f, 'item') for o in objects: try: if not o['value']: o['value'] = DEFAULT_QUESTION_VALUE Question.objects.create( question=unescape(o['question']), air_date=o['air_date'], answer=unescape(o['answer']), value=int(o['value'][1:].replace(',', '')), round=o['round'], show_number=o['show_number'], ) print(self.style.SUCCESS('.'), sep=' ', end='', file=sys.stdout, flush=True) except Exception as e: print(self.style.ERROR('X'), sep=' ', end='', file=sys.stdout, flush=True) was_error = True log.write('{} - {}\n'.format(e, str(o))) if was_error: self.stdout.write('') raise CommandError( 'Error loading question data. Check logs/import_data.error for more details' ) else: self.stdout.write(self.style.SUCCESS('\nDone')) log.close() f.close()
def DumpReader(lang, local_file, from_point=None): with bz2.open(local_file, "rb") as fin: reader = enumerate(ijson.items(fin, "item")) if from_point: for i, data in reader: if data and isinstance(data, dict) and data.get( 'id', None) == from_point: break # OK. found else: continue for i, data in reader: yield (data, lang, i)
def split_into_batches(cookbook_file): current = [] count = 0 with open(cookbook_file, 'r') as json_file: for item in ijson.items(json_file, "item"): current.append(item) if count > 0 and count % BATCH_SIZE == 0: print("Finished Item " + str(count)) with open( BATCH_PATH + str(int(count / BATCH_SIZE - 1)) + OUTFILE, 'w') as outfile: json.dump(current, outfile) current = [] count += 1
def download_item(url, filename='data.txt', *, num_retries=2, max_page=300, \ page_size=20, page_no=1, proxy=None): """Get items """ if page_no > max_page: return # check, only one '?' if url.count('?') != 1: print("Can't use this way, set url to None...") url = '' hd, tl = url.split('?') head = 'https://list.tmall.com/m/search_items.htm?page_size=%d&page_no=%d&' % (page_size, page_no) url_req = head + tl print('Downloading %s...' % url_req) try: html_response = urllib.request.urlopen(url_req) # .read().decode('utf-8') except urllib.error.URLError as e: print("Downloading error: ", e.reason) html_response = None if num_retries > 0: if hasattr(e, 'code') and 500 <= e.code < 600: return download_item(url, filename, num_retries=num_retries-1,\ max_page=max_page, page_size=page_size, page_no=page_no, proxy=proxy) print('Request done...') #json_html = json.loads(download("")) objects = ijson.items(html_response, 'item.item') # TODO check Generator empty ?? #if not list(objects): # return items = (o for o in objects if o['item_id']) # can set filter with open(filename, "a+") as f: for item in items: item_id = item['item_id'] price = item['price'] title = item['title'] print(item_id, title, price, sep=",", file=f) # TODO delay if page_no % 10 == 0: time.sleep(2) return download_item(url, filename, num_retries=num_retries, max_page=max_page, \ page_size=page_size, page_no=page_no+1, proxy=proxy)
def __getItems(self, prefix, chunksize): if prefix in self.cache: return self.cache[prefix] # cache miss, gotta parse the JSON again ofs = self.fd.tell() self.fd.seek(self.startpos) items = list(ijson.items(self.fd, prefix)) self.fd.seek(ofs) if chunksize == 0 or len(items) <= chunksize: self.cache[prefix] = items return items
def get_paragraph_questions(json_file_name, vocab_file=None): """ :param: json_file_name of the squad data set to parse :param: existing vocab to build on """ json_file = open(json_file_name, "r") data = [] if vocab_file is None: vocab = set() else: vocab_file = open(vocab_file, "rb") vocab = set(pickle.load(vocab_file)) vocab_file.close() print("Start processing data set %s" % json_file_name) for item in ijson.items(json_file, "data.item"): for paragraphs in item["paragraphs"]: paragraph = paragraphs["context"] indices, sentences = extract_sentences(paragraph) for qa in paragraphs["qas"]: if not qa["is_impossible"]: # add (sentence, question) pairs to data if len(qa["answers"]) != 0: answer = qa["answers"][0] answer_start_index = int(answer["answer_start"]) sentence_index = 0 for i in range(len(indices)): if answer_start_index > indices[i]: sentence_index = i else: break sentence = sentences[sentence_index] data.append((normalize_string(sentence), normalize_string(qa["question"]))) # add words in question to vocabulary for word in get_words(qa["question"]): vocab.add(word) # add words in sentences to vocabulary for word in flatten(map(get_words, sentences)): vocab.add(word) vocab.add(START_TOKEN) vocab.add(END_TOKEN) vocab.add(UNKNOWN_WORD) pickle.dump(list(vocab), open(os.path.join(EMBEDDING_DIR, "vocab.pkl"), 'wb')) pickle.dump( data, open( os.path.join(EMBEDDING_DIR, "%s.pkl" % json_file_name.split("-")[0]), 'wb')) print("Done processing data set %s" % json_file_name)
def build_index(): embed = hub.load('https://tfhub.dev/google/universal-sentence-encoder/4') index = faiss.IndexFlatL2(512) docMessages = [] embeddingtolabelmap = {} labeltotextmap = {} with open( '../../data/codeGraph/stackoverflow_questions_per_class_func_1M_filtered.json', 'r') as data: jsonCollect = ijson.items(data, 'results.bindings.item') i = 0 for jsonObject in jsonCollect: objectType = jsonObject['class_func_type']['value'].replace( 'http://purl.org/twc/graph4code/ontology/', '') if objectType != 'Class': continue label = jsonObject['class_func_label']['value'] docLabel = label + " docstring " + str(i) docStringText = jsonObject['docstr']['value'] + ' ' + str(i) soup = BeautifulSoup(docStringText, 'html.parser') for code in soup.find_all('code'): code.decompose() docStringText = soup.get_text() embeddedDocText = embed([docStringText])[0] newText = np.asarray(embeddedDocText, dtype=np.float32).reshape(1, -1) index.add(newText) docMessages.append(embeddedDocText.numpy().tolist()) embeddingtolabelmap[tuple( embeddedDocText.numpy().tolist())] = docLabel labeltotextmap[docLabel] = docStringText stackLabel = label + " stack " + str(i) stackQuestion = jsonObject['content']['value'] stackAnswer = jsonObject['answerContent']['value'] stackText = stackQuestion + " " + stackAnswer + ' ' + str(i) soup = BeautifulSoup(stackText, 'html.parser') for code in soup.find_all('code'): code.decompose() stackText = soup.get_text() embeddedStackText = embed([stackText])[0] newStackText = np.asarray(embeddedStackText, dtype=np.float32).reshape(1, -1) index.add(newStackText) docMessages.append(embeddedStackText.numpy().tolist()) embeddingtolabelmap[tuple( embeddedStackText.numpy().tolist())] = stackLabel labeltotextmap[stackLabel] = stackText i += 1 return (index, docMessages, embeddingtolabelmap, labeltotextmap)
def __getEsriRESTJSON(self, url, params, attempt=1, useIjson=False): """Helper function to query an Esri REST endpoint and return json""" # Wait five seconds if previous error if attempt > 1 and attempt != 6: time.sleep(5) # Set token if registered with object if self.token != None: params['token'] = self.token # all other attempts... if attempt <= 5: data = urllib.urlencode(params) req = urllib2.Request(url, data) try: response = urllib2.urlopen(req) except httplib.BadStatusLine as e: self.__logMsg(40, "Bad Status Line at attempt %n: %attempt") return self.__getEsriRESTJSON(url, params, attempt + 1, useIjson=useIjson) except urllib2.HTTPError as e: self.__logMsg(40, "HTTP Error at attempt %n: sleeping" % attempt) return self.__getEsriRESTJSON(url, params, attempt + 1, useIjson=useIjson) except urllib2.URLError as e: self.__logMsg(40, "Verify SSL Cert Error") dontVerifySSL() return self.__getEsriRESTJSON(url, params, attempt + 1, useIjson=useIjson) if useIjson: #need to figure out a way to deal with this if error is returned, possibly stop using ijson return ijson.items(response, "features.item") else: final = json.loads(response.read()) if 'error' in final.keys(): self.__logMsg(40, "Error in json loads " + str(final)) return self.__getEsriRESTJSON(url, params, attempt + 1) elif 'features' in final.keys(): return final['features'] else: return final else: self.__logMsg(30, "Too many attempts") raise MapServiceError("Error Accessing Map Service " + self.url)
def parser(base, objconf, skip=False, **kwargs): """ Parses the pipe content Args: base (str): The base currency (exchanging from) objconf (obj): The pipe configuration (an Objectify instance) skip (bool): Don't parse the content kwargs (dict): Keyword arguments Kwargs: assign (str): Attribute to assign parsed content (default: exchangerate) stream (dict): The original item Returns: dict: The item Examples: >>> from riko import get_path >>> from meza.fntools import Objectify >>> >>> url = get_path('quote.json') >>> conf = {'url': url, 'currency': 'USD', 'delay': 0, 'precision': 6} >>> item = {'content': 'GBP'} >>> objconf = Objectify(conf) >>> kwargs = {'stream': item, 'assign': 'content'} >>> parser(item['content'], objconf, **kwargs) Decimal('1.545801') """ same_currency = base == objconf.currency if skip: rate = kwargs['stream'] elif same_currency: rate = Decimal(1) else: decode = objconf.url.startswith('http') if objconf.memoize and not objconf.cache_type: objconf.cache_type = 'auto' with fetch(decode=decode, **objconf) as f: json = next(items(f, '')) if not (skip or same_currency): places = Decimal(10)**-objconf.precision rates = parse_response(json) rate = calc_rate(base, objconf.currency, rates, places=places) return rate
def read_json(data_result_file_path): arr = [] logging.info('opening {}'.format(data_result_file_path)) counter = 0 with open(data_result_file_path, 'r') as json_file: items = ijson.items(json_file, 'item') for item in items: counter += 1 if 'value_a' in item: item['value_a'] = parse_decimal(item['value_a']) if 'value_b' in item: item['value_b'] = parse_decimal(item['value_b']) arr.append(item) logging.info('JSON size: {}'.format(counter)) return arr
def getcontent(): global articles global headlines global summary with open('nstream.json', 'r') as stream: articlestream = ijson.items(stream, 'articles') k = list(articlestream) articles = list(k[0]) headlines = [title['title'] for title in articles] summary = [sum['description'] for sum in articles] sources = list(set([s["source"]["name"] for s in articles])) print('successfully fetched articles') print("Total articles", len(headlines)) print("Sources: ", sources) return headlines, summary
def iter_file(fd, root): """Iterate over `root` array in file provided by `filename` using ijson :param bytes fd: File descriptor :param str root: Array field name inside file :return: Iterator of bytes read and item as a tuple >>> [r for r in iter_file(open('tests/data/ocds-sample-data.json', 'rb'), 'records')] [] >>> len([r for r in iter_file(open('tests/data/ocds-sample-data.json', 'rb'), 'releases')]) 6 """ reader = ijson.items(fd, f"{root}.item", map_type=OrderedDict) for item in reader: yield item
def parse_data_to_case_class(input): conversations = [] with open(input["data_path"] + ".json") as data: print("Successfully opened " + input["data_path"] + ".json...") for root in ijson.items(data, 'conversations.conversation'): print("Start to process conversations...") for conversation in root: id = conversation["@id"] messages = [] for message in conversation["message"]: messages.append( Message(message["author"], message["time"], message["text"])) conversations.append(Conversation(id, messages)) return conversations
def _read_json(file_path: Path) -> List[Record]: data = [] with open(file_path, "rb") as f: objects = ijson.items(f, "fields.item") for obj in objects: dimensions, marks = {}, {} for k, v in obj.items(): if "D" in k: dimensions[k] = v elif "M" in k: marks[k] = v data.append(Record(dimensions, marks)) return data
def __init__(self, jsonInput): # Read json file as input blocks = [] with open(jsonInput, 'r') as jsonInputFile: try: objects = ijson.items(jsonInputFile, "Document.item") blocks = list(objects) except: pass jsonInputFile.close() self.__rawJson = blocks self.__dataFrame = pd.DataFrame() self.__tableDataFrame = pd.DataFrame()
def read_nvd_dir(cls, nvd_dir): """ Iterate over all the CVEs contained in NIST Vulnerability Database feeds since NVD_START_YEAR. If the files are missing or outdated in nvd_dir, a fresh copy will be downloaded, and kept in .json.gz """ for year in range(NVD_START_YEAR, datetime.datetime.now().year + 1): filename = CVE.download_nvd_year(nvd_dir, year) try: content = ijson.items(gzip.GzipFile(filename), 'CVE_Items.item') except: # noqa: E722 print("ERROR: cannot read %s. Please remove the file then rerun this script" % filename) raise for cve in content: yield cls(cve)
def extract_words(): words = defaultdict(int) iteration = 0 with open(POSTS_FILE_PATH, 'r') as f: for post in ijson.items(f, 'item'): add_words(words, post['content'], 1) add_words(words, post['title'], 2) add_words(words, post['tags'], 3) iteration += 1 if not iteration % SHOW_PROGRESS_EVERY: format_str = 'Words parsing {} iterations passed' print(format_str.format(iteration)) return words
def getcontent(): global articles global headlines global summary with open('nstream.json', 'r') as stream: articlestream = ijson.items(stream, 'articles') k = list(articlestream) articles = list(k[0]) headlines = [title['title'] for title in articles] summary = [sum['description'] for sum in articles] print('successfully fetched articles.') print("total headlines", len(headlines)) print("total summaries", len(summary)) print(" ") return headlines, summary