def test__json_next_signature(): name = 'Foo Bar' filename = '/tmp/foobar' minhash = (2,3,4,5,6) t = OrderedDict((('ksize', 21), ('num', len(minhash)), #('md5sum', ), ('cardinality', 123456), ('mins', minhash))) s = json.dumps(t) if sys.version_info[0] < 3: s = unicode(s) it = ijson.parse(io.StringIO(s)) # no MD5SUM sig = _json_next_signature(it, name, filename, ignore_md5sum=True, ijson=ijson) ## check MD5SUM minhash = (5,) t = OrderedDict((('ksize', 20), ('num', len(minhash)), ('md5sum', 'eae27d77ca20db309e056e3d2dcd7d69'), ('cardinality', 123456), ('mins', minhash))) s = json.dumps(t) if sys.version_info[0] < 3: s = unicode(s) it = ijson.parse(io.StringIO(s)) sig = _json_next_signature(it, name, filename, ignore_md5sum=False, ijson=ijson)
def get_metadata(self, fname): """ If the file is not too large, return its metadata. """ if os.stat(fname).st_size > self.filesize_limit > 0: return if fname == self.lastFileName and self.lastDocMeta is not None: self.insert_meta_year(self.lastDocMeta) return self.lastDocMeta self.lastFileName = fname if self.format == 'json': fIn = open(fname, 'r', encoding='utf-8-sig') elif self.format == 'json-gzip': fIn = gzip.open(fname, 'rt', encoding='utf-8-sig') else: return {} metadata = {} curMetaField = '' JSONParser = ijson.parse(fIn) for prefix, event, value in JSONParser: if (prefix, event) == ('meta', 'map_key'): curMetaField = value elif len(curMetaField) > 0 and prefix.startswith('meta.'): metadata[curMetaField] = value elif (prefix, event) == ('meta', 'end_map'): break self.lastDocMeta = metadata fIn.close() self.insert_meta_year(metadata) return metadata
def geojson_converter(inputfile, count): """ The main entry point """ if inputfile is None: raise Exception("Missing ") max_count = count current_count = 0 parser = ijson.parse(inputfile) f = GeojsonFsm() for prefix, event, value in parser: try: f.submitEvent(prefix, event, value) except EndOfFile: sys.exit(0) # no error break except Exception as e: logging.error(e) sys.exit(1) if count != None: current_count += 1 if current_count == max_count: break
def parse_location(stream, filter): """ Given a stream and a filter, parse JSON data that fits filter to GeoJSON file """ parser = ijson.parse(stream) reading = False obj = {} key = None value = None for prefix, event, value in parser: #print "prefix: " + str(prefix) if prefix == 'locations' and event == 'start_array': reading = True elif prefix == 'locations' and event == 'end_array': reading = False elif reading: if event == 'start_map' and prefix == 'locations.item': obj = {} activities = {} elif event == 'end_map' and prefix == 'locations.item': obj['activities'] = activities yield create_feature(obj, filter) elif event == 'map_key': key = value elif prefix == 'locations.item.%s' % key and value is not None: obj[key] = value elif prefix == 'locations.item.activitys.item.activities.item.type': activity = value elif prefix == 'locations.item.activitys.item.activities.item.confidence': confidence = value elif prefix == 'locations.item.activitys.item.activities.item' and event == 'end_map': activities[activity] = confidence
def main(): # Establish communication queues tasks = multiprocessing.JoinableQueue(16) results = multiprocessing.Queue() # Start consumers num_consumers = multiprocessing.cpu_count() * 2 # num_consumers = 1 print('Creating %d consumers' % num_consumers) consumers = [ Consumer(tasks, results) for i in xrange(num_consumers) ] for w in consumers: w.start() r = requests.post(NEO4J_CREATE_TRAN_URL, headers=generateheaders(),data=json.dumps(GET_PDB_IDS_NEO4J_STATEMENT), stream=True) parser = ijson.parse(r.raw) i = 0 pdb_id_list = [] for prefix, event, value in parser: if (prefix, event) == ('results.item.data.item.row.item', 'string'): # process_pdb_id(value) tasks.put(MapPDBOrganism(value), True, None) # Add a poison pill for each consumer for i in xrange(num_consumers): tasks.put(None) # Wait for all of the tasks to finish tasks.join()
def is_geojson(path): with open(path) as fp: try: parser = ijson.parse(fp) data = {} top_keys = ['coordinates', 'features', 'geometry', 'geometries'] for prefix, event, value in parser: if (prefix, event) == ('type', 'string'): data['type'] = value elif (prefix, event) == ('', 'map_key') and value in top_keys: data[value] = True gtype = data.get('type') geo_types = [ 'LineString', 'MultiLineString', 'MultiPoint', 'MultiPolygon', 'Point', 'Polygon' ] return any([ gtype in geo_types and 'coordinates' in data, gtype == 'Feature' and 'geometry' in data, gtype == 'FeatureCollection' and 'features' in data, gtype == 'GeometryCollection' and 'geometries' in data, ]) except Exception as exc: logger.debug('Exception during geojson validation: {}'.format(exc)) return False
def stream_geojson(stream): ''' ''' data = ijson.parse(stream) for (prefix1, event1, value1) in data: if event1 != 'start_map': # A root GeoJSON object is a map. raise ValueError((prefix1, event1, value1)) for (prefix2, event2, value2) in data: if event2 == 'map_key' and value2 == 'type': prefix3, event3, value3 = next(data) if event3 != 'string' and value3 != 'FeatureCollection': # We only want GeoJSON feature collections raise ValueError((prefix3, event3, value3)) elif event2 == 'map_key' and value2 == 'features': prefix4, event4, value4 = next(data) if event4 != 'start_array': # We only want lists of features here. raise ValueError((prefix4, event4, value4)) for (prefix5, event5, value5) in data: if event5 == 'end_array': break # let _build_value() handle the feature. _data = chain([(prefix5, event5, value5)], data) feature = _build_value(_data) yield feature
def extract_threatname(self, report): parser = ijson.parse(report) for prefix, event, value in parser: if prefix == "analysis.signaturedetections.strategy.item.threatname" \ and value is not None and str(value).lower() != "unknown": self.add_probable_name(str(value)) self.add_tag(str(value).lower())
def _detect_fields_in_geojson(self, resource_dict): geo_columns_dict = {} try: upload = uploader.ResourceUpload(resource_dict) with io.open(upload.get_path(resource_dict['id']), 'rb') as f, io.TextIOWrapper( f, encoding='utf-8-sig') as tf: parser = ijson.parse(tf) geo_columns = set() i = 0 for prefix, event, value in parser: if prefix == u'features.item.properties' and event == u'map_key': geo_columns.add(value) i += 1 if i > 10: break pass geo_columns_dict = [{ 'value': item, 'text': item } for item in sorted(geo_columns)] except Exception as e: log.warn( u'Error accessing resource size for resource {}: {}'.format( resource_dict.get('name', ''), str(e))) geo_columns_dict = {} return geo_columns_dict
def test_parse(self): events = parse(StringIO(JSON)) events = [value for prefix, event, value in events if prefix == 'docs.item.meta.item.item' ] self.assertEqual(events, [1])
def extract_info(self, report): # First, build an array with every antivirus information that might be # of interrest av_prefixes = [] for av in self._analysis._file['antivirus']: av_prefixes.append('data.signatures.item.data.item.{}'.format(av)) parser = ijson.parse(report) self.results['signatures'] = [] signature = dict() for prefix, event, value in parser: if prefix == "data.signatures.item" and event == "end_map": self.results['signatures'].append(signature) signature = dict() elif prefix == "data.signatures.item.name": signature['name'] = value self.add_tag(value) elif prefix == "data.signatures.item.severity": signature['severity'] = value elif prefix == "data.signatures.item.description": signature['description'] = value elif ('name' in signature and signature['name'] == 'antivirus_virustotal' and prefix in av_prefixes): self._analysis._file.update_value(['antivirus', prefix.split('.')[-1]], value) elif prefix == "data.malfamily": self.results['classification'] = value elif prefix == "data.malscore": self.results['score'] = str(value) elif prefix in ["data.network.domains.item.domain", "data.network.hosts.item.ip", "data.network.traffic.http.item.uri"]: self.add_ioc(value)
def read_regions(): VG_VERSION = '1.2' VG_PATH = '/home/joe/git/VG_raw_data' VG_REGION_PATH = '%s/%s/region_descriptions.json' % (VG_PATH, VG_VERSION) # parser = ijson.parse(open('test_region.json')) parser = ijson.parse(open(VG_REGION_PATH)) last_value = None Dic = {} regions = [] dic = {} for prefix, event, value in parser: if value == 'regions': Dic = {} regions = [] last_value = None elif last_value == 'id': Dic['regions'] = regions Dic['id'] = value with open('test_id_%s.json' % value, 'w') as f: json.dump(Dic, f) break elif event == 'map_key': last_value = value elif event == 'end_map': regions.append(dic) dic = {} last_value = None elif last_value: dic[last_value] = value
def test_load_signature_json(): email = '*****@*****.**' name = 'Foo Bar' filename = '/tmp/foobar' minhash = (2, 3, 4, 5, 6) t = OrderedDict(( ('email', email), ('name', name), ('filename', filename), ( 'signatures', ( OrderedDict(( ('ksize', 21), ('num', len(minhash)), #('md5sum', ), ('cardinality', 123456), ('mins', minhash))), )))) s = json.dumps(t) if sys.version_info[0] < 3: s = unicode(s) it = ijson.parse(io.StringIO(s)) # no MD5SUM sig_entry = load_signature_json(it, ignore_md5sum=True)
def implement_tags(self): """Applies the ingredient tags to the Recipe1M+ recipe corpus.""" tags_dict = instance.tags_cleaner( ) # Import the {ingredient: tags} dictionary recipe = ijson.parse( open("layer1.json")) # Import the corpus to python chunk by chunk for prefix, event, value in recipe: # Look into the recipe currently in memory if prefix == "item.ingredients.item.text": # Grab a recipe instruction tokenized = nltk.word_tokenize( value) # Tokenize it -> [word1, word2, word3] dracula = 0 new_string = "" for word in tokenized: # For each word in the list if dracula > 1: # Avoid infinte loop (see below) continue # Avoid infinte loop (see below) elif word in tags_dict.keys( ): # If the word is in the tag dictionary for i in range(len(tokenized)): # Find index of word if tokenized[i] == word: # Find index of word tokenized.insert( i + 1, tags_dict[word] ) # Insert the associated tag behind the word new_string = " ".join( tokenized) # Merge the list into a string dracula += 1 # Avoid infinte loop created by this for loop print(new_string) # Print each instructions with tags.
def dump_candidate_dict_to_db(): with open(constant.CANDIDATE_DICT_JSON_PATH, 'r') as fd: parser = ijson.parse(fd) candidate = Candidate() for prefix, event, value in parser: print(prefix, event, value) if (prefix, event) == ("", "map_key"): if candidate is not None: candidate.save() candidate = Candidate() candidate.word = value candidate.left_set = {} candidate.right_set = {} elif event == "map_key" and prefix.endswith("left_set"): key = value left_temp_dict = {key: None} elif event == "number" and prefix.endswith( "left_set.%s" % key): left_temp_dict[key] = str(value) candidate.left_set.update(left_temp_dict) elif event == "map_key" and prefix.endswith("right_set"): key = value right_temp_dict = {key: None} elif event == "number" and prefix.endswith( "right_set.%s" % key): right_temp_dict[key] = str(value) candidate.right_set.update(right_temp_dict) elif event == "number" and prefix.endswith("count"): candidate.count = value
def _parse_response(self): """Looks for `result.item` (array), `result` (object) and `error` (object) keys and parses the raw response content (stream of bytes) :raise: - ResponseError: If there's an error in the response - MissingResult: If no result nor error was found """ response = self._get_response() has_result_single = False has_result_many = False has_error = False builder = ObjectBuilder() for prefix, event, value in ijson.parse(response.raw, buf_size=self._chunk_size): if (prefix, event) == ('error', 'start_map'): # Matched ServiceNow `error` object at the root has_error = True elif prefix == 'result' and event in ['start_map', 'start_array']: # Matched ServiceNow `result` if event == 'start_map': # Matched object has_result_single = True elif event == 'start_array': # Matched array has_result_many = True if has_result_many: # Build the result if (prefix, event) == ('result.item', 'end_map'): # Reached end of object. Set count and yield builder.event(event, value) self.count += 1 yield getattr(builder, 'value') elif prefix.startswith('result.item'): # Build the result object builder.event(event, value) elif has_result_single: if (prefix, event) == ('result', 'end_map'): # Reached end of the result object. Set count and yield. builder.event(event, value) self.count += 1 yield getattr(builder, 'value') elif prefix.startswith('result'): # Build the error object builder.event(event, value) elif has_error: if (prefix, event) == ('error', 'end_map'): # Reached end of the error object - raise ResponseError exception raise ResponseError(getattr(builder, 'value')) elif prefix.startswith('error'): # Build the error object builder.event(event, value) if (has_result_single or has_result_many) and self.count == 0: # Results empty return if not (has_result_single or has_result_many or has_error): # None of the expected keys were found raise MissingResult('The expected `result` key was missing in the response. Cannot continue')
def load_signatureset_json_iter(data, ksize=None, ignore_md5sum=False, ijson=ijson): """ - data: file handle (or file handle-like) object - ksize: - ignore_md5sum: - ijson: ijson backend """ parser = ijson.parse(data) prefix, event, value = next(parser) assert prefix == '' and event == 'start_array' and value is None siglist = [] n = 0 while True: try: sig = load_signature_json( parser, prefix_item='item.signatures.item.mins.item', ignore_md5sum=ignore_md5sum, ijson=ijson) if not ksize or ksize == sig.minhash.ksize: yield sig except ValueError: # possible end of the array of signatures prefix, event, value = next(parser) assert event == 'end_array' break n += 1
def dump(path): # pragma: no cover """prints all the data ijson finds in a file in ijson event form; Not Recommended for large files""" with PrefixedJSON(path) as json_file: for prefix, event, value in ijson.parse(json_file): print("prefix=" + prefix + ", event=" + event + ", value=" + str(value))
def _validate_(self, level): # doi.org/10.1371/journal.pone.0031009 keys_found = set() # Can't self.open(mode='rb'), so we defer to the backing pathlib object with self.path.open(mode='rb') as fh: root_element = None for prefix, event, value in ijson.parse(fh): if root_element is None: if event != 'start_map': raise ValidationError('Root element of file must be a ' 'JSON object') else: root_element = True # Skip parsing attributes that could be prohibitively large if prefix.startswith('placements') \ or prefix.startswith('tree'): continue # Restricted to only checking root-level keys if event == 'map_key' and prefix == '': keys_found.add(value) if keys_found != self.fields: raise ValidationError('Expected the following fields: %s, found ' '%s.' % (sorted(self.fields), sorted(keys_found)))
def prefix_finder(path): # pragma: no cover "returns all the prefixes ijson finds in a file; used for parser development" with PrefixedJSON(path) as json_file: prefixes = set() for p, _, _ in ijson.parse(json_file): prefixes.add(p) return prefixes
def jsonToTxt(json_filename, goal_path): path = goal_path # this will be the path where the code puts the txts dont_allow = [ "reviewerID", "asin", "reviewerName", "helpful", "reviewText", "overall", "summary", "reviewTime", "unixReviewTime" ] with open(json_filename, encoding="UTF-8") as json_file: count = 0 # counts the txt generated bandera = False for line_number, line in enumerate(json_file): if count >= 10481487: line_as_file = io.StringIO(line) # Use a new parser for each line json_parser = ijson.parse(line_as_file) filebody = " " for prefix, type, value in json_parser: # each item from the line, only we need the value if value is None: pass else: if value not in dont_allow: # don't allow trash filebody += str(value) # txt body filebody += " " path += str(count) path += ".txt" file = open(path, "w") file.write(filebody) file.close() path = goal_path count += 1 else: count += 1 json_file.close()
def is_wallet_file_valid(cls, path): """ Check if the given wallet file is valid. .. note:: This method only continues reading the file until its validity can be determined, and should be preferred instead of :meth:`is_wallet_data_valid` when checking a file. :param str path: Path to the wallet file :returns: True if valid, False otherwise :rtype: bool """ with open(path, "rb") as f: try: for pfx, _, _ in ijson.parse(f): if pfx == "properties.gap_limit": return True if pfx == "wallet_data": return True except ijson.JSONError: return False return False
def __extract_books_from_json(self): with open(JSON_FILE_PATH, encoding='utf-8-sig') as input_file: logging.info('reading from json file') parser = ijson.parse(input_file) books = [] __found_book = False for prefix, event, value in parser: if prefix == "items.item.title": __found_book = True book = Book() book.title = value book.author = " " book.publisher = " " logging.info(book.title) if prefix == "items.item.products.item.price": book.price = value if prefix == "items.item.authors.item.firstName": book.author = value if prefix == "items.item.authors.item.lastName": book.author = str(book.author) + " " + value if prefix == "items.item.audioPublisherTitle": book.publisher = value if __found_book: __found_book = False books.append(book) return books
def AnotherWayToGetData(): # Get a seq of the Ledger EINs that we want to watch out for. our_eins = GetListOfOurEINs() # Set up our index file client. s3_resource = boto3.resource('s3') indicies = Forms990Indicies(s3_resource) indicies.save_all_indicies() # Start a stream of the index JSON. # If an EIN comes through that's in our_eins, take note of the following URL should_grab = False i = 0 for fd in indicies.saved_jsons.values(): parser = ijson.parse(fd) for prefix, event, value in parser: if event == 'string': if prefix.endswith('.item.EIN'): should_grab = value in our_eins if should_grab == True and prefix.endswith('.item.URL'): # Uncommenting this would actually grab the resource & log it: #print GetScheduleIUrlOnly(value, s3_resource) i += 1 print "done, would grab this many: " + str(i)
def doAFile(thisFilename, myEventID): global itemList, OBJStack, fileEntry FDUseful = open(usefulName, "w") FDUseful.write(f"""USEFULPREFIX = {C.OBSSTR}{C.NEWLINE}""") FDUseful.flush() FDUseful.close() with open(f"""{C.CACHEDIR}ijsonOut.json""", "w") as FDOut: for prefix, the_type, value in IJ.parse(open(thisFilename)): if prefix.find(myEventID) > -1: prefix = prefix.replace(myEventID, "$EVENTID$") thisTuple = (prefix, the_type, value) # itemList.append(thisTuple) COMBO = f"""{prefix}::{the_type}""" if COMBO not in usefulKeys: FDOut.write(str(thisTuple)) FDOut.flush() usefulKeys.append(prefix) FDUseful = open(usefulName, "ta") outStr = f"""{C.TABSTR}{C.DQTSTR}{COMBO}{C.DQTSTR},{C.NEWLINE}""" FDUseful.write(outStr) FDUseful.flush() FDUseful.close() FDMoreUseful = open(moreUsefulName, "ta") outStr = f"""({C.DQTSTR}{prefix}{C.DQTSTR}, {C.DQTSTR}{the_type}{C.DQTSTR}, C.TYPE, """ outStr += f"""{C.DQTSTR}headerName{C.DQTSTR}, SQLDEFAULT, SCNDEFAULT,),{C.NEWLINE}""" FDMoreUseful.write(f"""{str(thisTuple)}{C.NEWLINE}""") FDMoreUseful.flush() FDMoreUseful.close() FDUseful = open(usefulName, "ta") outStr = f"""{C.CBSSTR}{C.NEWLINE}""" FDUseful.write(outStr) FDUseful.flush() FDUseful.close()
def extractAllDescriptions(from_path, to_path, extension=".json", chunk_dim=50000, continue_with=0): count = 0 fileNumber = 0 with open(from_path, 'r') as f: parser = ijson.parse(f) obj = {} if fileNumber >= continue_with: w = codecs.open(to_path +"-"+ str(fileNumber) + extension, "w", "utf-8") for prefix, event, value in parser: if prefix=="item.id": obj["id"] = value elif prefix=="item.descriptions.en.value" and len(value.split())>1: obj["description"] = value if fileNumber >= continue_with: json.dump(obj, w) obj = {} count+=1 if count %chunk_dim ==0: if fileNumber >= continue_with: w.close() fileNumber += 1 if fileNumber >= continue_with: w = codecs.open(to_path +"-"+ str(fileNumber) + extension, "w", "utf-8") print(count) else: w.write("\n") w.close()
def get_sku(location, instance_type, products_file): """ Optimized JSON parsing to find the SKU for the provided location and type. """ # SKU dicts have prefixes like 76V3SF2FJC3ZR3GH sku_dict_prefix = re.compile('^[a-zA-Z0-9]+$') sku = '' matches = 0 event_count = 0 with open(products_file) as f: parser = ijson.parse(f) for prefix, event, value in parser: event_count += 1 if prefix.endswith('.sku'): # Save the SKU of the current SKU dict sku = value elif prefix.endswith( '.productFamily') and value == "Compute Instance": matches += 1 elif prefix.endswith('.location') and value == location: matches += 1 elif prefix.endswith('.instanceType') and value == instance_type: matches += 1 elif event == 'end_map' and sku_dict_prefix.match(prefix): # We've reached the end of the SKU dict, is this the right one? if matches == 3: # All three values matched, this is our sku logger.debug("SKU: {}".format(sku)) return sku else: # This wasn't the right SKU dict, reset our matches matches = 0
def doAFile(thisFilename, myEventID=""): global fileEntry, usefulName, moreUsefulName, usefulBit, moreUsefulBit, dirPFX DBLinks = DB.doOpen(C.SQLCONFIG) # FDUseful = open(usefulName, "w") # FDUseful.write(f"""USEFULPREFIX = {C.OBSSTR}{C.NEWLINE}""") # FDUseful.flush() # FDUseful.close() for prefix, the_type, value in IJ.parse(open(thisFilename)): if prefix == "" or prefix is None: continue if prefix.find(myEventID) > -1 and myEventID != "": prefix = prefix.replace(myEventID, "$EVENTID$") thisTuple = (prefix, the_type, value) # itemList.append(thisTuple) if DB.checkUpdatePrefix(DBLinks, prefix) is False: emptyPFXDict = C.PREFIXEMPTYDICT() emptyPFXDict[C.PFXlastSeen] = TDS.nowStrSql(TDS.DT.now()) emptyPFXDict[C.PFXfirstSeen] = TDS.nowStrSql(TDS.DT.now()) emptyPFXDict[C.PFXprefixStr] = prefix emptyPFXDict[C.PFXkeyType] = the_type DB.insertDict(DBLinks, C.GEOJSONPREFIXTABLENAME, emptyPFXDict, C.MYFIELDTYPESDICT) # FDUseful = open(usefulName, "ta") # outStr = f"""{C.TABSTR}{C.DQTSTR}{COMBO}{C.DQTSTR},{C.NEWLINE}""" # FDUseful.write(outStr) # FDUseful.flush() # FDUseful.close() FDMoreUseful = open(moreUsefulName, "ta") outStr = f"""({C.DQTSTR}{prefix}{C.DQTSTR}, {C.DQTSTR}{the_type}{C.DQTSTR}, C.TYPE, """ outStr += f"""{C.DQTSTR}headerName{C.DQTSTR}, SQLDEFAULT, SCNDEFAULT,),{C.NEWLINE}""" FDMoreUseful.write(f"""{str(thisTuple)}{C.NEWLINE}""") FDMoreUseful.flush() FDMoreUseful.close()
def extract_path_value_pairs_from_json_iter(inp): if isinstance(inp, str): inp = io.StringIO(inp) path_set = set() stack = [] def get_path(): q = ''.join(stack) if not q.startswith('.'): return '.' + q return q for prefix, event, value in ijson.parse(inp): if event == 'start_array': stack.append('[]') elif event == 'map_key': stack.pop( ) # remove a previous key or a dummy (in case of the first element). stack.append('.' + value) elif event == 'start_map': stack.append(None) # dummy elif event in ('end_array', 'end_map'): stack.pop() else: assert event in ('boolean', 'number', 'string', 'null') q = get_path() if q not in path_set: yield q, value path_set.add(q)
def parse(self, response): f = urllib.urlopen(open("spiders/amazon/query.txt").read()) parser = ijson.parse(f) for prefix, event, value in parser: if prefix == "jobs.item.url": yield Request("http://www.amazon.jobs%s" % value, callback=self.parse_link)
def detect_format(path, root_path=''): """ Returns the format of OCDS data, and whether the OCDS data is concatenated or in an array. If the OCDS data is concatenated or in an array, assumes that all items have the same format as the first item. :param str path: the path to a file :param str root_path: the path to the OCDS data within the file :returns: the format, whether data is concatenated, and whether data is in an array :rtype: tuple :raises UnknownFormatError: if the format cannot be detected """ with open(path, 'rb') as f: events = iter(ijson.parse(f, multiple_values=True)) while True: prefix, event, value = next(events) if prefix == root_path: break if prefix: prefix += '.' if event == 'start_array': prefix += 'item.' elif event != 'start_map': raise UnknownFormatError( 'top-level JSON value is a {}'.format(event)) records_prefix = '{}records'.format(prefix) releases_prefix = '{}releases'.format(prefix) ocid_prefix = '{}ocid'.format(prefix) tag_item_prefix = '{}tag.item'.format(prefix) has_records = False has_releases = False has_ocid = False has_tag = False is_compiled = False is_array = event == 'start_array' for prefix, event, value in events: if prefix == records_prefix: has_records = True elif prefix == releases_prefix: has_releases = True elif prefix == ocid_prefix: has_ocid = True elif prefix == tag_item_prefix: has_tag = True if value == 'compiled': is_compiled = True if not prefix and event not in ('end_array', 'end_map', 'map_key'): return _detect_format_result(True, is_array, has_records, has_releases, has_ocid, has_tag, is_compiled) return _detect_format_result(False, is_array, has_records, has_releases, has_ocid, has_tag, is_compiled)
def collect_data(self): outfile = open('/opt/projects/domain_nlp/sentences','w+', buffering=False) story_title=None parsed = ijson.parse(open(self.json_filename),buf_size=1024) comments = [] line = 0 while True: key,val = parsed.next() if not key and not val: break line+=1 if key == 'map_key': next_key,next_val = parsed.next() line+=1 if val == 'points': points = next_val if points < 1: continue elif val == 'story_title': story_title = next_val elif val == 'comment_text' and next_val: comment_text = self.sanitize_text(next_val, story_title) outfile.writelines(comment_text) # comments.append({'comment':comment_text,'points':points, # 'story_title':story_title}) if line % 100000 == 0: print len(comments) return comments
def retrieve_from_mgrast(meta_id, stage_name="upload"): url = "http://api.metagenomics.anl.gov/1/download/mgm" + meta_id print("Retrieving " + meta_id + " from " + url) res = urllib2.urlopen(url) #page = res.read() #ip = ijson.parse(page) ip = ijson.parse(res) elements = {} state = 0 for p, e, v in ip: #print str(p) + ":" + str(e) if state == 2: #print str(p) + ":" + str(e) if str(p) == data_item_file_name: elements[data_item_file_name] = str(v) if str(p) == data_item_url: elements[data_item_url] = str(v) if state == 1: if str(p) == 'data.item.stage_name' and str(v) == stage_name: state = 2 if str(p) == 'data.item' and str(e) == 'start_map': #print("start_map") state = 1 if str(p) == 'data.item' and str(e) == 'end_map': #print("end_map") state = 0 return elements
def distributed_save(): f = open('namuwiki_20190312.json') data = ijson.parse(f) p = open('namu_sentences.txt', 'w', encoding='utf-8') for prefix, event, value in data: if prefix == "item.text": sentences = re.compile('[\n]').split(value) for i, s in enumerate(sentences): try: sentences[i] = get_namuwiki_text(s.strip()) except Exception as e: print(e, "in sentence", s) new_s = '\n'.join(sentences) new_s = re.compile('[]]|[[]|[|][|]|[{][{][{]|[}][}][}]').sub( '', new_s) new_s = re.compile('[.]+|[?]|[\n]').split(new_s) new_s = [s.strip() for s in new_s if len(s) > 2] p.write('\n'.join(new_s) + '\n') print(new_s) elif prefix == "item.title": p.write(value + '\n') p.close()
def get_tables(self): prefixes = dict() with open(self._filename) as f: for prefix, event, _ in ijson.parse(f): if event in ("start_map", "start_array"): prefixes[".".join([self._root_prefix, prefix])] = None return [self._root_prefix] + list(prefixes.keys())[1:]
def load_json(filename): with open(filename, 'r') as fd: parser = ijson.parse(fd) ret = {'signatures': {}, 'antivirus': {}} for prefix, event, value in parser: if prefix == "signatures.item.name": signature = value ret['signatures'][signature] = {} ret['signatures'][signature]['markcount'] = markcount ret['signatures'][signature]['severity'] = severity ret['signatures'][signature]['description'] = description if prefix == "signatures.item.markcount": markcount = str(value) if prefix == "signatures.item.severity": severity = str(value) if prefix == "signatures.item.description": description = str(value) if prefix.startswith("virustotal.scans") and prefix.endswith( '.result'): # print(value) # av_name=prefix.split('.')[2] if value is not None: av_name = prefix.split('.')[2] ret['antivirus'][av_name] = value return ret
def _parse_categories_json(cls): categories_map = {} categories_json_stream = cls._request_epg_json(VaderStreamsConstants.CATEGORIES_PATH, VaderStreamsConstants.CATEGORIES_JSON_FILE_NAME, {}) logger.debug('Processing VaderStreams JSON categories\n' 'File name => {0}'.format(VaderStreamsConstants.CATEGORIES_JSON_FILE_NAME)) try: ijson_parser = ijson.parse(categories_json_stream) for (prefix, event, value) in ijson_parser: if event == 'string': categories_map[int(prefix)] = value logger.debug('Processed VaderStreams JSON categories\n' 'File name => {0}'.format(VaderStreamsConstants.CATEGORIES_JSON_FILE_NAME)) except Exception: logger.debug('Failed to process VaderStreams JSON categories\n' 'File name => {0}'.format(VaderStreamsConstants.CATEGORIES_JSON_FILE_NAME)) raise return categories_map
def extract_iocs(self, report): iocs = set() parser = ijson.parse(report) lines = "" for prefix, event, value in parser: if prefix in [ "analysis.behavior.network.tcp.packet.item.srcip", "analysis.behavior.network.tcp.packet.item.dstip", "analysis.behavior.network.udp.packet.item.srcip", "analysis.behavior.network.udp.packet.item.dstip", "analysis.behavior.network.dns.packet.item.name", ]: if not value.startswith("192.168."): iocs.add(value) elif prefix in [ "analysis.behavior.network.http.packet.item.header", "analysis.behavior.network.https.packet.item.header", "analysis.behavior.network.sslhttp.packet.item.header", ]: lines = "" elif prefix == "analysis.behavior.network.http.packet.item.header.line.item": lines += "{}\n".format(value) self.extract_url("http", iocs, lines) elif prefix in [ "analysis.behavior.network.https.packet.item.header.line.item", "analysis.behavior.network.sslhttp.packet.item.header.line.item" ]: lines += "{}\n".format(value) self.extract_url("https", iocs, lines) for ioc in iocs: self.add_ioc(ioc)
def yield_offenders(stream, matcher=KEY_RE): parser = ijson.parse(stream) for prefix, event, value in parser: if event != 'map_key': continue if matcher.match(value): continue yield prefix, value
def iter_large_json(json_file, prefix_value, event_value): import ijson parser = ijson.parse(open(json_file)) for prefix, event, value in parser: # For Flowdock data ('item.content', 'string') if (prefix, event) == (prefix_value, event_value): yield value
def test__json_next_atomic_array(): t = (2,3,4,5,6) s = json.dumps(t) if sys.version_info[0] < 3: s = unicode(s) it = ijson.parse(io.StringIO(s)) a = _json_next_atomic_array(it) assert len(t) == len(a) assert all(x == y for x,y in zip(t, a))
def json_usage(): file = r'E:\store_data\meituan.json' start = 0 with codecs.open(file,'r',encoding='utf8') as f: objects = ijson.parse(f) for prefix,event,value in objects: print(prefix,event,value) if start >200: break start+=1
def getJsonFile(): if LOCAL == None: downloadFile() curr_path = os.path.dirname(os.path.abspath(__file__)) json_data=open(curr_path+'/'+FILENAME) else: json_data=open(LOCAL) f = ijson.parse(json_data) return f
def fromJson (self, s, *args): import ijson it = ijson.parse (Stream (s)) def _read(): try: e = it.next() except StopIteration: return self.eos return [[e[1], e[2]], Lazy (_read)] return _read()
def exec_command(self, string_return=False): """ :param string_return: if true, returns a string summary. Otherwise, writes to out_file :return: """ json_stream = open(self._in_file) elem_id = None elem_type = None desc_en = None label_en = None properties = {} current_claim_key = None elem_count = 1 for prefix, event, value in ijson.parse(json_stream): if event == "end_map": if prefix == "item": self._process_current_data(elem_id, elem_type, desc_en, label_en, properties) elem_id = None elem_type = None desc_en = None current_claim_key = None label_en = None properties = {} elem_count += 1 if elem_count % 500 == 0: print "Llevamos " + str(elem_count) + " elementos" if prefix == "item.claims." + str(current_claim_key) + ".item": # print 'item.claims.' + str(current_claim_key) + '.item' properties[current_claim_key] += 1 elif event == "string": if prefix == "item.id": elem_id = value elif prefix == "item.type": elem_type = value elif prefix == "item.descriptions.en.value": desc_en = value elif prefix == "item.labels.en.value": label_en = value elif event == "map_key" and prefix == "item.claims": properties[value] = 0 current_claim_key = value print "Errores en propiedades: ", self._err_count_prop print "Errores en items: ", self._err_count_item if not string_return: self._write_to_file() else: return self._get_string_return()
def _parse_dump_file(self): json_stream = open(self._in_dump_file, "r") elem_id = None elem_type = None desc_en = None label_en = None datatype = None datavalue_type = None current_claim_key = None datavalue_num_id = None possible_edges = [] elem_count = 1 for prefix, event, value in ijson.parse(json_stream): if event == 'end_map': if prefix == 'item': for tuple_4 in possible_edges: if self._is_valid_edge(elem_type, tuple_4[0], tuple_4[1]): # triple: datatype, datavalue_type, datavalue_num_id self._add_triple_if_proceed(elem_id, tuple_4[2], 'Q' + tuple_4[3], label_en, desc_en) # print elem_id, tuple_4[2], 'Q' + tuple_4[3] elem_id = None elem_type = None current_claim_key = None # label_en = None datavalue_num_id = None datavalue_type = None elem_count += 1 possible_edges = [] if elem_count % 10000 == 0: print 'Llevamos ' + str(elem_count) elif prefix == "item.claims." + str(current_claim_key) + ".item": possible_edges.append((datatype, datavalue_type, current_claim_key, str(datavalue_num_id))) elif event == 'string': if prefix == 'item.id': elem_id = value elif prefix == 'item.type': elem_type = value elif prefix == 'item.claims.' + str(current_claim_key) + '.item.mainsnak.datatype': datatype = value elif prefix == 'item.claims.' + str(current_claim_key) + '.item.mainsnak.datavalue.value.entity-type': datavalue_type = value elif prefix == 'item.labels.en.value': label_en = value elif prefix == 'item.descriptions.en.value': desc_en = value elif event == 'map_key' and prefix == 'item.claims': current_claim_key = value elif event == 'number' and prefix == 'item.claims.' + str( current_claim_key) + '.item.mainsnak.datavalue.value.numeric-id': datavalue_num_id = value
def __init__(self, in_stream): if in_stream is None: raise AssertionError('input stream must not be none') self.__parser = ijson.parse(in_stream) self.__pre_meta_data = [] self.__post_meta_data = [] self.__aspect_element_counts = {} self.__aspect_names = set() self.__first_element = None for e in self.aspect_elements(): if e is not None: self.__first_element = e break
def main(args): print("started at {0}".format(time.time())) parser = ijson.parse(open(args.json_file)) session = requests.Session() parallelJobs = pp.Server() parallelJobs.set_ncpus(args.threads) for prefix, event, value in parser: if value is not None and event != 'map_key': # ijson sends the prefix as a string of keys connected by periods, # but Firebase uses periods for special values such as priority. # 1. Find '..', and store the indexes of the second period doublePeriodIndexes = [m.start() + 1 for m in re.finditer('\.\.', prefix)] # 2. Replace all '.' with ' ' prefix = prefix.replace('.', ' ') # 3. Use stored indexes of '..' to recreate second periods in the pairs of periods prefixList = list(prefix) for index in doublePeriodIndexes: prefixList[index] = '.' prefix = "".join(prefixList) # 4. Split on whitespace prefixes = prefix.split(' ') lastPrefix = prefixes[-1] prefixes = prefixes[:-1] url = args.firebase_url for prefix in prefixes: url += prefix + '/' url += '.json' if args.silent: url += '?print=silent' if not args.priority_mode: if lastPrefix == '.priority': continue else: if lastPrefix != '.priority': continue if event == 'number': dataObj = {lastPrefix: float(value)} else: dataObj = {lastPrefix: value} try: parallelJobs.submit(sendData, (url, dataObj, session, args), (), ("json", "requests")) except Exception, e: print('Caught an error: ' + traceback.format_exc()) print prefix, event, value
def jsonObjectReader(filepath): """ Creates a generator that parses an array of json objects from a valid json array file, yielding each top level json object in the array. :param filepath: path to json file. """ top_level_array = False array_stack = 0 top_level_object = False object_stack = 0 parser = ijson.parse(open(filepath, 'r')) for prefix, event, value in parser: if event == 'start_array': if not top_level_array: top_level_array = True continue else: array_stack += 1 if event == 'start_map': if not top_level_object: top_level_object = True builder = ijson.ObjectBuilder() else: object_stack += 1 if event == 'end_map': if not top_level_object: raise Exception('end_map without a top level object') else: if object_stack == 0: top_level_object = False yield builder.value else: object_stack -= 1 if event == 'end_array': if not top_level_array: raise Exception('end_array without a top level array') else: if array_stack == 0: top_level_array = False else: array_stack -= 1 # convert Decimal to float because mongo can't serialize Decimal # TODO is this the right place to do this? Should it be done instead # upon save? if isinstance(value, decimal.Decimal): # TODO this has different behavior on python 2.6 vs 2.7 due to # different rounding behavior value = float(value) builder.event(event, value)
def mergeDetails(cids, worker, project): data = [] missed = [] for cid in cids: res = getGerritChangeRequest(cid) parser = ijson.parse(res) author = '' date = '' time = '' patch = '' missedpatch = '' for prefix, event, value in parser: if prefix == 'messages.item.author.name': author = value if prefix == 'messages.item.date': cutoff = len(value)-3 date = datetime.strptime(value[:cutoff],"%Y-%m-%d %H:%M:%S.%f") if prefix == 'messages.item.message' and author == worker: dat = value.strip().split(':') if 'patch' in dat[0].lower(): patch = dat[0].split() patch = re.sub("[^0-9]","",patch[len(patch)-1]) if 'build successful' in value.lower() or 'build succeeded' in value.lower() and not "FAILURE" in value: success = True elif 'build failed' in value.lower() or 'build unsuccessful' in value.lower() or "FAILURE" in value: success = False else: continue try: item = [int(cid),int(patch),date.date(),date.time(),success] data += [item] except: continue elif prefix == 'messages.item.message' and author != worker: if 'starting check jobs' in value.lower(): continue dat = value.strip().split(':') if 'patch' in dat[0].lower(): missedpatch = dat[0].split() missedpatch = re.sub("[^0-9]","",missedpatch[len(missedpatch)-1]) try: missed += [[int(cid),int(missedpatch),date.date(),date.time()]] except: continue if len(data) >= 10: mergeChunk(data,missed,worker,project) data = [] missed = [] mergeChunk(data,missed,worker,project)
def __superficial_check(cls, fd): """Check if the cis and links field are a list. If not, raise a jsonschema.ValidationError. It move the cursor of the fd to back 0.""" # cis and links store if a field cis and links are found in the import # *_start store if a the beginning of a json array are found in the cis # and links fields of an import # *_end store if a the end of a json array are found in the cis and # links fields of an import cis = False cis_start = False cis_end = False links = False links_start = False links_end = False parser = ijson.parse(fd) for prefix, event, _ in parser: if prefix == "cis": cis = True if event == "end_array": cis_end = True if event == "start_array": cis_start = True if prefix == "links": links = True if event == "end_array": links_end = True if event == "start_array": links_start = True fd.seek(0) cis_status = (cis, cis_start, cis_end) links_status = (links, links_start, links_end) # ok is a filter to ascertain if a cis/link field of an import is # correct. ok = [(True, True, True), (False, False, False)] if cis_status in ok and links_status in ok: return True elif cis_status not in ok and links_status not in ok: raise jsonschema.ValidationError( "CIS and LINKS should be an array.") elif cis_status not in ok: raise jsonschema.ValidationError("CIS should be an array.") elif links_status not in ok: raise jsonschema.ValidationError("LINKS should be an array.")
def sniff(self): with self.open() as fh: try: parser = ijson.parse(fh) for prefix, event, value in parser: if (prefix, event) == ('', 'map_key'): # `format_url` seems pretty unique to BIOM 1.0. if value == 'format_url': return True elif value not in self.top_level_keys: return False except (ijson.JSONError, UnicodeDecodeError): pass return False
def _read_edges(self): json_stream = open(self._in_file) elem_id = None elem_type = None # desc_en = None # label_en = None datatype = None datavalue_type = None current_claim_key = None datavalue_num_id = None possible_edges = [] elem_count = 1 for prefix, event, value in ijson.parse(json_stream): if event == 'end_map': if prefix == 'item': for tuple_4 in possible_edges: if self._is_valid_edge(elem_type, tuple_4[0], tuple_4[1]): # triple: datatype, datavalue_type, datavalue_num_id yield (elem_id, 'Q' + tuple_4[3]) # pass elem_id = None elem_type = None current_claim_key = None # label_en = None datavalue_num_id = None datavalue_type = None elem_count += 1 possible_edges = [] if elem_count % 10 == 0: print 'Llevamos ' + str(elem_count) + ' elementos' elif prefix == "item.claims." + str(current_claim_key) + ".item": possible_edges.append((datatype, datavalue_type, current_claim_key, str(datavalue_num_id))) elif event == 'string': if prefix == 'item.id': elem_id = value elif prefix == 'item.type': elem_type = value elif prefix == 'item.claims.' + str(current_claim_key) + '.item.mainsnak.datatype': datatype = value elif prefix == 'item.claims.' + str(current_claim_key) + '.item.mainsnak.datavalue.value.entity-type': datavalue_type = value # elif prefix == 'item.labels.en.value': # label_en = value elif event == 'map_key' and prefix == 'item.claims': current_claim_key = value elif event == 'number' and prefix == 'item.claims.' + str( current_claim_key) + '.item.mainsnak.datavalue.value.numeric-id': datavalue_num_id = value
def _analyze_possibly_broken_json(line: str) -> Iterator[dict]: readers: Any = {} # too complicated for proper types f = io.StringIO(line) data: Union[Dict, List] try: for key, type_, value in ijson.parse(f): listeners = registry.get(key, []) for reader in readers.values(): _, stack, property_name = reader parent = stack[-1] if type_ == 'start_map': data = {} stack.append(data) if isinstance(parent, dict): parent[property_name] = data else: parent.append(data) continue if type_ in ('end_map', 'end_array'): stack.pop() continue if type_ == 'map_key': reader[2] = value continue if type_ == 'start_array': data = [] stack.append(data) if isinstance(parent, dict): parent[property_name] = data else: parent.append(data) continue if isinstance(parent, dict): parent[property_name] = value else: parent.append(value) for func in listeners: if type_ == 'start_map': # Start reading initial_data: Dict = {} readers[func] = [initial_data, [initial_data], None] elif type_ == 'end_map': yield from func(readers.pop(func)[0]) except (ijson.common.IncompleteJSONError, ijson.backends.python.UnexpectedSymbol): pass finally: f.close() for func, reader in readers.items(): for result in func(reader[0]): yield result
def yield_elements(self): json_stream = open(self._in_file) elem_id = None elem_type = None desc_en = None label_en = None properties = [] current_claim_key = None elem_count = 1 for prefix, event, value in ijson.parse(json_stream): if event == 'end_map': if prefix == 'item': if elem_type == 'item': yield WikidataEntity(entity_id=elem_id, label=label_en, description=desc_en, outcoming_properties_id=properties) elif elem_type == 'property': yield WikidataProperty(property_id=elem_id, label=label_en, description=desc_en, outcoming_properties_id=properties) elem_id = None elem_type = None desc_en = None current_claim_key = None label_en = None properties = [] elem_count += 1 if elem_count % 500 == 0: print 'Llevamos ' + str(elem_count) + ' elementos' if prefix == 'item.claims.' + str(current_claim_key) + '.item': # print 'item.claims.' + str(current_claim_key) + '.item' properties.append(current_claim_key) elif event == 'string': if prefix == 'item.id': elem_id = value elif prefix == 'item.type': elem_type = value elif prefix == 'item.descriptions.en.value': desc_en = value elif prefix == 'item.labels.en.value': label_en = value elif event == 'map_key' and prefix == 'item.claims': current_claim_key = value
def get_bucket_primary_dict(json_file_path): bucket_primary_dict={} bucket_name = None primary_peer = None with open(json_file_path, 'r') as f: parser = ijson.parse(f) for prefix, event, value in parser: if prefix == "entry.item.name": bucket_name = value if value == "0xffffffffffffffff": primary_peer = prefix if bucket_name is not None and primary_peer is not None: bucket_primary_dict[bucket_name]=primary_peer bucket_name = None primary_peer = None return bucket_primary_dict
def main(): # Establish communication queues tasks = multiprocessing.JoinableQueue(16) results = multiprocessing.Queue() # Start consumers num_consumers = multiprocessing.cpu_count() * 2 print('Creating %d consumers' % num_consumers, file=sys.stderr) consumers = [ Consumer(tasks, results) for i in xrange(num_consumers) ] for w in consumers: w.start() r = requests.post(NEO4J_CREATE_TRAN_URL, headers=generateheaders(),data=json.dumps(GET_PDB_IDS_NEO4J_STATEMENT), stream=True) parser = ijson.parse(r.raw) i = 0 buildingObject = False pdb_id_list = [] builder = None counter = 0 for prefix, event, value in parser: if (not buildingObject) & ((prefix, event) == ('results.item.data.item', 'start_map')): buildingObject = True builder = ijson.ObjectBuilder() builder.event(event, value) elif buildingObject & ((prefix, event) == ('results.item.data.item', 'end_map')): buildingObject = False builder.event(event, value) # put builder.value as object to work on # process_pdb_id(value) tasks.put(betaSheetOrder(builder.value["row"], counter), True, None) counter+=1 elif buildingObject: builder.event(event, value) print("%d domains found" %counter, file=sys.stderr) # Add a poison pill for each consumer for i in xrange(num_consumers): tasks.put(None) # Wait for all of the tasks to finish tasks.join()
def get_json_via_ijson(url): """ This function creates a parser object from a json url Paratmeters ----------- url - an url ending in .json Returns ------- An ijson parsing object """ from ijson import parse # load the json f = urllib2.urlopen(url, context = gcontext) # PArse the json parser = parse(f) return parser
def __init__(self, in_stream): """ Creates a new CxReader for reading from "in_stream". :param in_stream: object A file-like object to read from """ if in_stream is None: raise AssertionError('input stream must not be none') self.__parser = ijson.parse(in_stream) self.__pre_meta_data = [] self.__post_meta_data = [] self.__aspect_element_counts = {} self.__aspect_names = set() self.__first_element = None self.__number_verification = None self.__status = None for e in self.aspect_elements(): if e is not None: self.__first_element = e break