def _autocomplete_request(self, view, cache, request, text, offset, included=lambda item: True): """ """ # this should not happen, but just in case, do not # overload the system with too many requests if len(self.current_requests) > self.get_settings(view, "concurrent_request_limit", 4): raise AutocompleteRequestError("Request denied: too many concurrent requests.") # prevent duplicate requests if request in self.current_requests: raise AutocompleteRequestError( "Request denied: completion for \"{request}\" " "already in progress.".format(request=request) ) # start request self.current_requests.add(request) # get completion command cmd = self.get_completion_cmd(view, text, offset) # run completion command p = Popen(cmd, shell=True, stdout=PIPE, stderr=STDOUT) parser = ijson.parse(p.stdout) completions = list(self._parse_completions(parser, included=included)) # finish request self.current_requests.discard(request) return completions
def gen_entropy_data(json): entropy = [] for prefix, event, value in ijson.parse(json): if prefix.endswith('.entropy'): entropy.append(float(value)) return entropy
def gen_dylibs_data(json): dylibs = Counter() total = 0 # execute = None # temp_dylibs = [] for prefix, event, value in ijson.parse(json): #if prefix.endswith('.macho') or prefix.endswith('.machos.item'): #if execute: # for d in set(temp_dylibs): # dylibs[d] += 1 # del temp_dylibs[:] #execute = None if prefix.endswith('.filetype'): #if value == 'EXECUTE': # execute = True total += 1 #else: # execute = False if prefix.endswith('.dylibs.item'): #if execute or execute is None: # temp_dylibs.append(value) dylibs[value] += 1 #if execute: # for d in set(temp_dylibs): # dylibs[d] += 1 for i in dylibs: dylibs[i] = dylibs[i] * (1.0 / total) return dylibs
def gen_imports_data(json): imports = Counter() total = 0 function = False # execute = None # temp_imports = [] for prefix, event, value in ijson.parse(json): #if prefix.endswith('.macho') or prefix.endswith('.machos.item'): # if execute: # for t in set(temp_imports): # imports[t] += 1 # del temp_imports[:] # execute = None if prefix.endswith('.filetype'): #if value == 'EXECUTE': # execute = True total += 1 #else: # execute = False if prefix.endswith('.imports.item'): function = True if prefix.endswith('.imports.item.item') and function: #if execute or execute is None: # temp_imports.append(value) imports[value] += 1 function = False for i in imports: imports[i] = imports[i] * (1.0 / total) return imports
def gen_lcs_data(json): lcs = Counter() total = 0 # execute = None temp = None segment = False segment_64 = False for prefix, event, value in ijson.parse(json): if prefix.endswith('.filetype'): total += 1 if prefix.endswith('.lcs.item.cmd'): if value == 'LOAD_DYLIB': continue elif value == 'SEGMENT': segment = True elif value == 'SEGMENT_64': segment_64 = True else: lcs[value] += 1 if prefix.endswith('.lcs.item.name'): if segment: lcs['SEGMENT (' + value + ')'] += 1 segment = False elif segment_64: lcs['SEGMENT_64 (' + value + ')'] += 1 segment_64 = False print 'Samples parsed:', total for l in lcs.keys(): lcs[l] = lcs[l] * (1.0 / total) return lcs
def gen_ndylibs_data(json): data = [] for prefix, event, value in ijson.parse(json): if prefix.endswith('.ndylibs'): data.append(value) return data
def gen_slcs_data(json): slcs = [] for prefix, event, value in ijson.parse(json): if prefix.endswith('.slcs'): slcs.append(value) return slcs
def gen_sects_data(json): sects = Counter() total = 0 segname = None name = None for prefix, event, value in ijson.parse(json): if prefix.endswith('.filetype'): total += 1 elif prefix.endswith('.sects.item.segname'): if name is None: segname = value else: sects[value + ', ' + name] += 1 name = None elif prefix.endswith('.sects.item.name'): if segname is None: name = value else: sects[segname + ', ' + value] += 1 segname = None for s in sects.keys(): sects[s] = sects[s] * (1.0 / total) return sects
def json2generator(data, arrayKey=None): """ Функция конвертирует переданный json в генератор. Это позволяет избежать утечки памяти на огромных обьемах данных. Может выдать генератор только для массива (неважно какой вложенности и сложности). arrayKey должен указывать на массив, может быть цепочкой (key1.key2) """ from ijson import common # from io import BytesIO from cStringIO import StringIO #! yajl2 беккенд работает значительно быстрее, но на первый сервак так и не удалось его установить, пишет "Yajl shared object cannot be found" try: import ijson.backends.yajl2_cffi as ijson except: try: from ijson.backends import yajl2 as ijson except: try: from ijson.backends import yajl as ijson except: from ijson.backends import python as ijson try: f=StringIO(data) except: f=StringIO(data.encode('utf-8')) def _fixJSON(event): # функция исправляет "фичу" декодинга, Которая пытается все цифровые типы привести к decimal() if event[1]=='number': return (event[0], event[1], float(event[2]) if math.modf(event[2])[0] else int(event[2])) else: return event events=imap(_fixJSON, ijson.parse(f)) g=common.items(events, (arrayKey+'.item' if arrayKey else 'item')) # g=ijson.items(f, (arrayKey+'.item' if arrayKey else 'item')) return g
def validate(self, filepath): with open(filepath, 'rb') as vfile: parser = ijson.parse(vfile) model = list(self._model_prefixes) for prefix, event, value in parser: pair = (prefix, event) if pair in model: model.remove(pair) return len(model) == 0
def get_proportions(account_stats, conf, silent=True): """ We have a fixed amount of CREA to give out, specified by total_port_balance This needs to be given out subject to the following constraints: - The ratio of vesting : liquid CREA is the same on testnet, - Everyone's testnet balance is proportional to their mainnet balance - Everyone has at least min_vesting_per_account """ total_vests = account_stats["total_vests"] total_crea = account_stats["total_crea"] account_names = account_stats["account_names"] num_accounts = len(account_names) with open(conf["snapshot_file"], "rb") as f: for prefix, event, value in ijson.parse(f): if prefix == "dynamic_global_properties.total_vesting_fund_crea.amount": total_vesting_crea = int(value) break min_vesting_per_account = satoshis(conf["min_vesting_per_account"]) total_port_balance = satoshis(conf["total_port_balance"]) avail_port_balance = total_port_balance - min_vesting_per_account * num_accounts if avail_port_balance < 0: raise RuntimeError( "Increase total_port_balance or decrease min_vesting_per_account") total_port_vesting = (avail_port_balance * total_vesting_crea) // ( total_crea + total_vesting_crea) total_port_liquid = (avail_port_balance * total_crea) // (total_crea + total_vesting_crea) if total_vests == 0: vest_conversion_factor = 1 else: vest_conversion_factor = (DENOM * total_port_vesting) // total_vests if total_crea == 0: crea_conversion_factor = 1 else: crea_conversion_factor = (DENOM * total_port_liquid) // total_crea if not silent: print("total_vests:", total_vests) print("total_crea:", total_crea) print("total_vesting_crea:", total_vesting_crea) print("total_port_balance:", total_port_balance) print("total_port_vesting:", total_port_vesting) print("total_port_liquid:", total_port_liquid) print("vest_conversion_factor:", vest_conversion_factor) print("crea_conversion_factor:", crea_conversion_factor) return { "min_vesting_per_account": min_vesting_per_account, "vest_conversion_factor": vest_conversion_factor, "crea_conversion_factor": crea_conversion_factor }
def do_filter(input_file, output_file, filters, verbose): logging.basicConfig(level=logging.INFO if verbose else logging.WARNING) logger = logging.getLogger('JSON-FILTER') json_filter = JsonEventFilter(ijson.parse(input_file), filters) writer = ObjectWriter(output_file) start_time = time.time() for prefix, event, value in json_filter: logger.info('%s(%s): %s' % (prefix, event.upper(), value)) writer.event(event, value) logger.info('Finished in: %s seconds.' % (time.time() - start_time))
def gprocess(i, fns): """Iteratively parse the file object and generate the output Arguments: i: the index of the file to process fns: a list of filenames """ dbs = list() for fn in fns: try: db = rocksdb.DB(fn + '.db', rocksdb.Options(create_if_missing=False), read_only=True) dbs.append(db) except: raise ValueError( "Given DB: {}.db does not exist. Are you sure the name is correct?" .format(fn)) fileobj = open(os.path.join(args.input, fns[i]), 'r') ofilename = args.trace + '-out.' + str(i) + '.txt' if args.verbose: print("\x1b[6;30;43m[i]\x1b[0m opening output file {} for writing...". format(ofilename)) ofile = open(ofilename, 'a+') parser = ijson.common.items(ijson.parse(fileobj, multiple_values=True), '') if args.trace == 'camflow': if args.verbose: print("\x1b[6;30;42m[+]\x1b[0m parsing file {} in CAMFLOW mode...". format(i)) ptj.gencf(parser, i, dbs, ofile) elif args.trace == 'darpa': if args.verbose: print("\x1b[6;30;42m[+]\x1b[0m parsing file {} in DARPA mode...". format(i)) ptj.gendp(parser, i, dbs, ofile) elif args.trace == 'cadets2' or args.trace == 'fivedirections': if args.verbose: print( "\x1b[6;30;42m[+]\x1b[0m parsing file {} in CADETS2/FIVEDIRECTIONS mode..." .format(i)) ptj.gencd(parser, i, dbs, ofile) else: raise NotImplementedError("cannot run traces from an unknown system") fileobj.close() ofile.close() return
def process(fn): """Iteratively process an file object. Arguments: fn - file name """ if args.profile: if args.verbose: print("\x1b[6;30;43m[i]\x1b[0m profiling is on...") yappi.clear_stats() yappi.set_clock_type('cpu') yappi.start(builtins=True) db = initdb(fn) with open(os.path.join(args.input, fn), 'r') as fileobj: parser = ijson.common.items(ijson.parse(fileobj, multiple_values=True), '') if args.trace == 'camflow': if args.verbose: print( "\x1b[6;30;42m[+]\x1b[0m parsing file {} in CAMFLOW mode..." .format(fn)) ptj.parsecf(parser, db, fn) elif args.trace == 'darpa': if args.verbose: print( "\x1b[6;30;42m[+]\x1b[0m parsing file {} in DARPA mode...". format(fn)) ptj.parsedp(parser, db, fn) elif args.trace == 'cadets2' or args.trace == 'fivedirections': if args.verbose: print( "\x1b[6;30;42m[+]\x1b[0m parsing file {} in CADETS2/FIVEDIRECTIONS mode..." .format(fn)) ptj.parsecd(parser, db, fn) else: raise NotImplementedError( "cannot run traces from an unknown system") if args.profile: yappi.stop() if args.verbose: print("\x1b[6;30;43m[i]\x1b[0m profiling is done...") stat = yappi.get_func_stats() stat.save(fn + '.prof', type='callgrind') fileobj.close() return
def yield_obj(path, basepath): with gzip.open(path, "r") as fin: builder = ijson.common.ObjectBuilder() for prefix, event, val in ijson.parse(fin): try: builder.event(event, val) except: if hasattr(builder, "value"): print(builder.value) if prefix == basepath and event == "end_map": if hasattr(builder, "value"): yield builder.value builder = ijson.common.ObjectBuilder()
def serialize(args): """Consume raw JSON from the npm registry and spit out CSV for Postgres. """ import ijson.backends.yajl2_cffi as ijson path = args.path parser = ijson.parse(open(path)) start = time.time() package = None nprocessed = 0 out = csv.writer(sys.stdout) def log_stats(): log("processed {} packages in {:3.0f} seconds".format( nprocessed, time.time() - start)) for prefix, event, value in parser: if not prefix and event == b'map_key': # Flush the current package. We count on the first package being garbage. processed = serialize_one(out, package) nprocessed += processed if processed and not (nprocessed % 1000): log_stats() # Start a new package. package = { 'package_manager': b'npm', 'name': value, 'description': b'', 'emails': [] } key = lambda k: package['name'] + b'.' + k if event == b'string': assert type( value ) is unicode # Who knew? Seems to decode only for `string`. value = value.encode('utf8') if prefix == key(b'description'): package['description'] = value elif prefix in (key(b'author.email'), key(b'maintainers.item.email')): package['emails'].append(value) nprocessed += serialize_one(out, package) # Don't forget the last one! log_stats()
def load_geojson(file_name): with open(file_name, 'r') as fd: parser = ijson.parse(fd) for prefix, event, value in parser: if (prefix, event) == ('features.item', 'start_map'): feature = Feature() elif (prefix, event) == ('features.item', 'end_map'): yield feature if (prefix, event) == ('features.item.properties', 'start_map'): properties = JpCityProperties() elif (prefix, event) == ('features.item.properties', 'end_map'): feature.properties = properties.__dict__ elif (prefix, event, value) == ('features.item.properties', 'map_key', 'A27_005'): properties.a27_005 = parser.next()[2] elif (prefix, event, value) == ('features.item.properties', 'map_key', 'A27_006'): properties.a27_006 = parser.next()[2] elif (prefix, event, value) == ('features.item.properties', 'map_key', 'A27_007'): properties.a27_007 = parser.next()[2] elif (prefix, event, value) == ('features.item.properties', 'map_key', 'A27_008'): properties.a27_008 = parser.next()[2] elif (prefix, event) == ('features.item.geometry', 'start_map'): geometry = MultiPolygon() elif (prefix, event) == ('features.item.geometry.type', 'string'): if value == "MultiPolygon": geometry = MultiPolygon() elif value == "Polygon": geometry = Polygon() else: raise Exception elif (prefix, event) == ('features.item.geometry', 'end_map'): feature.geometry = geometry elif (prefix, event) == ('features.item.geometry.coordinates', 'start_array'): coordinates = [] elif (prefix, event) == ('features.item.geometry.coordinates', 'end_array'): geometry.coordinates = coordinates elif (prefix, event) == ('features.item.geometry.coordinates.item', 'start_array'): coordinates_item = [] elif (prefix, event) == ('features.item.geometry.coordinates.item', 'end_array'): coordinates.append(coordinates_item) elif (prefix, event) == ('features.item.geometry.coordinates.item.item', 'start_array'): if isinstance(geometry, MultiPolygon): coordinates_item_item = [] else: coordinates_item.append((parser.next()[2], parser.next()[2])) elif (prefix, event) == ('features.item.geometry.coordinates.item.item', 'end_array'): if isinstance(geometry, MultiPolygon): coordinates_item.append(coordinates_item_item) elif (prefix, event) == ('features.item.geometry.coordinates.item.item.item', 'start_array'): if isinstance(geometry, MultiPolygon): coordinates_item_item.append((parser.next()[2], parser.next()[2]))
def objects(file): key = '-' # for prefix, event, value in islice(ijson.parse(file), 10000): for prefix, event, value in ijson.parse(file): if prefix == '' and event == 'map_key': # found new object at the root key = value # mark the key value builder = ObjectBuilder() elif prefix.startswith(key): # while at this key, build the object # if value == 'p' or value == 'pct': builder.event(event, value) if event == 'end_map': # found the end of an object at the current key, yield value_dict = builder.value builder.value = {key: value_dict[key] for key in ['p', 'pct']} yield {key: builder.value}
def bracket_objects(file): """Parse the saved bracket json to be able to create an html bracket.""" key = '-' # for prefix, event, value in islice(ijson.parse(file), 10000): for prefix, event, value in ijson.parse(file): if prefix == '' and event == 'map_key': # found new object at the root key = value # mark the key value builder = ObjectBuilder() elif prefix.startswith(key): # while at this key, build the object # if value == 'p' or value == 'pct': builder.event(event, value) if event == 'end_array': # found the end of an object at the current key, yield # value_dict = builder.value # builder.value = {key: value_dict[key] for key in ['p', 'pct']} yield {key: builder.value}
def gen_dylibs_count_data(json): dylibs = Counter() total = 0 dylib = False for prefix, event, value in ijson.parse(json): if prefix.endswith('.filetype'): total += 1 elif prefix.endswith('.imports.item.item'): if dylib: dylibs[value] += 1 dylib = False else: dylib = True for d in dylibs: dylibs[d] = dylibs[d] * (1.0 / total) return dylibs
def gen_abnormalities_data(json): abnormalities = Counter() total = 0 temp = [] for prefix, event, value in ijson.parse(json): if prefix.endswith('.filetype'): total += 1 elif prefix.endswith('.abnormalities'): del temp[:] elif prefix.endswith('.abnormalities.item.title'): if value not in temp: abnormalities[value] += 1 temp.append(value) for a in abnormalities.keys(): abnormalities[a] = abnormalities[a] * (1.0 / total) return abnormalities
def serialize(env, args, db): ijson = import_ijson(env) path = args.path parser = ijson.parse(open(path)) start = time.time() package = None nprocessed = 0 out = csv.writer(sys.stdout) def log_stats(): log("processed {} packages in {:3.0f} seconds" .format(nprocessed, time.time() - start)) for prefix, event, value in parser: if not prefix and event == b'map_key': # Flush the current package. We count on the first package being garbage. processed = serialize_one(out, package) nprocessed += processed if processed and not(nprocessed % 1000): log_stats() # Start a new package. package = { 'package_manager': b'npm' , 'name': value , 'description': b'' , 'emails': [] } key = lambda k: package['name'] + b'.' + k if event == b'string': assert type(value) is unicode # Who knew? Seems to decode only for `string`. value = value.encode('utf8') if prefix == key(b'description'): package['description'] = value elif prefix in (key(b'author.email'), key(b'maintainers.item.email')): package['emails'].append(value) nprocessed += serialize_one(out, package) # Don't forget the last one! log_stats()
def gen_nimports_data(json): data = [] execute = None temp = None for prefix, event, value in ijson.parse(json): if prefix.endswith('.macho') or prefix.endswith('.machos.item'): if execute and temp is not None: data.append(temp) temp = None execute = None if prefix.endswith('.filetype'): if value == 'EXECUTE': execute = True else: execute = False if prefix.endswith('.nimps'): temp = value if temp is not None and execute: data.append(temp) return data
def main(): users_comments_dict = collections.defaultdict(list) with tqdm(desc="Grouping comments by user", total=12704751) as progress_bar: inside_comment = False comment_text = None comment_username = None with open(COMMENTS_DATASET_FILE_PATH, 'rb') as file_: # As the JSON file is large (2.5GB) and everything is in one line, is better to read it as a stream, # using a SAX-like approach. for prefix, type_, value in ijson.parse(file_): if inside_comment: if prefix.endswith('.text'): comment_text = value elif prefix.endswith('.author'): comment_username = value elif type_ == 'end_map': # This assumes there are no nested maps inside the comment maps. if comment_text and comment_username and comment_text != 'nan' \ and comment_username != '[deleted]': users_comments_dict[comment_username].append( comment_text) inside_comment = False comment_text = None comment_username = None progress_bar.update() elif type_ == 'start_map' and prefix: inside_comment = True with open(USER_COMMENTS_FILE_PATH, 'w') as output_file: writer = csv.writer(output_file, quoting=csv.QUOTE_ALL) writer.writerows( (user, " <END> ".join(comments_texts)) for user, comments_texts in iteritems(users_comments_dict))
files = [] print("Actions:\n1. Process Logs\n2. Analyze Proccesed Logs") action = input("Enter number of action to perform: ") if action == "1": print("=" * 50 + "\nLog Files: ") i = 1 for fn in os.listdir("."): if fn[-5:] == ".json" and fn[-len("_conversations.json" ):] != "_conversations.json": files.append(fn) print(str(i) + ". " + fn) i += 1 fn = files[int(input("Enter number of file to read logs from: ")) - 1] data = ijson.parse(open(fn, "rb")) conversations = [] conversation = False message = False member = False for prefix, event, value in data: if prefix == "conversations.item.conversation.conversation_id.id": print("Processing chat...") if conversation: if len(conversation["members"] ) == 2 and conversation["name"] == None: conversation["name"] = ( lambda a, b: a[0]["name"] if len(a) > 0 else b)(list( filter(lambda m: m["id"] != conversation["self"], conversation["members"])), "") conversations.append(conversation)
def port_snapshot(conf, keydb, silent=True): total_vests = 0 total_steem = 0 system_account_names = set(get_system_account_names(conf)) if not silent and not YAJL2_CFFI_AVAILABLE: print( "Warning: could not load yajl, falling back to default backend for ijson." ) snapshot_file = open(conf["snapshot_file"], "rb") account_names = set() num_accounts = 0 for acc in ijson.items(snapshot_file, "accounts.item"): if acc["name"] in system_account_names: continue account_names.add(acc["name"]) total_vests += satoshis(acc["vesting_shares"]) total_steem += satoshis(acc["balance"]) num_accounts += 1 if not silent: if num_accounts % 100000 == 0: print("Accounts read:", num_accounts) # We have a fixed amount of STEEM to give out, specified by total_port_balance # This needs to be given out subject to the following constraints: # - The ratio of vesting : liquid STEEM is the same on testnet, # - Everyone's testnet balance is proportional to their mainnet balance # - Everyone has at least min_vesting_per_account snapshot_file.seek(0) for prefix, event, value in ijson.parse(snapshot_file): if prefix == "dynamic_global_properties.total_vesting_fund_steem.amount": total_vesting_steem = int(value) break denom = 10**12 # we need stupidly high precision because VESTS min_vesting_per_account = satoshis(conf["min_vesting_per_account"]) total_port_balance = satoshis(conf["total_port_balance"]) avail_port_balance = total_port_balance - min_vesting_per_account * num_accounts if avail_port_balance < 0: raise RuntimeError( "Increase total_port_balance or decrease min_vesting_per_account") total_port_vesting = (avail_port_balance * total_vesting_steem) // ( total_steem + total_vesting_steem) total_port_liquid = (avail_port_balance * total_steem) // (total_steem + total_vesting_steem) vest_conversion_factor = (denom * total_port_vesting) // total_vests steem_conversion_factor = (denom * total_port_liquid) // total_steem if not silent: print("total_vests:", total_vests) print("total_steem:", total_steem) print("total_vesting_steem:", total_vesting_steem) print("total_port_balance:", total_port_balance) print("total_port_vesting:", total_port_vesting) print("total_port_liquid:", total_port_liquid) print("vest_conversion_factor:", vest_conversion_factor) print("steem_conversion_factor:", steem_conversion_factor) porter = conf["accounts"]["porter"]["name"] tnman = conf["accounts"]["manager"]["name"] yield { "operations": [{ "type": "transfer_operation", "value": { "from": "initminer", "to": porter, "amount": conf["total_port_balance"], "memo": "Fund porting balances", } }], "wif_sigs": [keydb.get_privkey("initminer")] } porter_wif = keydb.get_privkey("porter") create_auth = { "account_auths": [["porter", 1]], "key_auths": [], "weight_threshold": 1 } snapshot_file.seek(0) accounts_created = 0 for a in ijson.items(snapshot_file, "accounts.item"): if a["name"] in system_account_names: continue vesting_amount = (satoshis(a["vesting_shares"]) * vest_conversion_factor) // denom transfer_amount = (satoshis(a["balance"]) * steem_conversion_factor) // denom name = a["name"] ops = [{ "type": "account_create_operation", "value": { "fee": amount(max(vesting_amount, min_vesting_per_account)), "creator": porter, "new_account_name": name, "owner": create_auth, "active": create_auth, "posting": create_auth, "memo_key": "TST" + a["memo_key"][3:], "json_metadata": "", } }] if transfer_amount > 0: ops.append({ "type": "transfer_operation", "value": { "from": porter, "to": name, "amount": amount(transfer_amount), "memo": "Ported balance", } }) accounts_created += 1 if not silent: if accounts_created % 100000 == 0: print("Accounts created:", accounts_created) print( "\t", '%.2f%% complete' % (accounts_created / num_accounts * 100.0)) yield {"operations": ops, "wif_sigs": [porter_wif]} if not silent: print("Accounts created:", accounts_created) print("\t100.00%% complete") snapshot_file.seek(0) accounts_updated = 0 for a in ijson.items(snapshot_file, "accounts.item"): if a["name"] in system_account_names: continue cur_owner_auth = a["owner"] new_owner_auth = cur_owner_auth.copy() cur_active_auth = a["active"] new_active_auth = cur_active_auth.copy() cur_posting_auth = a["posting"] new_posting_auth = cur_posting_auth.copy() # filter to only include existing accounts for aw in cur_owner_auth["account_auths"]: if (aw[0] not in account_names) or (aw[0] in system_account_names): new_owner_auth["account_auths"].remove(aw) for aw in cur_active_auth["account_auths"]: if (aw[0] not in account_names) or (aw[0] in system_account_names): new_active_auth["account_auths"].remove(aw) for aw in cur_posting_auth["account_auths"]: if (aw[0] not in account_names) or (aw[0] in system_account_names): new_posting_auth["account_auths"].remove(aw) # add tnman to account_auths new_owner_auth["account_auths"].append( [tnman, cur_owner_auth["weight_threshold"]]) new_active_auth["account_auths"].append( [tnman, cur_active_auth["weight_threshold"]]) new_posting_auth["account_auths"].append( [tnman, cur_posting_auth["weight_threshold"]]) # substitute prefix for key_auths new_owner_auth["key_auths"] = [["TST" + k[3:], w] for k, w in new_owner_auth["key_auths"]] new_active_auth["key_auths"] = [ ["TST" + k[3:], w] for k, w in new_active_auth["key_auths"] ] new_posting_auth["key_auths"] = [ ["TST" + k[3:], w] for k, w in new_posting_auth["key_auths"] ] ops = [{ "type": "account_update_operation", "value": { "account": a["name"], "owner": new_owner_auth, "active": new_active_auth, "posting": new_posting_auth, "memo_key": "TST" + a["memo_key"][3:], "json_metadata": a["json_metadata"], } }] accounts_updated += 1 if not silent: if accounts_updated % 100000 == 0: print("Accounts updated:", accounts_updated) print( "\t", '%.2f%% complete' % (accounts_updated / num_accounts * 100.0)) yield {"operations": ops, "wif_sigs": [porter_wif]} if not silent: print("Accounts updated:", accounts_updated) print("\t100.00%% complete") snapshot_file.close() return
# edgeUUID = set() if input_format == 'avro': raise NotImplementedError('CDM avro format is not supported as of 01-04-09.') elif input_format == 'json': files = os.listdir(input_source) # all the dataset files needede to be parsed together # Start processing CDM records for data_file in files: with tf.open(os.path.join(input_source, data_file), 'r:gz') as f: names = f.getnames() sorted_files = sorted(names, key=lambda item: (int(item.split('.')[-1]) if item[-1].isdigit() else int(0), item)) for sorted_file in sorted_files: file_obj = f.extractfile(f.getmember(sorted_file)) parser = ijson.common.items(ijson.parse(file_obj, multiple_values=True), '') for cdm_record in parser: if input_format == 'avro': raise ValueError('This is a streaming JSON parser implementation.') elif input_format == 'json': # cdm_record = json.loads(line.strip()) cdm_record_type = cdm_record['datum'].keys()[0] cdm_record_value = cdm_record['datum'][cdm_record_type] if cdm_record_type == CDM_TYPE_SRCSINK: uuid = cdm_record_value['uuid'] values = process_cdm_srcsink(cdm_record_value, input_format, next_id) if uuid in nodes: logging.debug('CDM_TYPE_SRCSINK: UUID is not unique. UUID: ' + repr(uuid)) nodes[uuid] = values
def build_actions(conf, silent=True): keydb = prockey.ProceduralKeyDatabase() account_stats_start = datetime.datetime.utcnow() account_stats = get_account_stats(conf, silent) account_stats_elapsed = datetime.datetime.utcnow() - account_stats_start account_names = account_stats["account_names"] num_accounts = len(account_names) transactions_per_block = conf["transactions_per_block"] crea_block_interval = conf.get("crea_block_interval", CREA_BLOCK_INTERVAL) transaction_witness_setup_pad = conf.get("transaction_witness_setup_pad", TRANSACTION_WITNESS_SETUP_PAD) genesis_time = datetime.datetime.utcfromtimestamp(CREA_GENESIS_TIMESTAMP) # Three transactions per account (create, trasnfer_to_vesting, and update). predicted_transaction_count = num_accounts * 3 # The predicted number of blocks for accounts. predicted_block_count = predicted_transaction_count // transactions_per_block # The number of seconds required to setup transactions is a multiple of # the initial time it takes to do the get_account_stats() call. predicted_transaction_setup_seconds = (account_stats_elapsed.seconds * 2) # Pad for update witnesses, vote witnesses, clear rounds, and transaction # setup processing time predicted_block_count += transaction_witness_setup_pad + ( predicted_transaction_setup_seconds // crea_block_interval) now = datetime.datetime.utcnow() start_time = now - datetime.timedelta(seconds=predicted_block_count * crea_block_interval) miss_blocks = int( (start_time - genesis_time).total_seconds()) // crea_block_interval miss_blocks = max(miss_blocks - 1, 0) origin_api = None snapshot_head_block_num = None snapshot_semver = None has_backfill = False metadata = { "txgen:semver": __version__, "txgen:transactions_per_block": transactions_per_block, "epoch:created": str(now), "actions:count": predicted_transaction_count, "recommend:miss_blocks": miss_blocks } with open(conf["snapshot_file"], "rb") as f: for prefix, event, value in ijson.parse(f): if prefix == "metadata.snapshot:origin_api": metadata["snapshot:origin_api"] = value if prefix == "metadata.snapshot:semver": metadata["snapshot:semver"] = value if prefix == "dynamic_global_properties.head_block_number": metadata["snapshot:head_block_num"] = value if not prefix == '' and not prefix.startswith( "metadata") and not prefix.startswith( "dynamic_global_properties"): break semver = metadata.get("snapshot:semver", '0.0') major_version, minor_version = semver.split('.') major_version = int(major_version) minor_version = int(minor_version) backfill_file = conf.get("backfill_file", None) if major_version == SNAPSHOT_MAJOR_VERSION_SUPPORTED: if not silent: print("metadata:", metadata) else: raise RuntimeError("Unsupported snapshot:", metadata) if minor_version < SNAPSHOT_MINOR_VERSION_SUPPORTED: print("WARNING: Older snapshot encountered.", file=sys.stderr) if backfill_file and os.path.exists(backfill_file) and os.path.isfile( backfill_file): with open(backfill_file, "r") as f: num_lines = sum(1 for line in f) if num_lines > 0: metadata["backfill_actions:count"] = num_lines metadata["actions:count"] += num_lines miss_blocks -= max(num_lines // transactions_per_block, CREA_BLOCKS_PER_DAY * 30) metadata["recommend:miss_blocks"] = miss_blocks has_backfill = True yield ["metadata", metadata] yield ["wait_blocks", {"count": 1, "miss_blocks": miss_blocks}] yield ["submit_transaction", {"tx": build_initminer_tx(conf, keydb)}] for b in util.batch( build_setup_transactions(account_stats, conf, keydb, silent), transactions_per_block): for tx in b: yield ["submit_transaction", {"tx": tx}] if has_backfill: with open(backfill_file, "r") as f: for line in f: yield json.loads(line) yield ["metadata", {"post_backfill": True}] for tx in update_witnesses(conf, keydb, "init"): yield ["submit_transaction", {"tx": tx}] for tx in vote_accounts(conf, keydb, "elector", "init"): yield ["submit_transaction", {"tx": tx}] yield [ "wait_blocks", { "count": conf.get("num_blocks_to_clear_witness_round", NUM_BLOCKS_TO_CLEAR_WITNESS_ROUND) } ] return
#! /usr/bin/env python2.7 import sys import ijson.backends.yajl2_cffi as ijson p_map = {} p_count = 0 e_count = 0 with open(sys.argv[1]) as jfile: parser = ijson.common.items(ijson.parse(jfile, multiple_values=True), '') for evt in parser: if evt['subjprocuuid'] not in p_map: p_map[evt['subjprocuuid']] = True p_count += 1 if evt['event'] == "audit:event:aue_execve:": if p_map[evt['subjprocuuid']]: p_map[evt['subjprocuuid']] = False else: p_count += 1 elif evt['event'] in ["audit:event:aue_fork:", "audit:event:aue_vfork:"]: if evt['ret_objuuid1'] not in p_map: p_map[evt['ret_objuuid1']] = True p_count += 1 e_count += 1 print("{} Events Processed".format(e_count)) print("{} Process Nodes Observed".format(p_count)) print("{} Unique UUIDs Observed".format(len(p_map)))
def Iterate(self, inpath, outpath=None, rfrom=1, rto=0): if self._backend == 'yajl2_cffi': import ijson.backends.yajl2_cffi as ijson elif self._backend == 'yajl2': import ijson.backends.yajl2 as ijson else: import ijson _recno = 1 _lp_rec = 0 if self._mode in [1, 2]: _unique = set() elif self._mode == 3: _unique = None else: Log.error( 'Invalid value of key mode (=%d); allowed values [1,2,3]' % self._mode) return if self._lp_step > 0 and Log.isEnabledFor(logging.INFO): _lp_rec = self._lp_step try: header = [{'_tag_': 'ijson_events', '_bra_': True}] if self._flt is not None: if hasattr(self._flt, 'setHeader'): self._flt.setHeader(header) self._wri.writeHeader(header) with open(inpath, 'r') as fd: parser = ijson.parse(fd) for prefix, event, value in parser: if rto > 0 and _recno > rto: raise ToLimitBreak if prefix in [ 'item', '' ] and not event in ['start_array', 'start_map', 'map_key']: _recno = _recno + 1 if _recno == _lp_rec: Log.info('Processed %d records' % _recno) _lp_rec = _lp_rec + self._lp_step if _recno < rfrom: return if self._mode == 1: if prefix in _unique: continue _unique.add(prefix) rec = etree.Element(self._rec_tag) p_xml = etree.SubElement(rec, 'prefix') p_xml.text = str(prefix) elif self._mode == 2: if (prefix, event) in _unique: continue _unique.add((prefix, event)) rec = etree.Element(self._rec_tag) p_xml = etree.SubElement(rec, 'prefix') p_xml.text = str(prefix) e_xml = etree.SubElement(rec, 'event') e_xml.text = str(event) elif self._mode == 3: rec = etree.Element(self._rec_tag) p_xml = etree.SubElement(rec, 'prefix') p_xml.text = str(prefix) e_xml = etree.SubElement(rec, 'event') e_xml.text = str(event) v_xml = etree.SubElement(rec, 'value') v_xml.text = str(value) if self._flt is not None: while True: # OBLIGATORY res = self._flt.filterRecord(rec) if res & WRITE: yield self._wri.writeRecord(rec) if res & REPEAT: continue if res & BREAK: Log.info( 'Filter caused Process to stop on record %d' % _recno) raise FilterBreak break else: # OBLIGATORY yield self._wri.writeRecord(rec) except FilterBreak: pass except ToLimitBreak: pass finally: # OBLIGATORY footer = [] if self._flt is not None: if hasattr(self._flt, 'setFooter'): self._flt.setFooter(footer) self._wri.writeFooter(footer)
def Process(self, inpath, outpath=None, rfrom=1, rto=0): """Parameters are usually passed from YAML file as subkeys of ``Reader:PArg`` key. :param inpath: Path to input file. :param outpath: Path to output file passed to Writer (fall-back if output connector is not defined). :param rfrom-rto: specifies scope of records to be processed. For more detailed descriptions see :ref:`readers_conf_template`. """ if self._backend == 'yajl2_cffi': import ijson.backends.yajl2_cffi as ijson elif self._backend == 'yajl2': import ijson.backends.yajl2 as ijson else: import ijson _recno = 1 _lp_rec = 0 if self._mode in [1, 2]: _unique = set() elif self._mode == 3: _unique = None else: Log.error( 'Invalid value of key mode (=%d); allowed values [1,2,3]' % self._mode) return if self._lp_step > 0 and Log.isEnabledFor(logging.INFO): _lp_rec = self._lp_step try: header = [{'_tag_': 'ijson_events', '_bra_': True}] if self._flt is not None: if hasattr(self._flt, 'setHeader'): self._flt.setHeader(header) self._wri.writeHeader(header) with open(inpath, 'r') as fd: parser = ijson.parse(fd) for prefix, event, value in parser: if rto > 0 and _recno > rto: raise ToLimitBreak if prefix in [ 'item', '' ] and not event in ['start_array', 'start_map', 'map_key']: _recno = _recno + 1 if _recno == _lp_rec: Log.info('Processed %d records' % _recno) _lp_rec = _lp_rec + self._lp_step if _recno < rfrom: return if self._mode == 1: if prefix in _unique: continue _unique.add(prefix) rec = etree.Element(self._rec_tag) p_xml = etree.SubElement(rec, 'prefix') p_xml.text = str(prefix) elif self._mode == 2: if (prefix, event) in _unique: continue _unique.add((prefix, event)) rec = etree.Element(self._rec_tag) p_xml = etree.SubElement(rec, 'prefix') p_xml.text = str(prefix) e_xml = etree.SubElement(rec, 'event') e_xml.text = str(event) elif self._mode == 3: rec = etree.Element(self._rec_tag) p_xml = etree.SubElement(rec, 'prefix') p_xml.text = str(prefix) e_xml = etree.SubElement(rec, 'event') e_xml.text = str(event) v_xml = etree.SubElement(rec, 'value') v_xml.text = str(value) if self._flt is not None: while True: # OBLIGATORY res = self._flt.filterRecord(rec) if res & WRITE: self._wri.writeRecord(rec) if res & REPEAT: continue if res & BREAK: Log.info( 'Filter caused Process to stop on record %d' % _recno) raise FilterBreak break else: # OBLIGATORY self._wri.writeRecord(rec) except FilterBreak: pass except ToLimitBreak: pass finally: # OBLIGATORY footer = [] if self._flt is not None: if hasattr(self._flt, 'setFooter'): self._flt.setFooter(footer) self._wri.writeFooter(footer)
def cprocess(fileobj, ds, fn, out=None): """Iteratively process/scan an file object. Arguments: fileobj - file object ds - a database (for node parsing) or a sanitylog (for scanning) fn - file name """ parser = ijson.common.items(ijson.parse(fileobj, multiple_values=True), '') if args.trace == 'camflow': if args.scan: if args.verbose: print( "\x1b[6;30;42m[+]\x1b[0m scanning file {} in CAMFLOW mode..." .format(fn)) ptc.sanitycheckcf(parser, ds) elif out == None: if args.verbose: print( "\x1b[6;30;42m[+]\x1b[0m parsing compressed file {} in CAMFLOW mode..." .format(fn)) ptj.parsecf(parser, ds, fn) else: if args.verbose: print( "\x1b[6;30;42m[+]\x1b[0m generating output for file {} in CAMFLOW mode..." .format(fn)) print( "\x1b[6;30;43m[i]\x1b[0m initiating logging. Check error.log afterwards..." ) ptj.cgencf(parser, ds, out) elif args.trace == 'darpa': if args.scan: if args.verbose: print( "\x1b[6;30;42m[+]\x1b[0m scanning file {} in DARPA mode..." .format(fn)) ptc.sanitycheckdp(parser, ds) elif out == None: if args.verbose: print( "\x1b[6;30;42m[+]\x1b[0m parsing compressed file {} in DARPA mode..." .format(fn)) ptj.parsedp(parser, ds, fn) else: if args.verbose: print( "\x1b[6;30;42m[+]\x1b[0m generating output for file {} in DARPA mode..." .format(fn)) print( "\x1b[6;30;43m[i]\x1b[0m initiating logging. Check error.log afterwards..." ) ptj.cgendp(parser, ds, out) elif args.trace == 'cadets2' or args.trace == 'fivedirections': if args.scan: if args.verbose: print( "\x1b[6;30;42m[+]\x1b[0m scanning file {} in CADETS2/FIVEDIRECTIONS mode..." .format(fn)) ptc.sanitycheckcd(parser, ds) elif out == None: raise NotImplementedError( "no support for processing {} compact files at the moment". format(args.trace)) else: if args.verbose: print( "\x1b[6;30;42m[+]\x1b[0m generating output for file {} in CADETS2/FIVEDIRECTIONS mode..." .format(fn)) print( "\x1b[6;30;43m[i]\x1b[0m initiating logging. Check error.log afterwards..." ) ptj.cgencd(parser, ds, out) else: raise NotImplementedError("cannot run traces from an unknown system") fileobj.close() return
state = ParserState.START dirs = [] key = None obj = {} argp = argparse.ArgumentParser() argp.add_argument("file", type=argparse.FileType("rb"), help="ncdu export filename") argp.add_argument("--dirs", choices=["array", "string"], default="string", help="directory name format output to flat file") argp.add_argument("--verbose", action="store_true", help="enable verbose mode (inc. ijson variant)") options = argp.parse_args() if options.verbose: sys.stderr.write("ijson module variant: {}\n".format(ijson.__name__)) parser = ijson.parse(options.file) for prefix, event, value in parser: if event == "start_array": if state != ParserState.START: # started non-header array (directory listing) state = ParserState.ARRAY_START else: # started header, omit this map state = ParserState.HEADER elif event == "end_array": # array means a (sub)directory so it was at least a second entry # (first entry is the directory's meta-data) state = ParserState.SUBSEQ_MAP if dirs: dirs.pop() elif state == ParserState.ARRAY_START and event == "start_map":
def main(argv): parser = argparse.ArgumentParser( prog=argv[0], description="Generate transactions for Steem testnet") parser.add_argument("-i", "--infile", default="", dest="infile", metavar="FILE", help="Specify input snapshot, - means stdin") parser.add_argument("-o", "--outfile", default="-", dest="outfile", metavar="FILE", help="Specify output snapshot, - means stdout") args = parser.parse_args(argv[1:]) sample_size = 2000 if args.infile == "-": # We do not have random access, so we must load the whole thing in # memory. And we cannot output messages. infile = sys.stdin snapshot = json.load(infile, object_pairs_hook=collections.OrderedDict) snapshot["witnesses"] = [] snapshot["accounts"] = heapq.nlargest( sample_size, snapshot["accounts"], key=lambda a: int(a["balance"]["amount"])) else: # We have random access! try: import ijson.backends.yajl2_cffi as ijson from cffi import FFI YAJL2_CFFI_AVAILABLE = True except ImportError: import ijson YAJL2_CFFI_AVAILABLE = False if not YAJL2_CFFI_AVAILABLE: print( "Warning: could not load yajl, falling back to default backend for ijson." ) infile = open(args.infile, "rb") account_balances = {} snapshot = { "dynamic_global_properties": { "total_vesting_fund_steem": {} }, "accounts": [], "witnesses": [] } fund = snapshot["dynamic_global_properties"][ "total_vesting_fund_steem"] for prefix, event, value in ijson.parse(infile): if prefix == "dynamic_global_properties.total_vesting_fund_steem.amount": fund["amount"] = value elif prefix == "dynamic_global_properties.total_vesting_fund_steem.precision": fund["precision"] = value elif prefix == "dynamic_global_properties.total_vesting_fund_steem.nai": fund["nai"] = value if len(fund.keys()) > 2: break print("Captured:", snapshot["dynamic_global_properties"]) infile.seek(0) for a in ijson.items(infile, "accounts.item"): account_balances[a["name"]] = a["balance"]["amount"] if len(account_balances) % 100000 == 0: print("Balances so far:", len(account_balances)) top_accounts = heapq.nlargest(sample_size, account_balances, key=lambda a: int(account_balances[a])) print('Found top accounts:', len(top_accounts)) infile.seek(0) for a in ijson.items(infile, "accounts.item"): t = len(top_accounts) s = len(snapshot["accounts"]) if s >= t: break if a["name"] in top_accounts: snapshot["accounts"].append(a) if s > 0 and s % 100 == 0: print("Samples created:", s) print("\t", '%.2f%% complete' % (s / t * 100.0)) infile.close() if args.outfile == "-": outfile = sys.stdout else: print("Dumping sample ...") outfile = open(args.outfile, "w") json.dump(snapshot, outfile, separators=(",", ":")) if args.outfile != "-": outfile.close() return
def extract(self, filepath): vidjilfile = open(filepath, 'rb') parser = ijson.parse(vidjilfile) with self.writer() as writer: return self._extract(parser, writer)
def initModel(self, model_path): with open(model_path, 'rb') as model: parser = ijson.parse(model) for prefix, event, value in parser: if (prefix, event) not in self._model_prefixes: self._model_prefixes.append((prefix, event))
def build_dataframe(good, bad): df = pd.DataFrame(columns = features) current = 0 row = {} symbol = False cmd = None segment_name = None section_name = None print 'Parsing good json' for prefix, event, value in ijson.parse(good): if prefix.endswith('.macho') or prefix.endswith('.machos.item'): if len(row) > 0: row['alignment'] = 'good' df.loc[current] = build_row(row) current += 1 row = {} elif prefix.endswith('.macho.size') or prefix.endswith('.machos.item.size'): row['m_size'] = value / 1024.0 elif prefix.endswith('.strtab.size'): row['s_size'] = value / 1024.0 elif prefix.endswith('.slcs'): row['slcs'] = value / 1024.0 elif prefix.endswith('.signature.size'): row['sig_size'] = value / 1024.0 elif prefix.endswith('.symtab.nsyms'): row['nsyms'] = value elif prefix.endswith('.nlcs'): row['nlcs'] = value elif prefix.endswith('.ndylibs'): row['ndylibs'] = value elif prefix.endswith('.nimps'): row['nimports'] = value elif prefix.endswith('.entropy'): row['entropy'] = value elif prefix.endswith('.strtab.strings'): row['nstrings'] = 0 elif prefix.endswith('.strtab.strings.item'): row['nstrings'] += 1 elif prefix.endswith('.macho.flags.item') or prefix.endswith('.machos.item.flags.item'): row[value] = 10 elif prefix.endswith('.filetype'): row[value] = 10 elif prefix.endswith('.lcs.item.cmd'): if value == 'SEGMENT' or 'SEGMENT_64': if segment_name is None: cmd = value else: lc = value + ' (' + segment_name + ')' if lc in load_commands: row[lc] = 10 segment_name = None else: if value in load_commands: row[value] = 10 elif prefix.endswith('.lcs.item.name'): if cmd is None: segment_name = value else: lc = cmd + ' (' + value + ')' if lc in load_commands: row[lc] = 10 cmd = None elif prefix.endswith('.sects.item.segname'): if section_name is None: segment_name = value else: s = value + ', ' + section_name if s in sections: row[s] = 10 section_name = None elif prefix.endswith('.sects.item.name'): if segment_name is None: section_name = value else: s = segment_name + ', ' + value if s in sections: row[s] = 10 segment_name = None elif prefix.endswith('.imports.item'): symbol = True elif prefix.endswith('.imports.item.item') and symbol: if value in imports: row[value] = 10 symbol = False elif prefix.endswith('.imports.item.item') and not symbol: if value in dylib_counts: if value in row: row[value] += 1 else: row[value] = 1 #elif prefix.endswith('.dylibs.item'): # if value in dylibs: # row[value] = 10 print 'Parsing bad json' for prefix, event, value in ijson.parse(bad): if prefix.endswith('.macho') or prefix.endswith('.machos.item'): if len(row) > 0: row['alignment'] = 'bad' df.loc[current] = build_row(row) current += 1 row = {} elif prefix.endswith('.macho.size') or prefix.endswith('.machos.item.size'): row['m_size'] = value / 1024.0 elif prefix.endswith('.strtab.size'): row['s_size'] = value / 1024.0 elif prefix.endswith('.slcs'): row['slcs'] = value / 1024.0 elif prefix.endswith('.signature.size'): row['sig_size'] = value / 1024.0 elif prefix.endswith('.symtab.nsyms'): row['nsyms'] = value elif prefix.endswith('.nlcs'): row['nlcs'] = value elif prefix.endswith('.ndylibs'): row['ndylibs'] = value elif prefix.endswith('.nimps'): row['nimports'] = value elif prefix.endswith('.entropy'): row['entropy'] = value elif prefix.endswith('.strtab.strings'): row['nstrings'] = 0 elif prefix.endswith('.strtab.strings.item'): row['nstrings'] += 1 elif prefix.endswith('.macho.flags.item') or prefix.endswith('.machos.item.flags.item'): row[value] = 10 elif prefix.endswith('.filetype'): row[value] = 10 elif prefix.endswith('.lcs.item.cmd'): if value == 'SEGMENT' or 'SEGMENT_64': if segment_name is None: cmd = value else: lc = value + ' (' + segment_name + ')' if lc in load_commands: row[lc] = 10 segment_name = None else: if value in load_commands: row[value] = 10 elif prefix.endswith('.lcs.item.name'): if cmd is None: segment_name = value else: lc = cmd + ' (' + value + ')' if lc in load_commands: row[lc] = 10 cmd = None elif prefix.endswith('.sects.item.segname'): if section_name is None: segment_name = value else: s = value + ', ' + section_name if s in sections: row[s] = 10 section_name = None elif prefix.endswith('.sects.item.name'): if segment_name is None: section_name = value else: s = segment_name + ', ' + value if s in sections: row[s] = 10 segment_name = None elif prefix.endswith('.imports.item'): symbol = True elif prefix.endswith('.imports.item.item') and symbol: if value in imports: row[value] = 10 symbol = False elif prefix.endswith('.imports.item.item') and not symbol: if value in dylib_counts: if value in row: row[value] += 1 else: row[value] = 1 #elif prefix.endswith('.dylibs.item'): # if value in dylibs: # row[value] = 10 return df