def iter_json_dict(path: str, json_path: Tuple[Union[str, None], ...], **kwargs: Any) -> Iterable[Tuple[str, Dict[str, Any]]]: with open(path, 'rb') as jsonfile: yield from JsonSlicer(jsonfile, json_path, path_mode='map_keys', **kwargs)
def iter_parse( self, path: str, factory: PackageFactory, transformer: PackageTransformer ) -> Generator[PackageMaker, None, None]: result: Dict[str, PackageMaker] = {} # note that we actually parse database prepared by # fetcher, not the file we've downloaded with open(path, 'rb') as jsonfile: for entry in JsonSlicer(jsonfile, ('releases', None)): pkg = factory.begin() pkg.set_name(entry['name']) pkg.set_version(entry['version']) if not pkg.check_sanity(verbose=False): continue pkg.add_homepages(entry.get('homepage')) pkg.set_summary(entry.get('summary')) if not pkg.comment: pkg.set_summary(entry.get('description')) # multiline #pkg.add_maintainers(entry.get('submitter') + '@freshcode') # unfiltered garbage #pkg.add_downloads(entry.get('download')) # ignore for now, may contain download page urls instead of file urls pkg.add_licenses(entry.get('license')) # take latest known versions if pkg.name not in result or version_compare( pkg.version, result[pkg.name].version) > 0: result[pkg.name] = pkg yield from result.values()
def score(self,key,all_questions_scores): print(key) pivots = self.questions_dict.get(key) question_score={} p = np.asarray(pivots) tree = KDTree(p, leaf_size=40, metric='euclidean') flag = None value = 0 n = 0 for s in JsonSlicer(self.f,(None,None),path_mode='full'): print(len(s[2])) if flag == None or s[0]!= flag: if flag == None: flag = s[0] z = np.asarray(s[2]) dist,_ = tree.query([z], k=1) value += dist.item(0,0) n += 1 else: question_score[flag]=value/n flag = s[0] value = 0 n = 0 dist,_ = tree.query([z], k=1) value += dist.item(0,0) n += 1 else: z = np.asarray(s[2]) dist,_ = tree.query([z], k=1) value += dist.item(0,0) n+=1 question_score[flag] = value/n all_questions_scores[key] = question_score
def run_js(data, path=(), **kwargs): if isinstance(data, bytes): dataio = io.BytesIO(data) else: dataio = io.StringIO(data) slicer = JsonSlicer(dataio, path, **kwargs) return list(slicer)
def _iter_packages(path): with open(path, 'rb') as jsonfile: for summary_key, fmri, _, pkgdata in JsonSlicer(jsonfile, (None, None, None), path_mode='full'): if summary_key.startswith('_'): # e.g. _SIGNATURE continue # else summary_key is someting like "openindiana.org" # or "hipster-encumbered" yield fmri, pkgdata
def iter_parse(self, path, factory, transformer): with open(path, 'rb') as jsonfile: for package in JsonSlicer(jsonfile, (None, )): pkg = factory.begin() pkg.set_name(package['name'].split('@', 1)[0]) pkg.set_version(package['versions']['stable']) pkg.set_summary(package['desc']) pkg.add_homepages(package['homepage']) yield pkg
def iter_parse(self, path, factory, transformer): with open(path, 'rb') as jsonfile: for item in JsonSlicer(jsonfile, ('packages', None)): with factory.begin() as pkg: pkg.set_name(item['name']) pkg.set_version(item['ver']) pkg.set_summary(item['descs']) pkg.set_extra_field('location', item['loc']) yield pkg
def iter_parse( self, path: str, factory: PackageFactory, transformer: PackageTransformer ) -> Generator[PackageMaker, None, None]: normalize_version = VersionStripper().strip_right('+') with subprocess.Popen([config['TCLSH'], self.helperpath, path], stdout=subprocess.PIPE) as macportsjson: for pkgdata in JsonSlicer(macportsjson.stdout, (None, )): pkg = factory.begin() pkg.set_name(pkgdata['name']) pkg.set_version(pkgdata['version'], normalize_version) # drop obsolete ports (see #235) if 'replaced_by' in pkgdata: continue pkg.set_summary(pkgdata.get('description')) pkg.add_homepages(pkgdata.get('homepage')) pkg.add_categories(pkgdata.get('categories', '').split()) pkg.add_licenses( pkgdata.get('license')) # XXX: properly handle braces if 'maintainers' in pkgdata: for maintainer in pkgdata['maintainers'].replace( '{', '').replace('}', '').lower().split(): if maintainer.startswith('@'): # @foo means github user foo pkg.add_maintainers(maintainer[1:] + '@github') elif '@' in maintainer: # plain email pkg.add_maintainers(maintainer) elif ':' in maintainer: # foo.com:bar means [email protected] # ignore, since it's considered a form of email obfuscation pass elif maintainer == 'openmaintainer': # ignore, this is a flag that minor changes to a port # are allowed without involving the maintainer pass else: # otherwise it's [email protected] pkg.add_maintainers(maintainer + '@macports.org') pkg.set_extra_field('portdir', pkgdata['portdir']) pkg.set_extra_field('portname', pkgdata['portdir'].split('/')[1]) yield pkg
def iter_parse( self, path: str, factory: PackageFactory, transformer: PackageTransformer ) -> Generator[PackageMaker, None, None]: with open(path, 'rb') as jsonfile: for item in JsonSlicer(jsonfile, ('packages', None)): with factory.begin() as pkg: pkg.set_name(item['name']) pkg.set_version(item['ver']) pkg.set_summary(item['descs']) pkg.set_arch(item['arch']) pkg.set_extra_field('location', item['loc']) yield pkg
def iter_parse(self, path, factory, transformer): with open(path, 'rb') as jsonfile: for packagedata in JsonSlicer(jsonfile, ('ravenports', None)): pkg = factory.begin() pkg.set_name(packagedata['namebase']) pkg.set_version(packagedata['version']) pkg.add_categories(packagedata['keywords']) pkg.add_homepages(packagedata.get('homepage')) pkg.add_downloads(packagedata['distfile']) pkg.set_summary(packagedata['variants'][0]['sdesc']) pkg.add_maintainers( map(lambda contact: contact.get('email'), packagedata.get('contacts', []))) pkg.set_extra_field('bucket', packagedata['bucket']) pkg.set_extra_field('variant', packagedata['variants'][0]['label']) yield pkg
def iter_parse(self, path, factory, transformer): with open(path, 'rb') as jsonfile: for item in JsonSlicer(jsonfile, ('items', None)): pkg = factory.begin() pkg.set_basename(item['meta']) pkg.set_version(item['ver']) pkg.add_maintainers(item['maintainer']) pkg.add_licenses(item['license']) pkg.add_homepages(item['home']) pkg.add_downloads(item.get('src')) if pkg.version == 'latest': pkg.set_flags(PackageFlags.rolling) for subitem in item['pkgs']: subpkg = pkg.clone() subpkg.add_categories(subitem['cat']) subpkg.set_summary(subitem['desc']) subpkg.set_name(subitem['name']) subpkg.set_version(subitem.get('ver')) yield subpkg
def iter_parse( self, path: str, factory: PackageFactory, transformer: PackageTransformer ) -> Generator[PackageMaker, None, None]: with open(path, 'rb') as jsonfile: for key, packagedata in JsonSlicer(jsonfile, ('packages', None), encoding='utf-8', path_mode='map_keys'): pkg = factory.begin(key) # see how Nix parses 'derivative' names in # https://github.com/NixOS src/libexpr/names.cc, DrvName::DrvName # it just splits on dash followed by non-letter # # this doesn't work well on 100% cases, it's an upstream problem match = re.match('(.+?)-([^a-zA-Z].*)$', packagedata['name']) if not match: factory.log('cannot extract version: {}/{}'.format( key, packagedata['name']), severity=Logger.ERROR) continue pkg.set_name(match.group(1)) pkg.set_version(match.group(2)) # some exceptions for prefix in ('75dpi', '100dpi'): if pkg.version.startswith(prefix): pkg.set_name(pkg.name + '-' + prefix) pkg.set_version(pkg.version[len(prefix) + 1:]) merged = pkg.name + '-' + pkg.version for pkgname in [ 'liblqr-1', 'python2.7-3to2', 'python3.6-3to2', 'libretro-4do', 'polkit-qt-1-qt5', 'polkit-qt-1-qt4' ]: if merged.startswith(pkgname): pkg.set_name(pkgname) pkg.set_version(merged[len(pkgname) + 1:]) keyparts = key.split('.') if len(keyparts) > 1: pkg.add_categories(keyparts[0]) if pkg.name.endswith('-git'): pkg.set_name(pkg.name[:-4]) pkg.set_flags(PackageFlags.ignore) if re.match('.*20[0-9]{2}-[0-9]{2}-[0-9]{2}', pkg.version): pkg.set_flags(PackageFlags.ignore) if re.match('[0-9a-f]*[a-f][0-9a-f]*$', pkg.version) and len(pkg.version) >= 7: pkg.log( 'ignoring version which looks like commit hash: {}'. format(pkg.version), severity=Logger.ERROR) pkg.set_flags(PackageFlags.ignore) meta = packagedata['meta'] pkg.add_homepages(meta.get('homepage')) if 'description' in meta: pkg.set_summary(meta['description'].replace('\n', ' ')) if 'maintainers' in meta: if not isinstance(meta['maintainers'], list): pkg.log('maintainers is not a list: {}'.format( meta['maintainers']), severity=Logger.ERROR) else: pkg.add_maintainers( extract_nix_maintainers(meta['maintainers'])) if 'license' in meta: pkg.add_licenses(extract_nix_licenses(meta['license'])) if 'position' in meta: posfile, posline = meta['position'].rsplit(':', 1) pkg.set_extra_field('posfile', posfile) pkg.set_extra_field('posline', posline) if posfile.startswith('pkgs/development/haskell-modules'): pkg.set_flags( PackageFlags.rolling ) # XXX: haskell modules are autogenerated in nix: https://github.com/NixOS/nixpkgs/commits/master/pkgs/development/haskell-modules/hackage-packages.nix yield pkg
def test_constructs(self): self.assertIsNotNone(JsonSlicer(io.StringIO('0'), ()))
results = [] with TestCase('json.loads()', 'str', args.json_size, results): for n, item in enumerate(json.loads(jsondata)['level1']['level2']): assert (item['id'] == n) with TestCase('json.load(StringIO())', 'str', args.json_size, results): gen = io.StringIO(jsondata) for n, item in enumerate(json.load(gen)['level1']['level2']): assert (item['id'] == n) with TestCase('**JsonSlicer (no paths, binary input, binary output)**', 'bytes', args.json_size, results): gen = io.BytesIO(jsondata.encode('utf-8')) parser = JsonSlicer(gen, (b'level1', b'level2', None), binary=True) for n, item in enumerate(parser): assert (item[b'id'] == n) with TestCase('**JsonSlicer (no paths, unicode input, binary output)**', 'bytes', args.json_size, results): gen = io.StringIO(jsondata) parser = JsonSlicer(gen, (b'level1', b'level2', None), binary=True) for n, item in enumerate(parser): assert (item[b'id'] == n) with TestCase('**JsonSlicer (no paths, binary input, unicode output)**', 'str', args.json_size, results): gen = io.BytesIO(jsondata.encode('utf-8')) parser = JsonSlicer(gen, ('level1', 'level2', None)) for n, item in enumerate(parser):
def test_allows_list(self): self.assertIsNotNone(list(JsonSlicer(io.StringIO('0'), ())))
def check_for_updates(self): file_name = 'online_filmlist' full_file_name = DirectoryMapper.abspath(self.plugin_id, 'tmpfs', file_name, True) try: # does the file exist at all already? filmlist_time_stamp = DirectoryMapper.getmtime( self.plugin_id, 'tmpfs', file_name) except: filmlist_time_stamp = 0 if filmlist_time_stamp < time.time( ) - 60 * 60 * 48: # file is older as 48 hours ''' Bootstrap to read the filmlist: 1. read the list of actual filmlist URLs from https://res.mediathekview.de/akt.xml ''' self.logger.info("Retrieve film list") try: var_url = urlopen('https://res.mediathekview.de/akt.xml') server_list = parse(var_url) print(server_list) url = None prio = 999 # dummy start value for item in server_list.iterfind('Server'): this_prio = int(item.findtext('Prio')) if this_prio < prio: # filter for the server with the lowest prio prio = this_prio url = item.findtext('URL') self.logger.info(f'Mediathek filmlist url {url}') if url: try: urlretrieve(url, full_file_name + '.pack') self.logger.info("filmlist downloaded") except Exception as e: self.logger.warning( f'failed filmlist download {str(e)}') try: with DirectoryMapper.open(self.plugin_id, 'tmpfs', file_name, 'wb') as unpack_file_handle: with lzma.open( DirectoryMapper.open( self.plugin_id, 'tmpfs', file_name + '.pack', 'rb')) as archive_file_handle: bytes = archive_file_handle.read(4096) while bytes: unpack_file_handle.write(bytes) bytes = archive_file_handle.read(4096) self.reset_index() # destroy the existing index self.logger.info('filmlist server list unpacked') except Exception as e: print('failed filmlist unpack', str(e)) except Exception as e: print('failed filmlist server list download') else: if not self.is_empty() and self.providers: return # no need to load, we have already movie data loader_remember_data = {'provider': '', 'category': ''} try: with DirectoryMapper.open(self.plugin_id, 'tmpfs', file_name) as data: self.reset_index() with self.whoosh_ix.writer() as whoosh_writer: count = 0 self.logger.info(f"loading filmlist...") for liste in JsonSlicer(data, ('X'), path_mode='map_keys'): count += 1 data_array = liste[1] # "Sender" 0, # "Thema" 1, # "Titel" 2, # "Datum" 3, # "Zeit" 4, # "Dauer" 5, # "Größe [MB]" 6, # "Beschreibung" 7, # "Url" 8, # "Website" 9, # "Url Untertitel" 10, # "Url RTMP" 11, # "Url Klein" 12, # "Url RTMP Klein" 13, # "Url HD" 14, # "Url RTMP HD" 15, # "DatumL" 16, # "Url History" 17, # "Geo" 18, # "neu" 19 provider = data_array[0] category = data_array[1] if provider: loader_remember_data['provider'] = provider else: provider = loader_remember_data['provider'] if category: loader_remember_data['category'] = category else: category = loader_remember_data['category'] if category == 'Livestream': source_type = defaults.MOVIE_TYPE_STREAM plugin_name = self.plugin_names[1] provider = provider.replace('Livestream', '').strip() #print("Livestream") else: plugin_name = self.plugin_names[0] source_type = defaults.MOVIE_TYPE_RECORD self.providers.add(provider) try: # livestream do not have a duration timestamp = int(data_array[16]) timestamp_datetime = datetime.datetime.fromtimestamp( timestamp) except: timestamp = 1 timestamp_datetime = datetime.datetime.fromtimestamp( timestamp) movie_info = MovieInfo( url=data_array[8], mime='video/mp4', title=data_array[2], category=category, source=plugin_name, source_type=source_type, provider=provider, timestamp=timestamp, duration=self.time_string_to_secs(data_array[5]), description=data_array[7], ) # fill the search engine whoosh_writer.update_document( source=plugin_name, source_type=source_type, provider=provider, title=data_array[2], category=category, uri=movie_info['uri'], description=data_array[7], timestamp=timestamp_datetime, url=movie_info['url'], mime=movie_info['mime'], duration=movie_info['duration']) if not plugin_name in self.movies: self.movies[plugin_name] = {} # experimental: Do not save the movies in mem anymore, just in Whoosh #self.movies[plugin_name][movie_info['uri']]=movie_info self.provider_storage.write('provider_cache', list(self.providers)) self.logger.info(f"filmlist loaded, {count} entries") except Exception as err: self.logger.warning(f'failed to read filmlist:{err}')
def test_accepts_bytes(self): self.assertIsNotNone(JsonSlicer(io.BytesIO(b'0'), ())) self.assertIsNotNone(next(JsonSlicer(io.BytesIO(b'0'), ())))
def wrapper(): js = JsonSlicer(io.BytesIO(b'0'), (gen_bytes(), gen_bytes())) js.__init__(io.BytesIO(b'0'), (gen_bytes(), gen_bytes()))
def _process(self, stream: IO[bytes]) -> bool: num_updates = 0 for cve in JsonSlicer(stream, ('CVE_Items', None)): cve_id: str = cve['cve']['CVE_data_meta']['ID'] published: str = cve['publishedDate'] last_modified: str = cve['lastModifiedDate'] usable_matches: Set[CPEMatch] = set() for configuration in cve['configurations']['nodes']: if configuration['operator'] != 'OR': continue # not supported if 'cpe_match' not in configuration: continue usable_matches.update(filter(_is_good_match, map(CPEMatch, configuration['cpe_match']))) matches_for_json = [ [ match.cpe.vendor, match.cpe.product, match.cpe.edition, match.cpe.lang, match.cpe.sw_edition, match.cpe.target_sw, match.cpe.target_hw, match.cpe.other, match.start_version, match.end_version, match.start_version_excluded, match.end_version_excluded, ] for match in usable_matches ] with self._db.cursor() as cur: cur.execute( """ WITH updated_cves AS ( INSERT INTO cves ( cve_id, published, last_modified, matches, cpe_pairs ) VALUES ( %(cve_id)s, %(published)s, %(last_modified)s, %(matches)s, %(cpe_pairs)s ) ON CONFLICT(cve_id) DO UPDATE SET published = %(published)s, -- not expected to change in fact last_modified = %(last_modified)s, matches = %(matches)s, cpe_pairs = %(cpe_pairs)s WHERE %(last_modified)s > cves.last_modified RETURNING cpe_pairs ), register_cpe_updates AS ( INSERT INTO cpe_updates ( cpe_vendor, cpe_product ) SELECT split_part(unnest(cpe_pairs), ':', 1) AS cpe_vendor, split_part(unnest(cpe_pairs), ':', 2) AS cpe_product FROM updated_cves ) SELECT 1 FROM updated_cves """, { 'cve_id': cve_id, 'published': published, 'last_modified': last_modified, 'matches': psycopg2.extras.Json(matches_for_json) if matches_for_json else None, 'cpe_pairs': list(set(f'{match.cpe.vendor}:{match.cpe.product}' for match in usable_matches)) or None } ) num_updates += sum(row[0] for row in cur.fetchall()) self._num_updates += num_updates return num_updates > 0
def test_accepts_unicode(self): self.assertIsNotNone(JsonSlicer(io.StringIO('0'), ())) self.assertIsNotNone(next(JsonSlicer(io.BytesIO(b'0'), ())))
def wrapper(): JsonSlicer(io.BytesIO(b'0'), ())
from jsonslicer import JsonSlicer with open('/media/data/ky/local/claims_test.json', 'r') as data: for k in JsonSlicer(data, ('CN105019422A', None), path_mode='full'): print(len(k))
#!/usr/bin/env python import os import argparse import dicttoxml from jsonslicer import JsonSlicer from datetime import datetime parser = argparse.ArgumentParser(description='This script will take an input AU json file and convert into an XML file (1 line per Authorization)') parser.add_argument('-i', '--input', help='input AU json file', required=True, action='store') parser.add_argument('-o', '--output', help='output xml file', required=True, action='store') args = parser.parse_args() if os.path.exists(args.output): os.remove(args.output) def gettime(): return datetime.now().strftime("%Y%m%d %H:%M:%S") print("Converting {} to xml...".format(args.input)) with open(args.input) as data, open(args.output,"a+") as xmlFile: auths=0 for case_level in JsonSlicer(data, ('CaseLevel', None)): xml = dicttoxml.dicttoxml(case_level,custom_root='CaseLevel',attr_type=False) xmlFile.write(xml.decode("utf-8") + "\n") auths+=1 if auths % 5000 == 0: print("{}|converted {} auths...".format(gettime(),str(auths))) print("{}|Total Authorizations converted: {}".format(gettime(),str(auths))) print("{} created".format(args.output))
def wrapper(): JsonSlicer(io.BytesIO(b'0'), (gen_bytes(), gen_bytes()))
# sudo apt install libyajl-dev # sudo pip3 install jsonslicer import sys from jsonslicer import JsonSlicer # Iterate over collection(s) by using wildcards in the path: with open(sys.argv[1]) as data: for liste in JsonSlicer(data, ('X'), path_mode='map_keys'): print(liste[1][2]) print(liste)
def load_filmlist(self, file_name): print(os.path.abspath(file_name)) try: # does the file exist at all already? filmlist_time_stamp = os.path.getmtime(file_name) except: filmlist_time_stamp = 0 print("timestamp", filmlist_time_stamp, time.time()) if filmlist_time_stamp < time.time( ) - 60 * 60 * 48: # file is older as 48 hours print("Retrieve film list") try: var_url = urlopen('https://res.mediathekview.de/akt.xml') server_list = parse(var_url) print(server_list) url = None prio = 999 # dummy start value for item in server_list.iterfind('Server'): this_prio = int(item.findtext('Prio')) if this_prio < prio: # filter for the server with the lowest prio prio = this_prio url = item.findtext('URL') print(url) print(prio) print() if url: try: urlretrieve(url, file_name + '.pack') except Exception as e: print('failed filmlist download', str(e)) try: with open(file_name, 'wb') as unpack_file_handle: unpack_file_handle.write( lzma.open(file_name + '.pack').read()) except Exception as e: print('failed filmlist unpack', str(e)) except Exception as e: print('failed filmlist server list download') loader_remember_data = {'provider': '', 'category': ''} ''' Bootstrap to read the filmlist: 1. read the list of actual filmlist URLs from https://res.mediathekview.de/akt.xml ''' #with open('/home/steffen//Desktop/workcopies/schnipsl/Filmliste-akt') as data: with open(file_name) as data: count = 0 for liste in JsonSlicer(data, ('X'), path_mode='map_keys'): count += 1 data_array = liste[1] # "Sender" 0, # "Thema" 1, # "Titel" 2, # "Datum" 3, # "Zeit" 4, # "Dauer" 5, # "Größe [MB]" 6, # "Beschreibung" 7, # "Url" 8, # "Website" 9, # "Url Untertitel" 10, # "Url RTMP" 11, # "Url Klein" 12, # "Url RTMP Klein" 13, # "Url HD" 14, # "Url RTMP HD" 15, # "DatumL" 16, # "Url History" 17, # "Geo" 18, # "neu" 19 provider = data_array[0] category = data_array[1] if provider: loader_remember_data['provider'] = provider else: provider = loader_remember_data['provider'] if category: loader_remember_data['category'] = category else: category = loader_remember_data['category'] if category == 'Livestream': source_type = defaults.MOVIE_TYPE_STREAM plugin_name = self.plugin_names[1] provider = provider.replace('Livestream', '').strip() #print("Livestream") else: plugin_name = self.plugin_names[0] source_type = defaults.MOVIE_TYPE_RECORD self.providers.add(provider) new_movie = Movie(source=plugin_name, source_type=source_type, provider=provider, category=category, title=data_array[2], timestamp=data_array[16], duration=self.time_string_to_secs( data_array[5]), description=data_array[7], url=data_array[8]) new_movie.add_stream('mp4', '', data_array[8]) if not plugin_name in self.movies: self.movies[plugin_name] = {} self.movies[plugin_name][new_movie.uri()] = new_movie print("filmlist loaded, {0} entries", count)
def wrapper(): js = JsonSlicer(io.BytesIO(b'[0,1,2]'), (None, )) next(js)
def iter_json_list(path: str, json_path: Tuple[Union[str, None], ...], **kwargs: Any) -> Iterable[Dict[str, Any]]: with open(path, 'rb') as jsonfile: yield from JsonSlicer(jsonfile, json_path, **kwargs)
def iter_json_list(path: str, json_path: tuple[str | None, ...], **kwargs: Any) -> Iterable[dict[str, Any]]: with open(path, 'rb') as jsonfile: yield from JsonSlicer(jsonfile, json_path, **kwargs)