def test_load_weblist(): # test github ds = Dataset.load_web('top1kstarred', registry='github', from_type='list', cache_dir=CACHE_DIR, debug=True, github_pat=os.getenv('GITHUB_PAT')) ds.trim(10) ds.get_projects_meta() ds.get_project_versions(historical='latest') ds.update(**{'name': 'test', 'version': '1.0'}) ds.backup('../test.p') ds = Dataset.restore('../test.p') ds.export_inputset('../test.json') # test npm ds = Dataset.load_web('allbydependents', registry='npm', from_type='list', cache_dir=CACHE_DIR, debug=True) ds.trim(10) ds.get_projects_meta() ds.get_project_versions(historical='latest') ds.update(**{'name': 'test', 'version': '1.0'}) ds.backup('../test.p') ds = Dataset.restore('../test.p') ds.export_inputset('../test.json') # test pypi ds = Dataset.load_web('top4kyear', registry='pypi', from_type='list', cache_dir=CACHE_DIR, debug=True) ds.trim(10) ds.get_projects_meta() ds.get_project_versions(historical='latest') ds.update(**{'name': 'test', 'version': '1.0'}) ds.backup('../test.p') ds = Dataset.restore('../test.p') ds.export_inputset('../test.json') # cleanup files os.remove('../test.p') os.remove('../test.json')
def get_fedora_packages(): ds = Dataset.load_web( name="fedora", from_type="list", registry="portingdb", cache_dir=R2C_WEB_CACHE, ) names = set([project.get_name() for project in ds.projects]) return names
def get_top_packages(kind="top4kmonth"): ds = Dataset.load_web( name=kind, from_type="list", registry="pypi", cache_dir=R2C_WEB_CACHE, ) for project in ds.projects: yield project.get_name()
def get_opensuse_packages(project): ds = Dataset.load_web( name=project, from_type="list", registry="opensuse", cache_dir=R2C_WEB_CACHE, ) # Avoid dups like python2-cmd2 and python-cmd2 names = set([project.get_name() for project in ds.projects]) return names
def load(ctx, registry, from_type, name_or_path, fileargs): """Generates a dataset from a weblist name or file path.""" backup_ds = None try: backup_ds = deepcopy(ctx.obj.get('dataset', None)) if registry == 'noreg': registry = None global TEMP_SETTINGS if from_type == 'file': # read in a file (fileargs is either a header string for csv # or a parser handle for json) ds = Dataset.load_file(name_or_path, registry, fileargs=fileargs, **TEMP_SETTINGS) else: # download a weblist or organization repo list ds = Dataset.load_web(name_or_path, registry, from_type=from_type, **TEMP_SETTINGS) ctx.obj['dataset'] = ds # reset the temporary api/metadata dict TEMP_SETTINGS = dict() except Exception as e: print_error(e, DEBUG) # silently restore the dataset ctx.obj['dataset'] = backup_ds