def load(cls, name: str, **kwargs) -> Dataset: # get the request type (weblist vs. organization) from_type = kwargs.pop('from_type') # initialize a registry ds = Dataset(**kwargs) if from_type == 'list': # select the correct weblist loader/parser weblists = cls.weblists() if name not in weblists: raise Exception('Unrecognized github weblist name. ' 'Valid options are: %s' % list(weblists)) # load the data data = weblists[name]['getter'](api=ds.api, **kwargs) # parse the data weblists[name]['parser'](ds, data) elif from_type in ['user', 'org']: # load the data data = GithubLoader._get_org_or_user_repos(ds.api, name, from_type, **kwargs) # parse the data GithubLoader._parse_github(ds, data) return ds
def load(cls, name: str, **kwargs) -> Dataset: # get the request type (weblist vs. organization) from_type = kwargs.pop("from_type") if from_type in ["user", "org"]: raise Exception( "opensuse OBS does not support loading project lists from user/org names." ) # initialize a registry ds = Dataset(**kwargs) # select the correct weblist loader/parser weblists = cls.weblists() if name not in weblists: raise Exception( "Unrecognized opensuse OBS weblist name. Valid " "options are: %s" % list(weblists) ) # load the data data = weblists[name]["getter"](api=ds.api, project=name, **kwargs) # parse the data weblists[name]["parser"](ds, data) return ds
def load(cls, filepath: str, **kwargs) -> Dataset: """Loads a json file.""" # ensure the user specified which parser to use parser = kwargs.pop('parser', None) if not parser: raise Exception('Please provide the handle to a json parser. ' 'Valid options are: %s' % list(cls.parsers())) # check if the parsing schema exists if not parser in cls.parsers(): raise Exception('Unrecognized json parser name. Review the docs ' 'to ensure any custom json parsers have been ' 'properly registered.') # initialize a dataset ds = Dataset(**kwargs) # load the file data = json.load(open(filepath)) # run the appropriate parser cls.parsers()[parser](ds, data) return ds
def load(cls, name: str, **kwargs) -> Dataset: # get the request type (weblist vs. organization) from_type = kwargs.pop('from_type') if from_type in ['user', 'org']: raise Exception( 'Pypi does not support loading project lists from user/org names.' ) # initialize a registry ds = Dataset(**kwargs) # select the correct weblist loader/parser weblists = cls.weblists() if name not in weblists: raise Exception('Unrecognized pypi weblist name. Valid ' 'options are: %s' % list(weblists)) # load the data data = weblists[name]['getter'](api=ds.api, **kwargs) # parse the data weblists[name]['parser'](ds, data) return ds
def load(cls, filepath: str, **kwargs) -> Dataset: """Loads an r2c input set json file.""" # initialize the dataset ds = Dataset(**kwargs) # load the file data = json.load(open(filepath)) # remove any existing projects ds.projects = [] # don't overwrite previously set metadata ds.name = ds.name or data['name'] ds.version = ds.version or data['version'] # grab any optional metadata ds.description = ds.description or data.get('description', None) ds.readme = ds.readme or data.get('readme', None) ds.author = ds.author or data.get('author', None) ds.email = ds.email or data.get('email', None) # generate the projects and versions for input_ in tqdm(data['inputs'], desc=' Importing', unit=' inputs', leave=False): # split out project- vs. version-level information p_data, v_data = {}, {} p_keys = ['repo_url', 'url', 'package_name'] v_keys = ['commit_hash', 'version'] for k, val in input_.items(): # add the attribute to the project or version if k in v_keys: v_data[k] = val elif k in p_keys: p_data[k] = val # get or create the new project project = ds.find_project(**p_data) if project: # update the existing project project.update(**p_data) else: # map json headers to project keywords, as applicable uuids = {} if 'package_name' in p_data: uuids['name'] = lambda p: p.package_name if 'repo_url' in p_data: uuids['url'] = lambda p: p.repo_url if 'url' in p_data: uuids['url'] = lambda p: p.url # create the new project & add it to the dataset p_class = project_map.get(ds.registry, DefaultProject) project = p_class(uuids_=uuids, **p_data) ds.projects.append(project) # create the new version, if it doesn't already exist if v_data: version = project.find_version(**v_data) if version: # update the existing version version.update(**v_data) else: # map csv headers to version keywords, as applicable uuids = {} if 'version' in v_data: uuids['version'] = lambda v: v.version if 'commit_hash' in v_data: uuids['commit'] = lambda v: v.commit_hash # create the new version & add it to the project v_class = version_map.get(ds.registry, DefaultVersion) project.versions.append(v_class(uuids_=uuids, **v_data)) return ds
def load(cls, filepath: str, **kwargs) -> Dataset: """Loads a csv file.""" # user-defined headers override default headers headers = kwargs.pop('fileargs', None) if headers: user_defined = True headers = headers.split() else: user_defined = False # default headers are name and version string headers = ['name', 'v.version'] # initialize a dataset ds = Dataset(**kwargs) # load the file with open(filepath, mode='r', encoding='utf-8-sig') as file: csv_file = csv.reader(file, delimiter=',') for row in csv_file: if row[0].startswith('!'): # read in a header row if not user_defined: # in-file headers override defaults # (but not user-defined headers from the cli) headers = [h[1:] for h in row] else: # ensure we have as many headers as cells in the row if len(row) > len(headers): raise Exception('A column is missing a header. Review ' "the input file's column headers.") # read in a data row p_data, v_data = {}, {} for i, val in enumerate(row): attr = headers[i] # add the data to the project or version if attr.startswith('v.'): v_data[attr[2:]] = val else: p_data[attr] = val # get or create the new project project = ds.find_project(**p_data) if project: # update the existing project project.update(**p_data) else: # map csv headers to project keywords, as applicable uuids, meta = {}, {} if 'name' in p_data: uuids['name'] = lambda p: p.name if 'org' in p_data: meta['org'] = lambda p: p.org if 'url' in p_data: uuids['url'] = lambda p: p.url # create the new project & add it to the dataset p_class = project_map.get(ds.registry, DefaultProject) project = p_class(uuids_=uuids, meta_=meta, **p_data) ds.projects.append(project) # create the new version, if it doesn't already exist if v_data: version = project.find_version(**v_data) if version: # update the existing version version.update(**v_data) else: # map csv headers to version keywords, as applicable uuids = {} if 'version' in v_data: uuids['version'] = lambda v: v.version if 'commit' in v_data: uuids['commit'] = lambda v: v.commit # create the new version & add it to the project v_class = version_map.get(ds.registry, DefaultVersion) project.versions.append( v_class(uuids_=uuids, **v_data)) return ds