def sample(ds: Dataset, n: int, on_versions: bool = True, seed: str = None) -> None: """Samples n projects in place.""" # seed random, if a seed was provided if seed: random.seed(seed) # select a sample of versions in each project if on_versions: dropped = 0 for project in ds.projects: dropped += len(project.versions) if len(project.versions) > n: project.versions = random.sample(project.versions, n) dropped -= len(project.versions) print(' Sampled {:,} versions from each of {:,} projects ({:,} ' 'total versions dropped).'.format(n, len(ds.projects), dropped)) # select a sample of projects elif len(ds.projects) > n: orig_count = len(ds.projects) ds.projects = random.sample(ds.projects, n) print(' Sampled {:,} projects from {:,} (dropped {:,}).' .format(n, orig_count, max(orig_count - n, 0))) else: # this should never happen... raise Exception('Dataset has no projects; cannot sample.')
def _parse_project(ds: Dataset, data: list) -> None: # map data keys to project keywords uuids = {"name": lambda p: p.project} # create the projects ds.projects = [ PypiProject(uuids_=uuids, **d) for d in tqdm( data, desc=" Loading", unit="project", leave=False) ]
def _parse_hugovk(ds: Dataset, data: list) -> None: from r2c_isg.structures.projects import PypiProject # map data keys to project keywords uuids = {'name': lambda p: p.project} # create the projects ds.projects = [ PypiProject(uuids_=uuids, **d) for d in tqdm( data, desc=' Loading', unit='project', leave=False) ]
def _parse_github(ds: Dataset, data: list) -> None: from r2c_isg.structures.projects import GithubRepo # map data keys to project keywords uuids = {'name': lambda p: p.name, 'url': lambda p: p.html_url} meta = { 'org': lambda p: p.url.split('/')[-2], } # create the projects ds.projects = [ GithubRepo(uuids_=uuids, meta_=meta, **d) for d in tqdm( data, desc=' Loading', unit='project', leave=False) ]
def _parse_niceregistry(ds: Dataset, data: list): from r2c_isg.structures.projects import NpmPackage # map data keys to package keywords uuids = {'name': lambda p: p.name} # create the projects # Note: data list is ordered from most dependents to fewest ds.projects = [] i = 1 for name in data: package = NpmPackage(uuids_=uuids, name=name, dependents_rank=i) ds.projects.append(package) i += 1
def trim(ds: Dataset, n: int, on_versions: bool = False) -> None: """Keep only the first n projects inplace.""" # select a sample of versions in each project if on_versions: dropped = 0 for project in ds.projects: dropped += len(project.versions) project.versions = project.versions[:n] dropped -= len(project.versions) print(' Trimmed to first {:,} versions in each project ' '({:,} total versions dropped).'.format(n, dropped)) # select a sample of projects else: orig_count = len(ds.projects) ds.projects = ds.projects[:n] print(' Trimmed to first {:,} projects ({:,} dropped).'.format( n, max(orig_count - n, 0)))
def load(cls, filepath: str, **kwargs) -> Dataset: """Loads an r2c input set json file.""" # initialize the dataset ds = Dataset(**kwargs) # load the file data = json.load(open(filepath)) # remove any existing projects ds.projects = [] # don't overwrite previously set metadata ds.name = ds.name or data['name'] ds.version = ds.version or data['version'] # grab any optional metadata ds.description = ds.description or data.get('description', None) ds.readme = ds.readme or data.get('readme', None) ds.author = ds.author or data.get('author', None) ds.email = ds.email or data.get('email', None) # generate the projects and versions for input_ in tqdm(data['inputs'], desc=' Importing', unit=' inputs', leave=False): # split out project- vs. version-level information p_data, v_data = {}, {} p_keys = ['repo_url', 'url', 'package_name'] v_keys = ['commit_hash', 'version'] for k, val in input_.items(): # add the attribute to the project or version if k in v_keys: v_data[k] = val elif k in p_keys: p_data[k] = val # get or create the new project project = ds.find_project(**p_data) if project: # update the existing project project.update(**p_data) else: # map json headers to project keywords, as applicable uuids = {} if 'package_name' in p_data: uuids['name'] = lambda p: p.package_name if 'repo_url' in p_data: uuids['url'] = lambda p: p.repo_url if 'url' in p_data: uuids['url'] = lambda p: p.url # create the new project & add it to the dataset p_class = project_map.get(ds.registry, DefaultProject) project = p_class(uuids_=uuids, **p_data) ds.projects.append(project) # create the new version, if it doesn't already exist if v_data: version = project.find_version(**v_data) if version: # update the existing version version.update(**v_data) else: # map csv headers to version keywords, as applicable uuids = {} if 'version' in v_data: uuids['version'] = lambda v: v.version if 'commit_hash' in v_data: uuids['commit'] = lambda v: v.commit_hash # create the new version & add it to the project v_class = version_map.get(ds.registry, DefaultVersion) project.versions.append(v_class(uuids_=uuids, **v_data)) return ds