def sample(ds: Dataset, n: int,
           on_versions: bool = True, seed: str = None) -> None:
    """Samples n projects in place."""

    # seed random, if a seed was provided
    if seed:
        random.seed(seed)

    # select a sample of versions in each project
    if on_versions:
        dropped = 0
        for project in ds.projects:
            dropped += len(project.versions)
            if len(project.versions) > n:
                project.versions = random.sample(project.versions, n)
            dropped -= len(project.versions)

        print('         Sampled {:,} versions from each of {:,} projects ({:,} '
              'total versions dropped).'.format(n, len(ds.projects), dropped))

    # select a sample of projects
    elif len(ds.projects) > n:
        orig_count = len(ds.projects)
        ds.projects = random.sample(ds.projects, n)
        print('         Sampled {:,} projects from {:,} (dropped {:,}).'
              .format(n, orig_count, max(orig_count - n, 0)))

    else:
        # this should never happen...
        raise Exception('Dataset has no projects; cannot sample.')
Exemple #2
0
    def _parse_project(ds: Dataset, data: list) -> None:
        # map data keys to project keywords
        uuids = {"name": lambda p: p.project}

        # create the projects
        ds.projects = [
            PypiProject(uuids_=uuids, **d) for d in tqdm(
                data, desc="         Loading", unit="project", leave=False)
        ]
Exemple #3
0
    def _parse_hugovk(ds: Dataset, data: list) -> None:
        from r2c_isg.structures.projects import PypiProject

        # map data keys to project keywords
        uuids = {'name': lambda p: p.project}

        # create the projects
        ds.projects = [
            PypiProject(uuids_=uuids, **d) for d in tqdm(
                data, desc='         Loading', unit='project', leave=False)
        ]
Exemple #4
0
    def _parse_github(ds: Dataset, data: list) -> None:
        from r2c_isg.structures.projects import GithubRepo

        # map data keys to project keywords
        uuids = {'name': lambda p: p.name, 'url': lambda p: p.html_url}
        meta = {
            'org': lambda p: p.url.split('/')[-2],
        }

        # create the projects
        ds.projects = [
            GithubRepo(uuids_=uuids, meta_=meta, **d) for d in tqdm(
                data, desc='         Loading', unit='project', leave=False)
        ]
    def _parse_niceregistry(ds: Dataset, data: list):
        from r2c_isg.structures.projects import NpmPackage

        # map data keys to package keywords
        uuids = {'name': lambda p: p.name}

        # create the projects
        # Note: data list is ordered from most dependents to fewest
        ds.projects = []
        i = 1
        for name in data:
            package = NpmPackage(uuids_=uuids, name=name, dependents_rank=i)
            ds.projects.append(package)
            i += 1
def trim(ds: Dataset, n: int, on_versions: bool = False) -> None:
    """Keep only the first n projects inplace."""

    # select a sample of versions in each project
    if on_versions:
        dropped = 0
        for project in ds.projects:
            dropped += len(project.versions)
            project.versions = project.versions[:n]
            dropped -= len(project.versions)

        print('         Trimmed to first {:,} versions in each project '
              '({:,} total versions dropped).'.format(n, dropped))

    # select a sample of projects
    else:
        orig_count = len(ds.projects)
        ds.projects = ds.projects[:n]
        print('         Trimmed to first {:,} projects ({:,} dropped).'.format(
            n, max(orig_count - n, 0)))
    def load(cls, filepath: str, **kwargs) -> Dataset:
        """Loads an r2c input set json file."""

        # initialize the dataset
        ds = Dataset(**kwargs)

        # load the file
        data = json.load(open(filepath))

        # remove any existing projects
        ds.projects = []

        # don't overwrite previously set metadata
        ds.name = ds.name or data['name']
        ds.version = ds.version or data['version']

        # grab any optional metadata
        ds.description = ds.description or data.get('description', None)
        ds.readme = ds.readme or data.get('readme', None)
        ds.author = ds.author or data.get('author', None)
        ds.email = ds.email or data.get('email', None)

        # generate the projects and versions
        for input_ in tqdm(data['inputs'], desc='         Importing',
                           unit=' inputs', leave=False):
            # split out project- vs. version-level information
            p_data, v_data = {}, {}
            p_keys = ['repo_url', 'url', 'package_name']
            v_keys = ['commit_hash', 'version']
            for k, val in input_.items():
                # add the attribute to the project or version
                if k in v_keys:
                    v_data[k] = val
                elif k in p_keys:
                    p_data[k] = val

            # get or create the new project
            project = ds.find_project(**p_data)
            if project:
                # update the existing project
                project.update(**p_data)

            else:
                # map json headers to project keywords, as applicable
                uuids = {}
                if 'package_name' in p_data:
                    uuids['name'] = lambda p: p.package_name
                if 'repo_url' in p_data:
                    uuids['url'] = lambda p: p.repo_url
                if 'url' in p_data:
                    uuids['url'] = lambda p: p.url

                # create the new project & add it to the dataset
                p_class = project_map.get(ds.registry, DefaultProject)
                project = p_class(uuids_=uuids, **p_data)
                ds.projects.append(project)

            # create the new version, if it doesn't already exist
            if v_data:
                version = project.find_version(**v_data)
                if version:
                    # update the existing version
                    version.update(**v_data)

                else:
                    # map csv headers to version keywords, as applicable
                    uuids = {}
                    if 'version' in v_data:
                        uuids['version'] = lambda v: v.version
                    if 'commit_hash' in v_data:
                        uuids['commit'] = lambda v: v.commit_hash

                    # create the new version & add it to the project
                    v_class = version_map.get(ds.registry, DefaultVersion)
                    project.versions.append(v_class(uuids_=uuids, **v_data))

        return ds