Example #1
0
def main(args):
    do_render = args.render_only or not args.upload_only
    do_upload = args.upload_only or not args.render_only

    client = boto3.client('s3')

    os.makedirs(output_dir, exist_ok=True)
    os.makedirs(cache_dir, exist_ok=True)

    for project_dir in get_target_project_dirs():
        tsne = TSNE(project_dir.name, args.perplexity)

        if do_render:
            try:
                tsne.load()
            except HTTPError:
                log.info(f'Failed to retrieve tSNE data from SCXA for project'
                         f' {project_dir.name}')
                continue
            tsne.make_image(args.colormap, args.image_format, args.dpi)

        if do_upload:
            try:
                tsne.upload(client, args.bucket)
            except ValueError:
                log.info(f'Nothing to upload for project {project_dir.name}')
                continue
Example #2
0
    def get_cached_cell_counts(cls) -> MutableMapping[str, int]:
        """
        Return a mapping from accessions to cell counts.
        """

        return {
            p.name: cls.get_cached_cell_count(p)
            for p in get_target_project_dirs()
        }
Example #3
0
    def run(self):
        if self.args.verbose:
            logging.getLogger().setLevel(logging.DEBUG)

        count_method = []
        if self.args.slow:
            count_method.append('slow')
        if self.args.medium:
            count_method.append('medium')
        if self.args.fast:
            count_method.append('fast')

        for project_dir in get_target_project_dirs():
            cell_count, gene_count = self.get_project_contents_count(project_dir, count_method)
            if self.args.write:
                self.write_cell_gene_count(project_dir, cell_count, gene_count)
Example #4
0
def main():
    logging.basicConfig(level=logging.INFO)
    hca_config = HCAConfig()
    hca_config[
        "DSSClient"].swagger_url = f"https://dss.dev.data.humancellatlas.org/v1/swagger.json"
    dss = DSSClient(config=hca_config)

    projects = get_target_project_dirs(follow_links=True)

    for project in projects:
        log.info('Uploading %s', project)
        bundle_uuid = project.name
        assert str(UUID(bundle_uuid)) == bundle_uuid
        bundle = project / 'bundle'

        def file_uuid_callback(file_path: str):
            file_path = Path(file_path)
            file_name = file_path.name
            file_uuid = generate_file_uuid(bundle_uuid, file_name)
            log.info('Allocated UUID %s for file %s', file_uuid, file_path)
            if file_name.endswith('.json'):
                with file_path.open('rt') as f:
                    document = json.load(f)
                    if file_name == 'links.json':
                        pass
                    elif file_name == 'project_0.json':
                        assert document['provenance'][
                            'document_id'] == bundle_uuid
                    else:
                        assert document['provenance'][
                            'document_id'] == file_uuid
            return file_uuid

        if bundle.is_dir():
            response = dss.upload(src_dir=str(bundle),
                                  replica='aws',
                                  staging_bucket='lon-test-data',
                                  bundle_uuid=bundle_uuid,
                                  file_uuid_callback=file_uuid_callback)
            print(
                f'Successful upload.  Bundle information is:\n{json.dumps(response, indent=4)}'
            )
        else:
            log.warning('Skipping %s because metadata is missing', project)
Example #5
0
def main():
    project_dirs = get_target_project_dirs()

    for project_dir in project_dirs:
        extract_recursive(project_dir)
 def run(self):
     for project_dir in get_target_project_dirs(follow_links=True):
         main.upload_files_to_bucket(project_dir)
Example #7
0
def populate_all_static_projects(file_pattern):
    for project_dir in get_target_project_dirs():
        populate_static_project(project_dir, file_pattern)
def overview_report() -> Mapping[UUID, ProjectReport]:
    """
    Generate a report that reconciles the presence of certain resources associated with each project

    :return: An overview report in the form of a dict mapping UUIDs to objects containing project details
    """

    report = {}

    logging.debug('Searching for project uuids in the projects path ...')
    for uuid in get_project_uuids():
        project_path = projects_path / str(uuid)
        report[uuid] = ProjectReport(uuid=uuid,
                                     project_path=project_path)

    logging.debug('Searching for accession ids in the projects path ...')
    for accession_id in [p.name for p in get_target_project_dirs()]:
        # accession ids are symlinks to folders named by uuid
        accession_symlink = projects_path / accession_id
        assert accession_symlink.is_symlink()
        expanded_path = accession_symlink.resolve()
        if not expanded_path.exists():
            logging.debug('Error: Symlink %s has invalid target %s', accession_id, expanded_path)
            expanded_path = None
        try:
            uuid_from_symlink = UUID(expanded_path.name)
        except AttributeError:
            # logging.debug('Error: UUID(None.name)')
            uuid_from_symlink = None  # Symlink has an invalid target
        except ValueError:
            logging.debug('Error: Symlink %s target %s is invalid UUID', accession_id, expanded_path.name)
            uuid_from_symlink = None  # Value from symlink target wasn't a valid UUID
        if uuid_from_symlink:
            if uuid_from_symlink in report:  # Update existing project in report with accession info
                report[uuid_from_symlink].accession = accession_id
                report[uuid_from_symlink].symlink = accession_symlink
            else:  # Create a new project in the report using the uuid from symlink's target
                assert not (projects_path / str(uuid_from_symlink)).exists(), \
                    f'get_project_uuids() failed to find {str(uuid_from_symlink)}'
                logging.debug('New accession %s found as accession with symlink, adding uuid %s',
                              accession_id, str(uuid_from_symlink))
                report[uuid_from_symlink] = ProjectReport(uuid=uuid_from_symlink,
                                                          accession=accession_id,
                                                          symlink=accession_symlink)
        else:
            uuid_from_accession_id = UUID(generate_project_uuid(accession_id))
            if uuid_from_accession_id in report:  # update existing project in report
                report[uuid_from_accession_id].accession = accession_id
            else:
                logging.debug('New accession %s found as accession without symlink, adding uuid %s',
                              accession_id, str(uuid_from_accession_id))
                report[uuid_from_accession_id] = ProjectReport(uuid=uuid_from_accession_id,
                                                               accession=accession_id)

    logging.debug('Searching for spreadsheets ...')
    for accession_id, file in get_target_spreadsheets().items():
        logging.debug('Checking: %s', file)
        uuid = UUID(generate_project_uuid(accession_id))
        try:
            report[uuid].spreadsheet = file
        except KeyError:
            logging.debug('New accession %s found in spreadsheets, adding uuid %s',
                          accession_id, str(uuid))
            report[uuid] = ProjectReport(uuid=uuid,
                                         accession=accession_id,
                                         spreadsheet=file)

    logging.debug('Fetching cell count ...')
    for accession_id, cell_count in CountCells.get_cached_cell_counts().items():
        uuid = UUID(generate_project_uuid(accession_id))
        try:
            report[uuid].cell_count = cell_count
        except KeyError:
            logging.debug('New accession %s found, adding uuid %s',
                          accession_id, str(uuid))
            report[uuid] = ProjectReport(uuid=uuid,
                                         accession=accession_id,
                                         cell_count=cell_count)

    logging.debug('Fetching gene count ...')
    for accession_id, gene_count in CountCells.get_cached_gene_counts().items():
        uuid = UUID(generate_project_uuid(accession_id))
        try:
            report[uuid].gene_count = gene_count
        except KeyError:
            logging.debug('New accession %s found, adding uuid %s',
                          accession_id, str(uuid))
            report[uuid] = ProjectReport(uuid=uuid,
                                         accession=accession_id,
                                         gene_count=gene_count)

    logging.debug('Counting geo files ...')
    for uuid in report:
        path = projects_path / str(uuid) / 'geo'
        report[uuid].geo_files = get_file_count(path, glob='**/*')

    logging.debug('Counting matrices ...')
    for uuid in report:
        path = projects_path / str(uuid) / 'matrices'
        report[uuid].num_matrices = get_file_count(path, glob='**/matrix.mtx.gz')

    logging.debug('Checking for zipped_matrix ...')
    for uuid in report:
        zipped_matrix = projects_path / str(uuid) / 'bundle' / 'matrix.mtx.zip'
        if zipped_matrix.is_file():
            report[uuid].zipped_matrix = zipped_matrix

    logging.debug('Checking for metadata_json_count ...')
    for uuid in report:
        path = projects_path / str(uuid) / 'bundle'
        report[uuid].num_metadata_files = get_file_count(path, glob='*.json')

    logging.debug('Checking for num_hca_metadata_files ...')
    for uuid in report:
        path = projects_path / str(uuid) / 'hca'
        report[uuid].num_hca_metadata_files = get_file_count(path, glob='*.json')

    logging.debug('Checking for zipped_matrix ...')
    for uuid in report:
        zipped_matrix = projects_path / str(uuid) / 'hca' / 'matrix.mtx.zip'
        if zipped_matrix.is_file():
            report[uuid].zipped_hca_matrix = zipped_matrix

    return report
Example #9
0
 def run(self):
     projects = get_target_project_dirs()
     for project in projects:
         self.clean_project(project)