def main(args): do_render = args.render_only or not args.upload_only do_upload = args.upload_only or not args.render_only client = boto3.client('s3') os.makedirs(output_dir, exist_ok=True) os.makedirs(cache_dir, exist_ok=True) for project_dir in get_target_project_dirs(): tsne = TSNE(project_dir.name, args.perplexity) if do_render: try: tsne.load() except HTTPError: log.info(f'Failed to retrieve tSNE data from SCXA for project' f' {project_dir.name}') continue tsne.make_image(args.colormap, args.image_format, args.dpi) if do_upload: try: tsne.upload(client, args.bucket) except ValueError: log.info(f'Nothing to upload for project {project_dir.name}') continue
def get_cached_cell_counts(cls) -> MutableMapping[str, int]: """ Return a mapping from accessions to cell counts. """ return { p.name: cls.get_cached_cell_count(p) for p in get_target_project_dirs() }
def run(self): if self.args.verbose: logging.getLogger().setLevel(logging.DEBUG) count_method = [] if self.args.slow: count_method.append('slow') if self.args.medium: count_method.append('medium') if self.args.fast: count_method.append('fast') for project_dir in get_target_project_dirs(): cell_count, gene_count = self.get_project_contents_count(project_dir, count_method) if self.args.write: self.write_cell_gene_count(project_dir, cell_count, gene_count)
def main(): logging.basicConfig(level=logging.INFO) hca_config = HCAConfig() hca_config[ "DSSClient"].swagger_url = f"https://dss.dev.data.humancellatlas.org/v1/swagger.json" dss = DSSClient(config=hca_config) projects = get_target_project_dirs(follow_links=True) for project in projects: log.info('Uploading %s', project) bundle_uuid = project.name assert str(UUID(bundle_uuid)) == bundle_uuid bundle = project / 'bundle' def file_uuid_callback(file_path: str): file_path = Path(file_path) file_name = file_path.name file_uuid = generate_file_uuid(bundle_uuid, file_name) log.info('Allocated UUID %s for file %s', file_uuid, file_path) if file_name.endswith('.json'): with file_path.open('rt') as f: document = json.load(f) if file_name == 'links.json': pass elif file_name == 'project_0.json': assert document['provenance'][ 'document_id'] == bundle_uuid else: assert document['provenance'][ 'document_id'] == file_uuid return file_uuid if bundle.is_dir(): response = dss.upload(src_dir=str(bundle), replica='aws', staging_bucket='lon-test-data', bundle_uuid=bundle_uuid, file_uuid_callback=file_uuid_callback) print( f'Successful upload. Bundle information is:\n{json.dumps(response, indent=4)}' ) else: log.warning('Skipping %s because metadata is missing', project)
def main(): project_dirs = get_target_project_dirs() for project_dir in project_dirs: extract_recursive(project_dir)
def run(self): for project_dir in get_target_project_dirs(follow_links=True): main.upload_files_to_bucket(project_dir)
def populate_all_static_projects(file_pattern): for project_dir in get_target_project_dirs(): populate_static_project(project_dir, file_pattern)
def overview_report() -> Mapping[UUID, ProjectReport]: """ Generate a report that reconciles the presence of certain resources associated with each project :return: An overview report in the form of a dict mapping UUIDs to objects containing project details """ report = {} logging.debug('Searching for project uuids in the projects path ...') for uuid in get_project_uuids(): project_path = projects_path / str(uuid) report[uuid] = ProjectReport(uuid=uuid, project_path=project_path) logging.debug('Searching for accession ids in the projects path ...') for accession_id in [p.name for p in get_target_project_dirs()]: # accession ids are symlinks to folders named by uuid accession_symlink = projects_path / accession_id assert accession_symlink.is_symlink() expanded_path = accession_symlink.resolve() if not expanded_path.exists(): logging.debug('Error: Symlink %s has invalid target %s', accession_id, expanded_path) expanded_path = None try: uuid_from_symlink = UUID(expanded_path.name) except AttributeError: # logging.debug('Error: UUID(None.name)') uuid_from_symlink = None # Symlink has an invalid target except ValueError: logging.debug('Error: Symlink %s target %s is invalid UUID', accession_id, expanded_path.name) uuid_from_symlink = None # Value from symlink target wasn't a valid UUID if uuid_from_symlink: if uuid_from_symlink in report: # Update existing project in report with accession info report[uuid_from_symlink].accession = accession_id report[uuid_from_symlink].symlink = accession_symlink else: # Create a new project in the report using the uuid from symlink's target assert not (projects_path / str(uuid_from_symlink)).exists(), \ f'get_project_uuids() failed to find {str(uuid_from_symlink)}' logging.debug('New accession %s found as accession with symlink, adding uuid %s', accession_id, str(uuid_from_symlink)) report[uuid_from_symlink] = ProjectReport(uuid=uuid_from_symlink, accession=accession_id, symlink=accession_symlink) else: uuid_from_accession_id = UUID(generate_project_uuid(accession_id)) if uuid_from_accession_id in report: # update existing project in report report[uuid_from_accession_id].accession = accession_id else: logging.debug('New accession %s found as accession without symlink, adding uuid %s', accession_id, str(uuid_from_accession_id)) report[uuid_from_accession_id] = ProjectReport(uuid=uuid_from_accession_id, accession=accession_id) logging.debug('Searching for spreadsheets ...') for accession_id, file in get_target_spreadsheets().items(): logging.debug('Checking: %s', file) uuid = UUID(generate_project_uuid(accession_id)) try: report[uuid].spreadsheet = file except KeyError: logging.debug('New accession %s found in spreadsheets, adding uuid %s', accession_id, str(uuid)) report[uuid] = ProjectReport(uuid=uuid, accession=accession_id, spreadsheet=file) logging.debug('Fetching cell count ...') for accession_id, cell_count in CountCells.get_cached_cell_counts().items(): uuid = UUID(generate_project_uuid(accession_id)) try: report[uuid].cell_count = cell_count except KeyError: logging.debug('New accession %s found, adding uuid %s', accession_id, str(uuid)) report[uuid] = ProjectReport(uuid=uuid, accession=accession_id, cell_count=cell_count) logging.debug('Fetching gene count ...') for accession_id, gene_count in CountCells.get_cached_gene_counts().items(): uuid = UUID(generate_project_uuid(accession_id)) try: report[uuid].gene_count = gene_count except KeyError: logging.debug('New accession %s found, adding uuid %s', accession_id, str(uuid)) report[uuid] = ProjectReport(uuid=uuid, accession=accession_id, gene_count=gene_count) logging.debug('Counting geo files ...') for uuid in report: path = projects_path / str(uuid) / 'geo' report[uuid].geo_files = get_file_count(path, glob='**/*') logging.debug('Counting matrices ...') for uuid in report: path = projects_path / str(uuid) / 'matrices' report[uuid].num_matrices = get_file_count(path, glob='**/matrix.mtx.gz') logging.debug('Checking for zipped_matrix ...') for uuid in report: zipped_matrix = projects_path / str(uuid) / 'bundle' / 'matrix.mtx.zip' if zipped_matrix.is_file(): report[uuid].zipped_matrix = zipped_matrix logging.debug('Checking for metadata_json_count ...') for uuid in report: path = projects_path / str(uuid) / 'bundle' report[uuid].num_metadata_files = get_file_count(path, glob='*.json') logging.debug('Checking for num_hca_metadata_files ...') for uuid in report: path = projects_path / str(uuid) / 'hca' report[uuid].num_hca_metadata_files = get_file_count(path, glob='*.json') logging.debug('Checking for zipped_matrix ...') for uuid in report: zipped_matrix = projects_path / str(uuid) / 'hca' / 'matrix.mtx.zip' if zipped_matrix.is_file(): report[uuid].zipped_hca_matrix = zipped_matrix return report
def run(self): projects = get_target_project_dirs() for project in projects: self.clean_project(project)