def get_target_project_dirs(follow_links: bool = False) -> List[Path]: """ Return all or a subset of the project directories, if that subset is configured. :param follow_links: If True, follow the symbolic accession links and return Path instances referring to the physical, UUID-based project directories. Otherwise Path instances referring to the symbolic accession links will be returned. """ projects_dir = Path('projects') accessions = get_skunk_accessions() symlinks = [ path for path in projects_dir.iterdir() if path.is_dir() and path.is_symlink() and ( accessions is None or path.name in accessions) ] # Validate the links even though strictly speaking its only necessary to # follow them when follow_links is on. project_dirs = [] for symlink in symlinks: project_dir = symlink.follow() assert project_dir.is_dir() assert not project_dir.is_symlink() assert not project_dir.is_absolute() assert project_dir.parent == projects_dir accession = symlink.name project_uuid = generate_project_uuid([accession]) assert project_dir.name == project_uuid project_dirs.append(project_dir) return project_dirs if follow_links else symlinks
def download_supplementary_files(accession): """ Scrape web page for given accession id and download all supplementary files """ logging.info('---') project_uuid = generate_project_uuid([accession]) logging.info( 'Checking project with accession %s and UUID %s for files to download ...', accession, project_uuid) projects_path = Path('projects') geo_path = projects_path / project_uuid / 'geo' if not geo_path.exists(): geo_path.mkdir(parents=True) create_or_update_symlink(projects_path / accession, Path(project_uuid)) source_url = source_url_template + accession page = requests.get(source_url) links = supplementary_file_download_links(accession, page.text) if links: for file_name, url in links: file_path = geo_path / file_name if file_path.is_file(): logging.info('Skipping existing file: %s', file_path) else: logging.info('Downloading to: %s', file_path) download_file(url, file_path) else: logging.info('No supplementary files found on %s', source_url)
def get_target_spreadsheets() -> MutableMapping[str, Path]: accessions = get_skunk_accessions() paths_by_accession = {} ext = '.0.xlsx' def get_accession_from_path(path): assert path.name.endswith(ext) return path.name[:-len(ext)] for sub_dir in ('existing', 'new'): src_dir = Path('spreadsheets') / sub_dir paths = list(src_dir.iterdir()) paths = [ path for path in paths if path.is_file() and path.name.endswith(ext) ] subdir_paths_by_accession = { get_accession_from_path(path): path for path in paths } assert len(paths) == len(subdir_paths_by_accession) subdir_paths_by_accession = { accession: path for accession, path in subdir_paths_by_accession.items() if accessions is None or accession in accessions } paths_by_accession.update(subdir_paths_by_accession) return paths_by_accession
def extract_file(src_path: Path, dest_path: Path, compression='tar'): """ Extract a compressed file and put completion file in destination folder once complete. Skips extraction if a completion file found in the destination folder :param src_path: Path to a compressed file :param dest_path: Path to put extracted contents :param compression: Either 'tar' or 'zip' """ completion_file = dest_path / '.complete' if completion_file.exists(): logging.info('Expansion of %s already complete', src_path) else: if compression == 'tar': openmode = 'r:gz' if src_path.name.endswith('.tar.gz') else 'r' extractor = tarfile.TarFile.open(str(src_path), mode=openmode) assert completion_file.name not in extractor.getnames() elif compression == 'zip': extractor = zipfile.ZipFile(str(src_path), 'r') else: raise ValueError('Unsupported compression') with extractor: if dest_path.exists(): logging.info('Removing partially expanded %s', dest_path) shutil.rmtree(str(dest_path)) logging.info('Expanding %s', dest_path) dest_path.mkdir() extractor.extractall(str(dest_path)) completion_file.touch() logging.info('Expansion of %s is complete', dest_path)
def extract_recursive(compressed_path: Path): """ Recursively extract tar files into a folder located at the same path as the tar file :param compressed_path: Path to a compressed file or folder containing one or more tar files """ logging.debug('Running extract_recursive(%s)', compressed_path) if compressed_path.is_dir(): logging.debug('Decending into directory %s', compressed_path) # Iterate over directory contents sorted by type with files first, then # directories. This is done to avoid wasting time processing a directory # that could be itself be deleted and re-extracted from a tar file for file in sorted(compressed_path.iterdir(), key=methodcaller('is_dir')): extract_recursive(file) else: is_zip = compressed_path.name in ( 'experiment-metadata.zip', 'marker-genes.zip', 'normalised.zip', 'quantification-raw.zip') is_tar = compressed_path.name.endswith(('.tar', '.tar.gz')) if is_tar or is_zip: logging.debug('Examining file %s', compressed_path) base_file_name = get_base_file_name(compressed_path.name) assert 0 < len(base_file_name) < len(compressed_path.name) dest_path = compressed_path.parent / base_file_name assert compressed_path.is_file() if is_tar: extract_file(compressed_path, dest_path, compression='tar') # Extract the tar file to a subfolder extract_recursive(dest_path) # Check subfolder for tar files and extract them elif is_zip: # This is a zip download from SCXA extract_file(compressed_path, dest_path, compression='zip') else: assert False
def convert_h5_to_mtx(input_file: Path, output_dir: Path) -> None: with h5py.File(str(input_file), mode='r') as h5: group = one(h5.values()) m = Matrix.from_group(input_file, group) output_dir.mkdir(parents=True, exist_ok=True) # FIXME: move to convert_matrices.py m.to_mtx(output_dir)
def get_file_count(path: Path, glob: str) -> int: """ Return the count of files in a folder :param path: A path to a folder to check :param glob: The glob pattern :return: Number of files counted """ if path.exists() and path.is_dir(): return sum([1 for f in path.glob(glob) if f.is_file()]) else: return 0
def download_file(url: str, path: Path): """ Stream download the file from url, save it to path, and return response headers """ with requests.get(url, stream=True) as request: request.raise_for_status() with tempfile.NamedTemporaryFile(dir=str(path.parent), delete=False) as f: try: for chunk in request.iter_content(chunk_size=1024 * 1024): f.write(chunk) except: Path(f.name).unlink() raise else: Path(f.name).rename(path)
def write_gzip_file(output_file: Path, lines: Iterable): """ Create/overwrite a gzipped text file :param output_file: File to create :param lines: List/Iterator of strings to write to file (a '\n' is added to each line) """ temp_output_file = output_file.with_suffix(output_file.suffix + '.tmp') log.info('Writing %s ...', temp_output_file) try: # Using gzip.open(temp) directly creates an archive that causes # `gunzip -N` to extract the file under the name of the temporary file # even if the archive name is different. Therefore we must set the # internal file name manually and pass in an already open file object # for writing. with open(str(temp_output_file), 'wb') as f: with gzip.GzipFile(filename=output_file, fileobj=f) as z: with io.TextIOWrapper(z) as w: for line in lines: w.write(line + '\n') except BaseException: try: temp_output_file.unlink() except FileNotFoundError: pass raise else: log.info('Renaming %s to %s ...', temp_output_file, output_file) temp_output_file.rename(output_file)
def write_mtx_file(rows_cols_count_line: str, mtx_body_file: Path, output_file: Path): """ Write the final mtx file with comment header line, the rows_cols_count line, and the mtx body from previously written temp file :param rows_cols_count_line: String containing "{num_genes} {num_cells} {total_values}" :param mtx_body_file: Path of the temp file containing data to be written to the body mtx file :param output_file: Path of the mtx file to be written """ temp_output_file = output_file.with_suffix(output_file.suffix + '.tmp') log.info('Writing %s ...', temp_output_file) try: with gzip.open(temp_output_file, 'wb') as f: header_line = '%%MatrixMarket matrix coordinate integer general\n' f.write(header_line.encode()) f.write((rows_cols_count_line + '\n').encode()) with open_maybe_gz(mtx_body_file, 'rb') as temp_data: # Using 1MiB buffer should be faster than the default of 16KiB copyfileobj(temp_data, f, length=2**20) except BaseException: log.warning('Error writing %s ...', temp_output_file) try: temp_output_file.unlink() except FileNotFoundError: pass raise else: log.info('Renaming %s to %s ...', temp_output_file, output_file) temp_output_file.rename(output_file)
def main(argv): """ Support for command line execution of convert_csv_to_mtx() """ logging.basicConfig( format='%(asctime)s %(levelname)s:%(threadName)s:%(message)s', level=logging.INFO) parser = argparse.ArgumentParser(description=__doc__) parser.add_argument('csv_file', help='Input csv file') parser.add_argument('output_dir', help='Path to write output files') parser.add_argument( 'delimiter', help='Delimiter character or keyword "comma", "space", "tab"') parser.add_argument( 'rows_are_genes', help='"y" if rows are genes or "n" if columns are genes') args = parser.parse_args(argv) if not os.path.isfile(args.csv_file): log.error('File not found: %s', args.csv_file) parser.print_help() exit() if args.delimiter == 'comma': args.delimiter = ',' elif args.delimiter == 'space': args.delimiter = ' ' elif args.delimiter == 'tab': args.delimiter = '\t' if len(args.delimiter) < 1: log.error('Delimiter must be 1 char in length') if args.rows_are_genes not in ('y', 'n'): log.error('rows_are_genes must be "y" or "n"') args.rows_are_genes = args.rows_are_genes == 'y' converter = CSVConverter(Path(args.csv_file), delimiter=args.delimiter, rows_are_genes=args.rows_are_genes) converter.convert(Path(args.output_dir))
def convert(self, output_dir: Path): output_dir.mkdir(parents=True, exist_ok=True) # FIXME: move to convert_matrices.py mtx_body_file = output_dir / 'matrix.mtx.body.gz' mtx_file = output_dir / 'matrix.mtx.gz' # Fully consume the iterator by writing the body of the mtx file to a temp file write_gzip_file(mtx_body_file, self) # Write the completed mtx file using correct header information and the body we wrote to the temp file rows_cols_count_line = f'{len(self.genes)} {len(self.barcodes)} {self.num_values}' write_mtx_file(rows_cols_count_line, mtx_body_file, mtx_file) mtx_body_file.unlink() # Write the two remaining files using the properties from the fully consumed iterator write_gzip_file(output_dir / 'barcodes.tsv.gz', ['barcodes'] + self.barcodes) write_gzip_file(output_dir / 'genes.tsv.gz', ['genes'] + self.genes) print('Done.')
def update_project_stats(project_dir: Path): """ Read a project's stats.json and yield contents as a dict that will then be written back to the stats.json file. """ stats_file = project_dir / 'stats.json' try: with open(str(stats_file), 'r') as f: stats = json.load(f) except FileNotFoundError: stats = {'project_uuid': generate_project_uuid(project_dir.name)} yield stats temporary_file = stats_file.with_suffix(stats_file.suffix + '.tmp') try: with open(str(temporary_file), 'w') as f: json.dump(stats, f, sort_keys=True, indent=4) except: Path(f.name).unlink() raise else: logging.info('Writing to %s', stats_file) Path(f.name).rename(stats_file)
def create_or_update_symlink(symlink: Path, target: Path): if symlink.is_symlink(): # noinspection PyUnresolvedReferences current_target = symlink.readlink() if current_target == target: return logging.warning('Removing stale symlink from %s to %s.', symlink, current_target) symlink.unlink() elif symlink.exists(): raise RuntimeError( f'Will not overwrite {symlink} with link to {target}') logging.info('Linking %s to %s', symlink, target) symlink.symlink_to(target, target_is_directory=True)
def file_uuid_callback(file_path: str): file_path = Path(file_path) file_name = file_path.name file_uuid = generate_file_uuid(bundle_uuid, file_name) log.info('Allocated UUID %s for file %s', file_uuid, file_path) if file_name.endswith('.json'): with file_path.open('rt') as f: document = json.load(f) if file_name == 'links.json': pass elif file_name == 'project_0.json': assert document['provenance'][ 'document_id'] == bundle_uuid else: assert document['provenance'][ 'document_id'] == file_uuid return file_uuid
def clean_project(self, project_dir: Path): log.info('Looking for artifacts to clean in project %s. ...', project_dir) for glob in self.args.artifacts: for artifact in project_dir.glob(glob): if artifact.is_dir(): if self.args.dry_run: log.info(' Would recursively remove directory %s', artifact) else: log.info(' Recursively removing directory %s', artifact) shutil.rmtree(artifact) else: if self.args.dry_run: log.info(' Would remove file %s', artifact) else: log.info(' Removing file %s', artifact) artifact.unlink()
def run(xlsx, output_dir=None, clear=True): wb = load_workbook(xlsx) project_data = parse_project_data_from_xlsx(wb) project_json, project_uuid = create_project_json(project_data, version=timestamp()) root = f'projects/{project_uuid}' matrix_file = f'{root}/bundle/matrix.mtx.zip' output_dir = f'{root}/bundle' if not output_dir else output_dir if clear and os.path.exists(output_dir): remove_previous_metadata(output_dir=output_dir) write_project_json(project_json, output_dir) bundle_uuid = copy.deepcopy(project_uuid) if os.path.exists(matrix_file): generate_analysis_json(bundle_uuid=bundle_uuid, output_dir=output_dir) cell_count = CountCells.get_cached_cell_count(Path(output_dir)) generate_cell_suspension_json(wb=wb, output_dir=output_dir, cell_count=cell_count, bundle_uuid=bundle_uuid) generate_specimen_from_organism_jsons(wb=wb, output_dir=output_dir, bundle_uuid=bundle_uuid) generate_donor_organism_jsons(wb=wb, output_dir=output_dir, bundle_uuid=bundle_uuid) generate_library_preparation_protocol_json(wb=wb, output_dir=output_dir, bundle_uuid=bundle_uuid) generate_sequencing_protocol_json(wb=wb, output_dir=output_dir, bundle_uuid=bundle_uuid) # generate_analysis_protocol_json(output_dir=output_dir, # bundle_uuid=bundle_uuid) generate_links_json(output_dir)
import os from _pathlib import Path cwd = Path(os.getcwd()) child_links = [x for x in cwd.iterdir() if x.is_symlink()] ids = [(os.readlink(str(link)), link) for link in child_links] ids = sorted(ids) for uuid, geo in ids: print(f''' class {geo.name}(Converter): """ {uuid} """ def _convert(self): raise NotImplementedError() ''')
file_type: str zipped: bool def to_url(self): base_url = 'https://www.ebi.ac.uk/gxa/sc/experiment' return f"{base_url}/{self.accession}/download{'/zip' if self.zipped else ''}?fileType={self.file_type}" def idempotent_download(self, path) -> bool: name = self.file_type + ('.zip' if self.zipped else '') file_path = path / name if not file_path.exists(): log.info('Downloading new file `%s` from URL `%s`', file_path, self.to_url()) try: download_file(self.to_url(), file_path) return True except Exception: log.warning('Failed to download file `%s` from URL `%s`', exc_info=True) return False else: log.info('Skipping download of file `%s`', file_path) return True if __name__ == '__main__': logging.basicConfig(format='%(asctime)s %(levelname)s:%(message)s', level=logging.INFO) download_projects_parallel(Path('projects'))
Sequence, ) from uuid import UUID from count_cells import CountCells from create_project import ( generate_project_uuid, ) from util import ( get_target_project_dirs, get_target_spreadsheets, ) logging.basicConfig(level=logging.INFO) projects_path = Path('projects') @dataclass class ProjectReport: uuid: UUID = None accession: str = None project_path: Path = None # projects/{uuid} symlink: Path = None # projects/{accession} symlink to project_path spreadsheet: Path = None # spreadsheets/(new|existing)/{accession}.0.xlsx geo_files: int = 0 # number of downloaded geo files in projects/{uuid}/geo num_matrices: int = 0 # number of matrices in projects/{uuid}/matrices zipped_matrix: Path = None # projects/{uuid}/bundle/matrix.mtx.zip cell_count: int = 0 # number of cells counted gene_count: int = 0 # number of genes counted num_metadata_files: int = 0 # number of metadata JSON files in projects/{uuid}/bundle