Beispiel #1
0
def get_target_project_dirs(follow_links: bool = False) -> List[Path]:
    """
    Return all or a subset of the project directories, if that subset is
    configured.

    :param follow_links: If True, follow the symbolic accession links and return
                         Path instances referring to the physical, UUID-based
                         project directories. Otherwise Path instances referring
                         to the symbolic accession links will be returned.
    """
    projects_dir = Path('projects')
    accessions = get_skunk_accessions()
    symlinks = [
        path for path in projects_dir.iterdir()
        if path.is_dir() and path.is_symlink() and (
            accessions is None or path.name in accessions)
    ]

    # Validate the links even though strictly speaking its only necessary to
    # follow them when follow_links is on.
    project_dirs = []
    for symlink in symlinks:
        project_dir = symlink.follow()
        assert project_dir.is_dir()
        assert not project_dir.is_symlink()
        assert not project_dir.is_absolute()
        assert project_dir.parent == projects_dir
        accession = symlink.name
        project_uuid = generate_project_uuid([accession])
        assert project_dir.name == project_uuid
        project_dirs.append(project_dir)

    return project_dirs if follow_links else symlinks
Beispiel #2
0
def download_supplementary_files(accession):
    """
    Scrape web page for given accession id and download all supplementary files
    """
    logging.info('---')
    project_uuid = generate_project_uuid([accession])
    logging.info(
        'Checking project with accession %s and UUID %s for files to download ...',
        accession, project_uuid)

    projects_path = Path('projects')
    geo_path = projects_path / project_uuid / 'geo'
    if not geo_path.exists():
        geo_path.mkdir(parents=True)
    create_or_update_symlink(projects_path / accession, Path(project_uuid))

    source_url = source_url_template + accession
    page = requests.get(source_url)
    links = supplementary_file_download_links(accession, page.text)
    if links:
        for file_name, url in links:
            file_path = geo_path / file_name
            if file_path.is_file():
                logging.info('Skipping existing file: %s', file_path)
            else:
                logging.info('Downloading to: %s', file_path)
                download_file(url, file_path)
    else:
        logging.info('No supplementary files found on %s', source_url)
Beispiel #3
0
def get_target_spreadsheets() -> MutableMapping[str, Path]:
    accessions = get_skunk_accessions()
    paths_by_accession = {}
    ext = '.0.xlsx'

    def get_accession_from_path(path):
        assert path.name.endswith(ext)
        return path.name[:-len(ext)]

    for sub_dir in ('existing', 'new'):
        src_dir = Path('spreadsheets') / sub_dir
        paths = list(src_dir.iterdir())
        paths = [
            path for path in paths
            if path.is_file() and path.name.endswith(ext)
        ]
        subdir_paths_by_accession = {
            get_accession_from_path(path): path
            for path in paths
        }
        assert len(paths) == len(subdir_paths_by_accession)
        subdir_paths_by_accession = {
            accession: path
            for accession, path in subdir_paths_by_accession.items()
            if accessions is None or accession in accessions
        }
        paths_by_accession.update(subdir_paths_by_accession)
    return paths_by_accession
Beispiel #4
0
def extract_file(src_path: Path, dest_path: Path, compression='tar'):
    """
    Extract a compressed file and put completion file in destination folder once complete.
    Skips extraction if a completion file found in the destination folder

    :param src_path: Path to a compressed file
    :param dest_path: Path to put extracted contents
    :param compression: Either 'tar' or 'zip'
    """
    completion_file = dest_path / '.complete'
    if completion_file.exists():
        logging.info('Expansion of %s already complete', src_path)
    else:
        if compression == 'tar':
            openmode = 'r:gz' if src_path.name.endswith('.tar.gz') else 'r'
            extractor = tarfile.TarFile.open(str(src_path), mode=openmode)
            assert completion_file.name not in extractor.getnames()
        elif compression == 'zip':
            extractor = zipfile.ZipFile(str(src_path), 'r')
        else:
            raise ValueError('Unsupported compression')
        with extractor:
            if dest_path.exists():
                logging.info('Removing partially expanded %s', dest_path)
                shutil.rmtree(str(dest_path))
            logging.info('Expanding %s', dest_path)
            dest_path.mkdir()
            extractor.extractall(str(dest_path))
            completion_file.touch()
            logging.info('Expansion of %s is complete', dest_path)
Beispiel #5
0
def extract_recursive(compressed_path: Path):
    """
    Recursively extract tar files into a folder located at the same path as the tar file

    :param compressed_path: Path to a compressed file or folder containing one or more tar files
    """
    logging.debug('Running extract_recursive(%s)', compressed_path)
    if compressed_path.is_dir():
        logging.debug('Decending into directory %s', compressed_path)
        # Iterate over directory contents sorted by type with files first, then
        # directories. This is done to avoid wasting time processing a directory
        # that could be itself be deleted and re-extracted from a tar file
        for file in sorted(compressed_path.iterdir(), key=methodcaller('is_dir')):
            extract_recursive(file)
    else:
        is_zip = compressed_path.name in (
            'experiment-metadata.zip',
            'marker-genes.zip',
            'normalised.zip',
            'quantification-raw.zip')
        is_tar = compressed_path.name.endswith(('.tar', '.tar.gz'))
        if is_tar or is_zip:
            logging.debug('Examining file %s', compressed_path)
            base_file_name = get_base_file_name(compressed_path.name)
            assert 0 < len(base_file_name) < len(compressed_path.name)
            dest_path = compressed_path.parent / base_file_name
            assert compressed_path.is_file()
            if is_tar:
                extract_file(compressed_path, dest_path, compression='tar')  # Extract the tar file to a subfolder
                extract_recursive(dest_path)  # Check subfolder for tar files and extract them
            elif is_zip:
                # This is a zip download from SCXA
                extract_file(compressed_path, dest_path, compression='zip')
            else:
                assert False
def convert_h5_to_mtx(input_file: Path, output_dir: Path) -> None:
    with h5py.File(str(input_file), mode='r') as h5:
        group = one(h5.values())
        m = Matrix.from_group(input_file, group)
        output_dir.mkdir(parents=True,
                         exist_ok=True)  # FIXME: move to convert_matrices.py
        m.to_mtx(output_dir)
def get_file_count(path: Path, glob: str) -> int:
    """
    Return the count of files in a folder

    :param path: A path to a folder to check
    :param glob: The glob pattern

    :return: Number of files counted
    """
    if path.exists() and path.is_dir():
        return sum([1 for f in path.glob(glob) if f.is_file()])
    else:
        return 0
Beispiel #8
0
def download_file(url: str, path: Path):
    """
    Stream download the file from url, save it to path, and return response headers
    """
    with requests.get(url, stream=True) as request:
        request.raise_for_status()
        with tempfile.NamedTemporaryFile(dir=str(path.parent),
                                         delete=False) as f:
            try:
                for chunk in request.iter_content(chunk_size=1024 * 1024):
                    f.write(chunk)
            except:
                Path(f.name).unlink()
                raise
            else:
                Path(f.name).rename(path)
Beispiel #9
0
def write_gzip_file(output_file: Path, lines: Iterable):
    """
    Create/overwrite a gzipped text file

    :param output_file: File to create
    :param lines: List/Iterator of strings to write to file (a '\n' is added to each line)
    """
    temp_output_file = output_file.with_suffix(output_file.suffix + '.tmp')
    log.info('Writing %s ...', temp_output_file)
    try:
        # Using gzip.open(temp) directly creates an archive that causes
        # `gunzip -N` to extract the file under the name of the temporary file
        # even if the archive name is different. Therefore we must set the
        # internal file name manually and pass in an already open file object
        # for writing.
        with open(str(temp_output_file), 'wb') as f:
            with gzip.GzipFile(filename=output_file, fileobj=f) as z:
                with io.TextIOWrapper(z) as w:
                    for line in lines:
                        w.write(line + '\n')
    except BaseException:
        try:
            temp_output_file.unlink()
        except FileNotFoundError:
            pass
        raise
    else:
        log.info('Renaming %s to %s ...', temp_output_file, output_file)
        temp_output_file.rename(output_file)
Beispiel #10
0
def write_mtx_file(rows_cols_count_line: str, mtx_body_file: Path,
                   output_file: Path):
    """
    Write the final mtx file with comment header line, the rows_cols_count line, and
    the mtx body from previously written temp file

    :param rows_cols_count_line: String containing "{num_genes} {num_cells} {total_values}"
    :param mtx_body_file: Path of the temp file containing data to be written to the body mtx file
    :param output_file: Path of the mtx file to be written
    """
    temp_output_file = output_file.with_suffix(output_file.suffix + '.tmp')
    log.info('Writing %s ...', temp_output_file)
    try:
        with gzip.open(temp_output_file, 'wb') as f:
            header_line = '%%MatrixMarket matrix coordinate integer general\n'
            f.write(header_line.encode())
            f.write((rows_cols_count_line + '\n').encode())
            with open_maybe_gz(mtx_body_file, 'rb') as temp_data:
                # Using 1MiB buffer should be faster than the default of 16KiB
                copyfileobj(temp_data, f, length=2**20)
    except BaseException:
        log.warning('Error writing %s ...', temp_output_file)
        try:
            temp_output_file.unlink()
        except FileNotFoundError:
            pass
        raise
    else:
        log.info('Renaming %s to %s ...', temp_output_file, output_file)
        temp_output_file.rename(output_file)
Beispiel #11
0
def main(argv):
    """
    Support for command line execution of convert_csv_to_mtx()
    """
    logging.basicConfig(
        format='%(asctime)s %(levelname)s:%(threadName)s:%(message)s',
        level=logging.INFO)

    parser = argparse.ArgumentParser(description=__doc__)
    parser.add_argument('csv_file', help='Input csv file')
    parser.add_argument('output_dir', help='Path to write output files')
    parser.add_argument(
        'delimiter',
        help='Delimiter character or keyword "comma", "space", "tab"')
    parser.add_argument(
        'rows_are_genes',
        help='"y" if rows are genes or "n" if columns are genes')
    args = parser.parse_args(argv)

    if not os.path.isfile(args.csv_file):
        log.error('File not found: %s', args.csv_file)
        parser.print_help()
        exit()

    if args.delimiter == 'comma':
        args.delimiter = ','
    elif args.delimiter == 'space':
        args.delimiter = ' '
    elif args.delimiter == 'tab':
        args.delimiter = '\t'

    if len(args.delimiter) < 1:
        log.error('Delimiter must be 1 char in length')

    if args.rows_are_genes not in ('y', 'n'):
        log.error('rows_are_genes must be "y" or "n"')

    args.rows_are_genes = args.rows_are_genes == 'y'

    converter = CSVConverter(Path(args.csv_file),
                             delimiter=args.delimiter,
                             rows_are_genes=args.rows_are_genes)
    converter.convert(Path(args.output_dir))
Beispiel #12
0
    def convert(self, output_dir: Path):
        output_dir.mkdir(parents=True,
                         exist_ok=True)  # FIXME: move to convert_matrices.py

        mtx_body_file = output_dir / 'matrix.mtx.body.gz'
        mtx_file = output_dir / 'matrix.mtx.gz'

        # Fully consume the iterator by writing the body of the mtx file to a temp file
        write_gzip_file(mtx_body_file, self)

        # Write the completed mtx file using correct header information and the body we wrote to the temp file
        rows_cols_count_line = f'{len(self.genes)} {len(self.barcodes)} {self.num_values}'
        write_mtx_file(rows_cols_count_line, mtx_body_file, mtx_file)
        mtx_body_file.unlink()

        # Write the two remaining files using the properties from the fully consumed iterator
        write_gzip_file(output_dir / 'barcodes.tsv.gz',
                        ['barcodes'] + self.barcodes)
        write_gzip_file(output_dir / 'genes.tsv.gz', ['genes'] + self.genes)

        print('Done.')
Beispiel #13
0
def update_project_stats(project_dir: Path):
    """
    Read a project's stats.json and yield contents as a dict
    that will then be written back to the stats.json file.
    """
    stats_file = project_dir / 'stats.json'
    try:
        with open(str(stats_file), 'r') as f:
            stats = json.load(f)
    except FileNotFoundError:
        stats = {'project_uuid': generate_project_uuid(project_dir.name)}

    yield stats

    temporary_file = stats_file.with_suffix(stats_file.suffix + '.tmp')
    try:
        with open(str(temporary_file), 'w') as f:
            json.dump(stats, f, sort_keys=True, indent=4)
    except:
        Path(f.name).unlink()
        raise
    else:
        logging.info('Writing to %s', stats_file)
        Path(f.name).rename(stats_file)
Beispiel #14
0
def create_or_update_symlink(symlink: Path, target: Path):
    if symlink.is_symlink():
        # noinspection PyUnresolvedReferences
        current_target = symlink.readlink()
        if current_target == target:
            return
        logging.warning('Removing stale symlink from %s to %s.', symlink,
                        current_target)
        symlink.unlink()
    elif symlink.exists():
        raise RuntimeError(
            f'Will not overwrite {symlink} with link to {target}')
    logging.info('Linking %s to %s', symlink, target)
    symlink.symlink_to(target, target_is_directory=True)
Beispiel #15
0
 def file_uuid_callback(file_path: str):
     file_path = Path(file_path)
     file_name = file_path.name
     file_uuid = generate_file_uuid(bundle_uuid, file_name)
     log.info('Allocated UUID %s for file %s', file_uuid, file_path)
     if file_name.endswith('.json'):
         with file_path.open('rt') as f:
             document = json.load(f)
             if file_name == 'links.json':
                 pass
             elif file_name == 'project_0.json':
                 assert document['provenance'][
                     'document_id'] == bundle_uuid
             else:
                 assert document['provenance'][
                     'document_id'] == file_uuid
     return file_uuid
Beispiel #16
0
 def clean_project(self, project_dir: Path):
     log.info('Looking for artifacts to clean in project %s. ...',
              project_dir)
     for glob in self.args.artifacts:
         for artifact in project_dir.glob(glob):
             if artifact.is_dir():
                 if self.args.dry_run:
                     log.info('    Would recursively remove directory %s',
                              artifact)
                 else:
                     log.info('    Recursively removing directory %s',
                              artifact)
                     shutil.rmtree(artifact)
             else:
                 if self.args.dry_run:
                     log.info('    Would remove file %s', artifact)
                 else:
                     log.info('    Removing file %s', artifact)
                     artifact.unlink()
def run(xlsx, output_dir=None, clear=True):
    wb = load_workbook(xlsx)

    project_data = parse_project_data_from_xlsx(wb)
    project_json, project_uuid = create_project_json(project_data, version=timestamp())

    root = f'projects/{project_uuid}'
    matrix_file = f'{root}/bundle/matrix.mtx.zip'
    output_dir = f'{root}/bundle' if not output_dir else output_dir

    if clear and os.path.exists(output_dir):
        remove_previous_metadata(output_dir=output_dir)

    write_project_json(project_json, output_dir)
    bundle_uuid = copy.deepcopy(project_uuid)

    if os.path.exists(matrix_file):
        generate_analysis_json(bundle_uuid=bundle_uuid, output_dir=output_dir)

    cell_count = CountCells.get_cached_cell_count(Path(output_dir))

    generate_cell_suspension_json(wb=wb,
                                  output_dir=output_dir,
                                  cell_count=cell_count,
                                  bundle_uuid=bundle_uuid)
    generate_specimen_from_organism_jsons(wb=wb,
                                          output_dir=output_dir,
                                          bundle_uuid=bundle_uuid)
    generate_donor_organism_jsons(wb=wb,
                                  output_dir=output_dir,
                                  bundle_uuid=bundle_uuid)
    generate_library_preparation_protocol_json(wb=wb,
                                               output_dir=output_dir,
                                               bundle_uuid=bundle_uuid)
    generate_sequencing_protocol_json(wb=wb,
                                      output_dir=output_dir,
                                      bundle_uuid=bundle_uuid)
    # generate_analysis_protocol_json(output_dir=output_dir,
    #                                 bundle_uuid=bundle_uuid)
    generate_links_json(output_dir)
import os
from _pathlib import Path


cwd = Path(os.getcwd())
child_links = [x for x in cwd.iterdir() if x.is_symlink()]
ids = [(os.readlink(str(link)), link) for link in child_links]
ids = sorted(ids)

for uuid, geo in ids:
    print(f'''
class {geo.name}(Converter):
    """
    {uuid}
    """
    
    def _convert(self):
        raise NotImplementedError()

''')
    file_type: str
    zipped: bool

    def to_url(self):
        base_url = 'https://www.ebi.ac.uk/gxa/sc/experiment'
        return f"{base_url}/{self.accession}/download{'/zip' if self.zipped else ''}?fileType={self.file_type}"

    def idempotent_download(self, path) -> bool:
        name = self.file_type + ('.zip' if self.zipped else '')
        file_path = path / name
        if not file_path.exists():
            log.info('Downloading new file `%s` from URL `%s`', file_path,
                     self.to_url())
            try:
                download_file(self.to_url(), file_path)
                return True
            except Exception:
                log.warning('Failed to download file `%s` from URL `%s`',
                            exc_info=True)
                return False
        else:
            log.info('Skipping download of file `%s`', file_path)
            return True


if __name__ == '__main__':
    logging.basicConfig(format='%(asctime)s %(levelname)s:%(message)s',
                        level=logging.INFO)

    download_projects_parallel(Path('projects'))
    Sequence,
)
from uuid import UUID

from count_cells import CountCells
from create_project import (
    generate_project_uuid,
)
from util import (
    get_target_project_dirs,
    get_target_spreadsheets,
)

logging.basicConfig(level=logging.INFO)

projects_path = Path('projects')


@dataclass
class ProjectReport:
    uuid: UUID = None
    accession: str = None
    project_path: Path = None  # projects/{uuid}
    symlink: Path = None  # projects/{accession} symlink to project_path
    spreadsheet: Path = None  # spreadsheets/(new|existing)/{accession}.0.xlsx
    geo_files: int = 0  # number of downloaded geo files in projects/{uuid}/geo
    num_matrices: int = 0  # number of matrices in projects/{uuid}/matrices
    zipped_matrix: Path = None  # projects/{uuid}/bundle/matrix.mtx.zip
    cell_count: int = 0  # number of cells counted
    gene_count: int = 0  # number of genes counted
    num_metadata_files: int = 0  # number of metadata JSON files in projects/{uuid}/bundle