Example #1
0
def _file_gunzipper(id_list: List[str], genomes_metadata: pd.DataFrame,
                    genomes_directory: str) -> None:
    """Unzips the .gz file corresponding to each id in id_list"""

    for id_ in id_list:
        filename = get_zipped_fasta_name(id_, genomes_metadata)
        fna_gz_filename = os.path.join(genomes_directory, id_, filename)
        fna_filename = gz_stripper(fna_gz_filename)
        _gunzip(fna_gz_filename, fna_filename)
Example #2
0
def _ensure_all_data(id_list: List[str], genomes_metadata: pd.DataFrame,
                     output_directory: str) -> List[str]:
    """

    Parameters
    ----------
    id_list
        A list of assembly accession id's to ensure from NCBI
    genomes_metadata
        Table containing metadata containing NCBI ID's and ftp links
    output_directory
        Directory to look for and save data into to

    Returns
    -------

    List[str]
        paths to the genome for each id requested

    Raises
    ------

    ValueError
        If a supplied id is not contained in the requested metadata table

    """

    # TODO flat directory structure
    ids_to_download = _get_ids_not_downloaded(id_list, genomes_metadata,
                                              output_directory)

    # download .fna.gz files we do not have (do not need to download if
    # .fna exists)
    # TODO flat directory structure
    _ncbi_ftp_downloader(ids_to_download, genomes_metadata, output_directory)

    # if .fna.gz files are not unzipped, unzip them
    ids_to_gunzip = _get_ids_not_downloaded(id_list,
                                            genomes_metadata,
                                            output_directory,
                                            to_unzip=True)

    # TODO flat directory
    _file_gunzipper(ids_to_gunzip, genomes_metadata, output_directory)

    # TODO flat directory structure for `get_zipped_fasta_name`
    id_file_list = [(id_,
                     gz_stripper(get_zipped_fasta_name(id_, genomes_metadata)))
                    for id_ in id_list]

    return [os.path.join(output_directory, *pair) for pair in id_file_list]
Example #3
0
def _ncbi_ftp_downloader(id_list: List[str], genomes_metadata: pd.DataFrame,
                         genomes_directory: str) -> bool:
    """
    Opens an FTP and downloads the genomes of all ids in `id_list`
    """

    ftp = FTP('ftp.ncbi.nih.gov')
    ftp.login(user='******', passwd='*****@*****.**')
    for id_ in id_list:
        abspath = _ftp_path(id_, genomes_metadata)
        ftp_dir = get_ftp_dir(abspath)
        ftp.cwd(ftp_dir)
        filename = get_zipped_fasta_name(id_, genomes_metadata)
        local_dir = os.path.join(genomes_directory, id_, filename)
        ftp.retrbinary("RETR " + filename, open(local_dir, 'wb').write)

    return True
Example #4
0
 def test_get_zipped_fasta_name(self):
     id_ = 'GCF_000001765.3'
     exp = 'GCF_000001765.3_Dpse_3.0_genomic.fna.gz'
     self.assertEqual(get_zipped_fasta_name(id_, self.metadata), exp)
Example #5
0
def _get_ids_not_downloaded(id_list: List[str],
                            genomes_metadata: pd.DataFrame,
                            genomes_directory: Optional[str],
                            to_unzip: Optional[bool] = False) -> List[str]:
    """Returns ID's from id_list that need to be downloaded/unzipped (do not
    already exist in `genomes_directory`)

    Parameters
    ----------
    id_list
        List of Assembly accession id's being requested from
    genomes_metadata
        Table containing metadata containing NCBI ID's and ftp links
    genomes_directory
        Directory to look for and save data into to
    to_unzip
        False if downloading, True if unzipping

    Returns
    -------

    List[str]
        Assembly accession id's that need to be downloaded from NCBI

    Raises
    ------

    ValueError
        If a supplied id is not contained in the requested metadata table

    """

    # if fasta_only, we are checking for ids to gunzip
    # else, we are checking to see if we need to download the .gz file
    # results in the following truth table:
    # to_download | .fna exists | .fna.gz exists | fasta_only
    #       T     |      F      |        F       |    F
    #       F     |      F      |        T       |    F
    #       F     |      T      |        T       |    F
    #       F     |      T      |        F       |    F
    #  err<-F     |      F      |        F       |    T
    #       T     |      F      |        T       |    T
    #       F     |      T      |        T       |    T
    #       F     |      T      |        F       |    T
    # throws value error if a requested id does not have a .fna or .fna.gz
    # and is asked whether it should gunzip the .fna.gz, likely indicates a
    # failed download earlier on

    # default save data in current directory
    if genomes_directory is None:
        genomes_directory = os.path.curdir()

    ids_to_download = []
    for id_ in id_list:
        # TODO make expected local_dir just genomes_directory...
        expected_local_dir = os.path.join(genomes_directory, id_)
        fasta_gz_name = get_zipped_fasta_name(id_, genomes_metadata)
        fasta_name = gz_stripper(fasta_gz_name)

        # will fail if the directory is not found
        try:
            existing_files = set(os.listdir(expected_local_dir))
            # TODO play with logic for readability
            fasta_present = fasta_name in existing_files
            fasta_gz_present = fasta_gz_name in existing_files
            # the logic here is a little tricky, see note above
            # TODO maybe flip clauses
            #  if need to download?
            if not to_unzip:
                if not fasta_present and not fasta_gz_present:
                    ids_to_download.append(id_)
            # if we need to unzip?
            else:
                if not fasta_present and fasta_gz_present:
                    ids_to_download.append(id_)
                elif not fasta_present:
                    raise ValueError('Cannot gunzip when .gz file is not '
                                     'present, ID: {}'.format(id_))

        # if the directory does not exist, then create a directory for it
        # and add the id to the download list
        # TODO: more specific error catching? or if statement for os.isdir?
        except FileNotFoundError:
            os.makedirs(expected_local_dir, exist_ok=True)
            ids_to_download.append(id_)

    return ids_to_download