def _file_gunzipper(id_list: List[str], genomes_metadata: pd.DataFrame, genomes_directory: str) -> None: """Unzips the .gz file corresponding to each id in id_list""" for id_ in id_list: filename = get_zipped_fasta_name(id_, genomes_metadata) fna_gz_filename = os.path.join(genomes_directory, id_, filename) fna_filename = gz_stripper(fna_gz_filename) _gunzip(fna_gz_filename, fna_filename)
def _ensure_all_data(id_list: List[str], genomes_metadata: pd.DataFrame, output_directory: str) -> List[str]: """ Parameters ---------- id_list A list of assembly accession id's to ensure from NCBI genomes_metadata Table containing metadata containing NCBI ID's and ftp links output_directory Directory to look for and save data into to Returns ------- List[str] paths to the genome for each id requested Raises ------ ValueError If a supplied id is not contained in the requested metadata table """ # TODO flat directory structure ids_to_download = _get_ids_not_downloaded(id_list, genomes_metadata, output_directory) # download .fna.gz files we do not have (do not need to download if # .fna exists) # TODO flat directory structure _ncbi_ftp_downloader(ids_to_download, genomes_metadata, output_directory) # if .fna.gz files are not unzipped, unzip them ids_to_gunzip = _get_ids_not_downloaded(id_list, genomes_metadata, output_directory, to_unzip=True) # TODO flat directory _file_gunzipper(ids_to_gunzip, genomes_metadata, output_directory) # TODO flat directory structure for `get_zipped_fasta_name` id_file_list = [(id_, gz_stripper(get_zipped_fasta_name(id_, genomes_metadata))) for id_ in id_list] return [os.path.join(output_directory, *pair) for pair in id_file_list]
def _ncbi_ftp_downloader(id_list: List[str], genomes_metadata: pd.DataFrame, genomes_directory: str) -> bool: """ Opens an FTP and downloads the genomes of all ids in `id_list` """ ftp = FTP('ftp.ncbi.nih.gov') ftp.login(user='******', passwd='*****@*****.**') for id_ in id_list: abspath = _ftp_path(id_, genomes_metadata) ftp_dir = get_ftp_dir(abspath) ftp.cwd(ftp_dir) filename = get_zipped_fasta_name(id_, genomes_metadata) local_dir = os.path.join(genomes_directory, id_, filename) ftp.retrbinary("RETR " + filename, open(local_dir, 'wb').write) return True
def test_get_zipped_fasta_name(self): id_ = 'GCF_000001765.3' exp = 'GCF_000001765.3_Dpse_3.0_genomic.fna.gz' self.assertEqual(get_zipped_fasta_name(id_, self.metadata), exp)
def _get_ids_not_downloaded(id_list: List[str], genomes_metadata: pd.DataFrame, genomes_directory: Optional[str], to_unzip: Optional[bool] = False) -> List[str]: """Returns ID's from id_list that need to be downloaded/unzipped (do not already exist in `genomes_directory`) Parameters ---------- id_list List of Assembly accession id's being requested from genomes_metadata Table containing metadata containing NCBI ID's and ftp links genomes_directory Directory to look for and save data into to to_unzip False if downloading, True if unzipping Returns ------- List[str] Assembly accession id's that need to be downloaded from NCBI Raises ------ ValueError If a supplied id is not contained in the requested metadata table """ # if fasta_only, we are checking for ids to gunzip # else, we are checking to see if we need to download the .gz file # results in the following truth table: # to_download | .fna exists | .fna.gz exists | fasta_only # T | F | F | F # F | F | T | F # F | T | T | F # F | T | F | F # err<-F | F | F | T # T | F | T | T # F | T | T | T # F | T | F | T # throws value error if a requested id does not have a .fna or .fna.gz # and is asked whether it should gunzip the .fna.gz, likely indicates a # failed download earlier on # default save data in current directory if genomes_directory is None: genomes_directory = os.path.curdir() ids_to_download = [] for id_ in id_list: # TODO make expected local_dir just genomes_directory... expected_local_dir = os.path.join(genomes_directory, id_) fasta_gz_name = get_zipped_fasta_name(id_, genomes_metadata) fasta_name = gz_stripper(fasta_gz_name) # will fail if the directory is not found try: existing_files = set(os.listdir(expected_local_dir)) # TODO play with logic for readability fasta_present = fasta_name in existing_files fasta_gz_present = fasta_gz_name in existing_files # the logic here is a little tricky, see note above # TODO maybe flip clauses # if need to download? if not to_unzip: if not fasta_present and not fasta_gz_present: ids_to_download.append(id_) # if we need to unzip? else: if not fasta_present and fasta_gz_present: ids_to_download.append(id_) elif not fasta_present: raise ValueError('Cannot gunzip when .gz file is not ' 'present, ID: {}'.format(id_)) # if the directory does not exist, then create a directory for it # and add the id to the download list # TODO: more specific error catching? or if statement for os.isdir? except FileNotFoundError: os.makedirs(expected_local_dir, exist_ok=True) ids_to_download.append(id_) return ids_to_download