コード例 #1
0
def main():
    parser = argparse.ArgumentParser(description='Find all protein database entrys of specified taxon IDs and their descendants.' \
          ' One taxID or a taxID input file must be provided. Peptide-Databases from NCBI or Uniprot can be used. User defined databases,' \
          ' if header contain taxon IDs (e.g. OX=1111) or ncbi/uniprot accession IDs.')
    parser.add_argument(
        '-i',
        '--input',
        dest='input',
        default=None,
        help='TaxID input file: tabular file containing a column of NCBI'
        ' taxon IDs. Columns tab separated.')
    parser.add_argument('-c',
                        '--column',
                        dest='column',
                        type=positive_integer,
                        default=0,
                        help='The column (zero-based) in the tabular '
                        'file that contains Taxon IDs. Default = 0.')
    parser.add_argument(
        '-t',
        '--taxon',
        dest='taxon',
        type=positive_integer,
        nargs='+',
        action='append',
        help=
        'NCBI taxon ID/s for database extraction. Multiple taxonIDs seperated by space.'
    )
    parser.add_argument(
        '-d',
        '--database',
        dest='database',
        choices=['ncbi', 'uniprot', 'swissprot', 'trembl'],
        default='uniprot',
        help=
        'Database choice for analysis or for download. Choices: ncbi, uniprot, tremble, swissprot. '
        'No download, if databases with original name are stored in same folder as option --path '
    )
    parser.add_argument(
        '-p',
        '--path',
        dest='path',
        default=None,
        help='Path to folder with all needed '
        'databases: taxdump.tar.gz (for all databases), prot.accession2taxid or prot.accession2taxid.gz and '
        'pdb.accession2taxid.gz (for ncbi databases). Optional: peptide_database named: nr/nr.gz, '
        'uniprot_trembl.fasta/uniprot_trembl.fasta.gz or uniprot_sprot.fasta/uniprot_sprot.fasta.gz'
        ' or uniprot.fasta./uniprot.fasta.gz')
    parser.add_argument(
        '-o',
        '--out',
        dest='out',
        default=None,
        help=
        "File name and direction of the result taxon specified peptide database. "
        "Default = /taxon_specified_db_DATE/taxon_specific_database.fasta")
    parser.add_argument(
        '-n',
        '--dbname',
        dest='dbname',
        default=None,
        help=
        "Database name and direction. If database is in other folder than --path or name deviates from standard names."
    )
    parser.add_argument(
        '-l',
        '--level',
        dest='level',
        choices=[
            'species', 'section', 'genus', 'tribe', 'subfamily', 'family',
            'superfamily', 'order', 'superorder', 'class', 'phylum', 'kingdom',
            'superkingdom'
        ],
        default=None,
        help=
        'Hierarchy level up in anchestral tree. Choices: species, section, genus, tribe, '
        'subfamily, family, superfamily, order, superorder, class, phylum, kingdom, superkingdom'
    )
    parser.add_argument(
        '-z',
        '--no_descendants',
        dest='no_descendants',
        action='store_true',
        default=False,
        help=
        'Select peptide database only by given taxon IDs, descendant taxons are excluded.'
    )
    parser.add_argument(
        '-s',
        '--species',
        dest='species',
        action='store_true',
        default=False,
        help=
        'Select peptide database only until taxonomic level "species", descendents from species are excluded.'
    )
    parser.add_argument(
        '-r',
        '--non_redundant',
        dest='non_redundant',
        action='store_true',
        default=False,
        help=
        'Makes the final database non redundant in regard to sequences, headers are concatenated.'
    )
    parser.add_argument(
        '-u',
        '--threads',
        dest='threads',
        type=positive_integer,
        action="store",
        help=
        'Number of threads for using multiprocessing. Default = number of cores.'
    )
    parser.add_argument(
        '-x',
        '--reduce_header',
        dest='reduce_header',
        action='store_true',
        default=False,
        help=
        'Reduce the long headers of NCBI entries to accession IDs. Use only for NCBI databases.'
    )
    parser.add_argument('--version',
                        action='version',
                        version=('version ' + __version__))
    parser.add_argument(
        '-v',
        '--verbose',
        dest='verbose',
        action='store_true',
        default=False,
        help=
        'Verbose shows details about program progress and more information.')

    options = parser.parse_args()
    # url adresses for download:
    url_protaccession2taxID = 'https://ftp.ncbi.nlm.nih.gov/pub/taxonomy/accession2taxid/prot.accession2taxid.gz'
    url_protaccession2taxID_md5 = 'https://ftp.ncbi.nlm.nih.gov/pub/taxonomy/accession2taxid/prot.accession2taxid.gz.md5'
    url_pdbaccession2taxID = 'https://ftp.ncbi.nlm.nih.gov/pub/taxonomy/accession2taxid/pdb.accession2taxid.gz'
    url_pdbaccession2taxID_md5 = 'https://ftp.ncbi.nlm.nih.gov/pub/taxonomy/accession2taxid/pdb.accession2taxid.gz.md5'
    url_taxdump = 'ftp://ftp.ncbi.nlm.nih.gov/pub/taxonomy/taxdump.tar.gz'
    url_taxdump_md5 = 'ftp://ftp.ncbi.nlm.nih.gov/pub/taxonomy/taxdump.tar.gz.md5'
    url_database_ncbi = 'ftp://ftp.ncbi.nlm.nih.gov/blast/db/FASTA/nr.gz'
    url_database_md5_ncbi = 'ftp://ftp.ncbi.nlm.nih.gov/blast/db/FASTA/nr.gz.md5'
    url_database_swissprot = 'ftp://ftp.uniprot.org/pub/databases/uniprot/current_release/knowledgebase/complete/uniprot_sprot.fasta.gz'
    url_database_trembl = 'ftp://ftp.uniprot.org/pub/databases/uniprot/current_release/knowledgebase/complete/uniprot_trembl.fasta.gz'
    url_uniprot_metadata = 'ftp://ftp.expasy.org/databases/uniprot/current_release/knowledgebase/complete/RELEASE.metalink'
    db_dict_name = {
        'ncbi': url_database_ncbi.split('/')[-1],
        'uniprot': 'uniprot.fasta.gz',
        'swissprot': url_database_swissprot.split('/')[-1],
        'trembl': url_database_trembl.split('/')[-1]
    }

    # if not option out, a new folder with name taxon_database and date for result database and log file is created
    if options.out:
        output_path = Path.cwd() / options.out
    else:
        output_path = Output.createDir(Path.cwd())

    logger = initialize_logger(output_path, options.verbose)

    for arg, value in sorted(vars(options).items()):
        logger.debug("Argument %s: %r", arg, value)
    logger.debug("Result database and log file are saved in direction %s" %
                 output_path)

    # set path_to_db and database_folder for all user input variants

    # if options.path specified: folder to all databases (can be without protein DB if options.dbname)
    # if not exist, create folder with user defined name in option --path
    skip_check = False
    if options.path:
        database_folder = Path.cwd() / options.path
        path_to_db = database_folder / db_dict_name[options.database]

    # try open config file and read path to database folder, if no path option is entered
    # no config file, new database folder created
    else:
        try:
            path_to_main = Path(__file__, '..').resolve()
            with open(str(path_to_main) + "/tax2proteome.config",
                      'r') as config:
                database_folder = Path(config.readline().strip())
                path_to_db = database_folder / db_dict_name[options.database]
        except FileNotFoundError:
            database_folder = Path.cwd() / ('databases_' + str(date.today()))
            path_to_db = database_folder / db_dict_name[options.database]
            try:
                database_folder.mkdir()
                prot_gz_b = prot_b = pdb_b = taxdump_b = db_gz_b = db_b = False
                skip_check = True
                logger.info("Downloaded databases are saved in direction %s" %
                            database_folder)
            except FileExistsError:
                logger.debug(
                    "Database folder %s already exists. Checking for content."
                    % database_folder)
            except OSError:
                logger.exception(
                    "No permission to create new database folder.",
                    exc_info=True)
                exit(1)
    if not database_folder.exists():
        try:
            database_folder.mkdir()
            logger.info(
                "New folder %s created. All needed database files will be downloaded and stored in this "
                "direction." % database_folder)
            prot_gz_b = prot_b = pdb_b = taxdump_b = db_gz_b = db_b = False
            skip_check = True
        except OSError:
            logger.exception(
                "Database folder %s does not exist and can not be created." %
                database_folder,
                exc_info=True)
            exit(1)
    # user given path to database
    # given path to database checked, if not exists quit. Check if DB is in uniprot or ncbi format
    if options.dbname:
        path_to_db = Path.cwd() / options.dbname
        db_b = Output.check_files_exist([path_to_db])[0]
        if not db_b:
            logger.error(
                "Given database %s does not exist. Enter correct path under option --dbname. Program quits."
                % path_to_db)
            exit(1)
        if not TestFile.test_uniprot(options.dbname):
            options.database = 'ncbi'

    # check database folder for content
    # check if all needed files in database folder: bool values _b: True = file exists and not downloaded again
    if not skip_check:
        taxdump_b, prot_gz_b, prot_b, pdb_b, db_gz_b, db_b = Output.check_files_exist(
            [
                database_folder / url_taxdump.split('/')[-1],
                database_folder / url_protaccession2taxID.split('/')[-1],
                database_folder / 'prot.accession2taxid',
                database_folder / url_pdbaccession2taxID.split('/')[-1],
                path_to_db, path_to_db.parents[0] / path_to_db.stem
            ])
        if db_b:
            path_to_db = path_to_db.parents[0] / path_to_db.stem
        if not taxdump_b:
            logger.warning(
                "File taxdump.tar.gz does not exist does not exist under the path %s and will be downloaded."
                % str(database_folder))
        if not pdb_b and options.database == 'ncbi':
            logger.warning(
                "File pdb.accession2taxid.gz does not exist does not exist under the path %s and will be"
                " downloaded." % str(database_folder))
        if not prot_gz_b and not prot_b and options.database == 'ncbi':
            logger.warning(
                "File prot.accession2taxid.gz does not exist does not exist under the path %s and will be"
                " downloaded." % str(database_folder))
        if options.dbname is None and not db_b and not db_gz_b:
            logger.warning(
                "Database file %s does not exist does not exist under the path %s and will be downloaded."
                % (db_dict_name[options.database], str(database_folder)))

    # download taxdump file (best at the same day)
    if not taxdump_b:
        taxdump_md5 = read_ncbi_hash(url_taxdump_md5, logger)
        dwl_taxdb = Download(url_taxdump,
                             database_folder / url_taxdump.split('/')[-1],
                             taxdump_md5)
        dwl_taxdb.download()
        logger.debug('End download of taxdump.tar.gz')
    # download prot.accession2taxid.gz (only for ncbi) and check md5 hash
    if not prot_gz_b and not prot_b and options.database == 'ncbi':
        md5_hash = read_ncbi_hash(url_protaccession2taxID_md5, logger)
        dwl_protaccession = Download(url_protaccession2taxID,
                                     database_folder /
                                     url_protaccession2taxID.split('/')[-1],
                                     md5=md5_hash)
        dwl_protaccession.download()
        logger.debug(
            'End download from %s to location %s.' %
            (url_protaccession2taxID,
             str(database_folder / url_protaccession2taxID.split('/')[-1])))
    # download pdb.accession2taxid.gz (only for ncbi) and check md5 hash
    if not pdb_b and options.database == 'ncbi':
        md5_hash = read_ncbi_hash(url_pdbaccession2taxID_md5, logger)
        dwl_pdbaccession = Download(url_pdbaccession2taxID,
                                    database_folder /
                                    url_pdbaccession2taxID.split('/')[-1],
                                    md5=md5_hash)
        dwl_pdbaccession.download()
        logger.debug(
            'End download from %s to location %s.' %
            (url_pdbaccession2taxID,
             str(database_folder / url_pdbaccession2taxID.split('/')[-1])))
    # download peptide database and check md5 hash
    if not db_b and not db_gz_b:
        if options.database == 'ncbi':
            database_version_ncbi = 'ncbi ' + str(date)
            md5_hash = read_ncbi_hash(url_database_md5_ncbi, logger)
            dwl_db = Download(url_database_ncbi,
                              database_folder / db_dict_name['ncbi'],
                              md5=md5_hash)
            dwl_db.download()
            logger.debug("Databaseversion: %s" % database_version_ncbi)
            path_to_db = database_folder / db_dict_name['ncbi']
        else:
            if options.database == 'swissprot' or options.database == 'uniprot':
                database_version_swissprot, hash_swissprot = read_uniprot_metadata(
                    url_uniprot_metadata, db_dict_name['swissprot'], logger)
                logger.debug("Database version swissprot: %s " %
                             database_version_swissprot)
                dwl_db_swiss = Download(url_database_swissprot,
                                        database_folder /
                                        db_dict_name['swissprot'],
                                        md5=hash_swissprot)
                dwl_db_swiss.download()
                path_to_db = database_folder / db_dict_name['swissprot']
            if options.database == 'trembl' or options.database == 'uniprot':
                database_version_trembl, hash_trembl = read_uniprot_metadata(
                    url_uniprot_metadata, db_dict_name['trembl'], logger)
                logger.debug("Databaseversion trembl: %s." %
                             database_version_trembl)
                dwl_db_trembl = Download(url_database_trembl,
                                         database_folder /
                                         db_dict_name['trembl'],
                                         md5=hash_trembl)
                dwl_db_trembl.download()
                path_to_db = database_folder / db_dict_name['trembl']
            # concetenate  swissprot and trembl to uniprot file
            if options.database == 'uniprot':
                try:
                    logger.debug(
                        "Concatenate swissprot and trembl to uniprot database with name uniprot.fasta"
                    )
                    with open(str(database_folder / db_dict_name['trembl']),
                              'ab') as trembl:
                        with open(
                                str(database_folder /
                                    db_dict_name['swissprot']),
                                'rb') as swissprot:
                            shutil.copyfileobj(swissprot, trembl)
                    # rename trembl to uniprot:
                    Path(database_folder / db_dict_name['trembl']).rename(
                        database_folder / db_dict_name['uniprot'])
                    logger.debug("Uniprot database is now ready.")
                    path_to_db = database_folder / db_dict_name['uniprot']
                except FileNotFoundError:
                    logger.exception(
                        "Creation of uniprot database file out of trembl and swissprot file failed.",
                        exc_info=True)
                    exit(1)

    # create config file
    try:
        path_to_main = Path(__file__, '..').resolve()
        with open(str(path_to_main / "tax2proteome.config"), 'w') as config:
            config.write(str(database_folder) + '\n')
    except OSError:
        logger.debug('Can not create config file')

    # Read taxIDs from option -t and option -i
    if options.taxon:
        taxIDs = set(
            [taxID for taxonlist in options.taxon for taxID in taxonlist])
    else:
        taxIDs = set()
    if options.input:
        try:
            with open(options.input, 'r') as inputFile:
                for i, line in enumerate(inputFile):
                    fields = line.rstrip('\r\n').split('\t')
                    if len(fields) >= abs(options.column):
                        taxID = fields[options.column].strip()
                        if taxID.isdigit():
                            taxIDs.add(int(taxID))
                        else:
                            logger.error(
                                'Value %s in line %i of taxon input file is not a number. '
                                'Right column number specified?' % (taxID, i))
                            continue
                    else:
                        logger.error(
                            'Column number is bigger as number of columns in taxon ID input file. '
                            'Program continues without taxon IDs from input file.'
                        )
        except FileNotFoundError:
            logger.exception(
                'Taxon ID input file does not exist under specified path.',
                exc_info=True)

    if not taxIDs:
        logger.error(
            'No taxon ID given. Please check your input. Program quits. ')
        raise Exception('No taxon IDs.')
        exit(1)

    logger.debug('Given Tax-IDs: %s' % ' '.join(str(it) for it in taxIDs))

    # Try load pre-builded taxonomy graph or built taxonomy graph now
    if not (database_folder / 'taxon_graph').is_file():
        taxon_graph = TaxonGraph()
        logger.debug("Start building taxon graph.")
        taxon_graph.create_graph(database_folder / url_taxdump.split('/')[-1])
        logger.debug("Taxon graph successfully build.")
        # save TaxonGraph to harddrive:
        with open(str(database_folder / 'taxon_graph'), 'wb') as handle:
            pickle.dump(taxon_graph, handle, protocol=pickle.HIGHEST_PROTOCOL)
            logger.debug('Safe taxon graph to location: %s' %
                         str(database_folder / 'taxon_graph'))
    # load Taxon Graph
    else:
        try:
            logger.debug('Load taxon graph.')
            with open(str(database_folder / 'taxon_graph'), 'rb') as handle:
                taxon_graph = pickle.load(handle)
        except UnicodeDecodeError or EOFError:
            logger.exception(
                "Failed opening path to taxon graph / taxon_graph is corrupted. Delete %s file."
                % str(database_folder / 'taxon_graph'))
            exit(1)

    # adjusts the hierarchy level, if level does not exist, take next smaller level
    if options.level:
        logger.debug(
            "Start selection of next ancestor of level %s for all given taxIDs"
            % options.level)
        taxIDs = {
            taxon_graph.find_level_up(taxID, options.level)
            for taxID in taxIDs
        }
        logger.info(
            "All taxon IDs are set up to level %s in anchestral tree. Taxon IDs of level %s: %s"
            %
            (options.level, options.level, ' '.join(str(it) for it in taxIDs)))

    final_taxIDs = set()
    # find all descendants
    if not options.no_descendants:
        logger.debug("Start searching for all child taxon IDs.")
        for taxID in taxIDs:
            final_taxIDs.update(taxon_graph.find_taxIDs(
                taxID, options.species))
        logger.debug("End searching for all child taxon IDs.")
        logger.debug('Number of final taxon IDs: %s' % str(len(final_taxIDs)))
    else:
        final_taxIDs = taxIDs
        logger.debug('Number of taxon IDs for database search: %s' %
                     str(len(final_taxIDs)))

    # generate accession_taxID dict for ncbi db search and write custom specified db to --out
    with_taxon_ID = TestFile.test_uniprot(path_to_db)
    if not with_taxon_ID:
        accession = Accession(final_taxIDs)
        logger.debug('Read accession files.')
        if prot_b:
            accession.read_accessions(
                database_folder / 'prot.accession2taxid',
                database_folder / url_pdbaccession2taxID.split('/')[-1],
                options.threads)
        else:
            accession.read_accessions(
                database_folder / url_protaccession2taxID.split('/')[-1],
                database_folder / url_pdbaccession2taxID.split('/')[-1],
                options.threads)
        logger.debug('All accession IDs collected.')
        logger.info('Start writing taxon selected peptide database to %s.' %
                    output_path)
        wc = WriteCustomDB(path_to_db, output_path)
        wc.read_database(False,
                         gzipped=TestFile.test_gzipped(path_to_db),
                         accessions=accession.accessionIDs,
                         threads=options.threads)
        logger.debug('End writing taxon selected peptide database.')
        # non redundant database

    # uniprot: write custom specified db to --out
    else:
        logger.info('Start writing taxon selected peptide database to %s.' %
                    output_path)
        wc = WriteCustomDB(path_to_db, output_path, final_taxIDs)
        wc.read_database(True,
                         threads=options.threads,
                         gzipped=TestFile.test_gzipped(path_to_db))
        logger.debug('End writing taxon selected peptide database.')

    # non redundant database
    if options.non_redundant:
        DatabaseCleaner.non_redundant(output_path, with_taxon_ID)
        # remove redundant database:
        output_path.unlink()

    if options.reduce_header and not with_taxon_ID:
        # reduce headers of NCBI database
        DatabaseCleaner.reduce_header(output_path)
        output_path.unlink()

    logger.info('Program finished.')
    exit(0)