Esempio n. 1
0
def main():
    ## this code runs when call as a single script

    ## control if options provided or help
    if len(sys.argv) > 1:
        print("")
    else:
        help_options()
        exit()

    ## get arguments provided
    ID_file = os.path.abspath(sys.argv[1])
    folder = os.path.abspath(sys.argv[2])

    ## get file information
    strains2get = HCGB.functions.main_functions.readList_fromFile(ID_file)

    ## debug messages
    Debug = False
    if (sys.argv[3] == "True"):
        print('*******************************')
        debug_message("Mode ON")
        print('*******************************')
        Debug = True

    data = NCBI_download_list(strains2get, folder, Debug)
    print("+ Data has been retrieved.\n")
Esempio n. 2
0
def parse_taxid(tax_id, ncbi, option, debug):
    """Function to parse according to option: provide info or return unravelled data    
    """
    ############
    ## debug messages
    ############
    if debug:
        debug_message('parse_taxid:', "yellow")
        if tax_id.isdigit():
            debug_message('tax_id: ' + str(tax_id), "yellow")
        else:
            debug_message('tax_id: ' + tax_id, "yellow")
            debug_message('conversion needed: ', "yellow")

        debug_message('option: ' + option, "yellow")

    ############
    ## convert to tax ID
    ############
    if not tax_id.isdigit():
        ## convert name to taxid integer
        print("+ Convert to NCBI taxonomy ID")
        print("\tSource: " + tax_id)
        tax_id = name2taxid([tax_id], ncbi)
        (tax_name, taxid, rank, lineage) = taxon_info(tax_id, ncbi, debug)
        print("\tRank: " + rank)
        print("\tID: " + str(taxid))

    ############
    ## parse accordingly
    ############
    if (option == "info"):
        print()
        (tax_name, taxid, rank, lineage) = taxon_info(tax_id, ncbi, debug)

        print("----------------------------------------------")
        print("Result:")
        print("Name: " + tax_name)
        print("Rank: " + rank)
        print("Taxid: " + str(taxid))
        list_lineage = lineage.split(";")
        print("Lineage:")

        for tax in list_lineage:
            tax_split = tax.split(":")
            print("\t" + '{}\t{}'.format(tax_split[0], tax_split[1]))

        print("----------------------------------------------")
        print()

        ## return info
        return (tax_name, taxid, rank, lineage)

    ############
    ### call unravel taxid information
    ############
    elif (option == "unravel"):
        return (unravel_taxid(tax_id, ncbi, debug))
Esempio n. 3
0
def retrieve_genes_ids_sequences(profile, gene_ID, debug):
    """    
    """
    ## given a profile folder
    if debug:
        HCGB_aes.debug_message('profile: ', 'yellow')
        print (profile)
        HCGB_aes.debug_message('gene_id: ', 'yellow')
        print (gene_ID)
        
    ##
    assembled_genes_list = HCGB_main.retrieve_matching_files(profile, "assembled_genes.fa", debug)
    assembled_genes_list = [s for s in assembled_genes_list if 'ariba.tmp' not in s]
    
    if debug:
        HCGB_aes.debug_message('assembled_genes_list: ', 'yellow')
        print(assembled_genes_list)

    if os.path.isfile(assembled_genes_list[0]):
        for record in SeqIO.parse(assembled_genes_list[0], "fasta"):
            if debug:
                HCGB_aes.debug_message('record.description: ', 'yellow')
                print(record.description)
 
            search_ID = re.search(gene_ID, record.description)
            if (search_ID):
                return (record.id, str(record.seq))

        return('','')
Esempio n. 4
0
def NCBI_get_info_GenbankID(data_folder,
                            acc_ID,
                            debug,
                            assembly_level_given='complete'):

    section_given = get_section(acc_ID, debug)

    if debug:
        debug_message("-----------------------------------------")
        debug_message("NCBI_get_info_GenbankID function call", color="yellow")

    ## import module and class
    import ncbi_genome_download
    from ncbi_genome_download.config import NgdConfig
    tries = ['bacteria', 'archaea']
    for entry_tried in tries:
        if debug:
            debug_message("Trying with: " + entry_tried, color="yellow")

        ngd_config = NgdConfig.from_kwargs(section=section_given,
                                           file_formats='genbank',
                                           assembly_accessions=acc_ID,
                                           output=data_folder,
                                           dry_run=True,
                                           groups=entry_tried)
        info = ncbi_genome_download.core.select_candidates(ngd_config)
        if info:
            if debug:
                debug_message("It worked!", color="yellow")
            return (entry_tried)

    raise "**** ERROR: Something happen while connecting to NCBI... ***"
    exit()
    return (False)
Esempio n. 5
0
def desc_taxa(taxid, ncbi, debug):
    """Write descendent taxa for taxid
    Created by Joe R. J. Healey; Nick Youngblut
    Slightly modified. 
    
    Returns python dictionary with descendant taxid and name.
    """
    ## debug messages
    if debug:
        debug_message(
            "+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++")
        debug_message('desc_taxa: ' + str(taxid), "yellow")

    # Main feature of the script is to get all taxa within a given group.
    descendent_taxa = ncbi.get_descendant_taxa(taxid)
    descendent_taxa_names = ncbi.translate_to_names(descendent_taxa)

    dict_Descent = {}
    for dtn, dt in zip(descendent_taxa_names, descendent_taxa):
        dict_Descent[dt] = dtn

    ## debug messages
    if debug:
        debug_message('dict_Descent: ', "yellow")
        print(dict_Descent)
        debug_message(
            "+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++")

    return dict_Descent
Esempio n. 6
0
def get_superKingdom(tax_id, ncbi, debug):
    """For a given tax_id get superkingdom from NCBI taxonomy ID
    """

    if debug:
        debug_message("get_superKingdom: ", 'yellow')
        debug_message("tax_id: " + str(tax_id), 'yellow')

    (tax_name2, taxid2, rank2, lineage2) = taxon_info(tax_id, ncbi, debug)
    list_lineage = lineage2.split(";")
    for tax3 in list_lineage:
        tax_split = tax3.split(":")
        ## check the rank provided: add also species or serotype
        (tax_name3, taxid3, rank3,
         lineage3) = taxon_info(tax_split[0], ncbi, debug)
        if (rank3 == "superkingdom"):
            return (tax_name3.lower())
Esempio n. 7
0
def check_annot_table(annot_table, file, format, debug):
    '''Check annotation table provided matches the BLAST or protein fasta file provided'''
    
    ## read annotation
    annotation_table = pd.read_csv(annot_table, sep=",", index_col=0 )
    #annotation_table = annotation_table.drop("Unnamed: 0",axis=1)
    
    ## debug messages
    if debug:
        debug_message('check_annot_table function:', 'yellow')
        debug_message("list(annotation_table.columns)", 'yellow')
        print(list(annotation_table.columns))
        debug_message("BacDup_functions.columns_annot_table()", 'yellow')
        print(BacDup_functions.columns_annot_table())
    
    ## skip if not OK
    if not (list(annotation_table.columns) == BacDup_functions.columns_annot_table()):
        print('ERROR: Annotation table does NOT match desired input format')
        return(False)
    
    ## read BLAST sequence results
    if format=="blast":
        ## TODO
        print()
    
    ## protein fasta file
    elif(format=="fasta"):
        with open (file) as in_handle:
            ref_recs = SeqIO.to_dict(SeqIO.parse(in_handle, "fasta"))

        protein_IDs = list(ref_recs.keys()) ## get sequence headers
        index_annot = list(annotation_table.index) ## get annotation labels
        
        ## debug messages
        if debug:
            debug_message("list(annotation_table.index).sort()", 'yellow')
            print(index_annot)
            debug_message("list(ref_recs.keys()).sort()", 'yellow')
            print(protein_IDs)
        
        if (index_annot == protein_IDs):
            return(True)
        else:
            return(False)
Esempio n. 8
0
def update_db(ncbi_db, db_folder, debug):
    """Update database
    Created by Joe R. J. Healey; Nick Youngblut
    Original code.
    """

    ## debug messages
    if debug:
        debug_message('Update database at {}\n'.format(ncbi_db.dbfile),
                      "yellow")

    print('Updating the taxonomy database. This may take several minutes...\n')
    ncbi_db.update_taxonomy_database()

    ## print timestamp
    filename_stamp_parse = os.path.abspath(db_folder + '/timestamp_db.txt')
    time_functions.print_time_stamp(filename_stamp_parse)

    return ncbi_db
Esempio n. 9
0
def get_data(sample, file_data, format, out_folder, debug):
    '''Function to get BLAST results'''
    
    file_data = os.path.abspath(file_data)
        
    ## debug messages
    if (debug):
        debug_message('dup_searcher.get_data:', 'yellow')
        debug_message('file_data:' + file_data, 'yellow')
        debug_message('format:' + format, 'yellow')
    
    ## check file is readable
    BacDup_functions.file_readable_check(file_data)
    
    ## parse accordingly
    if (format=='blast_raw'):
        ## FIXME
        raw_blast = pd.read_csv(file_data, sep="\t", header = None, names=BacDup_functions.columns_rawBLAST_table())
                
    elif (format=='fasta'):
        
        raw_blast = create_blast_results(sample, file_data, out_folder, debug)
        raw_blast = pd.read_csv(raw_blast, sep="\t", header = None, names=BacDup_functions.columns_rawBLAST_table())
        
    return (raw_blast)
Esempio n. 10
0
def assembly_stats_caller(fasta_file, out_file, debug):

    contig_lens, scaffold_lens, gc_cont = assembly_stats.read_genome(
        fasta_file)

    ## debug messages
    if debug:
        HCGB_aes.debug_message("contig_lens", "yellow")
        print(contig_lens)
        HCGB_aes.debug_message("scaffold_lens", "yellow")
        print(scaffold_lens)
        HCGB_aes.debug_message("gc_cont", "yellow")
        print(gc_cont)

    ## get stats
    contig_stats = assembly_stats.calculate_stats(contig_lens, gc_cont)
    scaffold_stats = assembly_stats.calculate_stats(scaffold_lens, gc_cont)

    ## debug messages
    if debug:
        HCGB_aes.debug_message("contig_stats", "yellow")
        print(contig_stats)
        HCGB_aes.debug_message("scaffold_stats", "yellow")
        print(scaffold_stats)

    stat_output = {
        'Contig Stats': contig_stats,
        'Scaffold Stats': scaffold_stats
    }

    ## save results in file
    HCGB_main.printDict2file(out_file + '-contigs.csv', contig_stats, ",")
    HCGB_main.printDict2file(out_file + '-scaffolds.csv', scaffold_stats, ",")

    ## create stats in excel file
    assembly_stats_file = out_file + '_stats.xlsx'
    parse_stats(stat_output, assembly_stats_file, debug)

    return (stat_output, assembly_stats_file)
Esempio n. 11
0
def agrvate_caller(dict_assemblies, dict_folders, debug=False):
    """Create agrvate call and control for parameters"""
    
    ## ATTENTION: agrvate needs to chdir to output folder
    path_here = os.getcwd()
    
    print ("+ Checking agr genes for each sample retrieved...")
    
    agrvate_results = pd.DataFrame()
    
    ## No need to optimize. There is a problem with the working dir of agrvate and we 
    ## need to change every time.
    for name, assembly_file in dict_assemblies.items():
        sample_folder = HCGB_files.create_folder(dict_folders[name])
        ## check if previously done and succeeded
        filename_stamp = sample_folder + '/.success'
        if os.path.isfile(filename_stamp):
            stamp =  HCGB_time.read_time_stamp(filename_stamp)
            print (colored("\tA previous command generated results on: %s [%s]" %(stamp, name), 'yellow'))
        else:
            os.chdir(sample_folder)
            info_sample = agrvate_call(name, assembly_file, sample_folder, debug)
            agrvate_results = pd.concat([agrvate_results, info_sample], join='outer')
            
            if (info_sample.shape[0] == 0):
                print("+ Some error occurred with sample %s. Please re-run analysis or check log files." %name)
            else:
                ## success
                HCGB_time.print_time_stamp(filename_stamp)
    
    print ("+ Jobs finished%s\n+ Collecting information for all samples...")
    os.chdir(path_here)
    
    ## debug messages
    if debug:
        HCGB_aes.debug_message('agrvate_results', 'yellow')
        HCGB_main.print_all_pandaDF(agrvate_results)
    
    return(agrvate_results)
Esempio n. 12
0
def NCBI_get_info(section_given,
                  data_folder,
                  tax_ID_list,
                  debug,
                  assembly_level_given='complete',
                  group_given='bacteria'):
    '''This function uses ncbi_genome_download to 
    create a dry run and return information of each entry provided
    '''
    ## import module and class
    import ncbi_genome_download
    from ncbi_genome_download.config import NgdConfig

    try:
        ngd_config = NgdConfig.from_kwargs(
            section=section_given,
            file_formats='genbank',
            taxids=tax_ID_list,
            output=data_folder,
            dry_run=True,
            assembly_levels=assembly_level_given,
            groups=group_given)
        info = ncbi_genome_download.core.select_candidates(ngd_config)

    except:
        raise "**** ERROR: Something happen while connecting to NCBI... ***"
        exit()
        return (False)

    ####
    if (len(info)) < 1:
        print(
            colored(
                "No entries matched your filter. Please check the input options provided",
                'yellow'))
        exit()

    ## fill dictionary to simplify
    dict_entries = {}
    for entry, _ in info:
        strain_name = ncbi_genome_download.core.get_strain(entry)
        ## debug messagess
        if debug:
            debug_message("", 'yellow')
            print(entry)
            string = entry['assembly_accession'] + '\t' + entry[
                'organism_name'] + '\t' + strain_name
            debug_message(string, 'yellow')
            debug_message(
                ".....................................................................\n",
                'yellow')

        ## fill dictionary
        dict_entries[entry['assembly_accession']] = (entry['organism_name'],
                                                     strain_name)

    ## return
    return (dict_entries)
Esempio n. 13
0
def unravel_taxid(tax_id, ncbi, debug):
    """This function unravels information and obtains children taxids for each taxid.
    
    If taxid corresponds to serotype or species, no further processing is done. On the 
    other hand, if genes, family, order or any other rank is provided, all subranks would 
    retrieved. It also takes into account serotypes and accomodates information.
    
    It returns a list of all taxids included within the tax_id provided.
    """

    ## check the rank provided
    (tax_name, taxid, rank, lineage) = taxon_info(tax_id, ncbi, debug)

    ## debug messages
    if debug:
        debug_message('tax_name: ' + tax_name, "yellow")
        debug_message('taxid:' + str(taxid), "yellow")
        debug_message('rank: ' + rank, "yellow")
        debug_message('lineage: ' + lineage, "yellow")

    ##
    list_taxids = []

    ## taxid provided is either a serotype or strain: directly to retrieve
    if (rank in ("species", "serotype", "strain")):
        list_taxids.append(taxid)
    else:

        ## get descendant
        dict_descent = desc_taxa(taxid, ncbi, debug)
        for tax, name in dict_descent.items():

            ## add taxa retrieved
            list_taxids.append(tax)

            ## check the rank provided and decompose
            (tax_name2, taxid2, rank2, lineage2) = taxon_info(tax, ncbi, debug)
            list_lineage = lineage2.split(";")
            for tax3 in list_lineage:
                tax_split = tax3.split(":")
                ## check the rank provided: add also species or serotype
                (tax_name3, taxid3, rank3,
                 lineage3) = taxon_info(tax_split[0], ncbi, debug)
                if (rank3 in ("species", "serotype")):
                    list_taxids.append(taxid3)

    ## return uniq list of ids
    return (list(set(list_taxids)))
Esempio n. 14
0
def create_blast_results(sample, fasta_file, outdir, debug):
    '''Creates BLAST results for each fasta vs. itself'''
    
    #phr is the header file, pin is the index file, psq is the sequence file
    
    ## debug messages
    if debug:
        debug_message('create_blast_results function call:', 'yellow')
        debug_message('sample: ' + sample, 'yellow')
        debug_message('fasta_file: ' + fasta_file, 'yellow')
        debug_message('outdir: ' + outdir, 'yellow')
    
    ## output file
    raw_blast = os.path.abspath(os.path.join(outdir, "BLAST_raw_results.tsv"))

    ## timestamps 
    db_timestamp = os.path.join(outdir, '.db_success')
    search_timestamp = os.path.join(outdir, '.blast_success')
        
    if (not HCGB.functions.files_functions.is_non_zero_file(search_timestamp)):

        ## get binaries
        (makeblastdb_exe, blastp_exe) = BacDup.modules.config.get_exe('BLAST', debug)
        makeblastdb_exe = "/usr/bin/makeblastdb" 
        blastp_exe = "/usr/bin/blastp"
        
        ## check if db is indexed already
        db_path_name = os.path.join(os.path.abspath(outdir), sample + '_db')
        if (not HCGB.functions.files_functions.is_non_zero_file(db_timestamp)):
            ## generate blastdb for genome
            HCGB.functions.blast_functions.makeblastdb(db_path_name, fasta_file, makeblastdb_exe, 'prot') # HCGB function    
        
            ## print time stamp
            HCGB_time.print_time_stamp(db_timestamp)
        
        else:
            print (colored("\t+ BLAST database already available for sample %s [%s]" %(sample, read_time), 'green'))
            
        ## create blastp outfile
        HCGB.functions.blast_functions.blastp(blastp_exe, raw_blast, db_path_name, fasta_file, 1) # HCGB function

        ## print time stamp
        HCGB_time.print_time_stamp(search_timestamp)
    else:
        read_time = HCGB_time.read_time_stamp(search_timestamp)
        print (colored("\t+ Duplicate search already available for sample %s [%s]" %(sample, read_time), 'green'))
            
    return (raw_blast)
Esempio n. 15
0
def get_genes_profile(samples_info, gene_names, debug, option):
    """    
    """
    ## search by group id or gene name
    print ('\n+ Retrieve selected genes profile for each sample.')
    results_profileIDs = pd.DataFrame()
    sample_frame = samples_info.groupby(["name"])
    for g in gene_names:
        #print ("\t+", g)
        for name, cluster_df in sample_frame:
            my_list_profiles = cluster_df.loc[cluster_df['tag'] == 'profile']['ext'].to_list()
	       
            if debug:
                HCGB_aes.debug_message('name: ' + name, 'yellow')
                HCGB_aes.debug_message('my_list_profiles: ', 'yellow')
                print (my_list_profiles)
                HCGB_aes.debug_message('cluster_df: ', 'yellow')
                print (cluster_df)

            ## skip files
            if name == 'report':
                continue

            fill=False
            for p in my_list_profiles:
                profile_csv = cluster_df.loc[cluster_df['ext'] == p]['sample'].to_list()[0]
                
                ## skip files
                if not profile_csv.endswith('report_summary.csv'):
                    if debug:
                        HCGB_aes.debug_message('profile_csv: ' + profile_csv, 'yellow')
                
                    value = retrieve_genes_ids_profile(profile_csv, g, debug, option)
                    
                    ## save results 
                    if (not value.empty):
                        for Name, Data in value.iterrows():
                            results_profileIDs.loc[name,Name] = Data['Status']
                        fill=True

            if not fill:
                results_profileIDs.loc[name, g] = 'no'

    return (results_profileIDs)
Esempio n. 16
0
def get_gbk_information(gbk, debug):
    ## read Genbank file to retrieve information for each samle
    ## https://biopython.org/wiki/SeqRecord
    # get
    for index, record in enumerate(SeqIO.parse(gbk, "genbank")):

        if (index == 0):  ## only for first entry == Main chromosome
            if debug:
                debug_message("******************************************")
                debug_message("SeqIO.read(gbk, 'genbank') info:",
                              color="yellow")
                debug_message("record", color="yellow")
                debug_message(record, color="yellow")

            organism = record.annotations['source']
            taxonomy = record.annotations['taxonomy']

    ## TODO: get plasmid information

    ##
    return (taxonomy, organism)
Esempio n. 17
0
def parse_annot_file(name,
                     folder_out_input,
                     annot_file,
                     output_path,
                     Debug,
                     ref_file=""):
    """
    This functions checks for each annotation file provided type of input
    and calls appropriate parser: gbf_parser or gff_parser
    """
    ## debug messages
    if (Debug):
        debug_message('+++++++++++++++++++++++++++++++')
        debug_message('check_annot_file function call:', 'yellow')
        debug_message('name: ' + name, 'yellow')
        debug_message('annot_file: ' + annot_file, 'yellow')

    ## check file integrity: exists & non-zero
    if (BacDup_functions.file_readable_check(annot_file)):
        ## check format; call parser
        format = format_checker.is_format(annot_file, Debug)

        ## debug messages
        if (Debug):
            debug_message('\nformat_checker.is_format function call:',
                          'yellow')
            debug_message('format: ' + format, 'yellow')

        ## parse gbk or gff
        if (format == 'gbk'):
            print(colored('\t* GenBank format file:........[OK]', 'green'))

            ## TODO: print details available within GenBank:
            # Accession, Bioproject,
            # Reference, Authors, Title, Journal,
            # Comment

            return (gbf_parser.gbf_parser_caller(annot_file, output_path,
                                                 Debug))

        elif (format == 'gff'):
            print(colored('\t* GFF format file:.......[OK]', 'green'))
            if (HCGB_files.is_non_zero_file(ref_file)):
                return (gff_parser.gff_parser_caller(annot_file, ref_file,
                                                     output_path, Debug))
            else:
                print(
                    colored(
                        "ERROR: No genome reference file provided for this GFF annotation. Check input options provided.",
                        "red"))
                exit()

        ## not valid via this option
        else:
            print(colored("ERROR: not valid via this option", "red"))
            exit()

    ## not accessible for this sample
    else:
        return (False)
Esempio n. 18
0
def ngd_download(section_given,
                 acc_ID,
                 data_folder,
                 debug,
                 section='genbank',
                 assembly_level='complete',
                 group_given='bacteria'):
    '''
    Function that calls and retrieves data from NCBI using python package ngd.
    
    :param acc_ID:
    :param data_folder: Folder to store data. 
    :param debug: True/false for debugging messages
    
    :attention Module ngd requires to download data in bacteria/archaea subfolder under genbank or refseq folder.
    '''
    ##################################
    ## check if necessary to download
    ##################################

    ## get path
    print('+ Check data for ID: ', acc_ID)
    dir_path = os.path.join(data_folder, section_given, group_given, acc_ID)

    ## check if previously download
    download = False
    if os.path.exists(dir_path):
        print('+ Folder already exists: ', dir_path)
        ## get files download
        (genome, prot, gff,
         gbk) = BacDup.scripts.functions.get_files_annotation(dir_path, debug)
        if (gbk):  ## Only genbank format file is required
            download = False
        else:
            print('+ Not all necessary data is available. Download it again.')
            download = True
    else:
        download = True

    ## download data
    if download:
        print("\n+ Downloading data for: " + colored(acc_ID, 'green'))

        ## download in data folder provided
        if (debug):
            debug_message("ngd.download call", color="yellow")
            debug_message("dir_path: " + dir_path, color="yellow")
            debug_message("section_given: " + section_given, color="yellow")

        ## download
        if debug:
            debug_message(
                "section='%s', file_formats='genbank', assembly_level=%s, assembly_accessions=%s, output=%s, groups=%s"
                % (section_given, assembly_level, acc_ID, data_folder,
                   group_given),
                color="yellow")

        try:
            ngd.download(section=section_given,
                         file_formats='genbank',
                         assembly_levels=assembly_level,
                         assembly_accessions=acc_ID,
                         output=data_folder,
                         groups=group_given)
        except:
            raise (
                "A problem occurred when contacting NCBI for downloading id (%s) from %s"
                % (acc_ID, section_given))

        ## return empty
        if not os.path.isdir(dir_path):
            return False

        ## check if files are gunzip
        files = os.listdir(dir_path)
        files_list = []
        for f in files:
            if f.endswith('gz'):
                files_list.append(f)
                print("\t- Extracting files: ", f)
                HCGB.functions.files_functions.extract(dir_path + '/' + f,
                                                       dir_path)
                #os.remove(dir_path + '/' + f)

    ## skip
    else:
        print('\t+ Data is already available, no need to download it again')

    print()
    ## return path where data is
    return (dir_path)
Esempio n. 19
0
def retrieve_genes_ids_profile(profile, gene_ID, debug, option):
    """    
    """
    ## read data    
    get_csv_data = HCGB_main.get_data(profile, ',', '')
    
    if option == 'name':
        list_Genes = get_csv_data['Genes'].to_list()
        get_csv_data.index = get_csv_data['Genes']
    elif option == 'ID':
        list_Genes = get_csv_data['ID'].to_list()
        get_csv_data.index = get_csv_data['ID']

    
    ## debug messages
    if debug:
        HCGB_aes.debug_message('profile: ' + profile, 'yellow')
        HCGB_aes.debug_message('gene_id: ' + str(gene_ID), 'yellow')
        HCGB_aes.debug_message('data: ', 'yellow')
        print(get_csv_data)
        HCGB_aes.debug_message('Option: ' + option, 'yellow')
        HCGB_aes.debug_message('Genes: ', 'yellow')
        print (list_Genes)
        
    ## search accordingly
    if option == 'name':
        regex_search = re.compile("^" + gene_ID + ".*")
        filtered_genes = list(filter(regex_search.match, list_Genes))
        
        ## debug messages
        if debug:
            HCGB_aes.debug_message('filtered_genes: ', 'yellow')
            print (filtered_genes)
            HCGB_aes.debug_message('filtered_genes.loc[filtered_genes]: ', 'yellow')
            print (get_csv_data.loc[filtered_genes])
        
        return (get_csv_data.loc[filtered_genes]) 
        
    else:
        if gene_ID in list_Genes:
            ## debug messages
            if debug:
                HCGB_aes.debug_message('gene_id: ' + gene_ID, 'yellow')
                print (get_csv_data.loc[gene_ID].to_frame().transpose())
                
            return (get_csv_data.loc[gene_ID].to_frame().transpose())
        else:
            return(pd.DataFrame()) 
Esempio n. 20
0
def run_assembly(options):
    """Main function of the assemble module.
	
	It assembles each sample using SPADES_ and checks quality using BUSCO_ software and database.

	
	.. seealso:: This function depends on other BacterialTyper and HCGB functions called:
	
		- :func:`BacterialTyper.scripts.BUSCO_caller.print_help_BUSCO`
	
		- :func:`BacterialTyper.scripts.multiQC_report.multiqc_help`
		
		- :func:`BacterialTyper.modules.qc.BUSCO_check`
			
		- :func:`HCGB.sampleParser`
		
		- :func:`HCGB.functions.aesthetics_functions`
		
		- :func:`HCGB.functions.time_functions`
	
		- :func:`HCGB.functions.main_functions`
		
		- :func:`HCGB.functions.file_functions`
		
	.. include:: ../../links.inc	 	
	
	"""

    ## init time
    start_time_total = time.time()

    ## debugging messages
    global Debug
    if (options.debug):
        Debug = True
    else:
        Debug = False

    ##################################
    ### show help messages if desired
    ##################################
    if (options.help_format):
        ## help_format option
        help_info.help_fastq_format()
        exit()
    elif (options.help_BUSCO):
        ## information for BUSCO
        BUSCO_caller.print_help_BUSCO()
        exit()
    elif (options.help_project):
        ## information for project
        help_info.project_help()
        exit()
    elif (options.help_multiqc):
        ## information for Multiqc
        multiQC_report.multiqc_help()
        exit()

    ### set as default paired_end mode
    if (options.single_end):
        options.pair = False
    else:
        options.pair = True

    ## message header
    HCGB_aes.pipeline_header("BacterialTyper", ver=pipeline_version)
    HCGB_aes.boxymcboxface("Assembly module")
    print("--------- Starting Process ---------")
    HCGB_time.print_time()

    ## absolute path for in & out
    input_dir = os.path.abspath(options.input)
    outdir = ""

    ## Project mode as default
    project_mode = True
    if (options.detached):
        options.project = False
        project_mode = False
        outdir = os.path.abspath(options.output_folder)
    else:
        options.project = True
        outdir = input_dir

    ## get files
    pd_samples_retrieved = sampleParser.files.get_files(
        options, input_dir, "trim", ['_trim'], options.debug)

    ## debug message
    if (Debug):
        print(colored("**DEBUG: pd_samples_retrieve **", 'yellow'))
        print(pd_samples_retrieved)

    ## generate output folder, if necessary
    print("\n+ Create output folder(s):")
    if not options.project:
        HCGB_files.create_folder(outdir)
    outdir_dict = HCGB_files.outdir_project(outdir, options.project,
                                            pd_samples_retrieved, "assemble",
                                            options.debug)

    ### call assemble using spades
    start_time_partial = start_time_total
    start_time_partial_assembly = start_time_partial

    ## optimize threads
    name_list = set(pd_samples_retrieved["name"].tolist())
    threads_job = HCGB_main.optimize_threads(
        options.threads, len(name_list))  ## threads optimization
    max_workers_int = int(options.threads / threads_job)

    ## debug message
    if (Debug):
        HCGB_aes.debug_message("options.threads: " + str(options.threads),
                               "yellow")
        HCGB_aes.debug_message("max_workers: " + str(max_workers_int),
                               "yellow")
        HCGB_aes.debug_message("cpu_here: " + str(threads_job), "yellow")

    # Group dataframe by sample name
    sample_frame = pd_samples_retrieved.groupby(["name"])

    # We can use a with statement to ensure threads are cleaned up promptly
    print('+ Running modules SPADES...')
    with concurrent.futures.ThreadPoolExecutor(
            max_workers=max_workers_int) as executor:
        ## send for each sample
        commandsSent = {
            executor.submit(check_sample_assembly, name, outdir_dict[name],
                            sorted(cluster["sample"].tolist()), threads_job):
            name
            for name, cluster in sample_frame
        }

        for cmd2 in concurrent.futures.as_completed(commandsSent):
            details = commandsSent[cmd2]
            try:
                data = cmd2.result()
            except Exception as exc:
                print('***ERROR:')
                print(cmd2)
                print('%r generated an exception: %s' % (details, exc))

    ## functions.timestamp
    print("\n+ Assembly of all samples finished: ")
    start_time_partial = HCGB_time.timestamp(start_time_partial_assembly)

    ##
    if (assembly_stats):
        ###################
        if Debug:
            HCGB_aes.debug_message("assembly_stats dictionary", "yellow")
            print(assembly_stats)

        ## create single file
        get_assembly_stats_all(assembly_stats, outdir, Debug)

    ### symbolic links
    print("+ Retrieve all genomes assembled...")

    ### BUSCO check assembly
    if (options.no_BUSCO):
        print()
    else:
        results = qc.BUSCO_check(outdir, outdir, options, start_time_partial,
                                 "genome")

    ## print to file results
    print("\n*************** Finish *******************")
    start_time_partial = HCGB_time.timestamp(start_time_total)

    print("+ Exiting Assembly module.")
    return ()
Esempio n. 21
0
def get_files_annotation(folder, debug):
    '''
    Code retrieve from BacterialTyper database_generator.py script
    '''
    ## check if files are gunzip
    files = os.listdir(folder)
    genome = ""
    prot = ""
    gff = ""
    gbk = ""
    for f in files:
        if f.endswith('.fna'):
            genome = os.path.join(folder, f)
        elif f.endswith('.gff'):
            gff = os.path.join(folder, f)
        elif f.endswith('.gbk'):
            gbk = os.path.join(folder, f)
        elif f.endswith('.gbff'):
            gbk = os.path.join(folder, f)
        elif f.endswith('.faa'):
            prot = os.path.join(folder, f)

    ## debug messages
    if debug:
        debug_message("-----------------------------------------")
        debug_message("Return info get_files_download", color="yellow")
        debug_message("genome: " + genome, color="yellow")
        debug_message("prot: " + prot, color="yellow")
        debug_message("gff: " + gff, color="yellow")
        debug_message("gbk: " + gbk, color="yellow")

    return (genome, prot, gff, gbk)
Esempio n. 22
0
def get_assembly_stats_all(assembly_stats_dict, outdir, debug):
    ## get all assembly stats
    outdir_report = HCGB_files.create_subfolder("report", outdir)
    final_dir = HCGB_files.create_subfolder("assembly_stats", outdir_report)
    final_sub_dir = HCGB_files.create_subfolder("samples", final_dir)

    #### summary and information
    results_summary_toPrint_all = pd.DataFrame()
    column_names = ("Type", "Sample", "Total Sequences", "GC% Content",
                    "Longest sequence", "Shortest sequence", "Median length",
                    "Mean length", "Total Length (bp)", "L10", "N10", "L20",
                    "N20", "L30", "N30", "L40", "N40", "L50", "N50")

    ## debugging messages
    if debug:
        HCGB_aes.debug_message("Create assembly statistic for all samples")

    for sample_name in assembly_stats:
        excel_file_stats = assembly_stats[sample_name][1]

        if debug:
            HCGB_aes.debug_message("sample_name: " + sample_name, 'yellow')
            HCGB_aes.debug_message("excel: " + excel_file_stats, 'yellow')
            HCGB_aes.debug_message("contig stats dictionary: ", 'yellow')
            print(assembly_stats[sample_name][0]['Contig Stats'])
            HCGB_aes.debug_message("scaffold stats dictionary: ", 'yellow')
            print(assembly_stats[sample_name][0]['Scaffold Stats'])

        # get contig
        contig_stats = pd.DataFrame.from_dict(
            assembly_stats[sample_name][0]['Contig Stats'],
            orient='index').transpose()
        contig_stats['type'] = 'contigs'
        contig_stats['sample_name'] = sample_name

        # get scaffold
        scaff_stats = pd.DataFrame.from_dict(
            assembly_stats[sample_name][0]['Scaffold Stats'],
            orient='index').transpose()
        scaff_stats['type'] = 'scaffolds'
        scaff_stats['sample_name'] = sample_name

        ## copy individual excel file
        shutil.copy(excel_file_stats, final_sub_dir)

        ## add all data
        results_summary_toPrint_all = pd.concat(
            [results_summary_toPrint_all, contig_stats, scaff_stats],
            ignore_index=True)

    ## reorder columns
    cols = results_summary_toPrint_all.columns.tolist()
    cols = cols[-1:] + cols[:-1]
    cols = cols[-1:] + cols[:-1]
    results_summary_toPrint_all = results_summary_toPrint_all[cols]

    ## write to excel
    name_excel_summary = final_dir + '/summary_stats.xlsx'
    writer_summary = pd.ExcelWriter(name_excel_summary,
                                    engine='xlsxwriter')  ## open excel handle

    ## filter important columns
    results_summary_toPrint_all = results_summary_toPrint_all.set_axis(
        column_names, 1)

    ## save in excel
    results_summary_toPrint_all.to_excel(
        writer_summary, sheet_name="all_data")  ## write excel handle
    writer_summary.save()  ## close excel handle
Esempio n. 23
0
def check_sample_assembly(name, sample_folder, files, threads):
    """Checks if sample is assembled.
	
	It checks whether a sample is assembled or not by reading file *sample_folder/.success_all*. 
	
	If file not available (no previous assembly or not suceeded it) it calls :func:`BacterialTyper.scripts.spades_assembler.run_module_assembly` to generate assembly for the sample speficied.
	
	:param name: Sample name or tag to identify sample.
	:param sample_folder:  directory to generate assembly ouptut. It must exist.
	:param files: List containing files (fastq R1 & R2) for the sample to be assembled.
	:param threads: Number of CPUs to use
	:type name: string
	:type sample_folder: string 
	:type files: list
	:type threads: integer
	
	:return: Populates dictionary assembly_stats with assembly stats dictionary information
	:rtype: Dataframe
	
	.. seealso:: This function depends on other BacterialTyper and HCGB functions called:
	
		- :func:`BacterialTyper.scripts.spades_assembler.run_module_assembly`
	
	"""
    ## check if previously assembled and succeeded
    filename_stamp = sample_folder + '/.success_all'
    if os.path.isfile(filename_stamp):
        stamp = HCGB_time.read_time_stamp(filename_stamp)
        print(
            colored(
                "\tA previous command generated results on: %s [%s]" %
                (stamp, name), 'yellow'))

        ## Get information
        stat_output = {
            'Contig Stats':
            HCGB_main.file2dictionary(
                sample_folder + '/' + name + '_assembly-contigs.csv', ','),
            'Scaffold Stats':
            HCGB_main.file2dictionary(
                sample_folder + '/' + name + '_assembly-scaffolds.csv', ',')
        }

        ## populate main dictionary
        assembly_stats[name] = [
            stat_output, sample_folder + '/' + name + '_assembly_stats.xlsx'
        ]

    else:

        ## debug message
        if (Debug):
            HCGB_aes.debug_message(
                "spades_assembler.run_module_assembly call:", "yellow")
            print("spades_assembler.run_module_assembly " + name + "\t" +
                  sample_folder + "\t" + files[0] + "\t" + files[1] + "\t" +
                  str(threads) + "\n")

        # Call spades_assembler
        code = spades_assembler.run_module_assembly(name, sample_folder,
                                                    files[0], files[1],
                                                    threads)

        if (code != 'FAIL'):
            ## success stamps
            filename_stamp = sample_folder + '/.success_all'
            stamp = HCGB_time.print_time_stamp(filename_stamp)
            assembly_stats[
                name] = code  # list containing dictionary of data and excel
        else:
            print(
                "Some error occurred for sample %s while generating the assembly. "
                % name)
Esempio n. 24
0
def parse_search_options(arg_dict):

    ##
    outdir = os.path.abspath(arg_dict.input_folder)

    ## --------------------------------------- ##
    ## Project containing data
    ## --------------------------------------- ##
    if (arg_dict.project):
        print(colored('\t* BacDup project folder:.......[OK]', 'green'))

        ## set missing options
        arg_dict.pair = False
        arg_dict.include_all = True
        arg_dict.include_lane = True

        ## find samples previously parsed and prepared within a BacDup project structure
        pd_proteins = sampleParser.files.get_files(arg_dict, outdir, "parse",
                                                   ["fa"], arg_dict.debug)
        pd_proteins = pd_proteins.drop(["dirname", "name", "ext", "tag"],
                                       axis=1)
        pd_proteins = pd_proteins.rename(index=str,
                                         columns={'sample': 'file_data'})
        pd_proteins['format'] = 'fasta'

        pd_annot = sampleParser.files.get_files(arg_dict, outdir, "parse",
                                                ["annot_df.csv"],
                                                arg_dict.debug)
        pd_annot = pd_annot.drop(["dirname", "name", "ext", "tag"], axis=1)
        pd_annot = pd_annot.rename(index=str,
                                   columns={'sample': 'annot_table'})

        ## merge into pd_samples_retrieved
        pd_samples_retrieved = pd.merge(pd_proteins, pd_annot)

        ## debug messages
        if (arg_dict.debug):
            debug_message('pd_proteins:', 'yellow')
            HCGB_main.print_all_pandaDF(pd_proteins)

            debug_message('pd_annot:', 'yellow')
            HCGB_main.print_all_pandaDF(pd_annot)

            debug_message('pd_samples_retrieved:', 'yellow')
            HCGB_main.print_all_pandaDF(pd_samples_retrieved)

    ## --------------------------------------- ##
    ## data on multiple sources
    ## --------------------------------------- ##
    elif (arg_dict.detached):
        print(colored('\t* Detached mode:.......[OK]', 'green'))

        ## parse samples provided
        print()

        #########################################################
        ## BLAST raw results provided: either batch or single
        #########################################################
        if (arg_dict.text_file):
            print(
                colored('\t* BLAST raw results provided:.......[OK]', 'green'))
            print()

            # *************************** ##
            ## Batch file provided
            # *************************** ##
            if (arg_dict.batch):
                ## debug messages
                if (arg_dict.debug):
                    debug_message('+++++++++++++++++++++++++++++++')
                    debug_message(
                        'Multiple BLAST results file provided option:',
                        'yellow')
                    debug_message('arg_dict.text_file: ' + arg_dict.text_file,
                                  'yellow')

                ## check if ok
                BacDup_functions.file_readable_check(arg_dict.text_file)

                print(
                    colored(
                        '\t* Multiple BLAST results files provided .......[OK]',
                        'green'))
                dict_entries = HCGB_main.file2dictionary(
                    arg_dict.text_file, ',')

                ## check file is readable
                BacDup_functions.file_readable_check(arg_dict.annot_table)
                dict_entries_annot = HCGB_main.file2dictionary(
                    arg_dict.annot_table, ',')

                ## Check dictionaries contain same information
                if (dict_entries.keys() == dict_entries_annot.keys()):
                    for sample, files in dict_entries.items():
                        ## check annot_table and fasta_file headers are the same ##
                        return_code = dup_searcher.check_annot_table(
                            dict_entries_annot[sample], files, 'BLAST',
                            arg_dict.debug)
                        if not (return_code):
                            print(
                                'Process will continue but sample %s would be discarded'
                                % sample)
                        else:
                            print()
                            ## fill dataframe pd_samples_retrieved

            # *************************** ##
            ## single file provided
            # *************************** ##
            else:
                ## check annot_table and fasta_file headers are the same ##
                return_code = dup_searcher.check_annot_table(
                    arg_dict.annot_table, arg_dict.text_file, 'BLAST',
                    arg_dict.debug)
                if not (return_code):
                    print('Process will stop here. Please check input files')
                    exit()
                else:
                    print()
                    ## fill dataframe pd_samples_retrieved

        #########################################################
        ## annotations file provided: either batch or single
        #########################################################
        elif (arg_dict.annot_file):
            ## debug messages
            if (arg_dict.debug):
                debug_message('Multiple BLAST results file provided option:',
                              'yellow')
                debug_message('arg_dict.annot_file: ' + arg_dict.annot_file,
                              'yellow')

            ## get input info
            df_accID = input_parser.parse_options(arg_dict)
            if (arg_dict.debug):
                debug_message('df_accID', 'yellow')
                print(df_accID)

            ## parse info
            input_parser.parse_information(arg_dict, df_accID, outdir)

            ## set missing options
            arg_dict.pair = False
            arg_dict.include_all = True
            arg_dict.include_lane = True

            ## find samples previously parsed and prepared within a BacDup project structure
            pd_proteins = sampleParser.files.get_files(arg_dict, outdir,
                                                       "parse", ["fa"],
                                                       arg_dict.debug)
            pd_annot = sampleParser.files.get_files(arg_dict, outdir, "parse",
                                                    ["annot_df.csv"],
                                                    arg_dict.debug)

            ## merge into pd_samples_retrieved
            frames = [pd_proteins, pd_annot]
            pd_samples_retrieved = pd.concat(frames, sort=True, join='outer')

            if (arg_dict.debug):
                debug_message('pd_samples_retrieved', 'yellow')
                print(pd_samples_retrieved)

        #########################################################
        ## CDS fasta and annotations provided: either batch or single
        #########################################################
        elif arg_dict.fasta_prot:

            # *************************** ##
            ## Batch file provided
            # *************************** ##
            if (arg_dict.batch):
                print(
                    colored('\t* Multiple FASTA files provided .......[OK]',
                            'green'))

                ## debug messages
                if (arg_dict.debug):
                    debug_message('+++++++++++++++++++++++++++++++')
                    debug_message(
                        'Multiple Protein FASTA files provided option:',
                        'yellow')
                    debug_message(
                        'arg_dict.fasta_prot: ' + arg_dict.fasta_prot,
                        'yellow')

                ## check if ok
                BacDup_functions.file_readable_check(arg_dict.fasta_prot)
                dict_entries = HCGB_main.file2dictionary(
                    arg_dict.fasta_prot, ',')

                ## check file is readable
                BacDup_functions.file_readable_check(arg_dict.annot_table)
                print(
                    colored(
                        '\t* Multiple annotation tables provided .......[OK]',
                        'green'))
                dict_entries_annot = HCGB_main.file2dictionary(
                    arg_dict.annot_table, ',')

                ## Check dictionaries contain right information
                if (dict_entries.keys() == dict_entries_annot.keys()):
                    for sample, files in dict_entries.items():
                        ## check annot_table and fasta_file headers are the same ##
                        return_code = dup_searcher.check_annot_table(
                            dict_entries_annot[sample], files, 'fasta',
                            arg_dict.debug)
                        if not (return_code):
                            print(
                                'Process will continue but sample %s would be discarded'
                                % sample)
                        else:
                            print()
                            ## fill dataframe pd_samples_retrieved

            # *************************** ##
            ## single file provided
            # *************************** ##
            else:
                print(
                    colored('\t* Protein FASTA file provided .......[OK]',
                            'green'))
                BacDup_functions.file_readable_check(arg_dict.fasta_prot)

                ## check file is readable
                print(
                    colored('\t* An annotation table provided .......[OK]',
                            'green'))
                BacDup_functions.file_readable_check(arg_dict.annot_table)

                ## check annot_table and fasta_file headers are the same ##
                return_code = dup_searcher.check_annot_table(
                    arg_dict.annot_table, arg_dict.fasta_prot, 'fasta',
                    arg_dict.debug)
                if not (return_code):
                    print('Process will stop here. Please check input files')
                    exit()
                else:
                    print()
                    ## fill dataframe pd_samples_retrieved
                    exit()

        ### What??
        else:
            ## Nespresso
            print()

    ## return information
    pd_samples_retrieved = pd_samples_retrieved.set_index('new_name')
    return (pd_samples_retrieved)
Esempio n. 25
0
def run_search(arg_dict):
    """Main function of the search module in BacDup package.
    
    This module searches and create gene duplication analysis. 
    
    It allows the user to provide either a previous parsed data project (NCBI Genbank IDs, taxonomy or user
    annotation data) or a single or multiple samples.    
    """

    ## help message
    if (arg_dict.input_help):
        help_input()
        exit()

    if (arg_dict.blast_help):
        info.blast_help()
        exit()

    if (arg_dict.project_help):
        info.project_help()
        exit()

    if (arg_dict.detached_mode_help):
        info.detached_mode()
        exit()

    ### Start the analysis
    BacDup_functions.pipeline_header('BacDup')
    HCGB_aes.boxymcboxface("Search module")
    print("--------- Starting Process ---------")
    HCGB_time.print_time()

    ## init time
    start_time_total = time.time()

    ## absolute path for in & out
    outdir = os.path.abspath(arg_dict.input_folder)

    ## project or detached?
    if arg_dict.detached:
        arg_dict.project = False
        ## output folder
        print("\n+ Create output folder(s):")
        HCGB.functions.files_functions.create_folder(outdir)
    else:
        arg_dict.project = True

    ## debug messages
    if (arg_dict.debug):
        debug_message('+++++++++++++++++++++++++++++++')
        debug_message('Project/Detached option:', 'yellow')
        debug_message('arg_dict.detached: ' + str(arg_dict.detached), 'yellow')
        debug_message('arg_dict.project: ' + str(arg_dict.project), 'yellow')
        debug_message('outdir:' + outdir, 'yellow')
        debug_message('+++++++++++++++++++++++++++++++')

    ## get files
    print()
    HCGB_aes.print_sepLine("-", 50, False)
    print('+ Getting information provided... ')
    print('+ Several options available:')
    print('\t* BacDup project folder with initiated data')
    print('\t* Single/Multiple Annotation file:')
    print('\t  |-- GenBank format files')
    print('\t  |-- GFF files +  Reference fasta files required')
    print('\t* Single/Multiple raw BLAST results files')
    print('\t* Single/Multiple fasta proteins + annotation table')

    print("""\n\n**** NOTE: **** 
    For additional options (e.g. Single/Multiple NCBI GenBank or taxonomy IDs)
    use the input module to accommodate accordingly """)
    time.sleep(1)

    print()

    ## parse options
    pd_samples_retrieved = parse_search_options(arg_dict)

    ## time stamp
    start_time_partial = HCGB_time.timestamp(start_time_total)

    ## for each sample
    dict_search_folders = HCGB.functions.files_functions.outdir_project(
        outdir, arg_dict.project, pd_samples_retrieved, "search",
        arg_dict.debug)

    dict_dup_folders = HCGB.functions.files_functions.outdir_project(
        outdir, arg_dict.project, pd_samples_retrieved, "dups", arg_dict.debug)

    dict_parse_folders = HCGB.functions.files_functions.outdir_project(
        outdir, arg_dict.project, pd_samples_retrieved, "parse",
        arg_dict.debug)

    ## create results
    data2add = pd.DataFrame(columns=BacDup_functions.columns_dup_table())
    for sample, folder in dict_search_folders.items():

        annot_timestamp = os.path.join(dict_dup_folders[sample],
                                       '.annot_success')
        dup_annot_file = os.path.join(dict_dup_folders[sample],
                                      'dup_annot.csv')

        ## annotation
        annot_table_file = pd_samples_retrieved.loc[sample, 'annot_table']

        if (not HCGB.functions.files_functions.is_non_zero_file(
                annot_timestamp)):

            ## get results
            file_data = pd_samples_retrieved.loc[sample, 'file_data']
            format = pd_samples_retrieved.loc[sample, 'format']
            filtered_data = dup_searcher.filter_data(
                sample, file_data, format, arg_dict.pident, arg_dict.evalue,
                arg_dict.percentage, arg_dict.bitscore, folder, arg_dict.debug)

            ## timestamps
            filter_timestamp = os.path.join(dict_dup_folders[sample],
                                            '.filter_success')
            if (not HCGB.functions.files_functions.is_non_zero_file(
                    filter_timestamp)):
                #save results as a .csv file
                sort_csv = os.path.abspath(
                    os.path.join(dict_dup_folders[sample],
                                 'filtered_results.csv'))
                filtered_data.to_csv(sort_csv, header=True, index=False)

                ## print time stamp
                HCGB_time.print_time_stamp(filter_timestamp)
            else:
                read_time = HCGB_time.read_time_stamp(filter_timestamp)
                print(
                    colored(
                        "\t+ Filter results already available for sample %s [%s]"
                        % (sample, read_time), 'green'))

            ## get annotation
            (dup_annot_df, data2add_entry) = dup_searcher.get_dupannot(
                sample, filtered_data, annot_table_file, arg_dict.debug)

            ##
            info_dup_file = os.path.join(dict_dup_folders[sample],
                                         'info_dup.csv')
            data2add_entry.to_csv(info_dup_file, header=True, index=False)

            ## save into file
            dup_annot_df.to_csv(dup_annot_file, header=True)

            ## print time stamp
            HCGB_time.print_time_stamp(annot_timestamp)

        else:
            read_time = HCGB_time.read_time_stamp(annot_timestamp)
            print(
                colored(
                    "\t+ Duplicate annotation already available for sample %s [%s]"
                    % (sample, read_time), 'green'))

            ## add info for each
            dup_annot_df = HCGB_main.get_data(dup_annot_file, ',',
                                              "index_col=0")
            annot_table = HCGB_main.get_data(annot_table_file, ',',
                                             "index_col=0")
            data2add_entry = dup_searcher.get_dup_stats(
                sample, dup_annot_df, annot_table, arg_dict.debug)

        ## add genome length data
        data2add_entry['genome_len'] = ''
        len_df_file = os.path.join(dict_parse_folders[sample], 'length_df.csv')
        if os.path.isfile(len_df_file):
            len_data = HCGB_main.get_data(len_df_file, ',', "header=None")
            data2add_entry['genome_len'] = len_data[1].sum()

        ## merge data
        #data2add_entry = data2add_entry.reset_index()
        data2add = data2add.append(data2add_entry, ignore_index=False)

    ### report generation
    HCGB_aes.boxymcboxface("Summarizing duplicated search")
    outdir_report = HCGB.functions.files_functions.create_subfolder(
        "report", outdir)
    dups_report = HCGB.functions.files_functions.create_subfolder(
        "dups", outdir_report)

    ## add data2add
    data2add.to_csv(os.path.join(dups_report, 'info_annot.csv'),
                    index=True,
                    header=True)

    ## maybe add a summary of the files?

    print("\n*************** Finish *******************")
    start_time_partial = HCGB_time.timestamp(start_time_total)

    print("+ Exiting search module.")
    return ()
Esempio n. 26
0
def agrvate_call(sample, assembly_file, folder, debug=False):
    """agrvate call and check results."""
    
    ## prepare call
    log_call = os.path.join(folder, "agrvate_cmd.log")
    err_call = os.path.join(folder, "agrvate_cmd.err")
    agrvate_bin = set_config.get_exe('agrvate')
    
    ## system call
    cmd_call = "%s -i %s -m -f >  %s 2> %s " %(agrvate_bin, 
                                               assembly_file,
                                               log_call, err_call) ## use mummer (-m) and force results folder (-f)
    status = HCGB_sys.system_call(cmd_call)
    
    ## check results
    ## see https://github.com/VishnuRaghuram94/AgrVATE#results for additional details
    results = pd.DataFrame()
    
    ## check folder is created
    assembly_file_name = os.path.basename(assembly_file).split('.fna')[0]    
    original_results_folder = os.path.join(folder, assembly_file_name + '-results')
    results_folder = os.path.join(folder, 'agrvate_results')
    
    if os.path.isdir(original_results_folder):
        print("+ Results folder generated OK")
        print("+ Check results generated:")
        
        ## rename folder
        os.rename(original_results_folder, results_folder)
        os.rename(os.path.join(folder, assembly_file_name + '.fna-error-report.tab'), os.path.join(results_folder, 'error_report.tab'))
        
        ## write to excel
        file_name_Excel = os.path.join(folder, sample + '_agr_results.xlsx')
        writer_Excel = pd.ExcelWriter(file_name_Excel, engine='xlsxwriter') ## open excel handle
    
        ## get all files
        list_files = HCGB_main.get_fullpath_list(results_folder)
    
        ## summary tab
        summary_tab_file = [s for s in list_files if s.endswith("summary.tab")][0]
        summary_tab =  HCGB_main.get_data(summary_tab_file, '\t', options="")
        summary_tab['sample'] = sample
        
        ## columns
        #agr_group: gp1/gp2/gp3/gp4. 'u' means unknown. 
        ##           If multiple agr groups were found (col 5 = m), 
        ##           the displayed agr group is the majority/highest confidence. 
        # match_score: maximum 15; 0 means untypeable; < 5 means low confidence.
        # canonical_agrD: 1 means canonical; 0 means non-canonical; u means unknown.
        # multiple_agr:  s means single, m means multiple, u means unknown ) 
        ##               Multiple groups are found likely due to multiple S. aureus isolates in sequence
        # frameshifts: Number found in CDS of extracted agr operon ('u' if agr operon not extracted)
        
        ## debug messages
        if debug:
            HCGB_aes.debug_message("agrvate results: Summary tab file", 'yellow')
            print(summary_tab_file)
            print(summary_tab)

        ## add summary results to all results
        del summary_tab['#filename']
        results = summary_tab.copy()

        ## save summary_tab into excel
        ## tab summary
        summary_tab.to_excel(writer_Excel, sheet_name='summary') ## write excel handle

        ## agr_gp tab
        agr_gp_tab_file = [s for s in list_files if s.endswith("agr_gp.tab")][0]
        if HCGB_files.is_non_zero_file(agr_gp_tab_file):
            agr_gp_tab =  HCGB_main.get_data(agr_gp_tab_file, '\t', options='header=None')
            agr_gp_tab.columns = ['contig', 'agr', 'evalue', 'identity', 'start', 'end']
            agr_gp_tab['sample'] = sample
            
            ## columns
            ## Assembly Contig ID
            ## ID of matched agr group kmer
            ## evalue
            ## Percentage identity of match
            ## Start position of kmer alignment on input sequence
            ## End position of kmer alignment on input sequence
    
            ## debug messages
            if debug:
                HCGB_aes.debug_message("agrvate results: agr_gp file", 'yellow')
                print(agr_gp_tab_file)
                print(agr_gp_tab)
            
            ## save agr_gp_tab file into excel
            ## tab operon
            agr_gp_tab.to_excel(writer_Excel, sheet_name='operon') ## write excel handle

        ## agr_operon fna
        try:
            agr_operon_fna_file = [s for s in list_files if s.endswith("agr_operon.fna")][0]
            ## debug messages
            if debug:
                HCGB_aes.debug_message("agrvate results: agr_operon file", 'yellow')
                print(agr_operon_fna_file)
            
            results['operon_fna'] = agr_operon_fna_file
        except:
            results['operon_fna'] = ''

        ## agr_operon fna
        error_report_file = [s for s in list_files if s.endswith("error_report.tab")][0]
        error_report =  HCGB_main.get_data(error_report_file, '\t', options="")
        del error_report['#input_name']

        ## debug messages
        if debug:
            HCGB_aes.debug_message("agrvate results: error_report.tab file", 'yellow')
            print(error_report_file)
            print(error_report)
            
        ## save error_report file into excel
        ## tab steps
        error_report.to_excel(writer_Excel, sheet_name='steps') ## write excel handle
        
        ## merge results
        results = pd.concat([results, error_report], axis=1)

        ## close xlsx file
        writer_Excel.save() ## close excel handle
    
        ## add to pandas dataframe
        results['agr_operon_xlsx'] = file_name_Excel

    ## debug messages
    if debug:
        HCGB_aes.debug_message("agrvate results", 'yellow')
        HCGB_main.print_all_pandaDF(results)
        
    return (results)
Esempio n. 27
0
def gbf_parser(gbf_file, list_out_files, debug=False):

    ## create dataframe.
    ## get common column names
    columns = columns_annot_table()
    annot_df = pd.DataFrame(data=None, columns=columns)
    genome_length = pd.DataFrame(data=None, columns=["length"])

    for rec in SeqIO.parse(gbf_file, "genbank"):
        #get genome length for BioCircos plotting
        ID = rec.id
        genome_length.loc[ID, ["length"]] = [len(rec.seq)]

        ## debug messages
        if (debug):
            debug_message('GenBank record', 'yellow')
            print(rec)

        ## loop through features
        for feature in rec.features:

            #sort by CDS type. Duplicate genes analysis needs coding regions to proteins.
            if feature.type == "CDS":
                genome_seq = rec.seq[feature.location.nofuzzy_start:feature.
                                     location.nofuzzy_end]
                if int(feature.strand) > 0:
                    strand = "pos"
                else:
                    strand = "neg"
                    genome_seq = genome_seq.reverse_complement()

                #we create an ID for each entry
                protID = feature.type + "_" + rec.id + "_" + str(
                    feature.location.nofuzzy_start) + "_" + str(
                        feature.location.nofuzzy_end) + "_" + strand
                annot_df.loc[protID, ["rec_id", "start", "end", "strand"]] = [
                    ID, feature.location.nofuzzy_start,
                    feature.location.nofuzzy_end, strand
                ]
                qualif = feature.qualifiers
                pseudo = False

                ## Debug messages
                if (debug):
                    debug_message('protID: ' + protID, 'yellow')

                    debug_message('qualif: ', 'yellow')
                    print(qualif)

                    debug_message('feature: ', 'yellow')
                    print(feature)

                    debug_message('genome_seq: ', 'yellow')
                    print(genome_seq)

                pseudo_seq = ""
                ## fill datafarme
                for keys, values in qualif.items():
                    if keys not in columns:
                        continue

                    ## Save keys into dataframe
                    annot_df.loc[protID, [keys]] = [values[0]]

                    ####################################
                    ## Pseudogenes:
                    ####################################
                    if keys == "pseudo":
                        pseudo = True

                        ## set pseudo True/False
                        annot_df.loc[protID, ["pseudo"]] = ["True"]
                        table_code = feature.qualifiers["transl_table"][0]
                        pseudo_seq = genome_seq.translate(table=table_code,
                                                          to_stop=False)
                        if pseudo_seq.endswith("*"):
                            pseudo_seq = pseudo_seq[:-1]

                        ## Debug messages
                        if (debug):
                            print("***************************************")
                            debug_message('Pseudogene: ', 'yellow')
                            print("***************************************")
                            debug_message('feature.location.nofuzzy_start: ',
                                          'yellow')
                            print(feature.location.nofuzzy_start)
                            debug_message('feature.location.nofuzzy_end: ',
                                          'yellow')
                            print(feature.location.nofuzzy_end)
                            debug_message('Translation table code: ', 'yellow')
                            print(table_code)
                            debug_message('genome_seq: ', 'yellow')
                            print(genome_seq)
                            debug_message('pseudo_seq: ', 'yellow')
                            print(pseudo_seq)

                ## create a sequence fasta entry
                if (pseudo):
                    # Pseudogenes have no translation item
                    # set translated CDS even including *
                    if len(pseudo_seq) != 0:
                        gene_seq = pseudo_seq
                    else:
                        ## sometimes it might fail
                        gene_seq = Seq.Seq('***')

                else:
                    ## CDS provided by genbank
                    gene_seq = Seq.Seq(feature.qualifiers["translation"][0])

                yield (SeqRecord(gene_seq, protID, "", ""))

    ## print to file
    annot_df.to_csv(list_out_files[1], header=True)
    genome_length.to_csv(list_out_files[2], header=False)

    ## debug messages
    if (debug):
        debug_message('annot_df: ', 'yellow')
        print(annot_df)

    return ()
Esempio n. 28
0
def run_input(arg_dict):
    """Main function of the input_parser module in BacDup package.
    
    This module prepares data for later gene duplication analysis. 
    
    It allows the user to provide either a single sample, multiple samples, NCBI 
    GenBank IDs or NCBI taxonomy IDs to retrieve and obtain the annotation data.    
    """

    ## help message
    if (arg_dict.input_help):
        help_input()
        exit()

    BacDup_functions.pipeline_header('BacDup')
    HCGB_aes.boxymcboxface("Preparing input files")
    print("--------- Starting Process ---------")
    HCGB_time.print_time()

    ## init time
    start_time_total = time.time()

    ## absolute path for in & out
    #input_dir = os.path.abspath(options.input)
    outdir = os.path.abspath(arg_dict.output_folder)

    ## output folder
    print("\n+ Create output folder(s):")
    HCGB_files.create_folder(outdir)

    ## set defaults
    if not (arg_dict.assembly_level):
        arg_dict.assembly_level = 'complete'
    if not (arg_dict.section):
        arg_dict.section = 'genbank'

    ## project or detached?
    if arg_dict.detached:
        arg_dict.project = False
        final_dir = outdir
        data_dir = outdir
    else:
        arg_dict.project = True
        print(
            "+ Generate a directory containing information within the project folder provided"
        )
        final_dir = HCGB_files.create_subfolder("info", outdir)

    ## debug messages
    if (arg_dict.debug):
        debug_message('+++++++++++++++++++++++++++++++')
        debug_message('Project/Detached option:', 'yellow')
        debug_message('arg_dict.detached: ' + str(arg_dict.detached), 'yellow')
        debug_message('arg_dict.project: ' + str(arg_dict.project), 'yellow')
        debug_message('outdir:' + outdir, 'yellow')
        debug_message('final_dir:' + final_dir, 'yellow')
        debug_message('+++++++++++++++++++++++++++++++')

    ## get files
    print()
    HCGB_aes.print_sepLine("-", 50, False)
    print('+ Getting input information provided... ')
    print('+ Several options available:')
    print('\t* Single/Multiple Annotation file:')
    print('\t  |-- GenBank format files')
    print('\t  |-- GFF files +  Reference fasta files required')
    print('\n\t* Single/Multiple NCBI GenBank IDs')
    print('\n\t* Single/Multiple NCBI taxonomy IDs + Options')
    print('\n\t* A previous BacDup project folder')

    print('\n+ Check the option provided...')
    time.sleep(1)

    ## time stamp
    start_time_partial = HCGB_time.timestamp(start_time_total)

    #################################################
    ## Parse and obtain the type of input information provided
    #################################################
    df_accID = parse_options(arg_dict)
    ## pd.DataFrame: 'new_name','folder','genus',
    ##               'species','taxonomy','genome',
    ##               'annot_file','format_annot_file', 'proteins',
    ##               'plasmids_number','plasmids_ID'))

    ## time stamp
    start_time_partial = HCGB_time.timestamp(start_time_partial)

    ## parse information accordingly
    parse_information(arg_dict, df_accID, outdir)

    ### report generation
    HCGB_aes.boxymcboxface("Summarizing input files")
    outdir_report = HCGB_files.create_subfolder("report", outdir)

    input_report = HCGB_files.create_subfolder("input", outdir_report)

    ## add df_accID.loc[sample,] information as csv into input folder
    df_accID.to_csv(os.path.join(input_report, 'info.csv'),
                    index=True,
                    header=True)

    ## maybe add a summary of the files?

    print("\n*************** Finish *******************")
    start_time_partial = HCGB_time.timestamp(start_time_total)

    print("+ Exiting Input module.")
    return ()
Esempio n. 29
0
def parse_options(arg_dict):

    outdir = os.path.abspath(arg_dict.output_folder)

    ## TODO: Now set as mutually_exclusive group. It might be Set to multiple options
    ## ATTENTION: df_accID merge generated dataframe

    ## --------------------------------------- ##
    ## GFF or GBF file
    ## --------------------------------------- ##
    if (arg_dict.annot_file):
        arg_dict.annot_file = os.path.abspath(arg_dict.annot_file)

        # *************************** ##
        ## multiple files provided
        # *************************** ##
        if (arg_dict.batch):
            ## debug messages
            if (arg_dict.debug):
                debug_message('+++++++++++++++++++++++++++++++')
                debug_message('Multiple annotation file provided option:',
                              'yellow')
                debug_message('arg_dict.annot_file: ' + arg_dict.annot_file,
                              'yellow')

            ## check if ok
            BacDup_functions.file_readable_check(arg_dict.annot_file)

            print(
                colored('\t* Multiple annotation files provided .......[OK]',
                        'green'))
            dict_entries = HCGB_main.file2dictionary(arg_dict.annot_file, ',')

            ## debug messages
            if (arg_dict.debug):
                debug_message('dict_entries: ', 'yellow')
                debug_message(dict_entries, 'yellow')
                debug_message('+++++++++++++++++++++++++++++++\n\n')

        # *************************** ##
        ## single file provided
        # *************************** ##
        else:
            dict_entries = {}
            print(colored('\t* Annotation file:.......[OK]', 'green'))
            if (arg_dict.sample_name):
                sample_name = arg_dict.sample_name
            else:
                sample_name = "sample"

            ##
            dict_entries[sample_name] = arg_dict.annot_file

        ## create dataframe df_accID to match other formats
        df_accID = pd.DataFrame(
            columns=(BacDup_functions.columns_accID_table()))

        for name, file_annot in dict_entries.items():
            file_annot = os.path.abspath(file_annot)

            ## init all
            genome = ""
            prot = ""
            gff = ""
            gbk = ""
            plasmid_count = ""
            plasmid_id = ""

            ## debug messages
            if (arg_dict.debug):
                debug_message('+++++++++++++++++++++++++++++++')
                debug_message(
                    'dict_entries check annotation files provided option:',
                    'yellow')
                debug_message('name: ' + name, 'yellow')
                debug_message('file_annot: ' + file_annot, 'yellow')

            ## check file is valid
            BacDup_functions.file_readable_check(file_annot)

            ## get format
            format = format_checker.is_format(file_annot, arg_dict.debug)

            if (arg_dict.debug):
                debug_message('format: ' + format, 'yellow')

            ## parse accordingly
            taxonomy = ""
            organism = ""
            taxonomy_string = ""
            genus = ""
            if (format == 'gbk'):
                ## get information from each sample
                (taxonomy,
                 organism) = BacDup.scripts.functions.get_gbk_information(
                     file_annot, arg_dict.debug)
                ## plasmid_count, plasmid_id not available

            elif (format == 'gff'):
                if (arg_dict.ref_file):
                    arg_dict.ref_file = os.path.abspath(arg_dict.ref_file)
                    BacDup_functions.file_readable_check(arg_dict.ref_file)

                    if (arg_dict.batch):
                        ref_entries = HCGB_main.file2dictionary(
                            arg_dict.ref_file, ',')
                        genome = ref_entries[name]
                    else:
                        genome = arg_dict.ref_file

            ## save into dataframe
            if len(taxonomy) > 1:
                genus = taxonomy[-1]
                taxonomy_string = ";".join(taxonomy)

            dir_path = os.path.abspath(os.path.dirname(file_annot))
            df_accID.loc[len(df_accID)] = (name, dir_path, genus, organism,
                                           taxonomy_string, genome, file_annot,
                                           format, prot, plasmid_count,
                                           ";".join(plasmid_id))

    ## --------------------------------------- ##
    ## NCBI RefSeq/Genbank IDs: GCA_XXXXXXXX.1; GCF_XXXXXXXXX.1
    ## --------------------------------------- ##
    elif (arg_dict.GenBank_id):
        ## get database path
        if (arg_dict.db_folder):
            db_folder = HCGB_files.create_folder(
                os.path.abspath(arg_dict.db_folder))
        else:
            db_folder = HCGB_files.create_subfolder(
                "db", os.path.abspath(arg_dict.output_folder))

        ## debug messages
        if (arg_dict.debug):
            debug_message('+++++++++++++++++++++++++++++++')
            debug_message('GenBank ID option:', 'yellow')
            debug_message('db_folder: ' + db_folder, 'yellow')

        # *************************** ##
        ## batch file
        # *************************** ##
        if (arg_dict.batch):
            arg_dict.GenBank_id = os.path.abspath(arg_dict.GenBank_id)

            ## debug messages
            if (arg_dict.debug):
                debug_message('GenBank ID batch file provided:', 'yellow')
                debug_message('arg_dict.GenBank_id: ' + arg_dict.GenBank_id,
                              'yellow')

            ## check is a file and readable
            BacDup_functions.file_readable_check(arg_dict.GenBank_id)

            print(
                colored('\t* Multiple NCBI GenBank IDs in a file .......[OK]',
                        'green'))
            print()

            ## call IDs into a list and create tmp folder
            strains2get = HCGB_main.readList_fromFile(arg_dict.GenBank_id)
            strains2get = list(filter(None, strains2get))

            ## debug messages
            if (arg_dict.debug):
                debug_message('strains2get: ' + str(strains2get), 'yellow')

            ## call NCBI_downloader
            df_accID = BacDup.scripts.NCBI_downloader.NCBI_download_list(
                strains2get, db_folder, arg_dict.debug,
                arg_dict.assembly_level)

        # *************************** ##
        ## single GenBank ID
        # *************************** ##
        else:
            ## debug messages
            if (arg_dict.debug):
                debug_message('+++++++++++++++++++++++++++++++')
                debug_message('Single NCBI GenBank IDs provided option:',
                              'yellow')
                debug_message('arg_dict.GenBank_id: ' + arg_dict.GenBank_id,
                              'yellow')
                debug_message('db_folder: ' + db_folder, 'yellow')
                debug_message('+++++++++++++++++++++++++++++++')

            ## download
            print(colored('\t* A NCBI GenBank ID:.......[OK]', 'green'))
            print()
            HCGB_aes.print_sepLine("+", 75, False)
            df_accID = BacDup.scripts.NCBI_downloader.NCBIdownload(
                arg_dict.GenBank_id, db_folder, arg_dict.debug)

    ## --------------------------------------- ##
    ## NCBI Taxonomy ID:
    ## --------------------------------------- ##
    elif (arg_dict.tax_id):
        #################
        ## get tax ids
        #################
        if (arg_dict.batch):
            print(
                colored('\t* Multiple NCBI Taxonomy IDs in a file .......[OK]',
                        'green'))

            ## debug messages
            if (arg_dict.debug):
                debug_message('+++++++++++++++++++++++++++++++')
                debug_message('Multiple NCBI Taxonomy IDs provided option:',
                              'yellow')

            ## check is a file and readable
            BacDup_functions.file_readable_check(arg_dict.tax_id)

            ## get IDs into a list
            taxIDs2get = HCGB_main.readList_fromFile(arg_dict.tax_id)

        else:
            print(colored('\t* A NCBI Taxonomy ID:.......[OK]', 'green'))
            taxIDs2get = [arg_dict.tax_id]

        print()

        ##################################
        ## init ete NCBI taxonomy database
        ##################################
        print('+ Initiate NCBI taxonomy database...')
        ncbi = taxonomy_retrieval.init_db_object(arg_dict.debug)

        string_info_total = []
        for taxid in taxIDs2get:
            ## parse
            info = taxonomy_retrieval.parse_taxid(taxid, ncbi, 'unravel',
                                                  arg_dict.debug)
            print()

            ## debug messages
            if arg_dict.debug:
                debug_message(
                    "+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++"
                )
                debug_message('info\n', "yellow")
                print(info)

            ## append if more
            string_info_total.extend(info)

        ## convert to list of strings
        string_info_total = [str(int) for int in string_info_total]

        ## assume all belong to same superkingdom if children of same tax_id
        group_obtained = taxonomy_retrieval.get_superKingdom(
            string_info_total[0], ncbi, arg_dict.debug)

        #################
        ## get database path
        #################
        if (arg_dict.db_folder):
            db_folder = HCGB_files.create_folder(
                os.path.abspath(arg_dict.db_folder))
        else:
            db_folder = HCGB_files.create_subfolder("db", outdir)

        ## debug messages
        if arg_dict.debug:
            debug_message(
                "+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++"
            )
            debug_message('group_obtained: ' + group_obtained, "yellow")
            debug_message('db_folder: ' + db_folder, "yellow")
            debug_message(
                'arg_dict.assembly_level: ' + arg_dict.assembly_level,
                "yellow")
            debug_message('arg_dict.section: ' + arg_dict.section, "yellow")

        ##################################
        ## get GenBank entries selected
        ##################################
        (strains2get,
         allstrains_available) = taxonomy_retrieval.get_GenBank_ids(
             db_folder,
             string_info_total,
             int(arg_dict.k_random),
             arg_dict.debug,
             assembly_level_given=arg_dict.assembly_level,
             group_given=group_obtained,
             section_given=arg_dict.section)

        ## print list and dictionary of possible and selected taxIDs
        outdir = os.path.abspath(arg_dict.output_folder)
        info_dir = HCGB_files.create_subfolder("info", outdir)
        input_info_dir = HCGB_files.create_subfolder("input", info_dir)
        HCGB_main.printList2file(
            os.path.join(input_info_dir, 'Downloaded.txt'), strains2get)
        HCGB_main.printList2file(
            os.path.join(input_info_dir, 'all_entries.txt'),
            allstrains_available)

        ## save into file
        file_info = os.path.join(input_info_dir, 'info.txt')

        ## stop here if dry_run
        if arg_dict.dry_run:
            print()
            HCGB_aes.print_sepLine("*", 75, False)
            print(
                "ATTENTION: Dry run mode selected. Stopping the process here.")
            HCGB_aes.print_sepLine("*", 75, False)
            print("+ All available entries listed and printed in file:\n\t" +
                  os.path.join(input_info_dir, 'all_entries.txt'))
            print("+ Subset of entries generated and printed in file:\n\t" +
                  os.path.join(input_info_dir, 'Downloaded.txt'))
            print(
                "\n\nIf random numbers selected, take into account re-running this process might produce different results.\n"
            )
            HCGB_aes.print_sepLine("*", 75, False)
            print()
            exit()

        #################
        ## call NCBI_downloader
        #################
        df_accID = BacDup.scripts.NCBI_downloader.NCBI_download_list(
            strains2get, db_folder, arg_dict.debug, arg_dict.assembly_level)

    ## --------------------------------------- ##
    ## Previous BacDup analysis folder
    ## --------------------------------------- ##
    ## TODO
    elif (arg_dict.project):
        print(
            colored(
                '\t* A previous BacDup analysis project folder:.......[OK]',
                'green'))
        ## create df_accID to store data
        ## TODO

    ## Returns dataframe with information

    df_accID = df_accID.set_index('new_name')
    return (df_accID)
Esempio n. 30
0
def parse_information(arg_dict, df_accID, outdir):

    ### Parse df_accID
    dict_input_folders = HCGB_files.outdir_project(outdir, arg_dict.project,
                                                   df_accID, "input",
                                                   arg_dict.debug)
    dict_parse_folders = HCGB_files.outdir_project(outdir, arg_dict.project,
                                                   df_accID, "parse",
                                                   arg_dict.debug)

    ## debug messages
    if (arg_dict.debug):
        debug_message('+++++++++++++++++++++++++++++++')
        print("dict_input_folders")
        print(dict_input_folders)
        print("dict_parse_folders")
        print(dict_parse_folders)

    ## parse each sample retrieved
    for sample, folder_input in dict_input_folders.items():

        if (arg_dict.debug):
            debug_message('sample: ' + sample, 'yellow')
            debug_message('folder_input: ' + folder_input, 'yellow')
            debug_message('folder_parse: ' + dict_parse_folders[sample],
                          'yellow')
            debug_message('annot_file: ' + df_accID.loc[sample, 'annot_file'],
                          'yellow')
            debug_message('genome' + df_accID.loc[sample, 'genome'], 'yellow')

        ## timestamps
        input_timestamp = os.path.join(folder_input, '.success')
        parse_timestamp = os.path.join(dict_parse_folders[sample], '.success')

        print()
        print("\t+ Parsing sample: " + sample)

        if (not HCGB_files.is_non_zero_file(parse_timestamp)
                and not HCGB_files.is_non_zero_file(input_timestamp)):

            ## TODO: Set threads to use in parallel
            process_OK = parse_annot_file(sample, folder_input,
                                          df_accID.loc[sample, 'annot_file'],
                                          dict_parse_folders[sample],
                                          arg_dict.debug,
                                          df_accID.loc[sample, 'genome'])

            if (process_OK):

                ## link or copy annotation file into folder_input
                HCGB_files.get_symbolic_link_file(
                    df_accID.loc[sample, 'annot_file'], folder_input)

                ## add df_accID.loc[sample,] information as csv into input folder
                df_accID.loc[sample, ].to_csv(os.path.join(
                    folder_input, 'info.csv'),
                                              index=True,
                                              header=True)

                ## print time stamp
                HCGB_time.print_time_stamp(input_timestamp)

                ## print time stamp
                HCGB_time.print_time_stamp(parse_timestamp)
            else:
                print(
                    colored(
                        "\t+ Some error occurred for sample %s while parsing input options"
                        % sample, 'red'))

                ## print time stamp
                HCGB_time.print_time_stamp(os.path.join(folder_input, '.fail'))

                ## print time stamp
                HCGB_time.print_time_stamp(
                    os.path.join(dict_parse_folders[sample], '.fail'))
        else:
            read_time = HCGB_time.read_time_stamp(parse_timestamp)
            print(
                colored(
                    "\t+ Input parsing already available for sample %s [%s]" %
                    (sample, read_time), 'green'))
            print()