Ejemplo n.º 1
0
def plasmidID_call(plasmidID_bin, file1, file2, plasmid_contigs, database,
                   threads, outfolder):
    name_split = os.path.basename(file1).split("_trim_R1.fastq")
    name = name_split[0]
    cmd = "%s --R1 %s --R2 %s --database %s --sample %s --no-trim --contigs %s --threads %s --group %s" % (
        plasmidID_bin, file1, file2, database, name, plasmid_contigs, threads,
        outfolder)
    functions.system_call(cmd)
Ejemplo n.º 2
0
def run_doMLST(profile_folder, seq_folder, name, rscript, path, fileGiven,
               threads):

    print('+ Generating profile for sample...')

    folder_results = os.path.join(path, name + '_alleles')

    ## success timestamp
    filename_stamp = path + '/.success'
    if os.path.isfile(filename_stamp):
        stamp = functions.read_time_stamp(filename_stamp)
        print(
            colored(
                "\tA previous command generated results on: %s [%s]" %
                (stamp, name), 'yellow'))
        res_file = path + '/' + name + "_MLST_results.csv"
        return (res_file)

    else:
        if os.path.exists(folder_results):
            shutil.rmtree(folder_results)

        logFile = path + '_logFile.txt'
        cmd_profiler = "%s %s --dir_profile %s --dir_seq %s --file %s --dir %s --name %s --threads %s --lib.loc %s 2> %s" % (
            rscript, MLSTarR_script, profile_folder, seq_folder, fileGiven,
            path, name, threads, get_MLSTar_package_installed(), logFile)
        callCode = functions.system_call(cmd_profiler)

    if callCode == 'OK':
        res_file = path + '/' + name + "_MLST_results.csv"
        stamp = functions.print_time_stamp(filename_stamp)
        return (res_file)
    else:
        return ('FAIL')
Ejemplo n.º 3
0
def getPUBMLST(species, rscript, out_name):
    """
	Using `MLSTar software`_ retrieve for the given `species` the available schemes in PubMLST_.  

	It generates information in file `out_name` in csv format. 
	
	See example in :file:`/devel/results/getPubMLST_example.csv`
	
	.. include:: ../../devel/results/getPubMLST_example.csv
		:literal:
	   
	:param species: Sample name or tag to identify sample
	:param rscript: Path to Rscript to generate system call.
	:param out_name: Output file to generate profile information.

	:type species: string
	:type rscript: string
	:type out_name: string

	:return: OK/FAIL
	:warnings: Returns **FAIL** if process stopped.
	
	.. include:: ../../links.inc

	"""

    MLSTarR_getpubmlst = tools.R_scripts('MLSTar_getpubmlst')

    ## species is a comma separated string
    path_package = get_MLSTar_package_installed()
    #MLSTar_getpubmlst
    cmd_getPUBMLST = "%s %s --species %s --output %s --lib.loc %s 2> /dev/null" % (
        rscript, MLSTarR_getpubmlst, species, out_name, path_package)
    return (functions.system_call(cmd_getPUBMLST))
Ejemplo n.º 4
0
def get_MLSTar_package_installed(debug=False):

    install_path = set_config.R_package_path_installed()
    (check_install_system, check_install_path) = set_config.get_check_R_files()
    R_script_exe = set_config.get_exe('Rscript')

    if debug:
        print('\n+ Check package: MLSTar')

    ## ATTENTION: optparse library missing, no installation withn conda

    ## first try to check if package available in system
    cmd_check = R_script_exe + ' ' + check_install_system + ' -l MLSTar'
    code = functions.system_call(cmd_check, message=False, returned=False)
    if (code == 'OK'):
        return ('system')
    else:
        ## check if installed in path
        cmd_check_path = R_script_exe + ' ' + check_install_path + ' -l MLSTar -p ' + install_path
        code2 = functions.system_call(cmd_check_path,
                                      message=False,
                                      returned=False)

        if (code2 == 'OK'):
            return (install_path)
        else:
            (install_R, install_github_package
             ) = install_dependencies.get_install_R_files()
            cmd_R = '%s %s -l iferres/MLSTar -p %s' % (
                R_script_exe, install_github_package, install_path)
            code3 = functions.system_call(cmd_R, message=False, returned=False)
            if (code3):
                return (install_path)
            else:
                print('ERROR')
                exit()
Ejemplo n.º 5
0
def filter_and_cluster_database(database_path, threads):

    original_file = database_path + '/plasmid_database.fna'
    clustering_file = database_path + '/plasmids_clustered.fna'

    if os.path.isfile(clustering_file):
        print("+ Plasmid database was previously clustered...")
        return (clustering_file)
    else:
        print('+ Cluster putative duplicated entries on database...')

    folder = functions.create_subfolder('blast_search', database_path)

    ## makeblastDB
    dbName = folder + '/plasmids_DB'
    functions.makeblastdb(dbName, original_file)

    ## split plasmid seqs ids
    tmp_folder = functions.create_subfolder("tmp", folder)
    cmd_split = 'perl %s %s %s %s ' % (fasta_split_script, original_file,
                                       threads, tmp_folder)

    print("\n+ Splitting database file to speed up the computation...")
    functions.system_call(cmd_split)

    ## get files in folder
    split_files = os.listdir(tmp_folder)
    num_threads = 1
    print("\n+ Sending blastn commands: ")

    # We can use a with statement to ensure threads are cleaned up promptly
    with concurrent.futures.ThreadPoolExecutor(
            max_workers=threads) as executor:
        # Start the load operations and mark each future with its URL
        commandsSent = {
            executor.submit(functions.blastn,
                            tmp_folder + '/' + fil + '-blast-out.txt', dbName,
                            tmp_folder + '/' + fil, num_threads): fil
            for fil in split_files
        }
        for cmd2 in concurrent.futures.as_completed(commandsSent):
            details = commandsSent[cmd2]
            try:
                data = cmd2.result()
            except Exception as exc:
                print('***ERROR:')
                print('%r generated an exception: %s' % (details, exc))

    ########################
    ## parseBlast results
    ########################
    print("+ Concatenating all results into one file...")
    outFile_merged = folder + '/blastn_output_concat.txt'
    cmd_cat = 'cat ' + tmp_folder + '/*-blast-out.txt > ' + outFile_merged
    functions.system_call(cmd_cat)

    ## thresholds
    eval_thresh_float = float(1e-20)
    aln_thresh_given = 90
    min_length = 2000

    outFile_parsed = folder + '/blastn_output_parsed.txt'
    output_file = open(outFile_parsed, 'w')

    print('+ Parsing BLAST results generated...\n')
    my_cluster_list = []
    sequences2discard = []

    ## get number lines
    lines = functions.get_number_lines(outFile_merged)
    n = 0

    ## get results
    fh = open(outFile_merged)
    for blast_record in HCGB_blast.parse(fh,
                                         eval_thresh=eval_thresh_float,
                                         aln_thresh=aln_thresh_given,
                                         length_thresh=min_length):
        ### show progress bar
        n += 10
        functions.progbar(n, lines, 100)

        for hit in blast_record.hits:
            for hsp in hit:
                output_file.write('****Alignment****\n')
                output_file.write('query id: {}\n'.format(blast_record.qid))
                sequences2discard.append(hsp.sid)  ## add subject
                output_file.write('sequence: %s\n' % hsp.sid)
                output_file.write('e value: %s\n' % hsp.evalue)
                output_file.write('aln_length: %s\n' % hsp.length)
                output_file.write('qlen: %s [>%s]\n' % (hsp.qlen, min_length))
                aln_perc = (int(hsp.length) / int(hsp.qlen)) * 100
                output_file.write('aln_perc: %s [> %s]\n\n' %
                                  (aln_perc, aln_thresh_given))

                #print('query id: {}'.format(blast_record.qid))
                #print('sequence: ', hsp.sid)
                #print('e value:', hsp.evalue)
                #print('aln_length:', hsp.length)
                #print('qlen:', hsp.qlen, ' [>', min_length, ' bp]')
                #print ('aln_perc:', aln_perc, ' [>', aln_thresh_given ,'%]')

            ## add query:
            qseq = format(blast_record.qid)
            if (qseq in sequences2discard):
                continue  ## avoid adding query if already reported as subject
            else:
                my_cluster_list.append(qseq)

    print("\n\n")

    fh.close()
    output_file.close()

    ## get unique ids & print in file
    print("+ Obtaining clustered sequences...")
    my_cluster_list = set(my_cluster_list)
    list_cluster_file = folder + '/plasmids_clustered_ids.txt'
    functions.printList2file(list_cluster_file, my_cluster_list)

    ## retrieve ids
    cluster_fasta_file = folder + '/plasmids_clustered.fna'
    cmd = 'perl %s %s %s > %s' % (retrieve_seqs_script, list_cluster_file,
                                  original_file, cluster_fasta_file)
    functions.system_call(cmd)

    final_cluster_fasta_file = database_path + '/plasmids_clustered.fna'
    shutil.copy(cluster_fasta_file, final_cluster_fasta_file)

    ## maybe add a step to build bowtie index from database to speed up later process

    return (final_cluster_fasta_file)
Ejemplo n.º 6
0
def download_PubMLST(profile_folder, scheme, seq_folder, rscript, species):

    ## Check if profile exists
    file_prof = profile_folder + '/profile_scheme' + str(scheme) + '.tab'

    ######################
    ## download profile ##
    ######################
    if os.path.exists(file_prof):
        print('+ Profile file for scheme exists...')

        ## check if previously download profile and succeeded
        filename_stamp = profile_folder + '/.success'

        if os.path.isfile(filename_stamp):
            stamp = functions.read_time_stamp(filename_stamp)
            print(
                colored(
                    "\tA previous command generated results on: %s [%s -- %s]"
                    % (stamp, species, 'profile'), 'yellow'))
        #
        #else
        # [TODO: Check time passed and download again if >?? days passed]

    else:
        print('+ Profile file for scheme will be downloaded...')
        print('+ Downloading profile...')
        logFile = profile_folder + '_download.log'
        cmd_profile = "%s %s --species %s --scheme %s --dir_profile %s --lib.loc %s 2> %s" % (
            rscript, MLSTarR_download_prf, species, scheme, profile_folder,
            get_MLSTar_package_installed(), logFile)
        callCode = functions.system_call(cmd_profile)

        if (callCode == 'OK'):
            ## success timestamp
            filename_stamp = profile_folder + '/.success'
            stamp = functions.print_time_stamp(filename_stamp)

    #######################
    ## download sequence ##
    #######################
    seq_bool = 0
    if os.path.exists(seq_folder):
        print('+ Sequence folder exists...')

        ## check if previously download sequence and succeeded
        filename_stamp = seq_folder + '/.success'

        if os.path.isfile(filename_stamp):
            stamp = functions.read_time_stamp(filename_stamp)
            print(
                colored(
                    "\tA previous command generated results on: %s [%s -- %s]"
                    % (stamp, species, 'sequence'), 'yellow'))

            ########################################################################
            #else
            # [TODO: Check time passed and download again if >?? days passed]
            ########################################################################
            #files = os.listdir(seq_folder)
            #count_fas = 0
            #for f in files:
            #	if f.endswith('.fas'):
            #		count_fas += 1
            #	else:
            #		os.remove(seq_folder + '/' + f)
            #
            #if count_fas > 6:
            #	print ("+ Assuming sequences are previously downloaded...")
            #else:
            #	seq_bool = 1
            #	os.rmdir(seq_folder)
            #	seq_folder_path = functions.create_folder(seq_folder)

        else:
            print('+ Sequence files for scheme will be downloaded...')
            print('+ Downloading sequences...')
            logFile = seq_folder + '_download.log'
            cmd_seq = "%s %s --species %s --scheme %s --dir_seq %s --lib.loc %s 2> %s" % (
                rscript, MLSTarR_download_seq, species, scheme, seq_folder,
                get_MLSTar_package_installed(), logFile)
            callCode = functions.system_call(cmd_seq)

            if (callCode == 'OK'):
                ## success timestamp
                filename_stamp = seq_folder + '/.success'
                stamp = functions.print_time_stamp(filename_stamp)
Ejemplo n.º 7
0
def plot_MLST(results, profile, rscript):
    path_package = get_MLSTar_package_installed()
    path_folder = os.path.dirname(results)
    cmd_plotter = "%s %s --output %s --folder_profile %s --file_result %s --lib.loc %s" % (
        rscript, MLSTarR_plot, path_folder, profile, results, path_package)
    return (functions.system_call(cmd_plotter))
Ejemplo n.º 8
0
def GI_module(genbank_file, name, outdir, Debug, cutoff_dinuc_bias=8, min_length=1000):
    """Identify genomic islands (GI) within the genbank file provided. They are calculated
    based on gene annotation and dinucleotide bias region using the software `IslandPath-DIMOB`_.
    
    :param genbank_file: Absolute path to annotation file in Genbank format.
    :param name: Sample identifier. 
    :param outdir: Absolute path to output folder.
    :param cutoff_dinuc_bias: Dinucleotide bias cutoff
    :param min_length: Minimun length for the regions to be reported

    :type name: string
    :type genbank_file: string
    :type outdir: string
    :type cutoff_dinuc_bias: int
    :type min_length: int

    The Dimob.pl perl script has two mandatory argument which are the input :file:`genbank_file` and an output name.
    
    .. code-block:: sh

        Usage:
        perl Dimob.pl <genome.gbk> <output_name> [cutoff_dinuc_bias] [min_length]
        
        Default values:
            cutoff_dinuc_bias = 8
            min_length = 8000
        
        Example:
            perl Dimob.pl example/NC_003210.gbk NC_003210_GIs
            perl Dimob.pl example/NC_003210.gbk NC_003210_GIs 6 10000
            perl Dimob.pl example/NC_000913.embl NC_000913_GIs 6 10000

    
    During the development of BacterialTyper, we generated a modification of the original `IslandPath-DIMOB`_ to analyze 
    contig sequence data and generated different output format for better clarificaiton and interpretaion of results. 
    We forked the original code into a new git repository and update the code accordingly. See details here: https://github.com/JFsanchezherrero/islandpath.
    
     .. include:: ../../links.inc
    
    """
    
    ## filename stamp of the process
    filename_stamp = outdir + '/.Dimob'

    # check if previously done
    if os.path.isfile(filename_stamp):
        stamp = functions.read_time_stamp(filename_stamp)
        print (colored("\tA previous command generated results on: %s [%s -- Dimob]" %(stamp, name), 'yellow'))
    else:    
        ## debug message
        if (Debug):
            print (colored("**DEBUG: Call Dimob for sample %s " %name + "**", 'yellow'))
            print ("genbank_file", genbank_file)
            print ("outdir: ", outdir)
           
        ## Call IslandPath Dimob executable perl file.
        dimob_pl = set_config.get_exe("dimob", Debug)
        perl_exe = set_config.get_exe("perl", Debug)
        
        ## command
        outdir_sample = os.path.join(outdir, name)
        log_file = outdir_sample + '.log'
        perl_cmd = '%s %s %s %s %s %s > %s' %(perl_exe, dimob_pl, genbank_file, outdir_sample, cutoff_dinuc_bias, min_length, log_file)
    
        code = functions.system_call(perl_cmd)
        ##
        if code:
            ## when finished print time stamp in  output + '/.Dimob'
            stamp = functions.print_time_stamp(filename_stamp)
        else:
            return False
    
    return (outdir)