Esempio n. 1
0
def BUSCO_plot(outfolder):
	busco_plot_bin = set_config.get_exe('generate_plot')
	
	os.chdir(outfolder)
	#logFile = dataset_name + '.log'
	cmd = '%s -wd %s' %(busco_plot_bin, outfolder)
	HCGB_sys.system_call(cmd)
	return()
def get_symbolic_link(sample_list, directory):
    """Creates symbolic links, using system call, for list of files given in directory provided"""
    for samplex in sample_list:
        cmd = 'ln -s %s %s' % (samplex, directory)
        system_call_functions.system_call(cmd, returned=False)

    files2return = os.listdir(directory)
    return files2return
Esempio n. 3
0
def prokka_call(prokka_bin, sequence_fasta, kingdom, genus, outdir_name, name,
                threads):
    """Create system call for Prokka_ software. 
	
	It generates genome annotation using Prokka software. 
		
	:param prokka_bin: Path to the prokka binary file.
	:param sequence_fasta: Assembled sequences in fasta file format. 
	:param kingdom: Available kingdoms mode for Prokka software: Archaea|Bacteria|Mitochondria|Viruses
	:param genus: Available genus options for Prokka software. See details above.
	:param outdir_name: Absolute path to the output folder to include results.
	:param name: Sample name and tag to include in the annotation report and files.
	:param threads: Number of CPUs to use.
	  
	:type prokka_bin: string
	:type sequence_fasta: string
	:type kingdom: string
	:type genus: string 
	:type outdir_name: string 
	:type name: string 
	:type threads: integer 
	
	.. seealso:: Check description of output files generated in:
	
		- :ref:`Prokka-output-files`


	.. seealso:: This function depends on other BacterialTyper functions called:
	
		- :func:`BacterialTyper.scripts.set_config.get_exe`
		
		- :func:`HCGB.functions.time_functions.read_time_stamp`
		
		- :func:`HCGB.functions.time_functions.print_time_stamp`
				
		- :func:`BacterialTyper.scripts.annotation.prokka_call`	
	
	.. include:: ../../links.inc
	"""

    ## set parameters and options for prokka
    print("\n+ Starting annotation for: %s\n" % name)
    log_file = outdir_name + '/run.log'
    options = "--cdsrnaolap --addgenes --addmrna --kingdom " + kingdom
    if genus != "Other":
        options = options + " --usegenus --genus " + genus
    prokka = "%s --force --outdir %s --prefix %s --locustag %s %s --cpus %s %s 2> %s" % (
        prokka_bin, outdir_name, name, name, options, threads, sequence_fasta,
        log_file)
    HCGB_sys.system_call(prokka)
    return (outdir_name)
Esempio n. 4
0
def check_perl_package_version(package, Debug):
    """
	Retrieve perl package version installed

	It basically uses a one line perl command to load the package and print the version.

	:param package: package name 
	:param Debug: True/False for debugging messages

	:type package: string
	:type Debug: boolean

	:returns: Version retrieved
	"""

    perl_exe = get_exe('perl')
    perl_one_line_command = perl_exe + ' -M' + package + ' -e \'print $' + package + '::VERSION\';'

    if (Debug):
        print("** DEBUG: perl command:\n")
        print(perl_one_line_command)

    ## execute one line perl command
    output_one_line = HCGB_sys.system_call(perl_one_line_command,
                                           returned=True,
                                           message=False)
    return (HCGB_main.decode(output_one_line))
Esempio n. 5
0
def ml_tree(folder, name, threads, output, Debug):
    """
    Create Maximum Likelihood tree reconstruction 
    
    We use IQ-Tree for the versatility and the ability to automatically set parameters. 
    
    :param folder: Snippy-core folder containing results.
    :param name: Name of the analysis.
    :param Debug: True/false for debugging messages
    
    :type folder: string 
    :type name: string
    :type Debug: bool 
    """
    iqtree_exe = set_config.get_exe('iqtree', Debug)
    bootstrap_number = '1000'
    aln_file = os.path.join(folder, name + '.aln')
    output_log = os.path.join(output, 'iqtree.error.log')
    output_files = os.path.join(output, 'iqtree_' + name)

    iqtree_cmd = '%s -s %s -redo --threads-max %s --prefix %s -B %s 2> %s' % (
        iqtree_exe, aln_file, threads, output_files, bootstrap_number,
        output_log)
    code = HCGB_sys.system_call(iqtree_cmd)

    if code == 'OK':
        return ()
    else:
        print("Some error occurred...")
        return ()
Esempio n. 6
0
def multiQC_call(pathFile, name, folder, option):
    """
	multiQC_ report generation call.
	
	:param pathFile: File containing list of files to include in report.
	:param name: Name to include in the html report.
	:param folder: Absolute path for the output folder.
	:param option: Options to provide to multiQC call.
	
	:type pathFile: string
	:type name: string 
	:type folder: string 
	:type option: string
	
	:returns: :func:`BacterialTyper.scripts.functions.system_call` output (OK/FALSE)
		
	.. seealso:: This function depends on other BacterialTyper functions called:
	
		- :func:`HCGB.functions.system_call_functions`
	
	"""
    multiqc_bin = "multiqc"  ## if we activate the environment it should be in $PATH
    ## set options for call
    cmd = "%s --force -o %s -n %s -l %s -p -i 'MultiQC report' -b 'HTML report generated for multiple samples and steps' %s" % (
        multiqc_bin, folder, name, pathFile, option)

    ## if a report was previously generated in the folder
    ## force to delete and generate a new one
    return (HCGB_sys.system_call(cmd))
Esempio n. 7
0
def generate_seq_search_call(db,
                             query,
                             outfile,
                             revcomp,
                             start=0,
                             end=-1,
                             format='fasta'):

    ## Sequence Range
    ##   -seq_start     First sequence position to retrieve
    ##   -seq_stop      Last sequence position to retrieve
    ##   -strand        1 = forward DNA strand, 2 = reverse complement
    ##   -revcomp       Shortcut for strand 2

    efetch_bin = set_config.get_exe("efetch")
    cmd = ("%s -db %s -id %s -seq_start %s -seq_stop %s -format %s" %
           (efetch_bin, db, query, start, end, format))

    ## add reverse complement
    if (revcomp):
        cmd = cmd + ' -revcomp'

    ## add output file
    cmd = cmd + ' > %s' % outfile

    return (HCGB_sys.system_call(cmd))
def snippy_core_call(list_folder, options, name, output_dir, output_format, Debug):
	"""
	Create core alignment for samples align to the same reference
	
	ATTENTION: Requires sample names to be different within the first 10 characters.
	
	:param list_folder:
	:param options:
	:param name:
	:param output_dir:
	:param output_format:
	:param Debug:
	
	:type list_folder: list
	:type options: string
	:type name: string
	:type output_dir:
	:type output_format:
	:type Debug:
	 
	"""
	
	## create snippy-core call
	snippy_core_exe = set_config.get_exe('snippy_core', Debug)
	
	## start snippy_cmd 
	list_folder_string = " ".join(list_folder)
	log_file = os.path.join(output_dir, "snippy_cmd.log")
	name_outdir =  os.path.join(output_dir, name)
	
	## use one reference: must be the same for all comparisons
	reference_fasta = list_folder[1] + "/ref.fa"	
	snippy_core_cmd = '%s -aformat %s --ref %s --prefix %s %s 2> %s' %(snippy_core_exe, output_format, reference_fasta, name_outdir, list_folder_string, log_file)
	
	return (HCGB_sys.system_call(snippy_core_cmd))
def SPADES_systemCall(sample_folder, file1, file2, name, SPADES_bin, options, threads, debug=False):
	"""Generate SPADES system call.
	
	It calls system for SPADES and generates time stamp file in the folder provided (sample_folder + '/.success_assembly') for later analysis.
	
	Steps:
	
	- It generates system call for SPADES assembly. 
	
	- It generates timestamp file.
	
	:param sample_folder: Absolute path to store results. It must exists.
	:param file1: Absolute path to fastq reads (R1).
	:param file2: Absolute path to fastq reads (R2).
	:param name: Sample name or tag to identify sample.
	:param SPADES_bin: Binary executable for SPADES assembly software.
	:param options: Plasmid assembly is possible if specificed via options (--plasmid).
	:param threads: Number of CPUs to use.
	
	:type name: string
	:type sample_folder: string
	:type file1: string
	:type file2: string
	:type SPADES_bin: string
	:type options: string
	:type threads: integer
	
	:return: Returns **OK** if assembly process succeeded and fasta file is generated.
	:rtype: string.
	:warnings: Returns **FAIL** if assembly process stopped.
	
	.. seealso:: This function depends on other BacterialTyper functions called:
	
		- :func:`HCGB.functions.main_functions.system_call`
	
		- :func:`HCGB.functions.time_functions.print_time_stamp`
	"""
	
	## check if previously assembled and succeeded
	filename_stamp = sample_folder + '/.success_assembly'
	if os.path.isfile(filename_stamp):
		stamp =	HCGB_time.read_time_stamp(filename_stamp)
		print (colored("\tA previous command generated results on: %s [%s]" %(stamp, name), 'yellow'))
		return('OK')

	## call system for SPADES sample given
	logFile = sample_folder + '/' + name + '.log'
	
	## command	
	cmd_SPADES = '%s %s-t %s -o %s -1 %s -2 %s > %s 2> %s' %(SPADES_bin, options, threads, sample_folder, file1, file2, logFile, logFile)
	code = HCGB_sys.system_call(cmd_SPADES)
	
	if (code == 'OK'):
		## success stamps
		filename_stamp = sample_folder + '/.success_assembly'
		stamp =	HCGB_time.print_time_stamp(filename_stamp)
		return('OK')

	return "FAIL"
Esempio n. 10
0
def load_Genome(folder, STAR_exe, genomeDir, num_threads):

    ## --genomeLoad LoadAndExit
    Load_folder = files_functions.create_subfolder('LoadMem', folder)
    cmd_LD = "%s --genomeDir %s --runThreadN %s --outFileNamePrefix %s --genomeLoad LoadAndExit" % (
        STAR_exe, genomeDir, num_threads, Load_folder)

    print('\t+ Loading memory for STAR mapping')
    load_code = system_call_functions.system_call(cmd_LD, False, True)
    return (load_code)
Esempio n. 11
0
def remove_Genome(STAR_exe, genomeDir, folder, num_threads):

    ## --genomeLoad Remove
    remove_folder = files_functions.create_subfolder('RemoveMem', folder)
    cmd_RM = "%s --genomeDir %s --outFileNamePrefix %s --runThreadN %s --genomeLoad Remove" % (
        STAR_exe, genomeDir, remove_folder, num_threads)

    ## send command
    print('\t+ Removing memory loaded for STAR mapping')
    remove_code = system_call_functions.system_call(cmd_RM, False, True)
    return (remove_code)
Esempio n. 12
0
def print_list_prokka():
    """
	Prints Prokka_ databases that has installed to use. It is the output from the call: 
	
	.. code-block:: sh

		prokka --listdb
	
	.. include:: ../../devel/results/print_list_prokka.txt
		:literal:
	
	.. seealso:: This function depends on other BacterialTyper functions called:
	
		- :func:`BacterialTyper.scripts.set_config.get_exe`
			
	.. include:: ../../links.inc	 	
	"""
    prokka_bin = set_config.get_exe('prokka')
    cmd = prokka_bin + " --listdb"
    HCGB_sys.system_call(cmd)
Esempio n. 13
0
def check_R_packages(install, install_path, Debug):

    packages = get_R_packages()
    (check_install_system, check_install_path) = get_check_R_files()
    R_script_exe = get_exe('Rscript')

    ## if no install path, check for previous store information in R_package_info.txt
    if not install_path:
        install_path = R_package_path_installed()

    for index, row in packages.iterrows():
        ## debugging messages
        if Debug:
            print('\n+ Check package: ', index)
            print('+ Source: ', row['source'])

        ## first try to check if package available in system
        cmd_check = R_script_exe + ' ' + check_install_system + ' -l ' + index
        code = HCGB_sys.system_call(cmd_check, message=False, returned=False)
        if (code == 'OK'):
            check_install_module('1', index, '0', 'package')
        else:
            check_install_module('0', index, '1', 'System package')

            ## check if installed in path
            cmd_check_path = R_script_exe + ' ' + check_install_path + ' -l ' + index + ' -p ' + install_path
            code2 = HCGB_sys.system_call(cmd_check_path,
                                         message=False,
                                         returned=False)

            if (code2 == 'OK'):
                check_install_module('1', index, '0', 'Install path package')
            else:
                check_install_module('0', index, '1', 'Install path package')
                if (install):
                    install_dependencies.install_R_packages(
                        index, row['source'], install_path, row['extra'])
                else:
                    print(
                        "Please install module %s manually to continue with BacterialTyper"
                        % index)
Esempio n. 14
0
def ariba_expandflag(input_file, output_file):
	######################################################################################
	## usage: ariba expandflag infile.tsv outfile.tsv
	######################################################################################	
	#	Expands the flag column in a report file from number to comma-separated list
	#	of flag bits
	#
	#	positional arguments:
	#	  infile      Name of input report TSV file
	#	  outfile     Name of output report TSV file
	######################################################################################
	
	## download information in database folder provided by config
	#print ("+ Call ariba module 'expandflag' to add additional information to each entry.")
	
	## Somehow when I do expandflag of report.tsv file I get spaces convert to tabs and everything is 
	## distorted. I had to make this 
	
	data = pd.read_csv(input_file, header=0, sep='\t')
	list_data2 = set(list(data['flag']))
	tmp_name = os.path.splitext(input_file)[0]
	
	## print only flag
	flag_file = tmp_name + '-tmp.tsv'
	flag_file_hd = open(flag_file, "w")	
	flag_file_hd.write('#flag_name\tflag')
	flag_file_hd.write('\n')
	
	for line in zip(list_data2, list_data2):
		string = ('flag_{}\t{}'.format(*line))
		flag_file_hd.write(string)
		flag_file_hd.write('\n')
	flag_file_hd.close()
	
	## generate description
	flag_file_out = tmp_name + '-description.tsv'
	cmd = 'ariba expandflag "%s" %s' %(flag_file, flag_file_out)
	HCGB_sys.system_call(cmd)
	
	os.remove(flag_file)	
	return(flag_file_out)
Esempio n. 15
0
def blastp(blastpexe, outFile, DBname, fasta, threads):
    # blastp
    cmd_blastp = "%s -db %s -query %s -out %s -evalue 1e-20 -outfmt \'6 std qlen slen\' -num_threads %s" % (
        blastpexe, DBname, fasta, outFile, threads)
    codeBlastp = system_call_functions.system_call(cmd_blastp)

    if (codeBlastp == 'FAIL'):
        print(
            colored('****ERROR: Some error happened during the blastp command',
                    'red'))
        print(cmd_blastp)
        exit()
Esempio n. 16
0
def BUSCO_run(sample_name, fasta, threads, output_name, dataset_name, mode, busco_db):

	my_out_folder = os.path.join(output_name, dataset_name + '/run_' + dataset_name)
	## timestamp
	filename_stamp =  my_out_folder + '/.success'

	print (colored("\tBUSCO Dataset [%s]; Sample [%s]" %(dataset_name, sample_name), 'yellow'))
		
	## check previous run
	if os.path.isfile(filename_stamp):
		timestamp = HCGB_time.read_time_stamp(filename_stamp)
		print (colored("\tSuccessfully run on date: %s"  %timestamp, 'green'))
	else:
	
		busco_bin = set_config.get_exe('busco')
		os.chdir(output_name)
		
		## init cmd configuration
		cmd = '%s -f -i %s -c %s --mode %s --download_path %s ' %(busco_bin, fasta, threads, mode, busco_db)
		
		## options if autolineage or given dataset
		if "auto-lineage" == dataset_name:
			logFile = 'auto_lineage.log'
			cmd = cmd + '--auto-lineage -o %s > %s' %(dataset_name, logFile)
		else:
			logFile = dataset_name + '.log'
			cmd = cmd + '-l %s -o %s > %s' %(dataset_name, dataset_name, logFile)
		
		## system call
		HCGB_sys.system_call(cmd)
		
		if os.path.isfile(my_out_folder + '/short_summary.txt'):
			## timestamp
			HCGB_time.print_time_stamp(filename_stamp)
		else:
			print (colored("BUSCO failed: Dataset [%s]; Sample [%s]" %(dataset_name, fasta), 'red'))
			return ('FAIL')

	return()
Esempio n. 17
0
def print_available_BUSCO():
	HCGB_aes.print_sepLine("-", 100, False)
	busco_bin = set_config.get_exe('busco')
	
	## get datasets
	busco_bin_call = busco_bin + ' --list-datasets > tmp'
	HCGB_sys.system_call(busco_bin_call, message=False)
	
	## dump in screen
	with open("./tmp", 'r') as f:
		print(f.read())	
	
	## clean
	list_files = HCGB_main.get_fullpath_list("./busco_downloads", False)
	list_files + ['tmp']
	for i in list_files:
		os.remove(i)
	os.rmdir("./busco_downloads/information")		
	os.rmdir("./busco_downloads/")
		
	HCGB_aes.print_sepLine("-", 100, False)
	print ("\n")
def kma_ident_call(out_file, files, sample_name, index_name, kma_bin, option, threads):
	"""Create kma system call for kmer identification. 
	
	Paired-end end or single end fastq files accepted. It generates a time stamp if succeeds.

	:param out_file: Absolute path and basename for the output files generated with results.
	:param files: List of absolute paths for fastq files to search againts the database.
	:param sample_name: Directory path to store database generated.
	:param index_name: Database name
	:param kma_bin: Binary executable for KMA software.
	:param option: Additional options to pass to the system call.
	:param threads: Number of CPUs to use. 

	:type out_file: string
	:type files: list
	:type sample_name: string
	:type index_name: string
	:type kma_bin: string
	:type option: string
	:type threads: integer	

	:returns: System call returned finish status.
	
	.. seealso:: This function depends on other ``BacterialTyper`` functions called:
	
		- :func:`BacterialTyper.scripts.functions.system_call`
	
		- :func:`BacterialTyper.scripts.functions.print_time_stamp`

	"""

	###
	out_file_log = out_file + '.log'
	if len(files) == 2:
		cmd_kma_search = "%s -ipe %s %s -o %s -t_db %s -t %s %s 2> %s" %(kma_bin, files[0], files[1], out_file, index_name, threads, option, out_file_log)
	else:
		## TODO: test Single End
		cmd_kma_search = "%s -i %s -o %s -t_db %s -t %s %s 2> %s" %(kma_bin, files[0], out_file, index_name, threads, option, out_file_log)

	code = HCGB_sys.system_call(cmd_kma_search)

	if (code == 'OK'):
		## success stamps
		basename_tag = os.path.basename(out_file)
		folder = os.path.dirname(out_file)
		filename_stamp = folder + '.success_' + basename_tag
		stamp =	HCGB_time.print_time_stamp(filename_stamp)
		return('OK')
	else:
		return('FAIL')
Esempio n. 19
0
def install_R_packages(package, source, install_path, extra):
	
	(install_R, install_github_package) = get_install_R_files()
	
	HCGB_files.create_folder(install_path)
	Rscript_exe = set_config.get_exe('Rscript')
	print("+ Installing %s package..." %package)
	install_file = install_R
	if (source == 'github'):
		install_file = install_github_package
		package= extra + '/' + package
	
	cmd_R = '%s %s -l %s -p %s' %(Rscript_exe, install_file, package, install_path)
	HCGB_sys.system_call(cmd_R)
	
	## check if exists or try to install
	MLSTar_package = os.path.join(install_path, 'MLSTar')
	if os.path.exists(MLSTar_package):
		RDir_package = os.path.join(os.path.dirname(os.path.realpath(__file__)), 'R', 'R_package.info.txt')
		HCGB_main.printList2file(RDir_package, [install_path])
	else:
		print_error_message(package, "No R package found", 'package')
		print ('Please install manually to proceed...')
Esempio n. 20
0
def ariba_pubmlstget(species, outdir):
	######################################################################################
	## usage: ariba pubmlstget [options] <"species in quotes"> <output_directory>
	######################################################################################	
	## Download typing scheme for a given species from PubMLST, and make an ARIBA db
	## positional arguments:
	##  species     Species to download. Put it in quotes
	##	outdir      Name of output directory to be made (must not already exist)
	######################################################################################
	
	## download information in database folder provided by config
	print ("+ Call ariba module 'pubmlstget' to retrieve MLST information.")
	HCGB_files.create_folder(outdir)
	cmd = 'ariba pubmlstget "%s" %s' %(species, outdir)
	return(HCGB_sys.system_call(cmd))
Esempio n. 21
0
def install_git_repo(git_repo, folder_sofware, install_path, option, Debug):
	""" """
	
	## current path
	current_path = os.getcwd()
	os.chdir(install_path)
	
	## git clone repo
	print ('+ Using git to get code...')
	git_exe = set_config.get_exe('git', Debug)
	
	if os.path.exists(folder_sofware):
		print ('+ Clone repository...')
		## pull
		os.chdir(folder_sofware)
		cmd = git_exe + ' pull'
	else:
		print ('+ Clone repository...')
		## clone
		cmd = git_exe + ' clone ' + git_repo 
	
	## call git
	HCGB_sys.system_call(cmd)

	## compile if necessary
	if (option == 'make'):
		## Compile
		print ('+ Compile software...')
		## make
		os.chdir(folder_sofware)
		HCGB_sys.system_call('make')
	
	## chdir to previous path
	os.chdir(current_path)
	
	return(True)	
def load_db(kma_bin, db2use):
	"""
	This function loads the given database in memory.
	
	:param kma_bin: Absolute path to KMA binary.
	:param db2use: Database to load in memory
	
	:type kma_bin: string
	:type db2use: string 
	
	:returns: System call status. 
	"""
	cmd_load_db = "%s shm -t_db %s -shmLvl 1" %(kma_bin, db2use)
	return_code_load = HCGB_sys.system_call(cmd_load_db)
	
	return(return_code_load)
Esempio n. 23
0
def makeblastdb(DBname, fasta, makeblastDBexe, dbtype='nucl'):
    ## generate blastdb for genome
    if (os.path.isfile(DBname + '.nhr')):
        print("+ BLAST database is already generated...")
    else:
        cmd_makeblast = "%s -in %s -input_type fasta -dbtype %s -out %s" % (
            makeblastDBexe, fasta, dbtype, DBname)
        code = system_call_functions.system_call(cmd_makeblast)

        if (code == 'FAIL'):
            print(
                colored(
                    '****ERROR: Some error happened during the makeblastDB command',
                    'red'))
            print(cmd_makeblast)
            exit()
Esempio n. 24
0
def ariba_summary(out, infileList, options):
	"""Create ARIBA summary
	
	This function calls ARIBA_ summary and generates a summary of multiple ARIBA report files. It also creates Phandango_ files.
	
	:param out: Prefix of output files
	:param infileList: Files to be summarised
	:param options: Additional options for ariba summary. See details below. Provide them within quotes.
	
	:type out: string
	:type infileList: list
	:type options: string
	
	:returns: functions.system_call message
	
	.. note:: Additional options for ariba_summary (as specified by ARIBA_)
	
		--cluster_cols col_ids			Comma separated list of cluster columns to include. Choose from: assembled, match, ref_seq, pct_id, ctg_cov, known_var, novel_var [match]

		--no_tree             Do not make phandango tree

		--min_id FLOAT        Minimum percent identity cutoff to count as assembled [90]

		--only_clusters Cluster_names			Only report data for the given comma-separated list of cluster names, eg: cluster1,cluster2,cluster42
		
		--v_groups      Show a group column for each group of variants
		
		--known_variants      Report all known variants
		
		--novel_variants      Report all novel variants
	
		--col_filter string			Choose whether columns where all values are "no" or "NA" are removed [yes/no]

		--row_filter string			Choose whether rows where all values are "no" or "NA" are removed [yes/no]


	.. seealso:: This function depends on other BacterialTyper functions called:
	
		- :func:`HCGB.functions.system_call_functions.system_call`
	
	"""
	
	logFile = out + '.log'
	infileList_string = " ".join(infileList)
	cmd_summary = 'ariba summary %s %s %s 2> %s' %(options, out, infileList_string, logFile)
	
	return(HCGB_sys.system_call(cmd_summary))
def remove_db(kma_bin, db2use):
	"""
	This function removes the given database from memory.
	
	:param kma_bin: Absolute path to KMA binary.
	:param db2use: Database to remove from memory
	
	:type kma_bin: string
	:type db2use: string 
	
	:returns: System call status. 
	"""

	cmd_rm_db = "%s shm -t_db %s -shmLvl 1 -destroy" %(kma_bin, db2use)
	return_code_rm = HCGB_sys.system_call(cmd_rm_db)
	
	return(return_code_rm)
Esempio n. 26
0
def create_genomeDir(folder, STAR_exe, num_threads, fasta_file,
                     limitGenomeGenerateRAM):

    ##
    genomeDir = files_functions.create_subfolder("STAR_index", folder)

    cmd_create = "%s --runMode genomeGenerate --limitGenomeGenerateRAM %s --runThreadN %s --genomeDir %s --genomeFastaFiles %s" % (
        STAR_exe, limitGenomeGenerateRAM, num_threads, genomeDir, fasta_file)

    print('\t+ genomeDir generation for STAR mapping')
    create_code = system_call_functions.system_call(cmd_create, False, True)

    if not create_code:
        print("** ERROR: Some error ocurred during genomeDir creation... **")
        exit()

    return (genomeDir)
Esempio n. 27
0
def trimmo_call(java_path, sample_folder, sample_name, files, trimmomatic_jar, threads, trimmomatic_adapters, Debug):
	##
	## Function to call trimmomatic using java. Can take single-end and pair-end files
	## sample_folder must exists before calling this function. 
	## It can be call from main or a module.
	## Returns code OK/FAIL according if succeeded or failed the system call
	## 

	#######################################
	## http://www.usadellab.org/cms/?page=trimmomatic
	#
	# ILLUMINACLIP:fasta_file.fa:2:30:10 LEADING:11 TRAILING:11 SLIDINGWINDOW:4:20 MINLEN:24
	#
	# This will perform the following:
	#	Remove adapters (ILLUMINACLIP:fasta_file.fa:2:30:10)
	#	Remove leading low quality or N bases (below quality 11) (LEADING:11)
	#	Remove trailing low quality or N bases (below quality 11) (TRAILING:11)
	#	Scan the read with a 4-base wide sliding window, cutting when the average quality per base drops below 20 (SLIDINGWINDOW:4:20)
	#	Drop reads below the 24 bases long (MINLEN:24)
	#######################################


	## debug message
	if (Debug):
		print (colored("+ Cutting adapters for sample: " + sample_name, 'yellow'))
		
	## log files
	log_file = sample_folder + '/' + sample_name + '_call.log'
	trimmo_log = sample_folder + '/' + sample_name + '.log'
	
	## init
	file_R1 = ""
	file_R2 = ""
	trim_R1 = ""
	orphan_R1 = ""
	trim_R2 = ""
	orphan_R2 = ""

	## conda installation includes a wrapper and no java jar call is required
	if trimmomatic_jar.endswith('jar'):
		cmd = "%s -jar %s"  %(java_path, trimmomatic_jar)
	else:
		cmd = "%s"  %(trimmomatic_jar)

	## Paired or single end
	## set command
	if (len(files) == 2): ## paired-end
		file_R1 = files[0]
		file_R2 = files[1]

		#print ('\t-', file_R2)
		trim_R1 = sample_folder + '/' + sample_name + '_trim_R1.fastq'
		orphan_R1 = sample_folder + '/' + sample_name + '_orphan_R1.fastq'
		trim_R2 = sample_folder + '/' + sample_name + '_trim_R2.fastq'
		orphan_R2 = sample_folder + '/' + sample_name + '_orphan_R2.fastq'

		cmd = cmd + " PE -threads %s -trimlog %s %s %s %s %s %s %s ILLUMINACLIP:%s:2:30:10 LEADING:11 TRAILING:11 SLIDINGWINDOW:4:20 MINLEN:24 2> %s" %(threads, log_file, file_R1, file_R2, trim_R1, orphan_R1, trim_R2, orphan_R2, trimmomatic_adapters, trimmo_log)

	else: ## single end
		file_R1 = files[0]
		trim_R1 = sample_folder + '/' + sample_name + '_trim.fastq'

		cmd = cmd + " SE -threads %s -trimlog %s %s %s ILLUMINACLIP:%s:2:30:10 LEADING:11 TRAILING:11 SLIDINGWINDOW:4:20 MINLEN:24 2> %s" %(threads, log_file, file_R1, trim_R1, trimmomatic_adapters, trimmo_log)

	## system call & return
	code = HCGB_sys.system_call(cmd)
	if code == 'OK':
		## success stamps
		filename_stamp = sample_folder + '/.success'
		stamp =	HCGB_time.print_time_stamp(filename_stamp)	
		return('OK')	
	else:
		return('FAIL')	
Esempio n. 28
0
def biotype_all(featureCount_exe, path, gtf_file, bam_file, name, threads, Debug, allow_multimap, stranded):
	
	## folder for results
	if not os.path.isdir(path):
		files_functions.create_folder(path)

	out_file = os.path.join(path, 'featureCount.out')
	logfile = os.path.join(path, name + '_RNAbiotype.log')

	filename_stamp_all = path + '/.success_all'
	if os.path.isfile(filename_stamp_all):
		stamp = time_functions.read_time_stamp(filename_stamp_all)
		print (colored("\tA previous command generated results on: %s [%s -- %s]" %(stamp, name, 'RNAbiotype'), 'yellow'))
		return()

	else:
		filename_stamp_featureCounts = path + '/.success_featureCounts'
		if os.path.isfile(filename_stamp_featureCounts):
			stamp = time_functions.read_time_stamp(filename_stamp_featureCounts)
			print (colored("\tA previous command generated results on: %s [%s -- %s]" %(stamp, name, 'featureCounts'), 'yellow'))
		else:

			## debugging messages
			if Debug:
				print ("** DEBUG:")
				print ("featureCounts system call for sample: " + name)
				print ("out_file: " + out_file)
				print ("logfile: " + logfile)
		
			## send command for feature count
			## Allow multimapping
			if allow_multimap:
				cmd_featureCount = ('%s -s %s -M -O -T %s -p -t exon -g transcript_biotype -a %s -o %s %s 2> %s' %(
					featureCount_exe, stranded, threads, gtf_file, out_file, bam_file, logfile)
				)
			else:
				cmd_featureCount = ('%s -s %s --largestOverlap -T %s -p -t exon -g transcript_biotype -a %s -o %s %s 2> %s' %(
					featureCount_exe, stranded, threads, gtf_file, out_file, bam_file, logfile)
				)
				
				
			## system call
			cmd_featureCount_code = system_call_functions.system_call(cmd_featureCount, False, True)
			if not cmd_featureCount_code:
				print("** ERROR: featureCount failed for sample " + name)
				exit()
				
			## print time stamp
			time_functions.print_time_stamp(filename_stamp_featureCounts)
		
		## parse results
		(extended_Stats_file, RNAbiotypes_stats_file) = parse_featureCount(out_file, path, name, bam_file, Debug)
		
		## debugging messages
		if Debug:
			print ("** DEBUG:")
			print ("extended_Stats: " + extended_Stats_file)
			print (main_functions.get_data(extended_Stats_file, '\t', 'header=None'))
			print ("RNAbiotypes_stats: " + RNAbiotypes_stats_file)
			print (main_functions.get_data(RNAbiotypes_stats_file, '\t', 'header=None'))

	return ()
Esempio n. 29
0
def mapReads(option, reads, folder, name, STAR_exe, genomeDir, limitRAM_option,
             num_threads, Debug):
    """
    Map reads using STAR software. Some parameters are set for small RNA Seq.

    Parameters set according to ENCODE Project directives for small RNAs
    https://www.encodeproject.org/rna-seq/small-rnas/
    
    :param option: If multiple files to map, use loaded genome (LoadAndKeep) if only one map, anything else.
    :param reads: List containing absolute path to reads (SE or PE)
    :param folder: Path for output results
    :param name: Sample name
    :param STAR_exe: Executable path for STAR binary
    :param genomeDir: 
    :param limitRAM_option: maximum available RAM (bytes) for map reads process. Default: 40000000000
    :param num_threads:
    
    :type option: string
    :type reads: list
    :type folder: string 
    :type name: string 
    :type STAR_exe: string
    :type genomeDir: string 
    :type limitRAM_option: int
    :type num_threads: int
    
    
    """
    ## open file
    print("\t+ Mapping sample %s using STAR" % name)

    if not os.path.isdir(folder):
        folder = files_functions.create_folder(folder)
    ##
    bam_file_name = os.path.join(folder, 'Aligned.sortedByCoord.out.bam')

    ## read is a list with 1 or 2 read fastq files
    jread = " ".join(reads)

    ## prepare command
    cmd = "%s --genomeDir %s --runThreadN %s " % (STAR_exe, genomeDir,
                                                  num_threads)
    cmd = cmd + "--limitBAMsortRAM %s --outFileNamePrefix %s " % (
        limitRAM_option, folder + '/')

    ## some common options
    cmd = cmd + "--alignSJDBoverhangMin 1000 --outFilterMultimapNmax 1 --outFilterMismatchNoverLmax 0.03 "
    cmd = cmd + "--outFilterScoreMinOverLread 0 --outFilterMatchNminOverLread 0 --outFilterMatchNmin 16 "
    cmd = cmd + "--alignIntronMax 1 --outSAMheaderHD @HD VN:1.4 SO:coordinate --outSAMtype BAM SortedByCoordinate "

    ## Multiple samples or just one?
    if option == 'LoadAndKeep':
        cmd = cmd + "--genomeLoad LoadAndKeep"
    else:
        cmd = cmd + "--genomeLoad NoSharedMemory"

    ## ReadFiles
    cmd = cmd + " --readFilesIn %s " % jread

    ## logfile & errfile
    logfile = os.path.join(folder, 'STAR.log')
    errfile = os.path.join(folder, 'STAR.err')
    cmd = cmd + ' > ' + logfile + ' 2> ' + errfile

    ## sent command
    mapping_code = system_call_functions.system_call(cmd, False, True)

    return (mapping_code)
def get_symbolic_link_file(file2link, newfile):
    """Creates symbolic link for a file into a new name file"""
    cmd = 'ln -s %s %s' % (file2link, newfile)
    system_call_functions.system_call(cmd, returned=False)