def BUSCO_plot(outfolder): busco_plot_bin = set_config.get_exe('generate_plot') os.chdir(outfolder) #logFile = dataset_name + '.log' cmd = '%s -wd %s' %(busco_plot_bin, outfolder) HCGB_sys.system_call(cmd) return()
def get_symbolic_link(sample_list, directory): """Creates symbolic links, using system call, for list of files given in directory provided""" for samplex in sample_list: cmd = 'ln -s %s %s' % (samplex, directory) system_call_functions.system_call(cmd, returned=False) files2return = os.listdir(directory) return files2return
def prokka_call(prokka_bin, sequence_fasta, kingdom, genus, outdir_name, name, threads): """Create system call for Prokka_ software. It generates genome annotation using Prokka software. :param prokka_bin: Path to the prokka binary file. :param sequence_fasta: Assembled sequences in fasta file format. :param kingdom: Available kingdoms mode for Prokka software: Archaea|Bacteria|Mitochondria|Viruses :param genus: Available genus options for Prokka software. See details above. :param outdir_name: Absolute path to the output folder to include results. :param name: Sample name and tag to include in the annotation report and files. :param threads: Number of CPUs to use. :type prokka_bin: string :type sequence_fasta: string :type kingdom: string :type genus: string :type outdir_name: string :type name: string :type threads: integer .. seealso:: Check description of output files generated in: - :ref:`Prokka-output-files` .. seealso:: This function depends on other BacterialTyper functions called: - :func:`BacterialTyper.scripts.set_config.get_exe` - :func:`HCGB.functions.time_functions.read_time_stamp` - :func:`HCGB.functions.time_functions.print_time_stamp` - :func:`BacterialTyper.scripts.annotation.prokka_call` .. include:: ../../links.inc """ ## set parameters and options for prokka print("\n+ Starting annotation for: %s\n" % name) log_file = outdir_name + '/run.log' options = "--cdsrnaolap --addgenes --addmrna --kingdom " + kingdom if genus != "Other": options = options + " --usegenus --genus " + genus prokka = "%s --force --outdir %s --prefix %s --locustag %s %s --cpus %s %s 2> %s" % ( prokka_bin, outdir_name, name, name, options, threads, sequence_fasta, log_file) HCGB_sys.system_call(prokka) return (outdir_name)
def check_perl_package_version(package, Debug): """ Retrieve perl package version installed It basically uses a one line perl command to load the package and print the version. :param package: package name :param Debug: True/False for debugging messages :type package: string :type Debug: boolean :returns: Version retrieved """ perl_exe = get_exe('perl') perl_one_line_command = perl_exe + ' -M' + package + ' -e \'print $' + package + '::VERSION\';' if (Debug): print("** DEBUG: perl command:\n") print(perl_one_line_command) ## execute one line perl command output_one_line = HCGB_sys.system_call(perl_one_line_command, returned=True, message=False) return (HCGB_main.decode(output_one_line))
def ml_tree(folder, name, threads, output, Debug): """ Create Maximum Likelihood tree reconstruction We use IQ-Tree for the versatility and the ability to automatically set parameters. :param folder: Snippy-core folder containing results. :param name: Name of the analysis. :param Debug: True/false for debugging messages :type folder: string :type name: string :type Debug: bool """ iqtree_exe = set_config.get_exe('iqtree', Debug) bootstrap_number = '1000' aln_file = os.path.join(folder, name + '.aln') output_log = os.path.join(output, 'iqtree.error.log') output_files = os.path.join(output, 'iqtree_' + name) iqtree_cmd = '%s -s %s -redo --threads-max %s --prefix %s -B %s 2> %s' % ( iqtree_exe, aln_file, threads, output_files, bootstrap_number, output_log) code = HCGB_sys.system_call(iqtree_cmd) if code == 'OK': return () else: print("Some error occurred...") return ()
def multiQC_call(pathFile, name, folder, option): """ multiQC_ report generation call. :param pathFile: File containing list of files to include in report. :param name: Name to include in the html report. :param folder: Absolute path for the output folder. :param option: Options to provide to multiQC call. :type pathFile: string :type name: string :type folder: string :type option: string :returns: :func:`BacterialTyper.scripts.functions.system_call` output (OK/FALSE) .. seealso:: This function depends on other BacterialTyper functions called: - :func:`HCGB.functions.system_call_functions` """ multiqc_bin = "multiqc" ## if we activate the environment it should be in $PATH ## set options for call cmd = "%s --force -o %s -n %s -l %s -p -i 'MultiQC report' -b 'HTML report generated for multiple samples and steps' %s" % ( multiqc_bin, folder, name, pathFile, option) ## if a report was previously generated in the folder ## force to delete and generate a new one return (HCGB_sys.system_call(cmd))
def generate_seq_search_call(db, query, outfile, revcomp, start=0, end=-1, format='fasta'): ## Sequence Range ## -seq_start First sequence position to retrieve ## -seq_stop Last sequence position to retrieve ## -strand 1 = forward DNA strand, 2 = reverse complement ## -revcomp Shortcut for strand 2 efetch_bin = set_config.get_exe("efetch") cmd = ("%s -db %s -id %s -seq_start %s -seq_stop %s -format %s" % (efetch_bin, db, query, start, end, format)) ## add reverse complement if (revcomp): cmd = cmd + ' -revcomp' ## add output file cmd = cmd + ' > %s' % outfile return (HCGB_sys.system_call(cmd))
def snippy_core_call(list_folder, options, name, output_dir, output_format, Debug): """ Create core alignment for samples align to the same reference ATTENTION: Requires sample names to be different within the first 10 characters. :param list_folder: :param options: :param name: :param output_dir: :param output_format: :param Debug: :type list_folder: list :type options: string :type name: string :type output_dir: :type output_format: :type Debug: """ ## create snippy-core call snippy_core_exe = set_config.get_exe('snippy_core', Debug) ## start snippy_cmd list_folder_string = " ".join(list_folder) log_file = os.path.join(output_dir, "snippy_cmd.log") name_outdir = os.path.join(output_dir, name) ## use one reference: must be the same for all comparisons reference_fasta = list_folder[1] + "/ref.fa" snippy_core_cmd = '%s -aformat %s --ref %s --prefix %s %s 2> %s' %(snippy_core_exe, output_format, reference_fasta, name_outdir, list_folder_string, log_file) return (HCGB_sys.system_call(snippy_core_cmd))
def SPADES_systemCall(sample_folder, file1, file2, name, SPADES_bin, options, threads, debug=False): """Generate SPADES system call. It calls system for SPADES and generates time stamp file in the folder provided (sample_folder + '/.success_assembly') for later analysis. Steps: - It generates system call for SPADES assembly. - It generates timestamp file. :param sample_folder: Absolute path to store results. It must exists. :param file1: Absolute path to fastq reads (R1). :param file2: Absolute path to fastq reads (R2). :param name: Sample name or tag to identify sample. :param SPADES_bin: Binary executable for SPADES assembly software. :param options: Plasmid assembly is possible if specificed via options (--plasmid). :param threads: Number of CPUs to use. :type name: string :type sample_folder: string :type file1: string :type file2: string :type SPADES_bin: string :type options: string :type threads: integer :return: Returns **OK** if assembly process succeeded and fasta file is generated. :rtype: string. :warnings: Returns **FAIL** if assembly process stopped. .. seealso:: This function depends on other BacterialTyper functions called: - :func:`HCGB.functions.main_functions.system_call` - :func:`HCGB.functions.time_functions.print_time_stamp` """ ## check if previously assembled and succeeded filename_stamp = sample_folder + '/.success_assembly' if os.path.isfile(filename_stamp): stamp = HCGB_time.read_time_stamp(filename_stamp) print (colored("\tA previous command generated results on: %s [%s]" %(stamp, name), 'yellow')) return('OK') ## call system for SPADES sample given logFile = sample_folder + '/' + name + '.log' ## command cmd_SPADES = '%s %s-t %s -o %s -1 %s -2 %s > %s 2> %s' %(SPADES_bin, options, threads, sample_folder, file1, file2, logFile, logFile) code = HCGB_sys.system_call(cmd_SPADES) if (code == 'OK'): ## success stamps filename_stamp = sample_folder + '/.success_assembly' stamp = HCGB_time.print_time_stamp(filename_stamp) return('OK') return "FAIL"
def load_Genome(folder, STAR_exe, genomeDir, num_threads): ## --genomeLoad LoadAndExit Load_folder = files_functions.create_subfolder('LoadMem', folder) cmd_LD = "%s --genomeDir %s --runThreadN %s --outFileNamePrefix %s --genomeLoad LoadAndExit" % ( STAR_exe, genomeDir, num_threads, Load_folder) print('\t+ Loading memory for STAR mapping') load_code = system_call_functions.system_call(cmd_LD, False, True) return (load_code)
def remove_Genome(STAR_exe, genomeDir, folder, num_threads): ## --genomeLoad Remove remove_folder = files_functions.create_subfolder('RemoveMem', folder) cmd_RM = "%s --genomeDir %s --outFileNamePrefix %s --runThreadN %s --genomeLoad Remove" % ( STAR_exe, genomeDir, remove_folder, num_threads) ## send command print('\t+ Removing memory loaded for STAR mapping') remove_code = system_call_functions.system_call(cmd_RM, False, True) return (remove_code)
def print_list_prokka(): """ Prints Prokka_ databases that has installed to use. It is the output from the call: .. code-block:: sh prokka --listdb .. include:: ../../devel/results/print_list_prokka.txt :literal: .. seealso:: This function depends on other BacterialTyper functions called: - :func:`BacterialTyper.scripts.set_config.get_exe` .. include:: ../../links.inc """ prokka_bin = set_config.get_exe('prokka') cmd = prokka_bin + " --listdb" HCGB_sys.system_call(cmd)
def check_R_packages(install, install_path, Debug): packages = get_R_packages() (check_install_system, check_install_path) = get_check_R_files() R_script_exe = get_exe('Rscript') ## if no install path, check for previous store information in R_package_info.txt if not install_path: install_path = R_package_path_installed() for index, row in packages.iterrows(): ## debugging messages if Debug: print('\n+ Check package: ', index) print('+ Source: ', row['source']) ## first try to check if package available in system cmd_check = R_script_exe + ' ' + check_install_system + ' -l ' + index code = HCGB_sys.system_call(cmd_check, message=False, returned=False) if (code == 'OK'): check_install_module('1', index, '0', 'package') else: check_install_module('0', index, '1', 'System package') ## check if installed in path cmd_check_path = R_script_exe + ' ' + check_install_path + ' -l ' + index + ' -p ' + install_path code2 = HCGB_sys.system_call(cmd_check_path, message=False, returned=False) if (code2 == 'OK'): check_install_module('1', index, '0', 'Install path package') else: check_install_module('0', index, '1', 'Install path package') if (install): install_dependencies.install_R_packages( index, row['source'], install_path, row['extra']) else: print( "Please install module %s manually to continue with BacterialTyper" % index)
def ariba_expandflag(input_file, output_file): ###################################################################################### ## usage: ariba expandflag infile.tsv outfile.tsv ###################################################################################### # Expands the flag column in a report file from number to comma-separated list # of flag bits # # positional arguments: # infile Name of input report TSV file # outfile Name of output report TSV file ###################################################################################### ## download information in database folder provided by config #print ("+ Call ariba module 'expandflag' to add additional information to each entry.") ## Somehow when I do expandflag of report.tsv file I get spaces convert to tabs and everything is ## distorted. I had to make this data = pd.read_csv(input_file, header=0, sep='\t') list_data2 = set(list(data['flag'])) tmp_name = os.path.splitext(input_file)[0] ## print only flag flag_file = tmp_name + '-tmp.tsv' flag_file_hd = open(flag_file, "w") flag_file_hd.write('#flag_name\tflag') flag_file_hd.write('\n') for line in zip(list_data2, list_data2): string = ('flag_{}\t{}'.format(*line)) flag_file_hd.write(string) flag_file_hd.write('\n') flag_file_hd.close() ## generate description flag_file_out = tmp_name + '-description.tsv' cmd = 'ariba expandflag "%s" %s' %(flag_file, flag_file_out) HCGB_sys.system_call(cmd) os.remove(flag_file) return(flag_file_out)
def blastp(blastpexe, outFile, DBname, fasta, threads): # blastp cmd_blastp = "%s -db %s -query %s -out %s -evalue 1e-20 -outfmt \'6 std qlen slen\' -num_threads %s" % ( blastpexe, DBname, fasta, outFile, threads) codeBlastp = system_call_functions.system_call(cmd_blastp) if (codeBlastp == 'FAIL'): print( colored('****ERROR: Some error happened during the blastp command', 'red')) print(cmd_blastp) exit()
def BUSCO_run(sample_name, fasta, threads, output_name, dataset_name, mode, busco_db): my_out_folder = os.path.join(output_name, dataset_name + '/run_' + dataset_name) ## timestamp filename_stamp = my_out_folder + '/.success' print (colored("\tBUSCO Dataset [%s]; Sample [%s]" %(dataset_name, sample_name), 'yellow')) ## check previous run if os.path.isfile(filename_stamp): timestamp = HCGB_time.read_time_stamp(filename_stamp) print (colored("\tSuccessfully run on date: %s" %timestamp, 'green')) else: busco_bin = set_config.get_exe('busco') os.chdir(output_name) ## init cmd configuration cmd = '%s -f -i %s -c %s --mode %s --download_path %s ' %(busco_bin, fasta, threads, mode, busco_db) ## options if autolineage or given dataset if "auto-lineage" == dataset_name: logFile = 'auto_lineage.log' cmd = cmd + '--auto-lineage -o %s > %s' %(dataset_name, logFile) else: logFile = dataset_name + '.log' cmd = cmd + '-l %s -o %s > %s' %(dataset_name, dataset_name, logFile) ## system call HCGB_sys.system_call(cmd) if os.path.isfile(my_out_folder + '/short_summary.txt'): ## timestamp HCGB_time.print_time_stamp(filename_stamp) else: print (colored("BUSCO failed: Dataset [%s]; Sample [%s]" %(dataset_name, fasta), 'red')) return ('FAIL') return()
def print_available_BUSCO(): HCGB_aes.print_sepLine("-", 100, False) busco_bin = set_config.get_exe('busco') ## get datasets busco_bin_call = busco_bin + ' --list-datasets > tmp' HCGB_sys.system_call(busco_bin_call, message=False) ## dump in screen with open("./tmp", 'r') as f: print(f.read()) ## clean list_files = HCGB_main.get_fullpath_list("./busco_downloads", False) list_files + ['tmp'] for i in list_files: os.remove(i) os.rmdir("./busco_downloads/information") os.rmdir("./busco_downloads/") HCGB_aes.print_sepLine("-", 100, False) print ("\n")
def kma_ident_call(out_file, files, sample_name, index_name, kma_bin, option, threads): """Create kma system call for kmer identification. Paired-end end or single end fastq files accepted. It generates a time stamp if succeeds. :param out_file: Absolute path and basename for the output files generated with results. :param files: List of absolute paths for fastq files to search againts the database. :param sample_name: Directory path to store database generated. :param index_name: Database name :param kma_bin: Binary executable for KMA software. :param option: Additional options to pass to the system call. :param threads: Number of CPUs to use. :type out_file: string :type files: list :type sample_name: string :type index_name: string :type kma_bin: string :type option: string :type threads: integer :returns: System call returned finish status. .. seealso:: This function depends on other ``BacterialTyper`` functions called: - :func:`BacterialTyper.scripts.functions.system_call` - :func:`BacterialTyper.scripts.functions.print_time_stamp` """ ### out_file_log = out_file + '.log' if len(files) == 2: cmd_kma_search = "%s -ipe %s %s -o %s -t_db %s -t %s %s 2> %s" %(kma_bin, files[0], files[1], out_file, index_name, threads, option, out_file_log) else: ## TODO: test Single End cmd_kma_search = "%s -i %s -o %s -t_db %s -t %s %s 2> %s" %(kma_bin, files[0], out_file, index_name, threads, option, out_file_log) code = HCGB_sys.system_call(cmd_kma_search) if (code == 'OK'): ## success stamps basename_tag = os.path.basename(out_file) folder = os.path.dirname(out_file) filename_stamp = folder + '.success_' + basename_tag stamp = HCGB_time.print_time_stamp(filename_stamp) return('OK') else: return('FAIL')
def install_R_packages(package, source, install_path, extra): (install_R, install_github_package) = get_install_R_files() HCGB_files.create_folder(install_path) Rscript_exe = set_config.get_exe('Rscript') print("+ Installing %s package..." %package) install_file = install_R if (source == 'github'): install_file = install_github_package package= extra + '/' + package cmd_R = '%s %s -l %s -p %s' %(Rscript_exe, install_file, package, install_path) HCGB_sys.system_call(cmd_R) ## check if exists or try to install MLSTar_package = os.path.join(install_path, 'MLSTar') if os.path.exists(MLSTar_package): RDir_package = os.path.join(os.path.dirname(os.path.realpath(__file__)), 'R', 'R_package.info.txt') HCGB_main.printList2file(RDir_package, [install_path]) else: print_error_message(package, "No R package found", 'package') print ('Please install manually to proceed...')
def ariba_pubmlstget(species, outdir): ###################################################################################### ## usage: ariba pubmlstget [options] <"species in quotes"> <output_directory> ###################################################################################### ## Download typing scheme for a given species from PubMLST, and make an ARIBA db ## positional arguments: ## species Species to download. Put it in quotes ## outdir Name of output directory to be made (must not already exist) ###################################################################################### ## download information in database folder provided by config print ("+ Call ariba module 'pubmlstget' to retrieve MLST information.") HCGB_files.create_folder(outdir) cmd = 'ariba pubmlstget "%s" %s' %(species, outdir) return(HCGB_sys.system_call(cmd))
def install_git_repo(git_repo, folder_sofware, install_path, option, Debug): """ """ ## current path current_path = os.getcwd() os.chdir(install_path) ## git clone repo print ('+ Using git to get code...') git_exe = set_config.get_exe('git', Debug) if os.path.exists(folder_sofware): print ('+ Clone repository...') ## pull os.chdir(folder_sofware) cmd = git_exe + ' pull' else: print ('+ Clone repository...') ## clone cmd = git_exe + ' clone ' + git_repo ## call git HCGB_sys.system_call(cmd) ## compile if necessary if (option == 'make'): ## Compile print ('+ Compile software...') ## make os.chdir(folder_sofware) HCGB_sys.system_call('make') ## chdir to previous path os.chdir(current_path) return(True)
def load_db(kma_bin, db2use): """ This function loads the given database in memory. :param kma_bin: Absolute path to KMA binary. :param db2use: Database to load in memory :type kma_bin: string :type db2use: string :returns: System call status. """ cmd_load_db = "%s shm -t_db %s -shmLvl 1" %(kma_bin, db2use) return_code_load = HCGB_sys.system_call(cmd_load_db) return(return_code_load)
def makeblastdb(DBname, fasta, makeblastDBexe, dbtype='nucl'): ## generate blastdb for genome if (os.path.isfile(DBname + '.nhr')): print("+ BLAST database is already generated...") else: cmd_makeblast = "%s -in %s -input_type fasta -dbtype %s -out %s" % ( makeblastDBexe, fasta, dbtype, DBname) code = system_call_functions.system_call(cmd_makeblast) if (code == 'FAIL'): print( colored( '****ERROR: Some error happened during the makeblastDB command', 'red')) print(cmd_makeblast) exit()
def ariba_summary(out, infileList, options): """Create ARIBA summary This function calls ARIBA_ summary and generates a summary of multiple ARIBA report files. It also creates Phandango_ files. :param out: Prefix of output files :param infileList: Files to be summarised :param options: Additional options for ariba summary. See details below. Provide them within quotes. :type out: string :type infileList: list :type options: string :returns: functions.system_call message .. note:: Additional options for ariba_summary (as specified by ARIBA_) --cluster_cols col_ids Comma separated list of cluster columns to include. Choose from: assembled, match, ref_seq, pct_id, ctg_cov, known_var, novel_var [match] --no_tree Do not make phandango tree --min_id FLOAT Minimum percent identity cutoff to count as assembled [90] --only_clusters Cluster_names Only report data for the given comma-separated list of cluster names, eg: cluster1,cluster2,cluster42 --v_groups Show a group column for each group of variants --known_variants Report all known variants --novel_variants Report all novel variants --col_filter string Choose whether columns where all values are "no" or "NA" are removed [yes/no] --row_filter string Choose whether rows where all values are "no" or "NA" are removed [yes/no] .. seealso:: This function depends on other BacterialTyper functions called: - :func:`HCGB.functions.system_call_functions.system_call` """ logFile = out + '.log' infileList_string = " ".join(infileList) cmd_summary = 'ariba summary %s %s %s 2> %s' %(options, out, infileList_string, logFile) return(HCGB_sys.system_call(cmd_summary))
def remove_db(kma_bin, db2use): """ This function removes the given database from memory. :param kma_bin: Absolute path to KMA binary. :param db2use: Database to remove from memory :type kma_bin: string :type db2use: string :returns: System call status. """ cmd_rm_db = "%s shm -t_db %s -shmLvl 1 -destroy" %(kma_bin, db2use) return_code_rm = HCGB_sys.system_call(cmd_rm_db) return(return_code_rm)
def create_genomeDir(folder, STAR_exe, num_threads, fasta_file, limitGenomeGenerateRAM): ## genomeDir = files_functions.create_subfolder("STAR_index", folder) cmd_create = "%s --runMode genomeGenerate --limitGenomeGenerateRAM %s --runThreadN %s --genomeDir %s --genomeFastaFiles %s" % ( STAR_exe, limitGenomeGenerateRAM, num_threads, genomeDir, fasta_file) print('\t+ genomeDir generation for STAR mapping') create_code = system_call_functions.system_call(cmd_create, False, True) if not create_code: print("** ERROR: Some error ocurred during genomeDir creation... **") exit() return (genomeDir)
def trimmo_call(java_path, sample_folder, sample_name, files, trimmomatic_jar, threads, trimmomatic_adapters, Debug): ## ## Function to call trimmomatic using java. Can take single-end and pair-end files ## sample_folder must exists before calling this function. ## It can be call from main or a module. ## Returns code OK/FAIL according if succeeded or failed the system call ## ####################################### ## http://www.usadellab.org/cms/?page=trimmomatic # # ILLUMINACLIP:fasta_file.fa:2:30:10 LEADING:11 TRAILING:11 SLIDINGWINDOW:4:20 MINLEN:24 # # This will perform the following: # Remove adapters (ILLUMINACLIP:fasta_file.fa:2:30:10) # Remove leading low quality or N bases (below quality 11) (LEADING:11) # Remove trailing low quality or N bases (below quality 11) (TRAILING:11) # Scan the read with a 4-base wide sliding window, cutting when the average quality per base drops below 20 (SLIDINGWINDOW:4:20) # Drop reads below the 24 bases long (MINLEN:24) ####################################### ## debug message if (Debug): print (colored("+ Cutting adapters for sample: " + sample_name, 'yellow')) ## log files log_file = sample_folder + '/' + sample_name + '_call.log' trimmo_log = sample_folder + '/' + sample_name + '.log' ## init file_R1 = "" file_R2 = "" trim_R1 = "" orphan_R1 = "" trim_R2 = "" orphan_R2 = "" ## conda installation includes a wrapper and no java jar call is required if trimmomatic_jar.endswith('jar'): cmd = "%s -jar %s" %(java_path, trimmomatic_jar) else: cmd = "%s" %(trimmomatic_jar) ## Paired or single end ## set command if (len(files) == 2): ## paired-end file_R1 = files[0] file_R2 = files[1] #print ('\t-', file_R2) trim_R1 = sample_folder + '/' + sample_name + '_trim_R1.fastq' orphan_R1 = sample_folder + '/' + sample_name + '_orphan_R1.fastq' trim_R2 = sample_folder + '/' + sample_name + '_trim_R2.fastq' orphan_R2 = sample_folder + '/' + sample_name + '_orphan_R2.fastq' cmd = cmd + " PE -threads %s -trimlog %s %s %s %s %s %s %s ILLUMINACLIP:%s:2:30:10 LEADING:11 TRAILING:11 SLIDINGWINDOW:4:20 MINLEN:24 2> %s" %(threads, log_file, file_R1, file_R2, trim_R1, orphan_R1, trim_R2, orphan_R2, trimmomatic_adapters, trimmo_log) else: ## single end file_R1 = files[0] trim_R1 = sample_folder + '/' + sample_name + '_trim.fastq' cmd = cmd + " SE -threads %s -trimlog %s %s %s ILLUMINACLIP:%s:2:30:10 LEADING:11 TRAILING:11 SLIDINGWINDOW:4:20 MINLEN:24 2> %s" %(threads, log_file, file_R1, trim_R1, trimmomatic_adapters, trimmo_log) ## system call & return code = HCGB_sys.system_call(cmd) if code == 'OK': ## success stamps filename_stamp = sample_folder + '/.success' stamp = HCGB_time.print_time_stamp(filename_stamp) return('OK') else: return('FAIL')
def biotype_all(featureCount_exe, path, gtf_file, bam_file, name, threads, Debug, allow_multimap, stranded): ## folder for results if not os.path.isdir(path): files_functions.create_folder(path) out_file = os.path.join(path, 'featureCount.out') logfile = os.path.join(path, name + '_RNAbiotype.log') filename_stamp_all = path + '/.success_all' if os.path.isfile(filename_stamp_all): stamp = time_functions.read_time_stamp(filename_stamp_all) print (colored("\tA previous command generated results on: %s [%s -- %s]" %(stamp, name, 'RNAbiotype'), 'yellow')) return() else: filename_stamp_featureCounts = path + '/.success_featureCounts' if os.path.isfile(filename_stamp_featureCounts): stamp = time_functions.read_time_stamp(filename_stamp_featureCounts) print (colored("\tA previous command generated results on: %s [%s -- %s]" %(stamp, name, 'featureCounts'), 'yellow')) else: ## debugging messages if Debug: print ("** DEBUG:") print ("featureCounts system call for sample: " + name) print ("out_file: " + out_file) print ("logfile: " + logfile) ## send command for feature count ## Allow multimapping if allow_multimap: cmd_featureCount = ('%s -s %s -M -O -T %s -p -t exon -g transcript_biotype -a %s -o %s %s 2> %s' %( featureCount_exe, stranded, threads, gtf_file, out_file, bam_file, logfile) ) else: cmd_featureCount = ('%s -s %s --largestOverlap -T %s -p -t exon -g transcript_biotype -a %s -o %s %s 2> %s' %( featureCount_exe, stranded, threads, gtf_file, out_file, bam_file, logfile) ) ## system call cmd_featureCount_code = system_call_functions.system_call(cmd_featureCount, False, True) if not cmd_featureCount_code: print("** ERROR: featureCount failed for sample " + name) exit() ## print time stamp time_functions.print_time_stamp(filename_stamp_featureCounts) ## parse results (extended_Stats_file, RNAbiotypes_stats_file) = parse_featureCount(out_file, path, name, bam_file, Debug) ## debugging messages if Debug: print ("** DEBUG:") print ("extended_Stats: " + extended_Stats_file) print (main_functions.get_data(extended_Stats_file, '\t', 'header=None')) print ("RNAbiotypes_stats: " + RNAbiotypes_stats_file) print (main_functions.get_data(RNAbiotypes_stats_file, '\t', 'header=None')) return ()
def mapReads(option, reads, folder, name, STAR_exe, genomeDir, limitRAM_option, num_threads, Debug): """ Map reads using STAR software. Some parameters are set for small RNA Seq. Parameters set according to ENCODE Project directives for small RNAs https://www.encodeproject.org/rna-seq/small-rnas/ :param option: If multiple files to map, use loaded genome (LoadAndKeep) if only one map, anything else. :param reads: List containing absolute path to reads (SE or PE) :param folder: Path for output results :param name: Sample name :param STAR_exe: Executable path for STAR binary :param genomeDir: :param limitRAM_option: maximum available RAM (bytes) for map reads process. Default: 40000000000 :param num_threads: :type option: string :type reads: list :type folder: string :type name: string :type STAR_exe: string :type genomeDir: string :type limitRAM_option: int :type num_threads: int """ ## open file print("\t+ Mapping sample %s using STAR" % name) if not os.path.isdir(folder): folder = files_functions.create_folder(folder) ## bam_file_name = os.path.join(folder, 'Aligned.sortedByCoord.out.bam') ## read is a list with 1 or 2 read fastq files jread = " ".join(reads) ## prepare command cmd = "%s --genomeDir %s --runThreadN %s " % (STAR_exe, genomeDir, num_threads) cmd = cmd + "--limitBAMsortRAM %s --outFileNamePrefix %s " % ( limitRAM_option, folder + '/') ## some common options cmd = cmd + "--alignSJDBoverhangMin 1000 --outFilterMultimapNmax 1 --outFilterMismatchNoverLmax 0.03 " cmd = cmd + "--outFilterScoreMinOverLread 0 --outFilterMatchNminOverLread 0 --outFilterMatchNmin 16 " cmd = cmd + "--alignIntronMax 1 --outSAMheaderHD @HD VN:1.4 SO:coordinate --outSAMtype BAM SortedByCoordinate " ## Multiple samples or just one? if option == 'LoadAndKeep': cmd = cmd + "--genomeLoad LoadAndKeep" else: cmd = cmd + "--genomeLoad NoSharedMemory" ## ReadFiles cmd = cmd + " --readFilesIn %s " % jread ## logfile & errfile logfile = os.path.join(folder, 'STAR.log') errfile = os.path.join(folder, 'STAR.err') cmd = cmd + ' > ' + logfile + ' 2> ' + errfile ## sent command mapping_code = system_call_functions.system_call(cmd, False, True) return (mapping_code)
def get_symbolic_link_file(file2link, newfile): """Creates symbolic link for a file into a new name file""" cmd = 'ln -s %s %s' % (file2link, newfile) system_call_functions.system_call(cmd, returned=False)