def filter_variants(vcf_file): """ :param vcf_file: :return: """ # locate the executable snpsift = find_executable(['SnpSift']) sample = os.path.basename(vcf_file).rsplit(".", 2)[0] snpsift_file = os.path.join(os.path.dirname(vcf_file), sample + '.snpSift.table.txt') if os.path.exists(snpsift_file): logging.critical("SnpSift file {} exists!".format(snpsift_file)) else: call = [ '{} extractFields -s "," -e "." {} CHROM POS REF ALT "ANN[*].GENE" "ANN[*].GENEID" "ANN[*].IMPACT" ' '"ANN[*].EFFECT" "ANN[*].FEATURE" "ANN[*].FEATUREID" "ANN[*].BIOTYPE" "ANN[*].RANK" "ANN[*].HGVS_C" ' '"ANN[*].HGVS_P" "ANN[*].CDNA_POS" "ANN[*].CDNA_LEN" "ANN[*].CDS_POS" "ANN[*].CDS_LEN" "ANN[*].AA_POS" ' '"ANN[*].AA_LEN" "ANN[*].DISTANCE" "EFF[*].EFFECT" "EFF[*].FUNCLASS" "EFF[*].CODON" "EFF[*].AA" ' '"EFF[*].AA_LEN" > {}'.format(snpsift, vcf_file, snpsift_file) ] cmd = " ".join(call) # filter run_shell_command(cmd=cmd, raise_errors=False, extra_env=None) return snpsift_file
def build_standard(db, task='standard', n_threads=4, extra_args=""): """ build standard kraken2 database :param db: <str> path/location to the database :param task: <str> operation to be performed :param n_threads: <int> number of cpus/threads :param extra_args: <str> extra arguments passed to the taxonomy executable :return: """ # locate the executable kraken2_build = find_executable(["kraken2-build"]) # create db directory db = os.path.join(os.path.realpath(db), '') mkdir(db) lib_dir = os.path.join(db, 'library') if os.path.exists(lib_dir) and len(os.listdir(lib_dir)) > 0: logging.critical( 'library already downloaded -> \n\t{}'.format(lib_dir)) else: # run process call = [ "{} --{} --threads {} --use-ftp --db {} {}".format( kraken2_build, task, n_threads, db, extra_args) ] cmd = " ".join(call) # run the shell command logging.info("building standard database") run_shell_command(cmd=cmd, raise_errors=False, extra_env=None) return db
def backup_db(): db_params = {'db': DB['pg_dbname'], 'user': DB['pg_user'], 'host': DB['pg_host'], 'port': DB['pg_port'], 'dt': datetime.datetime.now().strftime('%Y%m%d_%H%M%S'), 'arch_file_tail': DB_ARCNAME_TAIL, 'dest': DESTINATION_DIR} command = 'pg_dump -d {db} -U {user} -w -h {host} -p {port} -F tar -f {dest}/{dt}{arch_file_tail}'.format(**db_params) run_shell_command(command=command)
def index_vcf(vcf_file): """ index VCF file with tabix :param vcf_file: <str> VCF file to be indexed :return: """ call = ["tabix -p vcf -f {}".format(vcf_file)] cmd = " ".join(call) logging.info("indexing VCF file") run_shell_command(cmd=cmd, raise_errors=False, extra_env=None) return vcf_file
def run(self): ''' :return: ''' #command example #rtg-tools-3.8.4-bdba5ea_install/rtg vcfeval --baseline truth.vcf.gz \ #--calls compare1.vcf.gz -o vcfeval_split_snp -t ref.sdf --output-mode=annotate --sample xx --squash-ploidy --regions ?? \ cmd = ['java', utils.JAVA_XMX, '-jar', utils.RTGJAR, 'vcfeval', '-o', self.prefix, '--baseline', self.true_vcf, '-t', self.reference, ] if not self.exclude_filtered: cmd.append('--all-records') if not self.match_geno: cmd.append('--squash-ploidy') if self.sample: cmd.append('--sample') cmd.append(self.sample) if self.regions: cmd.append('--bed-regions') cmd.append(self.regions) if self.opts: cmd.append(self.opts) if len(self.vcfs) != 1: raise ValueError('vcfeval only takes 1 prediction VCF and 1 truth VCF: {0}'.format(self.vcfs)) cmd.append('--calls') cmd.append(self.vcfs[0]) tp = os.path.join(self.prefix, 'tp-baseline.vcf.gz') tp_predict = os.path.join(self.prefix, 'tp.vcf.gz') fn = os.path.join(self.prefix, 'fn.vcf.gz') fp = os.path.join(self.prefix, 'fp.vcf.gz') if utils.count_variants(self.true_vcf) == 0 and utils.count_variants(self.vcfs[0]) == 0: #both truth and prediction are empty, do nothing utils.makedirs([self.prefix]) shutil.copyfile(self.true_vcf, tp) shutil.copyfile(self.true_vcf, fn) shutil.copyfile(self.vcfs[0], tp_predict) shutil.copyfile(self.vcfs[0], fp) else: if self.log_to_file: with utils.versatile_open(self.log_to_file, 'a') as logout: utils.run_shell_command(cmd, sys.stderr, logout) else: utils.run_shell_command(cmd, sys.stderr, sys.stderr) for i in (tp, tp_predict, fn, fp): if not os.path.exists(i): raise Exception('{0} was not generated by vcfeval. Please check and rerun.'.format(i)) self.tp, self.tp_predict, self.fn, self.fp = tp, tp_predict, fn, fp
def run(self): """Generate the model adapted to the synthesis for the CMP part :returns: None :rtype: """ self.mk_unseen_script() self.logger.info("CMP unseen model building") cmd = '%s -A -B -C %s -D -T 1 -p -i -H %s -w %s %s %s' % \ (self.conf.HHEd, self.conf.TRAIN_CONFIG, self.cmp_model_fpath, self.conf.TMP_CMP_MMF, self.conf.TYPE_HED_UNSEEN_BASE+'_cmp.hed', self.full_list_fpath) run_shell_command(cmd, self.logger)
def download_taxonomy(db, task="download-taxonomy", n_threads=4, extra_args=""): """ download the accession number to taxon maps, as well as the taxonomic name and tree information from NCBI. :param task: <str> operation to be performed :param db: <str> path/location to the database :param n_threads: <int> number of cpus/threads :param extra_args: <str> extra arguments passed to the taxonomy executable :return: """ # locate the executable kraken2 = find_executable(["kraken2-build"]) # create the database directory if not existing mkdir(db) taxonomy_dir = os.path.join(db, 'taxonomy') # prelim = os.path.join(taxonomy_dir, 'prelim_map.txt') if os.path.exists(taxonomy_dir) and len(os.listdir(taxonomy_dir)) > 0: logging.critical( 'taxonomy already downloaded \n\t{}'.format(taxonomy_dir)) else: call = [ "{} --{} --use-ftp --threads {} --use-ftp --db {} {}".format( kraken2, task, n_threads, db, extra_args) ] cmd = " ".join(call) # run the shell command logging.info("downloading taxonomy") run_shell_command(cmd=cmd, logfile=f_out, raise_errors=False, extra_env=None) # if not os.path.exists(prelim) or (os.path.exists(prelim) and os.path.getsize(prelim) <=0): # call = ["{} --{} --threads {} --db {} {}".format(kraken2, task, n_threads, db, extra_args) # ] # cmd = " ".join(call) # # run the shell command # logging.info("downloading taxonomy") # run_shell_command(cmd=cmd, logfile=f_out, raise_errors=False, extra_env=None) # else: # logging.critical('taxonomy already downloaded \n\t{}'.format(taxonomy_dir)) return db
def run(self): ''' :return: ''' cmd = [ self.java, utils.JAVA_XMX, '-jar', utils.VARSIMJAR, 'vcfcompare', '-prefix', self.prefix, '-true_vcf', self.true_vcf, '-reference', self.reference, ] if self.exclude_filtered: cmd.append('-exclude_filtered') if self.match_geno: cmd.append('-match_geno') if self.sample: cmd.append('-sample') cmd.append(self.sample) if self.regions: cmd.append('-bed') cmd.append(self.regions) if self.disallow_partial_fp: cmd.append('-disallow_partial_fp') if str(self.sv_length): cmd.append('-sv_length {}'.format(self.sv_length)) if self.opts: cmd.append(self.opts) cmd.extend(self.vcfs) if self.log_to_file: with utils.versatile_open(self.log_to_file, 'a') as logout: utils.run_shell_command(cmd, sys.stdout, logout) else: utils.run_shell_command(cmd, sys.stdout, sys.stderr) tp = self.prefix + '_TP.vcf' fn = self.prefix + '_FN.vcf' fp = self.prefix + '_FP.vcf' for i in (tp, fn, fp): if not os.path.exists(i): raise Exception( '{0} was not generated by VarSim vcfcompare. Please check and rerun.' .format(i)) self.tp, self.fn, self.fp = tp, fn, fp
def download_library(db, library, task="download-library", n_threads=4, extra_args=""): """ download sets of standard genomes/proteins while providing a reference. :param db: <str> path/location to the database :param library: <str> name of the reference library whose sequences will be downloaded :param task: <str> operation to be performed :param n_threads: <int> number of cpus/threads :param extra_args: <str> extra arguments passed to the taxonomy executable :return: """ # locate the executable kraken2 = find_executable(["kraken2-build"]) # create db directory db = os.path.join(os.path.realpath(db), '') mkdir(db) lib_dir = os.path.join(db, 'library', library) fnames = [ 'assembly_summary.txt', 'manifest.txt', 'prelim_map.txt', 'library.fna.masked', 'library.fna' ] ind = list(os.path.join(lib_dir, f) for f in fnames) i_bool = [os.path.isfile(filename) for filename in ind] result = all(i_bool) if result is True: logging.critical("library files exist \n\t{}".format('\n\t'.join( [filename for filename in ind if os.path.isfile(filename)]))) else: call = [ "{} --{} {} --threads {} --use-ftp --db {} {}".format( kraken2, task, library, n_threads, db, extra_args) ] cmd = " ".join(call) # run the shell command logging.info("downloading library") run_shell_command(cmd=cmd, raise_errors=False, extra_env=None) # if not p and os.path.exists(os.path.join(lib_dir, 'assembly_summary.txt')): # assembly_summary = os.path.join(lib_dir, 'assembly_summary.txt') # fix_ftp_paths(assembly_summary) # p = run_shell_command(cmd=cmd, logfile=f_out, raise_errors=False, extra_env=None) return db
def run(self): """Generate the model adapted to the synthesis for the global variance part :returns: None :rtype: """ self.mk_unseen_script() self.logger.info("Global variance unseen model building") cmd = '%s -A -B -C %s -D -T 1 -p -i -H %s -w %s %s %s' % \ (self.conf.HHEd, self.conf.TRAIN_CONFIG, self.gv_path+'/clustered.mmf', self.conf.TMP_GV_MMF, self.conf.GV_HED_UNSEEN_BASE+'.hed', self.gv_path+'/gv.list') run_shell_command(cmd, self.logger)
def build_snpeff_db(reference, gff, snpeff_config, snpeff_db): """ build SnpEff database for a reference genome :param: snpeff_config :param snpeff_db: :param reference: :param gff: :return: """ # locate the executable snpeff = find_executable(['snpEff']) snpeff_db = os.path.abspath(snpeff_db) # create SnpEff database prefix = os.path.join(os.path.abspath(os.path.dirname(reference)), os.path.splitext(os.path.basename(reference))[0]) index_base = os.path.basename(prefix) snpeff_data_dir = os.path.join(snpeff_db, 'data') snpeff_genes_dir = os.path.join(snpeff_data_dir, index_base) mkdir(snpeff_data_dir) mkdir(snpeff_genes_dir) # copy the files copy_file(src=gff, dest=os.path.join(snpeff_genes_dir, 'genes.gff')) copy_file(src=reference, dest=os.path.join(snpeff_genes_dir, 'sequences.fa')) # Add a genome to the configuration file snpeff_config = os.path.join(snpeff_db, 'snpeff.config') with open(snpeff_config, 'w') as f_obj: f_obj.write('{}.genome : {}\n'.format(index_base, index_base)) # check if db exists and build if not db_bin = os.path.join(snpeff_genes_dir, 'snpEffectPredictor.bin') if os.path.exists(db_bin): logging.critical("SnpEff database exist for {}".format(index_base)) else: # build db call = [ "{} build -config {} -dataDir {} -gff3 -v {}".format( snpeff, snpeff_config, snpeff_data_dir, index_base) ] cmd = " ".join(call) logging.info("building SnpEFF database: {}".format(gff)) run_shell_command(cmd=cmd, raise_errors=False, extra_env=None) return snpeff_config, snpeff_data_dir
def generate(self, in_path, out_path, gen_labfile_base_lst, use_gv): """Parameter generation method. :param out_path: the path where to store the parameters. :param gen_labfile_base_lst: the list of utt. to generate :param use_gv: switch to use the variance global :returns: None :rtype: """ # Configuration part self.configuration_generator.generateTrainingConfiguration() self.configuration_generator.generateSynthesisConfiguration(use_gv) # Model part self.composition(use_gv) # Generate directory set dir_dict = {} for f in gen_labfile_base_lst: f = "%s.lab" % f parent = os.path.dirname(f) if parent not in dir_dict: dir_dict[parent] = [] dir_dict[parent].append(f) # Parameter generation (per directory !) for k, v in dir_dict.items(): os.makedirs("%s/%s" % (out_path, k), exist_ok=True) with open(self.conf.TMP_GEN_LABFILE_LIST_FNAME, "w") as f_lab: for f in v: f_lab.write("%s/%s\n" % (in_path, f)) # gen_labfile_base_lst self.logger.info("Parameter generation") cmd = "%s " % self.conf.HMGenS if self.conf.imposed_duration: cmd += "-m " cmd += '-A -B -C %s -D -T 1 -S %s -t %s -c %d -H %s -N %s -M %s %s %s' % \ (self.conf.SYNTH_CONFIG, self.conf.TMP_GEN_LABFILE_LIST_FNAME, self.conf.MODELLING["beam"], int(self.conf.pg_type), self.conf.TMP_CMP_MMF, self.conf.TMP_DUR_MMF, "%s/%s" % (out_path, k), self.conf.TYPE_TIED_LIST_BASE+'_cmp', self.conf.TYPE_TIED_LIST_BASE+'_dur') run_shell_command(cmd, self.logger)
def test_wait_for_ip(self, target_platforms): ''' Wait for the VM to boot. Can be a fresh VM which running the setup Wait for the setup to complete ''' vbox = virtualbox_shell.VirtualBox() uuids_macs = {} for target_platform in target_platforms: os_name, architecture = target_platform[ "os_name"], target_platform["architecture"] presents, machine = self.__vm_presents(os_name, architecture) mac = vbox.get_mac_address(machine.uuid) target_platform["mac"] = mac uuids_macs[machine.uuid] = (mac, machine.name) print(f"Waiting for ping from {uuids_macs}") time_end = datetime.datetime.now() + datetime.timedelta(minutes=20) while len(uuids_macs) and (datetime.datetime.now() < time_end): uuid = next(iter(uuids_macs)) mac, machine_name = uuids_macs[uuid] res, hostname, ipaddress = utils.find_ip_by_mac(mac) if res: res = utils.run_shell_command(f"ping -c 1 {ipaddress}", None, None, True) if res: print( f"Got a response from {ipaddress} {hostname} for {mac} {machine_name} {uuid}" ) target_platform["ipaddress"] = ipaddress del uuids_macs[uuid] time.sleep(5.0) assert len( uuids_macs ) == 0, "Failed to get a response to ping for " + str(uuids_macs)
def get_stats(vcf_file, stats_file): """ calculate variant call stats with bcftools :param vcf_file: <str> VCF file :param stats_file: <str> file to write stats :return: """ # locate the executable bcftools = find_executable(['bcftools']) call = ["{} stats {} > {}".format(bcftools, vcf_file, stats_file)] cmd = " ".join(call) logging.info("calculating stats ") run_shell_command(cmd=cmd, raise_errors=False, extra_env=None) return stats_file
def index(request): pip = os.path.join(sys.exec_prefix, 'bin', 'pip') if not os.path.isfile(pip): pip = 'pip' SHELL_COMMANDS = ( ('Hostname','hostname'), ('hg version', 'hg id'), ('git version', "git log --pretty=format:'%h' -n 1"), ('hg branch', 'hg branch'), ('git branch', 'git rev-parse --abbrev-ref HEAD'), ('MySQL version', 'mysql --version'), ('Local Packages', '%s freeze -l' % pip) ) SD = OrderedDict() for k,v in sorted(settings_list(), key=lambda x: x[0]): SD[k] = v context = RequestContext(request, { 'args': sys.argv, 'exe': sys.executable, 'settings': SD, }) context['versions'] = OrderedDict() # get versions curr_dir = os.path.realpath(os.path.dirname(__file__)) for name, shell_command in SHELL_COMMANDS: try: result = utils.run_shell_command(shell_command, curr_dir) if result: if isinstance(result, list): result = '<br>'.split(result) context['versions'][name] = result except: pass # machine status context['machine'] = OrderedDict() if sys.platform == 'darwin': context['machine']['Uptime'] = 'not done yet on MacOS' context['machine']['Disk Space'] = 'not done yet on MacOS' elif sys.platform == 'win32': context['machine']['Uptime'] = 'not done yet on Windows' context['machine']['Disk Space'] = 'not done yet on Windows' else: context['machine']['Uptime'] = utils.server_uptime() context['machine']['Disk Space'] = utils.disk_usage('/')._asdict() if os.path.exists(settings.MEDIA_ROOT): context['machine']['Media Folder'] = utils.sizeof_fmt(utils.folder_size(settings.MEDIA_ROOT)) context['stats'] = utils.get_available_stats() context['apps'] = [(app.__name__, ', '.join([model.__name__ for model in models])) for app, models in all_concrete_models()] context['relations'] = [[(model.__name__, ', '.join(['%s (%s) through %s' % (relation.__name__, relation.__module__, field.__class__.__name__) for field, relation in relations]), app.__name__) for model, relations in rel_info] for app, rel_info in all_relations()] #context['rel_graph'] = context['config_warnings'] = utils.get_configuration_warnings() return render_to_response('dashboard/index.html', context)
def summarize_results(prefix, tp, fn, fp, t, var_types, sv_length = 100, regions = None, bed_either = False): ''' count variants by type and tabulate :param augmented_tp: :param augmented_fn: :param augmented_fp: :param augmented_t: :return: ''' cmd = ['java', utils.JAVA_XMX, '-jar', utils.VARSIMJAR, 'vcfcompareresultsparser', '-prefix', prefix, '-tp',tp, '-fn', fn, '-fp', fp, '-t', t, '-sv_length', str(sv_length), ] if regions: cmd = cmd + ['-bed', regions] if bed_either: cmd = cmd + ['-bed_either'] utils.run_shell_command(cmd, cmd_stdout=sys.stdout, cmd_stderr=sys.stderr) tp = prefix + "_tp.vcf" fn = prefix + "_fn.vcf" fp = prefix + "_fp.vcf" t = prefix + "_t.vcf" tp = utils.sort_and_compress(tp) fn = utils.sort_and_compress(fn) fp = utils.sort_and_compress(fp) t = utils.sort_and_compress(t) jsonfile = "{0}_report.json".format(prefix) metrics = ['tp', 'fp', 't', 'fn'] stats = {k: {ii: 0 for ii in metrics} for k in var_types} parse_jsons(jsonfile, stats) print("Non-SV stats") print_stats(stats) sv_stats = {k: {ii: 0 for ii in metrics} for k in var_types} parse_jsons(jsonfile, sv_stats, count_sv=True) print("SV stats") print_stats(sv_stats) all_stats = {k: {ii: 0 for ii in metrics} for k in var_types} parse_jsons(jsonfile, all_stats, count_all=True) print("Overall stats") print_stats(all_stats) return tp, fn, fp, t
def run_scan_alignments(record): work, reference, scan_alignments_binary, split_region_file, \ input_bam, window_size, maf, min_mapq, max_dp, filter_duplicate, calc_qual = record if filter_duplicate: filter_duplicate_str = "--filter_duplicate" else: filter_duplicate_str = "" thread_logger = logging.getLogger("{} ({})".format( run_scan_alignments.__name__, multiprocessing.current_process().name)) try: if not os.path.exists(scan_alignments_binary): raise IOError("File not found: {}".format(scan_alignments_binary)) if not os.path.exists(work): os.mkdir(work) if len(pybedtools.BedTool(split_region_file)) > 0: cmd = "{} --ref {} -b {} -L {} --out_vcf_file {}/candidates.vcf --out_count_file {}/count.bed \ --window_size {} --min_af {} --min_mapq {} --max_depth {} {}".format( scan_alignments_binary, reference, input_bam, split_region_file, work, work, window_size, maf, min_mapq, max_dp * window_size / 100.0, filter_duplicate_str) if calc_qual: cmd += " --calculate_qual_stat" run_shell_command(cmd, stdout=os.path.join(work, "scan.out"), stderr=os.path.join(work, "scan.err"), run_logger=thread_logger) else: pybedtools.BedTool([]).saveas(os.path.join(work, "candidates.vcf")) pybedtools.BedTool([]).saveas(os.path.join(work, "count.bed")) pysam.tabix_index(os.path.join(work, "count.bed"), preset="bed") concatenate_files([split_region_file], os.path.join(work, "region.bed")) return os.path.join(work, "candidates.vcf"), os.path.join( work, "count.bed.gz"), os.path.join(work, "region.bed") except Exception as ex: thread_logger.error(traceback.format_exc()) thread_logger.error(ex) stderr_file = os.path.join(work, "scan.err") if os.path.exists(stderr_file) and os.path.getsize(stderr_file): thread_logger.error( "Please check error log at {}".format(stderr_file)) return None
def __run_command(self, arguments, copy_to_output=False): lines = [] command = "VBoxManage " + arguments log_prompt = None if copy_to_output: log_prompt = "" res = utils.run_shell_command(command, log_prompt, lines) assert res, f"Failed to run {command}" return lines
def test_clean_arp(self, target_platforms): ''' Remove IP address of the machines. I am going to ping the created machines using cached by arp IP address ''' vbox = virtualbox_shell.VirtualBox() time_end = datetime.datetime.now() + datetime.timedelta(minutes=8) uuids_macs = {} for target_platform in target_platforms: os_name, architecture = target_platform[ "os_name"], target_platform["architecture"] presents, machine = self.__vm_presents(os_name, architecture) if not presents: continue mac = vbox.get_mac_address(machine.uuid) res, hostname, ipaddress = utils.find_ip_by_mac(mac) if not res: continue utils.run_shell_command(f"ping -c 1 {ipaddress}")
def generate_sdf(reference, log): ''' take reference and generate SDF :param reference: :return: ''' sdf = reference + '.sdf' if os.path.exists(sdf): LOGGER.info('{0} exists, doing nothing'.format(sdf)) LOGGER.info('to rerun SDF generation, please remove or rename {0}'.format(sdf)) return sdf cmd = ['java', utils.JAVA_XMX, '-jar',utils.RTGJAR,'format', '-o', sdf, reference] if log: with utils.versatile_open(log, 'a') as logout: utils.run_shell_command(cmd, logout, logout) else: utils.run_shell_command(cmd, sys.stdout, sys.stderr) return sdf
def run_scan_alignments((work, reference, scan_alignments_binary, split_region_file, input_bam, window_size, maf, min_mapq, calc_qual, num_threads)): if not os.path.exists(work): os.mkdir(work) if len(pybedtools.BedTool(split_region_file)) > 0: cmd = "{} --ref {} -b {} -L {} --out_vcf_file {}/candidates.vcf --out_count_file {}/count.bed \ --window_size {} --min_af {} --min_mapq {} --num_thread {}".format( scan_alignments_binary, reference, input_bam, split_region_file, work, work, window_size, maf, min_mapq, num_threads) if calc_qual: cmd += " --calculate_qual_stat" run_shell_command(cmd, stdout=os.path.join(work, "scan.out"), stderr=os.path.join(work, "scan.err")) else: pybedtools.BedTool([]).saveas(os.path.join(work, "candidates.vcf")) pybedtools.BedTool([]).saveas(os.path.join(work, "count.bed")) pysam.tabix_index(os.path.join(work, "count.bed"), preset="bed") concatenate_files([split_region_file], os.path.join(work, "region.bed")) return os.path.join(work, "candidates.vcf"), os.path.join(work, "count.bed.gz"), os.path.join(work, "region.bed")
def build(db, task="build", n_threads=4, extra_args=""): """ build the database using 'kraken2-build --build' once library has been installed :param db: <str> path/location to the database :param task: <str> operation to be performed :param n_threads: <int> number of cpus/threads :param extra_args: <str> extra arguments passed to the taxonomy executable :return: """ # locate the executable kraken2 = find_executable(["kraken2-build"]) # create db directory db = os.path.join(os.path.realpath(db), '') mkdir(db) # check if indices exist indices = ['opts.k2d', 'hash.k2d', 'taxo.k2d', 'seqid2taxid.map'] ind = list(os.path.join(db, ext) for ext in indices) i_bool = [os.path.isfile(filename) for filename in ind] result = all(i_bool) if result is True: logging.critical("indices exist \n\t{}".format('\n\t'.join( [filename for filename in ind if os.path.isfile(filename)]))) else: # run process call = [ "{} --{} --threads {} --db {} {}".format(kraken2, task, n_threads, db, extra_args) ] cmd = " ".join(call) # run the shell command logging.info("building database") run_shell_command(cmd=cmd, raise_errors=False, extra_env=None) return db
def generate_sdf(reference, log): ''' take reference and generate SDF :param reference: :return: ''' sdf = reference + '.sdf' if os.path.exists(sdf): LOGGER.info('{0} exists, doing nothing'.format(sdf)) LOGGER.info( 'to rerun SDF generation, please remove or rename {0}'.format(sdf)) return sdf cmd = [ 'java', utils.JAVA_XMX, '-jar', utils.RTGJAR, 'format', '-o', sdf, reference ] if log: with utils.versatile_open(log, 'a') as logout: utils.run_shell_command(cmd, logout, logout) else: utils.run_shell_command(cmd, sys.stdout, sys.stderr) return sdf
def dbcan_cazymes(input_file, seq_type, tools, db_dir, out_dir, dbcan_args=""): """ :param input_file: <str> input file in FASTA format :param seq_type: <str> sequence type of the input :param tools: <str> tools for cazyme annotation :param db_dir: <str> path to the database directory :param out_dir <str> path to output directory :param dbcan_args: <str> extra arguments passed on to the executable :return: """ # locate the executable dbcan = find_executable(['run_dbcan.py']) tools = list(map(str, tools.split(','))) diamond_out = os.path.join(out_dir, 'diamond.out') hmmer_out = os.path.join(out_dir, 'hmmer.out') hotpep_out = os.path.join(out_dir, 'Hotpep.out') if os.path.exists(diamond_out) and os.path.exists( hmmer_out) and os.path.exists(hotpep_out): pass # logging.critical("CAZyme predicted outputs \n{}\n{}\n{} exists!".format(diamond_out, hmmer_out, hotpep_out)) else: call = [ "{} {} {} --tools {} --db_dir {} --out_dir {} {}".format( dbcan, input_file, seq_type, ",".join(tools), db_dir, out_dir, dbcan_args) ] cmd = " ".join(call) logging.info("CAZyme prediction on {}".format( os.path.basename(input_file))) run_shell_command(cmd=cmd, logfile=f_out, raise_errors=False, extra_env=None) return out_dir
def run(self): ''' :return: ''' cmd = ['java', utils.JAVA_XMX, '-jar', utils.VARSIMJAR, 'vcfcompare', '-prefix', self.prefix, '-true_vcf', self.true_vcf, '-reference', self.reference, ] if self.exclude_filtered: cmd.append('-exclude_filtered') if self.match_geno: cmd.append('-match_geno') if self.sample: cmd.append('-sample') cmd.append(self.sample) if self.regions: cmd.append('-bed') cmd.append(self.regions) if self.disallow_partial_fp: cmd.append('-disallow_partial_fp') if self.opts: cmd.append(self.opts) cmd.extend(self.vcfs) if self.log_to_file: with utils.versatile_open(self.log_to_file, 'a') as logout: utils.run_shell_command(cmd, sys.stdout, logout) else: utils.run_shell_command(cmd, sys.stdout, sys.stderr) tp = self.prefix + '_TP.vcf' fn = self.prefix + '_FN.vcf' fp = self.prefix + '_FP.vcf' for i in (tp, fn, fp): if not os.path.exists(i): raise Exception('{0} was not generated by VarSim vcfcompare. Please check and rerun.'.format(i)) self.tp, self.fn, self.fp = tp, fn, fp
def count_reads(infile, outfile): """ Perform a read count on FASTQ files in a directory, return dict of counts (int) :param infile: <str> path to the input file in fastq format :param outfile: <str> file to write the summary :return: <str> log file """ # locate the executable reformat = find_executable(["reformat.sh"]) summary_dict = dict() # loop through the dict items # for sample, reads in read_type(datadir).items(): # lines = [] # for read in reads: #print("\n# sample: {}\n".format(read), sep=' ', end='\n', file=f_out, flush=True) call = ["{} in={} -Xmx4G".format(reformat, infile)] cmd = " ".join(call) try: logging.info( "[counting reads]\n\t" + cmd + "\n\tBrian Bushnell (2017)." "\n\tBBTools: a suite of fast, multithreaded bioinformatics tools designed for " "analysis of DNA and RNA sequence data. " "\n\thttps://jgi.doe.gov/data-and-tools/bbtools//\n ") process = run_shell_command(cmd=cmd, logfile=f_out, raise_errors=True) if process: with open(logfile, 'r') as infile: for line in infile: m1 = re.match(r"^java.*$", line.strip()) m2 = re.match(r"^Input:.*$", line.strip()) if m1: read = os.path.basename( m1.group(0).split()[-2]).strip("in=") summary_dict[read] = '' if m2: read_count = int(m2.group(0).split()[1]) if read in summary_dict: summary_dict[read] = read_count except Exception: raise Exception("ERROR: COUNTING READS FAILED") df = pd.DataFrame(summary_dict.items(), columns=['sample', 'reads']) df.to_csv(outfile.name, index=False, sep="\t") return outfile
def run(self): """Achieve the conversion :returns: None :rtype: """ while True: base = self.queue.get() if base is None: break # bap => aperiodicity for cur_stream in self.conf.STREAMS: if cur_stream["kind"] == "lf0": # lf0 => f0 f0_fn = '%s/%s.f0' % (self.out_path, base) cmd = '%s -magic -1.0E+10 -EXP -MAGIC 0.0 %s/%s.lf0 > %s' % \ (self.SOPR, self.out_path, base, f0_fn) utils.run_shell_command(cmd, self.logger) elif cur_stream["kind"] == "bap": ap_fn = '%s/%s.ap' % (self.out_path, base) if not self.keep_bap: cmd = '%s -a %f -g 0 -m %d -l 2048 -o 2 %s/%s.bap | %s -d 32768.0 -P > %s' % \ (self.MGC2SP, self.conf.FREQWARPING, cur_stream["order"], self.out_path, base, self.SOPR, ap_fn) else: cmd = 'cat %s/%s.bap > %s' % (self.out_path, base, ap_fn) utils.run_shell_command(cmd, self.logger) elif cur_stream["kind"] == "mgc": sp_fn = '%s/%s.sp' % (self.out_path, base) cmd = '%s -a %f -g %f -m %d -l 2048 -o 2 %s/%s.mgc | %s -d 32768.0 -P > %s' % \ (self.MGC2SP, self.conf.FREQWARPING, cur_stream['parameters']['gamma'], cur_stream["order"], self.out_path, base, self.SOPR, sp_fn) utils.run_shell_command(cmd, self.logger) if not self.preserve: try: for cur_stream in self.conf.STREAMS: os.remove('%s/%s.%s' % (self.out_path, base, cur_stream["kind"])) os.remove('%s/%s.dur' % (self.out_path, base)) except FileNotFoundError: pass self.queue.task_done()
def annotate_snps(index_base, config, vcf_file, db, snpeff_csv, snpeff_vcf): """ annotate and predict the effect of variants :param index_base: :param config: :param vcf_file: :param db: :param snpeff_csv: :param snpeff_vcf: :param out_dir: :return: """ # locate the executable snpeff = find_executable(['snpEff']) if os.path.exists(snpeff_vcf): logging.critical( "variant annotation file {} exists".format(snpeff_vcf)) else: call = [ "snpEff {} -config {} -dataDir {} {} -csvStats {} | bgzip -c > {}" "".format(index_base, config, db, vcf_file, snpeff_csv, snpeff_vcf) ] cmd = " ".join(call) logging.info( "annotating variants and predicting effects: {}".format(vcf_file)) p = run_shell_command(cmd=cmd, raise_errors=False, extra_env=None) if p: # index the vcf file index_vcf(vcf_file=snpeff_vcf) sample = os.path.basename(snpeff_vcf).rsplit(".", 2)[0] out_dir = os.path.dirname(snpeff_vcf) html = os.path.join(os.getcwd(), 'snpEff_summary.html') snpeff_html = os.path.join(out_dir, sample + ".snpEff.summary.html") copy_file(src=html, dest=snpeff_html) os.remove(html) return snpeff_vcf
def summarize_results(prefix, tp, fn, fp, t, var_types, sv_length=100, regions=None, bed_either=False): ''' count variants by type and tabulate :param augmented_tp: :param augmented_fn: :param augmented_fp: :param augmented_t: :return: ''' cmd = [ 'java', utils.JAVA_XMX, '-jar', utils.VARSIMJAR, 'vcfcompareresultsparser', '-prefix', prefix, '-tp', tp, '-fn', fn, '-fp', fp, '-t', t, '-sv_length', str(sv_length), ] if regions: cmd = cmd + ['-bed', regions] if bed_either: cmd = cmd + ['-bed_either'] utils.run_shell_command(cmd, cmd_stdout=sys.stdout, cmd_stderr=sys.stderr) tp = prefix + "_tp.vcf" fn = prefix + "_fn.vcf" fp = prefix + "_fp.vcf" t = prefix + "_t.vcf" tp = utils.sort_and_compress(tp) fn = utils.sort_and_compress(fn) fp = utils.sort_and_compress(fp) t = utils.sort_and_compress(t) jsonfile = "{0}_report.json".format(prefix) metrics = ['tp', 'fp', 't', 'fn'] stats = {k: {ii: 0 for ii in metrics} for k in var_types} parse_jsons(jsonfile, stats) print("Non-SV stats") print_stats(stats) sv_stats = {k: {ii: 0 for ii in metrics} for k in var_types} parse_jsons(jsonfile, sv_stats, count_sv=True) print("SV stats") print_stats(sv_stats) all_stats = {k: {ii: 0 for ii in metrics} for k in var_types} parse_jsons(jsonfile, all_stats, count_all=True) print("Overall stats") print_stats(all_stats) return tp, fn, fp, t
import utils if __name__ == '__main__': arguments = docopt(__doc__, version='0.1') image_filename = arguments['--outfile'] image_filename = os.path.realpath(image_filename) image_folder = os.path.dirname(image_filename) image_basename = os.path.basename(image_filename) image_mount_tmp = tempfile.mkdtemp(None, "mount-iso-", image_folder) mount_point = os.path.join(image_folder, image_mount_tmp) size = 1440*1024 while True: res = utils.run_shell_command(f"fallocate -l {size} {image_filename}", None, None, True) msg = f"Failed to allocate {size} bytes for {image_filename}" if not res: break; res = utils.run_shell_command(f"mkfs.vfat {image_filename}", None, None, True) msg = f"Failed to create file system in the {image_filename}" if not res: break; res = utils.run_shell_command(f"mount -o loop {image_filename} {mount_point}", None, None, True) msg = f"Failed to mount {image_filename} using mount point {mount_point}" if not res: break; image_content = arguments['--infile']
def straight_part(self, in_path, out_path, gen_labfile_base_lst): """Achieving the straight generation :param out_path: the output directory path :param gen_labfile_base_lst: the file containing the list of utterances :returns: None :rtype: """ # Generate STRAIGHT script with open(self.conf.STRAIGHT_SCRIPT, 'w') as f: # Header f.write("path(path, '%s');\n" % self.conf.STRAIGHT_PATH) f.write("prm.spectralUpdateInterval = %f;\n" % self.conf.SIGNAL['frameshift']) f.write("prm.levelNormalizationIndicator = 0;\n\n") # Now some parameters f.write("out_path = '%s';\n" % out_path) f.write("fft_len = %d;\n" % 1025) # FIXME: hardcoded f.write("samplerate = %d;\n" % self.conf.SIGNAL["samplerate"]) f.write("basenames = {};\n") for i in range(1, len(gen_labfile_base_lst) + 1): base = gen_labfile_base_lst[i - 1] f.write("basenames{%d} = '%s';\n" % (i, base)) f.write("\n") f.write("nb_frames = [];\n") for i in range(1, len(gen_labfile_base_lst) + 1): base = gen_labfile_base_lst[i - 1] nb_frames = os.path.getsize('%s/%s.f0' % (out_path, base)) / 4 f.write("nb_frames(%d) = %d;\n" % (i, nb_frames)) f.write("\n") # Read STRAIGHT params nb_elts = len(gen_labfile_base_lst) if self.nb_proc != 1: f.write("parfor i=1:%d\n" % nb_elts) else: f.write("for i=1:%d\n" % nb_elts) f.write("\ttry\n") f.write( "\t\tfid_sp = fopen(sprintf('%s/%s.sp', out_path, basenames{i}), 'r', 'ieee-le');\n" ) f.write( "\t\tfid_ap = fopen(sprintf('%s/%s.ap', out_path, basenames{i}), 'r', 'ieee-le');\n" ) f.write( "\t\tfid_f0 = fopen(sprintf('%s/%s.f0', out_path, basenames{i}), 'r', 'ieee-le');\n" ) f.write( "\t\tsp = fread(fid_sp, [fft_len nb_frames(i)], 'float');\n") f.write( "\t\tap = fread(fid_ap, [fft_len nb_frames(i)], 'float');\n") f.write("\t\tf0 = fread(fid_f0, [1 nb_frames(i)], 'float');\n") f.write("\t\tfclose(fid_sp);\n") f.write("\t\tfclose(fid_ap);\n") f.write("\t\tfclose(fid_f0);\n") # Synthesis process part 2 f.write( "\t\t[sy] = exstraightsynth(f0, sp, ap, samplerate, prm);\n") f.write( "\t\taudiowrite(sprintf('%s/%s.wav', out_path, basenames{i}), sy, samplerate);\n" ) f.write("\tcatch me\n") f.write( "\t\twarning(sprintf('cannot render %s: %s', basenames{i}, me.message));\n" ) f.write("\tend;\n") f.write("end;\n") # Ending f.write("quit;\n") # Synthesis! cmd = '%s -nojvm -nosplash -nodisplay < %s' % ( self.MATLAB, self.conf.STRAIGHT_SCRIPT) run_shell_command(cmd, self.logger) if not self.preserve: os.remove(self.conf.STRAIGHT_SCRIPT)
from settings import REPOS from utils import run_shell_command, notify import os msg_subj = "Repositories' refreshment" msg_body = "" for rep_path in REPOS: spl_path = rep_path.rsplit('/', 1) print('===> Updating {}: '.format(spl_path[len(spl_path) - 1])) msg_body += '\n===> Updating {}: '.format(spl_path[len(spl_path) - 1]) print('Making cd {}...'.format(rep_path)) msg_body += '\nMaking cd {}...'.format(rep_path) os.chdir(rep_path) msg_body = run_shell_command('git config credential.helper store', notification_body=msg_body) msg_body = run_shell_command('git pull --all', notification_body=msg_body) if notify: notify(subject=msg_subj, message=msg_body)
async def exec(self, message): cmd = message.content.replace("!exec ", "") r = utils.run_shell_command(cmd) for line in r: await self.client.send_message(message.channel, line)
def run(self): ''' :return: ''' #command example #rtg-tools-3.8.4-bdba5ea_install/rtg vcfeval --baseline truth.vcf.gz \ #--calls compare1.vcf.gz -o vcfeval_split_snp -t ref.sdf --output-mode=annotate --sample xx --squash-ploidy --regions ?? \ cmd = [ 'java', utils.JAVA_XMX, '-jar', utils.RTGJAR, 'vcfeval', '-o', self.prefix, '--baseline', self.true_vcf, '-t', self.reference, ] if not self.exclude_filtered: cmd.append('--all-records') if not self.match_geno: cmd.append('--squash-ploidy') if self.sample: cmd.append('--sample') cmd.append(self.sample) if self.regions: cmd.append('--bed-regions') cmd.append(self.regions) if self.opts: cmd.append(self.opts) if len(self.vcfs) != 1: raise ValueError( 'vcfeval only takes 1 prediction VCF and 1 truth VCF: {0}'. format(self.vcfs)) cmd.append('--calls') cmd.append(self.vcfs[0]) tp = os.path.join(self.prefix, 'tp-baseline.vcf.gz') tp_predict = os.path.join(self.prefix, 'tp.vcf.gz') fn = os.path.join(self.prefix, 'fn.vcf.gz') fp = os.path.join(self.prefix, 'fp.vcf.gz') if utils.count_variants(self.true_vcf) == 0 and utils.count_variants( self.vcfs[0]) == 0: #both truth and prediction are empty, do nothing utils.makedirs([self.prefix]) shutil.copyfile(self.true_vcf, tp) shutil.copyfile(self.true_vcf, fn) shutil.copyfile(self.vcfs[0], tp_predict) shutil.copyfile(self.vcfs[0], fp) else: if self.log_to_file: with utils.versatile_open(self.log_to_file, 'a') as logout: utils.run_shell_command(cmd, sys.stderr, logout) else: utils.run_shell_command(cmd, sys.stderr, sys.stderr) for i in (tp, tp_predict, fn, fp): if not os.path.exists(i): raise Exception( '{0} was not generated by vcfeval. Please check and rerun.' .format(i)) self.tp, self.tp_predict, self.fn, self.fp = tp, tp_predict, fn, fp
def index(request): pip = os.path.join(sys.exec_prefix, 'bin', 'pip') if not os.path.isfile(pip): pip = 'pip' SHELL_COMMANDS = (('Hostname', 'hostname'), ('hg version', 'hg id'), ('git version', "git log --pretty=format:'%h' -n 1"), ('hg branch', 'hg branch'), ('git branch', 'git rev-parse --abbrev-ref HEAD'), ('MySQL version', 'mysql --version'), ('Local Packages', '%s freeze -l' % pip)) SD = OrderedDict() for k, v in sorted(settings_list(), key=lambda x: x[0]): SD[k] = v context = RequestContext(request, { 'args': sys.argv, 'exe': sys.executable, 'settings': SD, }) context['versions'] = OrderedDict() # get versions curr_dir = os.path.realpath(os.path.dirname(__file__)) for name, shell_command in SHELL_COMMANDS: try: result = utils.run_shell_command(shell_command, curr_dir) if result: if isinstance(result, list): result = '<br>'.split(result) context['versions'][name] = result except: pass # machine status context['machine'] = OrderedDict() if sys.platform == 'darwin': context['machine']['Uptime'] = 'not done yet on MacOS' context['machine']['Disk Space'] = 'not done yet on MacOS' elif sys.platform == 'win32': context['machine']['Uptime'] = 'not done yet on Windows' context['machine']['Disk Space'] = 'not done yet on Windows' else: context['machine']['Uptime'] = utils.server_uptime() context['machine']['Disk Space'] = utils.disk_usage('/')._asdict() if os.path.exists(settings.MEDIA_ROOT): context['machine']['Media Folder'] = utils.sizeof_fmt( utils.folder_size(settings.MEDIA_ROOT)) context['stats'] = utils.get_available_stats() context['apps'] = [(app.__name__, ', '.join([model.__name__ for model in models])) for app, models in all_concrete_models()] context['relations'] = [[(model.__name__, ', '.join([ '%s (%s) through %s' % (relation.__name__, relation.__module__, field.__class__.__name__) for field, relation in relations ]), app.__name__) for model, relations in rel_info] for app, rel_info in all_relations()] #context['rel_graph'] = context['config_warnings'] = utils.get_configuration_warnings() return render_to_response('dashboard/index.html', context)
def run(self): ''' :return: ''' #command example #rtg-tools-3.8.4-bdba5ea_install/rtg vcfeval --baseline truth.vcf.gz \ #--calls compare1.vcf.gz -o vcfeval_split_snp -t ref.sdf --output-mode=annotate --sample xx --squash-ploidy --regions ?? \ cmd = [ self.java, utils.JAVA_XMX, '-jar', utils.RTGJAR, 'vcfeval', '-o', self.prefix, '--baseline', self.true_vcf, '-t', self.reference, ] if not self.exclude_filtered: cmd.append('--all-records') if not self.match_geno: cmd.append('--squash-ploidy') if self.sample: cmd.append('--sample') cmd.append(self.sample) if self.regions: cmd.append('--bed-regions') cmd.append(self.regions) if self.opts: cmd.append(self.opts) if len(self.vcfs) != 1: raise ValueError( 'vcfeval only takes 1 prediction VCF and 1 truth VCF: {0}'. format(self.vcfs)) cmd.append('--calls') cmd.append(self.vcfs[0]) tp = os.path.join(self.prefix, 'tp-baseline.vcf.gz') tp_predict = os.path.join(self.prefix, 'tp.vcf.gz') fn = os.path.join(self.prefix, 'fn.vcf.gz') fp = os.path.join(self.prefix, 'fp.vcf.gz') #vcfeval refuses to run if true_vcf contains 0 variants if utils.count_variants(self.true_vcf) == 0: utils.makedirs([self.prefix]) #because there is 0 ground truth variants, TP and FN will be empty shutil.copyfile(self.true_vcf, tp) shutil.copyfile(self.true_vcf, fn) if utils.count_variants(self.vcfs[0]) == 0: #if calls are empty, then TP_PREDICT and FP will for sure be empty shutil.copyfile(self.vcfs[0], tp_predict) shutil.copyfile(self.vcfs[0], fp) else: #if calls are not empty, then all calls will be FP due to 0 ground truth, TP_PREDICT will be empty shutil.copyfile(self.vcfs[0], fp) with utils.versatile_open(tp_predict, "w") as output, utils.versatile_open( self.vcfs[0], "r") as input: for i in input: if i.startswith('#'): output.write(i) else: break else: if self.log_to_file: with utils.versatile_open(self.log_to_file, 'a') as logout: utils.run_shell_command(cmd, sys.stderr, logout) else: utils.run_shell_command(cmd, sys.stderr, sys.stderr) for i in (tp, tp_predict, fn, fp): if not os.path.exists(i): raise Exception( '{0} was not generated by vcfeval. Please check and rerun.' .format(i)) self.tp, self.tp_predict, self.fn, self.fp = tp, tp_predict, fn, fp