def main(): parser = argparse.ArgumentParser() required_args = parser.add_argument_group('Required Arguments') required_args.add_argument('-R', '--reference', help='samtools faidx indexed reference file') required_args.add_argument( '-b', '--bam', type=str, help= 'bam file; if given; no need to offer reads file; mapping will be skipped' ) required_args.add_argument( '-s', '--sam2tsv', type=str, default='', help= '/path/to/sam2tsv.jar; needed unless a sam2tsv.jar produced file is already given' ) parser.add_argument( '-f', '--file', type=str, help= 'tsv file generated by sam2tsv.jar; if given, reads mapping and sam2tsv conversion will be skipped' ) parser.add_argument('-n', '--number_cpus', type=int, default=4, help='number of CPUs') parser.add_argument( '-T', '--type', type=str, default="t", help="reference types, which is either g(enome) or t(ranscriptome);") args = parser.parse_args() #~~~~~~~~~~~~~~~~~~~~~~~ prepare for analysis ~~~~~~~~~~~~~~ tsv_gen = None # generator prefix = '' def _tsv_gen(): if not args.file: if args.reference: if not file_exist(args.reference): sys.stderr.write(args.reference, 'does not exist') exit() dict_fn = args.reference + '.dict' if not file_exist(dict_fn): sys.stderr.write( dict_fn, 'needs to be created using picard.jar CreateSequenceDictionary' ) exit() ref_faidx = args.reference + '.fai' if not file_exist(ref_faidx): sys.stderr.write( ref_faidx, 'needs to be created with samtools faidx') exit() if args.bam: bam_file = args.bam if not file_exist(bam_file): sys.stderr.write(bam_file + ' does not exist; please double check!\n') exit() else: if not file_exist(args.sam2tsv): sys.stderr.write( "Please offer correctly path to sam2tsv.jar\n". format(args.sam2tsv)) exit() if not os.path.exists(bam_file + '.bai'): sys.stderr.write( 'bam file not indexed!\nstarting indexing it ...') os.system('samtools index ' + bam_file + '.bai') if not args.reference: sys.stderr.write( 'requires reference file that was used for reads mapping\n' ) prefix = bam_file.replace('.bam', '') cmds = java_bam_to_tsv(bam_file, args.reference, args.sam2tsv, args.type) if args.type[0].lower( ) == 't': #mapping to transcriptome; only one sam2tsv.jar command cmd = subprocess.Popen((cmds[0]), stdout=subprocess.PIPE, stderr=subprocess.PIPE, shell=True) returncode = cmd.returncode if returncode: print(res[1], file=sys.stderr) exit() tsv_gen = stdin_stdout_gen(cmd.stdout) elif args.type[0].lower( ) == 'g': #mapping to genome; sam2tsv.jar caled twice for + and - strands cmd1 = subprocess.Popen((cmds[0]), stdout=subprocess.PIPE, stderr=subprocess.PIPE, shell=True) cmd2 = subprocess.Popen((cmds[1]), stdout=subprocess.PIPE, stderr=subprocess.PIPE, shell=True) returncode1 = cmd1.returncode returncode2 = cmd2.returncode if any([returncode1, returncode2]): res1 = cmd1.communicate() res2 = cmd2.communicate() print(res1[1], res2[1], file=sys.stderr) exit() tsv_gen = itertools.chain( stdin_stdout_gen(cmd1.stdout), stdin_stdout_gen(cmd2.stdout)) else: if args.file: tsv_file = args.file prefix = tsv_file.replace('.tsv', '') if os.path.exists(args.file): fh = openfile(tsv_file) firstline = fh.readline() fh.close() if len(firstline.rstrip().split()) != 10: sys.stderr.write('tsv file is not in right format!') sys.stderr.write( 'tsv files should contain these columns {}\n'. format( "#READ_NAME FLAG CHROM READ_POS BASE QUAL REF_POS REF OP STARAND" )) sys.stderr.write( tsv_file + ' already exists; will skip reads mapping and sam2tsv conversion \n' ) tsv_gen = openfile(tsv_file) else: sys.stderr.write(tsv_file + ' does not exist; please double check \n') exit() return tsv_gen, prefix #~~~~~~~~~~~~~~~~ SAM2TSV ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ################# funciton run commands ########################### #~~~~~~~~~~~~~~~~ split tsv ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ tsv_gen, prefix = _tsv_gen() tmp_dir = prefix + '.tmp_splitted' if os.path.exists(tmp_dir): shutil.rmtree(tmp_dir) sys.stderr.write( "{} already exists, will overwrite it\n".format(tmp_dir)) if not os.path.exists(tmp_dir): os.mkdir(tmp_dir) number_threads = args.number_cpus manager = Manager() q = manager.Queue(args.number_cpus) #~~~~~~~~~~~~~~~~ compute per site variants frequecies ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ #1 calculate variants frequency for each small splitted file processes = [] ps = Process(target=split_tsv_for_per_site_var_freq, args=(tsv_gen, q, number_threads, 2500)) processes.append(ps) for _ in range(number_threads): ps = Process(target=tsv_to_freq_multiprocessing_with_manager, args=(q, tmp_dir)) processes.append(ps) for ps in processes: ps.daemon = True ps.start() for ps in processes: ps.join() #2 combine small files and produce varinats frequencies per ref-position #persite_var = prefix +'.per_site.var.csv' df = dd.read_csv("{}/small_*freq".format(tmp_dir)) df = df.compute() df_fr = df[df['strand'] == "+"] out = prefix + '.per.site.fwd.csv' if args.type.lower() == 't': df_proc(df_fr, out) elif args.type.lower() == 'g': df_rev = df[df['strand'] == "-"] out_rev = prefix + '.per.site.rev.csv' if args.number_cpus > 1: processes = [None, None] processes[0] = Process(target=df_proc, args=(df_fr, out)) processes[1] = Process(target=df_proc, args=(df_rev, out_rev)) for ps in processes: ps.aemon = True ps.start() for ps in processes: ps.join() else: df_proc(df_fr, out) df_proc(df_rev, out_rev) #var_files = df_proc (tmp_dir, prefix, 2) if os.path.exists(tmp_dir): pool = mp.Pool(args.number_cpus) tmp_files = glob.glob("{}/small*".format(tmp_dir)) pool.map(_rm, tmp_files) shutil.rmtree(tmp_dir)