Example #1
0
def main():
    parser = argparse.ArgumentParser()
    required_args = parser.add_argument_group('Required Arguments')
    required_args.add_argument('-R',
                               '--reference',
                               help='samtools faidx indexed reference file')
    required_args.add_argument(
        '-b',
        '--bam',
        type=str,
        help=
        'bam file; if given; no need to offer reads file; mapping will be skipped'
    )
    required_args.add_argument(
        '-s',
        '--sam2tsv',
        type=str,
        default='',
        help=
        '/path/to/sam2tsv.jar; needed unless a sam2tsv.jar produced file is already given'
    )
    parser.add_argument(
        '-f',
        '--file',
        type=str,
        help=
        'tsv file generated by sam2tsv.jar; if given, reads mapping and sam2tsv conversion will be skipped'
    )
    parser.add_argument('-n',
                        '--number_cpus',
                        type=int,
                        default=4,
                        help='number of CPUs')
    parser.add_argument(
        '-T',
        '--type',
        type=str,
        default="t",
        help="reference types, which is either g(enome) or t(ranscriptome);")
    args = parser.parse_args()

    #~~~~~~~~~~~~~~~~~~~~~~~ prepare for analysis ~~~~~~~~~~~~~~
    tsv_gen = None  # generator
    prefix = ''

    def _tsv_gen():
        if not args.file:
            if args.reference:
                if not file_exist(args.reference):
                    sys.stderr.write(args.reference, 'does not exist')
                    exit()
                dict_fn = args.reference + '.dict'
                if not file_exist(dict_fn):
                    sys.stderr.write(
                        dict_fn,
                        'needs to be created using picard.jar CreateSequenceDictionary'
                    )
                    exit()
                ref_faidx = args.reference + '.fai'
                if not file_exist(ref_faidx):
                    sys.stderr.write(
                        ref_faidx, 'needs to be created with samtools faidx')
                    exit()
            if args.bam:
                bam_file = args.bam
                if not file_exist(bam_file):
                    sys.stderr.write(bam_file +
                                     ' does not exist; please double check!\n')
                    exit()
                else:
                    if not file_exist(args.sam2tsv):
                        sys.stderr.write(
                            "Please offer correctly path to sam2tsv.jar\n".
                            format(args.sam2tsv))
                        exit()
                    if not os.path.exists(bam_file + '.bai'):
                        sys.stderr.write(
                            'bam file not indexed!\nstarting indexing it ...')
                        os.system('samtools index ' + bam_file + '.bai')
                    if not args.reference:
                        sys.stderr.write(
                            'requires reference file that was used for reads mapping\n'
                        )
                    prefix = bam_file.replace('.bam', '')
                    cmds = java_bam_to_tsv(bam_file, args.reference,
                                           args.sam2tsv, args.type)
                    if args.type[0].lower(
                    ) == 't':  #mapping to transcriptome; only one sam2tsv.jar command
                        cmd = subprocess.Popen((cmds[0]),
                                               stdout=subprocess.PIPE,
                                               stderr=subprocess.PIPE,
                                               shell=True)
                        returncode = cmd.returncode
                        if returncode:
                            print(res[1], file=sys.stderr)
                            exit()
                        tsv_gen = stdin_stdout_gen(cmd.stdout)
                    elif args.type[0].lower(
                    ) == 'g':  #mapping to genome; sam2tsv.jar caled twice for + and - strands
                        cmd1 = subprocess.Popen((cmds[0]),
                                                stdout=subprocess.PIPE,
                                                stderr=subprocess.PIPE,
                                                shell=True)
                        cmd2 = subprocess.Popen((cmds[1]),
                                                stdout=subprocess.PIPE,
                                                stderr=subprocess.PIPE,
                                                shell=True)
                        returncode1 = cmd1.returncode
                        returncode2 = cmd2.returncode
                        if any([returncode1, returncode2]):
                            res1 = cmd1.communicate()
                            res2 = cmd2.communicate()
                            print(res1[1], res2[1], file=sys.stderr)
                            exit()
                        tsv_gen = itertools.chain(
                            stdin_stdout_gen(cmd1.stdout),
                            stdin_stdout_gen(cmd2.stdout))
        else:
            if args.file:
                tsv_file = args.file
                prefix = tsv_file.replace('.tsv', '')
                if os.path.exists(args.file):
                    fh = openfile(tsv_file)
                    firstline = fh.readline()
                    fh.close()
                    if len(firstline.rstrip().split()) != 10:
                        sys.stderr.write('tsv file is not in right format!')
                        sys.stderr.write(
                            'tsv files should contain these columns {}\n'.
                            format(
                                "#READ_NAME     FLAG    CHROM   READ_POS        BASE    QUAL    REF_POS REF     OP      STARAND"
                            ))
                    sys.stderr.write(
                        tsv_file +
                        ' already exists; will skip reads mapping and sam2tsv conversion \n'
                    )
                    tsv_gen = openfile(tsv_file)
                else:
                    sys.stderr.write(tsv_file +
                                     ' does not exist; please double check \n')
                    exit()
        return tsv_gen, prefix
#~~~~~~~~~~~~~~~~  SAM2TSV ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
################# funciton run commands ###########################
#~~~~~~~~~~~~~~~~ split tsv  ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~

    tsv_gen, prefix = _tsv_gen()

    tmp_dir = prefix + '.tmp_splitted'
    if os.path.exists(tmp_dir):
        shutil.rmtree(tmp_dir)
        sys.stderr.write(
            "{} already exists, will overwrite it\n".format(tmp_dir))

    if not os.path.exists(tmp_dir):
        os.mkdir(tmp_dir)

        number_threads = args.number_cpus

        manager = Manager()
        q = manager.Queue(args.number_cpus)
        #~~~~~~~~~~~~~~~~ compute per site variants frequecies ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
        #1 calculate variants frequency for each small splitted file
        processes = []

        ps = Process(target=split_tsv_for_per_site_var_freq,
                     args=(tsv_gen, q, number_threads, 2500))
        processes.append(ps)

        for _ in range(number_threads):
            ps = Process(target=tsv_to_freq_multiprocessing_with_manager,
                         args=(q, tmp_dir))
            processes.append(ps)
        for ps in processes:
            ps.daemon = True
            ps.start()
        for ps in processes:
            ps.join()

#2 combine small files and produce varinats frequencies per ref-position
#persite_var = prefix +'.per_site.var.csv'
    df = dd.read_csv("{}/small_*freq".format(tmp_dir))
    df = df.compute()
    df_fr = df[df['strand'] == "+"]
    out = prefix + '.per.site.fwd.csv'

    if args.type.lower() == 't':
        df_proc(df_fr, out)
    elif args.type.lower() == 'g':
        df_rev = df[df['strand'] == "-"]
        out_rev = prefix + '.per.site.rev.csv'
        if args.number_cpus > 1:
            processes = [None, None]
            processes[0] = Process(target=df_proc, args=(df_fr, out))
            processes[1] = Process(target=df_proc, args=(df_rev, out_rev))
            for ps in processes:
                ps.aemon = True
                ps.start()
            for ps in processes:
                ps.join()
        else:
            df_proc(df_fr, out)
            df_proc(df_rev, out_rev)

    #var_files = df_proc (tmp_dir, prefix, 2)

    if os.path.exists(tmp_dir):
        pool = mp.Pool(args.number_cpus)
        tmp_files = glob.glob("{}/small*".format(tmp_dir))
        pool.map(_rm, tmp_files)
        shutil.rmtree(tmp_dir)