Example #1
0
def whichTAD(args):
    """
    %prog gene.bed tad.bed Options
    
    find gene location in tads
    """
    p = OptionParser(annotate.__doc__)
    p.add_option('-f',
                 dest='fraction',
                 default='0.7',
                 help='the fraction of gene overlap of tads')

    opts, args = p.parse_args(args)

    if len(args) != 2:
        sys.exit(p.print_help())

    genes, tads = args
    fraction = opts.fraction
    check_file_exists(tads)
    check_file_exists(genes)
    if 0 > float(fraction) > 1:
        logging.error('The option `-f` must set in '
                      'range [0, 1], and you set {}'.format(fraction))
        sys.exit()

    bedtools_cmd = "bedtools intersect -a {} -b {} -wao -f {} | cut -f 4-8 ".format(
        genes, tads, fraction)
    for line in os.popen(bedtools_cmd):
        print(line.strip())
Example #2
0
def getTSSbw(args):
    """
    %prog <tss.gff/bed> <chrom.sizes> <out_prefix> [options]
        To obtain a bedgraph file of tss sites 
            density of per windows
    """

    p = OptionParser(getTSSbw.__doc__)
    p.add_option('-w',
                 '--window',
                 type=int,
                 default=1000,
                 help='the window of tss density calculation')
    p.add_option('-o',
                 '--out',
                 default='./',
                 help='output [default: %default]')
    p.add_option('--qsub',
                 default=False,
                 action='store_true',
                 help='if qsub to sge [default: %default]')
    opts, args = p.parse_args(args)
    if len(args) != 3:
        sys.exit(p.print_help())

    gff, chrom_sizes, sample = args
    window = opts.window
    check_file_exists(chrom_sizes)
    command = 'qsub -pe mpi 1 -cwd -j y -S /bin/bash' if opts.qsub \
            else 'sh'

    outdir = op.abspath(opts.out)
    cmd = 'python -m TDGP.analysis.genome getTSS {} > \
        {}/{}.gene.tss.bed\n'.format(gff, outdir, sample)
    cmd += 'bedtools makewindows -g {1} -w {2} > {0}/{3}.{2}.window\n'.format(
        outdir,
        chrom_sizes,
        window,
        sample,
    )
    cmd += 'bedtools intersect -a {outdir}/{sample}.{window}.window -b \
            {outdir}/{sample}.gene.tss.bed -c | sort -k1,1 -k2,2n > \
                {outdir}/{sample}.gene.tss.{window}.bg\n'.format(sample=sample,
                                                                 window=window,
                                                                 outdir=outdir)
    cmd += 'bedGraphToBigWig {outdir}/{sample}.gene.tss.{window}.bg {sizes} \
            {outdir}/{sample}.gene.tss.{window}.bw\n'.format(sample=sample,
                                                             window=window,
                                                             sizes=chrom_sizes,
                                                             outdir=outdir)
    with open('{}/run_{}_tss.sh'.format(outdir, sample), 'w') as out:
        out.write(cmd)

    os.system('{} {}/run_{}_tss.sh'.format(command, outdir, sample))
    logging.debug('Successful')
Example #3
0
def getTSS(args):
    """
    %prog infile [Options]
        To get TSS from gff or bed.
    """

    p = OptionParser(getTSS.__doc__)
    p.add_option('--type',
                 default='gene',
                 help='the type of sequence [default: %default]')
    p.add_option('-o',
                 '--out',
                 default=sys.stdout,
                 help='output file. [default: stdout]')
    opts, args = p.parse_args(args)

    if len(args) < 1:
        sys.exit(p.print_help())

    infile, = args
    check_file_exists(infile)
    out = opts.out
    filetype = guess_filetype(infile)
    if not filetype:
        logging.error('Input filetype must a gff or bed')
        sys.exit()

    if filetype == "gff":
        with must_open(infile, 'r') as fp:
            for line in fp:
                if line.startswith("#"):
                    continue
                if not line.strip():
                    continue
                line_list = line.strip().split('\t')
                (chrom, _, type_, start, end, _, strand, _,
                 info) = line_list[:9]

                if type_ == opts.type:
                    print('\t'.join(map(str, (chrom, start, int(start) + 1))),
                          file=out)

    elif filetype == 'bed':
        with must_open(infile, 'r') as fp:
            for line in fp:
                if line.startswith("#"):
                    continue
                line_list = line.strip().split()
                chrom, start, end = line_list[:3]
                print('\t'.join(map(str, (chrom, start, int(start) + 1))),
                      file=out)

    out = 'stdout' if isinstance(out, str) else out
    logging.debug('Done, output is in `{}`'.format)
Example #4
0
def create_bed_dict(bedfile):
    bed_dict = {}
    check_file_exists(bedfile)
    with open(bedfile) as fp:
        for line in fp:
            line_list = line.strip().split()
            chrom, start, end, gene = line_list[:4]

            bed_dict[gene] = (chrom, start, end)
    
    return bed_dict
Example #5
0
File: qc.py Project: wangyibin/TDGP
def statFrag(args):
    """
    %(prog)s allValidParis [Options]
        stat the Ratio of theoretically digested genomic 
        fragments covered by valid paired Hi-C reads.
    """
    p = argparse.ArgumentParser(prog=statFrag.__name__,
                        description=statFrag.__doc__,
                        conflict_handler='resolve')
    pReq = p.add_argument_group('Required arguments')
    pOpt = p.add_argument_group('Optional arguments')
    pReq.add_argument('validpairs',  help='Validparis file')
    pReq.add_argument('enzyme', help='restriction enzyme site bed file')
    pOpt.add_argument('--unmap', default=False, action='store_true',
            help='output the unmap fragments bed [default: %(default)s]')
    pOpt.add_argument('-o', '--output', type=argparse.FileType('w'), default=sys.stdout,
            help='output file [default: stdout]')
    pOpt.add_argument('-h', '--help', action='help',
            help='show help message and exit.')
    
    args = p.parse_args(args)

    theo_num = 0
    reality_num = 0
    check_file_exists(args.enzyme)

    enzyme_df = pd.read_csv(args.enzyme, sep='\t', 
                    header=None, index_col='name',
                    names=['chrom', 'start', 'end',
                            'name', 'flag', 'strand'])
    theoFrags = set(enzyme_df.index)
    theo_num = len(theoFrags)
    vp = ValidPairs(args.validpairs)
    realFrags = vp.getRealFrags()
    reality_num = len(realFrags)
    
    if args.unmap:
        unmapFrags = theoFrags - realFrags 
        unmap_df = enzyme_df.loc[unmapFrags]
        unmap_df.reset_index(inplace=True)
        df = unmap_df[['chrom', 'start', 'end',
                            'name', 'flag', 'strand']]
        df = df.sort_values(by=['chrom', 'start'])
        df.to_csv(args.output.name + ".unmap", sep='\t', header=None, index=None)

    print("Theoretical Fragments\t{}".format(theo_num), file=args.output)
    print("Reality Fragments\t{}".format(reality_num), file=args.output)
    print("Reality Fragments Ratio (%)\t{:.2%}".format(reality_num*1.0/theo_num), 
            file=args.output)
Example #6
0
    def __init__(self, filename, exclude=None, 
        exclude_contig=['tig', 'Un', 'Sy', 'scaffold', 'ctg', 'Pt', 'Mt'], 
        mem_cache='.'):
        check_file_exists(filename)
        self.filename = filename
        self.exclude = listify(exclude)
        self.exclude_contig = listify(exclude_contig)
        self.getChrSizes()
        self.idx2label = dict((i, chrom) 
            for i, chrom in enumerate(self.chromLabels))
        self.label2idx = dict((chrom, i) 
            for i, chrom in enumerate(self.chromLabels))

        self.mem_cache = mem_cache
        self.memory = Memory(mem_cache, verbose=0)
        self.getGCBin = self.memory.cache(self._getGCBin)
Example #7
0
def convertAnchorsToLink(args):
    """
    %(prog)s bed1 bed2 anchor [Options]
        To convert anchors file to link bed file, which is generate
            from jcvi
    """
    p = p=argparse.ArgumentParser(prog=convertAnchorsToLink.__name__,
                        description=convertAnchorsToLink.__doc__,
                        conflict_handler='resolve')
    pReq = p.add_argument_group('Required arguments')
    pOpt = p.add_argument_group('Optional arguments')
    pReq.add_argument('bed1', help='gene bed file of species1')
    pReq.add_argument('bed2', help='gene bed file of species2')
    pReq.add_argument('anchor', help='anchor file of synteny gene pairs')
    pOpt.add_argument('-o', '--out', type=argparse.FileType('w'), 
            default=sys.stdout, help='output file [default: %(default)s]')
    pOpt.add_argument('-h', '--help', action='help',
            help='show help message and exit.')
    
    args = p.parse_args(args)

    bed1 = args.bed1
    bed2 = args.bed2
    anchor = args.anchor
    check_file_exists(anchor)

    bed1 = create_bed_dict(bed1)
    bed2 = create_bed_dict(bed2)
    popen = Popen(['sort', '-V'], 
                                stdout= PIPE,
                                stdin = PIPE)
    with open(anchor) as fp:
        for line in fp:
            if line.startswith("#"):
                
                continue

            line_list = line.strip().split()
            gene1, gene2 = line_list[:2]
            
            if gene1 not in bed1 or gene2 not in bed2:
                continue
            #chrom1, start1, end1 = bed1[gene]
            #chrom2, start2, end2 = bed2[gene]
            print("{}\n".format('\t'.join(['\t'.join(bed1[gene1]), 
                    '\t'.join(bed2[gene2]), gene1, gene2])), file=args.out)
Example #8
0
    def genePairTAD(genes, tads, fraction=0.7):
        """
        Annotate tads with genes and return as dict.

        Params:
        --------
        genes: `str` bed4 file of gene
        tads: `str` bed3 file of tad
        fraction: `str` or `float` fraction of gene 
                    overlap with tads [default: 0.7]
        
        Returns:
        --------
        out: `dict` dictionary of TADs annotation

        Examples:
        --------
        >>> db = TADConserved().getGene("gene.bed", "tad.bed")

        """

        check_file_exists(tads)
        check_file_exists(genes)
        if 0 > float(fraction) > 1:
            logging.error('The option `-f` must set in '
                          'range [0, 1], and you set {}'.format(fraction))
            sys.exit()

        bedtools_cmd = "bedtools intersect -a {} -b {} -wao -f {} | cut -f 4-7 ".format(
            genes, tads, fraction)
        db = OrderedDict()
        for line in os.popen(bedtools_cmd):
            line_list = line.strip().split()
            gene, chrom, start, end = line_list
            ID = chrRangeID([chrom, start, end]) if chrom != "." \
                else "."
            if ID == ".":
                continue

            db[gene] = ID

        return db
Example #9
0
def getBoundaryBed(args):
    """
    %prog <tad.bed> <chrom.sizes> [Options]

    get a bed file of the tad boundary.
    """
    p = OptionParser(getBoundaryBed.__doc__)
    p.add_option('-a',
                 '--up',
                 type=int,
                 default=0,
                 help='the upstrean distance of boundary '
                 '[default: %default]')
    p.add_option('-b',
                 '--down',
                 type=int,
                 default=1,
                 help='the downstream distance of boundary '
                 '[default: %default]')

    opts, args = p.parse_args(args)
    if len(args) < 2:
        sys.exit(p.print_help())

    tadFile, chromSize = args
    check_file_exists(chromSize)
    up, down = opts.up, opts.down
    if not op.exists(tadFile) or not \
            op.exists(chromSize):
        logging.error('The input file is not exists')
    chrom_dict = dict(i.strip().split() \
            for i in open(chromSize) if i.strip())

    tf = TADFile(tadFile)
    tf.getBoundaryDict()
    boundaryBed = tf.getBoundaryBed(tf.boundaryDict, chrom_dict, up, down)

    for item in sorted(boundaryBed):
        print("\t".join(map(str, item[:3])))
    logging.debug('Successful output boundary bed')
Example #10
0
File: qc.py Project: wangyibin/TDGP
def getRealityBed(args):
    """
    %(prog)s <allValidParis> <theoretical.bed> [Options] > <reality.bed> 
        get reality fragment bed file from valid pairs file and 
            theoretical bed.
    """
    p = argparse.ArgumentParser(prog=getRealityBed.__name__,
                        description=getRealityBed.__doc__,
                        conflict_handler='resolve')
    pReq = p.add_argument_group('Required arguments')
    pOpt = p.add_argument_group('Optional arguments')
    pReq.add_argument('validpairs',  help='Validparis file')
    pReq.add_argument('enzyme', help='restriction enzyme site bed file')
    pOpt.add_argument('-o', '--output', type=argparse.FileType('w'), default=sys.stdout,
            help='output file [default: stdout]')
    pOpt.add_argument('-h', '--help', action='help',
            help='show help message and exit.')
    
    args = p.parse_args(args)

    check_file_exists(args.enzyme)

    enzyme_df = pd.read_csv(args.enzyme, sep='\t', 
                    header=None, index_col='name',
                    names=['chrom', 'start', 'end',
                            'name', 'flag', 'strand'])

    vp = ValidPairs(args.validpairs)
    realFrags = vp.getRealFrags()
    df = enzyme_df.loc[realFrags]
    df.reset_index(inplace=True, )
    
    df = df[['chrom', 'start', 'end',
                            'name', 'flag', 'strand']]
    df = df.sort_values(by=['chrom', 'start'])
    df.to_csv(args.output, sep='\t', header=None, index=None)
Example #11
0
def statFrag(args):
    """
    %(prog)s allValidParis [Options]
        stat the Ratio of theoretically digested genomic 
        fragments covered by valid paired Hi-C reads.
    """
    p = p=argparse.ArgumentParser(prog=statFrag.__name__,
                        description=statFrag.__doc__,
                        conflict_handler='resolve')
    pReq = p.add_argument_group('Required arguments')
    pOpt = p.add_argument_group('Optional arguments')
    pReq.add_argument('validpairs',  help='Validparis file')
    pReq.add_argument('enzyme', help='restriction enzyme site bed file')
    pOpt.add_argument('-o', '--out', type=argparse.FileType('w'), default=sys.stdout,
            help='output file [default: stdout]')
    pOpt.add_argument('-h', '--help', action='help',
            help='show help message and exit.')
    
    args = p.parse_args(args)

    theo_num = 0
    reality_num = 0
    check_file_exists(args.enzyme)

    with open(args.enzyme, 'r') as fp:
        for line in fp:
            if line.strip():
                theo_num += 1
    vp = ValidPairs(args.validpairs)
    realFrags = vp.getRealFrags()
    reality_num = len(realFrags)
    
    print("Theoretical Fragments\t{}".format(theo_num), file=args.out)
    print("Reality Fragments\t{}".format(reality_num), file=args.out)
    print("Reality Fragments Ratio (%)\t{:.2%}".format(reality_num*1.0/theo_num), 
            file=args.out)
Example #12
0
File: qc.py Project: wangyibin/TDGP
def plotIDEMultiv1(args):
    """
    %(prog) 1.ValidPairs 2.ValidPairs ... [Options]
        To multi sample IDE in a picture.
    """
    p = argparse.ArgumentParser(prog=plotIDEMulti.__name__,
                        description=plotIDEMulti.__doc__,
                        conflict_handler='resolve')
    pReq = p.add_argument_group('Required arguments')
    pOpt = p.add_argument_group('Optional arguments')
    pReq.add_argument('validpairs', nargs="+", 
            help='validpairs file')
    pReq.add_argument('--labels', nargs='+', required=True,
            help='lable for legend')
    pReq.add_argument('-o', '--out', required=True,
            help='output file')
    pOpt.add_argument('--chrom', default=None, help='plot chrom list')
    pOpt.add_argument('--scale', default=100000, type=int, metavar='int',
            help='the scale of data [default: %(default)]')
    p.add_argument('--xmin', default=1e5, type=float, metavar='float',
            help='min value of xtick [default: %(default)]')
    p.add_argument('--xmax', default=2e7, type=float, metavar='float',
            help='max value of xtick [default: %(default)]')
    pOpt.add_argument('-h', '--help', action='help',
            help='show help message and exit.')
    
    args = p.parse_args(args)

    from scipy.stats import linregress
    from matplotlib.lines import Line2D
    scale = args.scale
    xmin = args.xmin
    xmax = args.xmax
    out = args.out
    fig, ax = plt.subplots(figsize=(5, 5))

    if args.chrom:
        if op.exists(args.chrom):
            chrom = [i.strip().split()[0] 
                        for i in open(args.chrom) if i.strip()]
        else:
            chrom = args.chrom.split(',')
    else:
        chrom = args.chrom
    for i in args.validpairs:
        check_file_exists(i)
    assert len(args.validpairs) == len(args.labels), \
        'input validpair file must equal to labels'
    i = 0
    for validpair, label in zip(args.validpairs, args.labels):
        vp = ValidPairs(validpair)
        distance_db = vp.getCisDistance(chrom=chrom)
        data = list(chain(*list(distance_db.values())))
        data = np.array(data) // scale * scale
        
        data = data[(data >= xmin) & (data <= xmax)]
       
        unique, counts = np.unique(data, return_counts=True)
        db = OrderedDict(zip(unique, counts))
        slope = linregress(np.log10(unique), np.log10(counts)).slope
        label = "{} ({:.2f})".format(label, slope)
        #sns.regplot(list(db.keys()), list(db.values()), label=label, 
         #   marker=Line2D.filled_markers[i], ci=0, truncate=True,
        #    )
        #i += 1
        plt.plot(list(db.keys()), list(db.values()), label=label)
    plt.legend(loc='best', fontsize=13)
    #plt.xlim(xmin, xmax)
    plt.ylabel('Contact probability', dict(size=14))
    plt.xlabel('Distance (bp)', dict(size=14))
    plt.yscale('log')
    plt.xscale('log')
    #sns.despine(trim=True)
    plt.savefig(out, dpi=300, bbox_inches='tight')
    plt.savefig(out.rsplit(".", 1)[0] + '.png', 
                    dpi=300, bbox_inches='tight')
    logging.debug('Successful, picture is in `{}`'.format(out))
Example #13
0
def plotBoundary(args):
    """
    %prog boundary.bed data.bw samplelabel [options]
        To plot omics data density in tads boundary.
    """
    p = OptionParser(plotBoundary.__doc__)
    p.add_option('-b',
                 dest='up',
                 default=50000,
                 type=int,
                 help='upstream distance of boundary [default: %default]')
    p.add_option('-a',
                 dest='down',
                 default=50000,
                 type=int,
                 help='downstream distance of boundary [default: %default]')
    p.add_option('--binSize',
                 default=1000,
                 type=int,
                 help='calculate binSize [default: %default]')
    p.add_option('-p',
                 '--process',
                 default=8,
                 type=int,
                 help='process of program [default:%default]')
    p.add_option('-o',
                 '--output',
                 default=None,
                 help='the plot output prefix [default: tadprefix_label]')
    opts, args = p.parse_args(args)
    if len(args) != 3:
        sys.exit(p.print_help())

    boundary, data, label = args
    check_file_exists(boundary)
    check_file_exists(data)
    up = opts.up
    down = opts.down
    binSize = opts.binSize
    process = opts.process

    prefix = op.basename(boundary).replace('.bed', '') if not opts.output \
            else opts.output
    compute_cmd = """
    computeMatrix reference-point -S {data} -R {boundary} \\
        --referencePoint center -b {up} -a {down} --binSize {binSize} \\
            --samplesLabel {label} -p {process} --missingDataAsZero \\
                --skipZeros -o {prefix}_{label}.matrix.gz\\
                    --outFileSortedRegions {prefix}_{label}.bed
    """.format(data=data,
               boundary=boundary,
               binSize=binSize,
               up=up,
               down=down,
               label=label,
               prefix=prefix,
               process=process)

    plot_cmd = """
     plotProfile -m {prefix}_{label}.matrix.gz --refPointLabel Boundary \\
         -out {prefix}_{label}.pdf --plotHeight 10 --plotWidth 12 
    """.format(prefix=prefix, label=label)

    with open('run_{}_{}.sh'.format(prefix, label), 'w') as out:
        out.write(compute_cmd + "\n")
        out.write(plot_cmd)
    logging.debug('Starting plot {} density in boundary'.format(label))
    os.system('sh run_{}_{}.sh'.format(prefix, label))
    logging.debug('Done, picture is `{prefix}_{label}.pdf`'.format(
        prefix=prefix, label=label))
Example #14
0
    def getConserved(tad1,
                     tad2,
                     syngene1,
                     syngene2,
                     gene1,
                     gene2,
                     anchors,
                     fraction=0.7,
                     threshold=0,
                     gene_num=0,
                     synthre=0):
        """
        Get all syntenic TADs between two species.
        
        out: tad1 tad2 geneNum1 geneNum2 synNum1 synNum2 \
            genePer1 genePer2 synPer1 synPer2 geneList1 geneList2
        
        >>> tc = TADConserved
        >>> tc.getConserved(tad1, tad2, syngene1, syngene2, gene1, gene2, anchor)
        ...
        """
        logging.debug('Start ...')
        check_file_exists(anchors)
        tc = TADConserved()
        tadSynGeneNum1 = tc.getGene(tad1, syngene1, fraction, isnum=True)
        tadSynGeneNum2 = tc.getGene(tad2, syngene2, fraction, isnum=True)
        tadGeneNum1 = tc.getGene(tad1, gene1, fraction, isnum=True)
        tadGeneNum2 = tc.getGene(tad2, gene2, fraction, isnum=True)
        geneTAD1 = tc.genePairTAD(syngene1, tad1, fraction)
        geneTAD2 = tc.genePairTAD(syngene2, tad2, fraction)

        db = OrderedDict()
        with open(anchors, 'r') as fp:
            for line in fp:
                if line[0] == "#":
                    continue
                gene1, gene2, length = line.strip().split()

                try:
                    anchor1 = geneTAD1[gene1]
                    anchor2 = geneTAD2[gene2]
                except KeyError:
                    continue
                if anchor1 not in db:
                    db[anchor1] = OrderedDict()
                if anchor2 not in db[anchor1]:
                    db[anchor1][anchor2] = []
                db[anchor1][anchor2].append((gene1, gene2))
        header = ('#tad1', 'tad2', 'total_gene_num1', 'total_gene_num2',
                  'syn_gene_num1', 'syn_gene_num2', 'gene_per1', 'gene_per2',
                  'syngene_per1', 'syngene_per2', 'gene_list1', 'gene_list2')
        print("\t".join(header), file=sys.stdout)
        for anchor1 in db:
            for anchor2 in db[anchor1]:
                tmp = np.array(db[anchor1][anchor2])
                geneNum1 = tadGeneNum1[anchor1]
                geneNum2 = tadGeneNum2[anchor2]
                synGeneNum1 = tadSynGeneNum1[anchor1]
                synGeneNum2 = tadSynGeneNum2[anchor2]
                genePer1 = len(tmp[:, 0]) * 1.0 / geneNum1
                genePer2 = len(tmp[:, 1]) * 1.0 / geneNum2
                synGenePer1 = len(tmp[:, 0]) * 1.0 / synGeneNum1
                synGenePer2 = len(tmp[:, 1]) * 1.0 / synGeneNum2
                if genePer1 >= threshold and genePer2 >= threshold and \
                        geneNum1 >= gene_num and geneNum2 >= gene_num and \
                            synGeneNum1 >= synthre and synGeneNum2 >= synthre:

                    print("\t".join(
                        map(str, (anchor1, anchor2, geneNum1, geneNum2,
                                  synGeneNum1, synGeneNum2, genePer1, genePer2,
                                  synGenePer1, synGenePer2, ",".join(
                                      tmp[:, 0]), ",".join(tmp[:, 1])))),
                          file=sys.stdout)
        logging.debug('Done')
Example #15
0
    def getGene(tads, genes, fraction=0.7, isnum=False, isPlot=False):
        """
        Annotate tads with genes and return as dict.

        Params:
        --------
        tads: `str` bed3 file of tad
        genes: `str` bed4 file of gene
        fraction: `str` or `float` fraction of gene 
                    overlap with tads [default: 0.7]
        isnum: `bool` if set output the gene number instead 
                    of gene list. [default: False]
        isPlot: `bool` if plot the gene number per TADs 
                    distribution. [default: False]

        Returns:
        --------
        out: `dict` dictionary of TADs annotation

        Examples:
        --------
        >>> db = TADConserved().getGene("tad.bed", "gene.bed")
        """
        check_file_exists(tads)
        check_file_exists(genes)
        if 0 > float(fraction) > 1:
            logging.error('The option `-F` must set in '
                          'range [0, 1], and you set {}'.format(fraction))
            sys.exit()

        bedtools_cmd = "bedtools intersect -a {} -b {} -wao -F {} | \
                         cut -f 1-3,7 ".format(tads, genes, fraction)
        db = OrderedDict()

        for line in os.popen(bedtools_cmd):
            line_list = line.strip().split()
            ID = chrRangeID(line_list[:3])
            gene = line_list[3]
            if ID not in db:
                db[ID] = set()
            db[ID].add(gene)

        if isnum:
            for ID in db:
                db[ID] = len(db[ID])

        if isPlot:
            assert isnum, 'isnum must specify as True'
            fig, ax = plt.subplots(figsize=(5, 5))
            sns.distplot(db.values(), hist=False, kde=True, ax=ax)
            ax.set_xticks(range(0, 41, 5))
            ax.set_xlim(0, 40)
            ax.set_xlabel('Gene number')
            ax.set_ylabel('Frequence')
            ax.set_title('Gene Number Distribution ({:,})'.format(
                sum(db.values())))
            plt.savefig('{}.gene_num_dist.pdf'.format(genes.rsplit('.', 1)[0]),
                        dpi=300)
            logging.debug('Successful to plot gene number distribution '
                          '`{}.gene_num_dist.pdf`.'.format(
                              genes.rsplit('.', 1)[0]))

        return db
Example #16
0
def getSyntenyBlock(args):
    """
    %(prog)s bed1 bed2 anchor [Options]
        To get synteny block from anchor file, which is generate
            from jcvi
    
    """

    p = p=argparse.ArgumentParser(prog=getSyntenyBlock.__name__,
                        description=getSyntenyBlock.__doc__,
                        conflict_handler='resolve')
    pReq = p.add_argument_group('Required arguments')
    pOpt = p.add_argument_group('Optional arguments')
    pReq.add_argument('bed1', help='gene bed file of species1')
    pReq.add_argument('bed2', help='gene bed file of species2')
    pReq.add_argument('anchor', help='anchor file of synteny gene pairs')
    pOpt.add_argument('-o', '--out', type=argparse.FileType('w'), 
            default=sys.stdout, help='output file [default: %(default)s]')
    pOpt.add_argument('-h', '--help', action='help',
            help='show help message and exit.')
    
    args = p.parse_args(args)

    bed1 = args.bed1
    bed2 = args.bed2
    anchor = args.anchor
    check_file_exists(anchor)

    bed1 = create_bed_dict(bed1)
    bed2 = create_bed_dict(bed2)
    
    block_num = 0
    tmp_outfile = anchor.replace('anchor', 'bed')

    tmp_out = open(tmp_outfile, 'w')
    with open(anchor) as fp:
        for line in fp:
            if line.startswith("#"):
                block_num += 1
                block = "block{}".format(block_num)
                continue

            line_list = line.strip().split()
            gene1, gene2 = line_list[:2]
            
            if gene1 not in bed1 or gene2 not in bed2:
                continue
            #chrom1, start1, end1 = bed1[gene]
            #chrom2, start2, end2 = bed2[gene]
            tmp_out.write('\t'.join(['\t'.join(bed1[gene1]), 
                    '\t'.join(bed2[gene2]), gene1, gene2, block]) + "\n")
    
    block_db = OrderedDict()
    with open(tmp_outfile) as fp:
        for line in fp:
            line_list = line.strip().split()
            chr1, start1, end1, chr2, start2, end2 = line_list[:6]
            block = line_list[8]
            if block not in block_db:
                block_db[block] = [chr1, start1, end1, chr2, start2, end2]
            
            if block_db[block][0] != chr1:
                continue
            if block_db[block][3] != chr2:
                continue

            if block_db[block][1] > start1:
                block_db[block][1] = start1
            if block_db[block][2] < end1:
                block_db[block][2] = end1
            if block_db[block][4] > start2:
                block_db[block][4] = start2
            if block_db[block][5] < end2:
                block_db[block][5] = end2
            
        for block in block_db:
            print("\t".join([block, "\t".join(block_db[block])]), file=args.out)
        
        logging.debug('Successful ... result is in `{}`'.format(args.out.name))