def whichTAD(args): """ %prog gene.bed tad.bed Options find gene location in tads """ p = OptionParser(annotate.__doc__) p.add_option('-f', dest='fraction', default='0.7', help='the fraction of gene overlap of tads') opts, args = p.parse_args(args) if len(args) != 2: sys.exit(p.print_help()) genes, tads = args fraction = opts.fraction check_file_exists(tads) check_file_exists(genes) if 0 > float(fraction) > 1: logging.error('The option `-f` must set in ' 'range [0, 1], and you set {}'.format(fraction)) sys.exit() bedtools_cmd = "bedtools intersect -a {} -b {} -wao -f {} | cut -f 4-8 ".format( genes, tads, fraction) for line in os.popen(bedtools_cmd): print(line.strip())
def getTSSbw(args): """ %prog <tss.gff/bed> <chrom.sizes> <out_prefix> [options] To obtain a bedgraph file of tss sites density of per windows """ p = OptionParser(getTSSbw.__doc__) p.add_option('-w', '--window', type=int, default=1000, help='the window of tss density calculation') p.add_option('-o', '--out', default='./', help='output [default: %default]') p.add_option('--qsub', default=False, action='store_true', help='if qsub to sge [default: %default]') opts, args = p.parse_args(args) if len(args) != 3: sys.exit(p.print_help()) gff, chrom_sizes, sample = args window = opts.window check_file_exists(chrom_sizes) command = 'qsub -pe mpi 1 -cwd -j y -S /bin/bash' if opts.qsub \ else 'sh' outdir = op.abspath(opts.out) cmd = 'python -m TDGP.analysis.genome getTSS {} > \ {}/{}.gene.tss.bed\n'.format(gff, outdir, sample) cmd += 'bedtools makewindows -g {1} -w {2} > {0}/{3}.{2}.window\n'.format( outdir, chrom_sizes, window, sample, ) cmd += 'bedtools intersect -a {outdir}/{sample}.{window}.window -b \ {outdir}/{sample}.gene.tss.bed -c | sort -k1,1 -k2,2n > \ {outdir}/{sample}.gene.tss.{window}.bg\n'.format(sample=sample, window=window, outdir=outdir) cmd += 'bedGraphToBigWig {outdir}/{sample}.gene.tss.{window}.bg {sizes} \ {outdir}/{sample}.gene.tss.{window}.bw\n'.format(sample=sample, window=window, sizes=chrom_sizes, outdir=outdir) with open('{}/run_{}_tss.sh'.format(outdir, sample), 'w') as out: out.write(cmd) os.system('{} {}/run_{}_tss.sh'.format(command, outdir, sample)) logging.debug('Successful')
def getTSS(args): """ %prog infile [Options] To get TSS from gff or bed. """ p = OptionParser(getTSS.__doc__) p.add_option('--type', default='gene', help='the type of sequence [default: %default]') p.add_option('-o', '--out', default=sys.stdout, help='output file. [default: stdout]') opts, args = p.parse_args(args) if len(args) < 1: sys.exit(p.print_help()) infile, = args check_file_exists(infile) out = opts.out filetype = guess_filetype(infile) if not filetype: logging.error('Input filetype must a gff or bed') sys.exit() if filetype == "gff": with must_open(infile, 'r') as fp: for line in fp: if line.startswith("#"): continue if not line.strip(): continue line_list = line.strip().split('\t') (chrom, _, type_, start, end, _, strand, _, info) = line_list[:9] if type_ == opts.type: print('\t'.join(map(str, (chrom, start, int(start) + 1))), file=out) elif filetype == 'bed': with must_open(infile, 'r') as fp: for line in fp: if line.startswith("#"): continue line_list = line.strip().split() chrom, start, end = line_list[:3] print('\t'.join(map(str, (chrom, start, int(start) + 1))), file=out) out = 'stdout' if isinstance(out, str) else out logging.debug('Done, output is in `{}`'.format)
def create_bed_dict(bedfile): bed_dict = {} check_file_exists(bedfile) with open(bedfile) as fp: for line in fp: line_list = line.strip().split() chrom, start, end, gene = line_list[:4] bed_dict[gene] = (chrom, start, end) return bed_dict
def statFrag(args): """ %(prog)s allValidParis [Options] stat the Ratio of theoretically digested genomic fragments covered by valid paired Hi-C reads. """ p = argparse.ArgumentParser(prog=statFrag.__name__, description=statFrag.__doc__, conflict_handler='resolve') pReq = p.add_argument_group('Required arguments') pOpt = p.add_argument_group('Optional arguments') pReq.add_argument('validpairs', help='Validparis file') pReq.add_argument('enzyme', help='restriction enzyme site bed file') pOpt.add_argument('--unmap', default=False, action='store_true', help='output the unmap fragments bed [default: %(default)s]') pOpt.add_argument('-o', '--output', type=argparse.FileType('w'), default=sys.stdout, help='output file [default: stdout]') pOpt.add_argument('-h', '--help', action='help', help='show help message and exit.') args = p.parse_args(args) theo_num = 0 reality_num = 0 check_file_exists(args.enzyme) enzyme_df = pd.read_csv(args.enzyme, sep='\t', header=None, index_col='name', names=['chrom', 'start', 'end', 'name', 'flag', 'strand']) theoFrags = set(enzyme_df.index) theo_num = len(theoFrags) vp = ValidPairs(args.validpairs) realFrags = vp.getRealFrags() reality_num = len(realFrags) if args.unmap: unmapFrags = theoFrags - realFrags unmap_df = enzyme_df.loc[unmapFrags] unmap_df.reset_index(inplace=True) df = unmap_df[['chrom', 'start', 'end', 'name', 'flag', 'strand']] df = df.sort_values(by=['chrom', 'start']) df.to_csv(args.output.name + ".unmap", sep='\t', header=None, index=None) print("Theoretical Fragments\t{}".format(theo_num), file=args.output) print("Reality Fragments\t{}".format(reality_num), file=args.output) print("Reality Fragments Ratio (%)\t{:.2%}".format(reality_num*1.0/theo_num), file=args.output)
def __init__(self, filename, exclude=None, exclude_contig=['tig', 'Un', 'Sy', 'scaffold', 'ctg', 'Pt', 'Mt'], mem_cache='.'): check_file_exists(filename) self.filename = filename self.exclude = listify(exclude) self.exclude_contig = listify(exclude_contig) self.getChrSizes() self.idx2label = dict((i, chrom) for i, chrom in enumerate(self.chromLabels)) self.label2idx = dict((chrom, i) for i, chrom in enumerate(self.chromLabels)) self.mem_cache = mem_cache self.memory = Memory(mem_cache, verbose=0) self.getGCBin = self.memory.cache(self._getGCBin)
def convertAnchorsToLink(args): """ %(prog)s bed1 bed2 anchor [Options] To convert anchors file to link bed file, which is generate from jcvi """ p = p=argparse.ArgumentParser(prog=convertAnchorsToLink.__name__, description=convertAnchorsToLink.__doc__, conflict_handler='resolve') pReq = p.add_argument_group('Required arguments') pOpt = p.add_argument_group('Optional arguments') pReq.add_argument('bed1', help='gene bed file of species1') pReq.add_argument('bed2', help='gene bed file of species2') pReq.add_argument('anchor', help='anchor file of synteny gene pairs') pOpt.add_argument('-o', '--out', type=argparse.FileType('w'), default=sys.stdout, help='output file [default: %(default)s]') pOpt.add_argument('-h', '--help', action='help', help='show help message and exit.') args = p.parse_args(args) bed1 = args.bed1 bed2 = args.bed2 anchor = args.anchor check_file_exists(anchor) bed1 = create_bed_dict(bed1) bed2 = create_bed_dict(bed2) popen = Popen(['sort', '-V'], stdout= PIPE, stdin = PIPE) with open(anchor) as fp: for line in fp: if line.startswith("#"): continue line_list = line.strip().split() gene1, gene2 = line_list[:2] if gene1 not in bed1 or gene2 not in bed2: continue #chrom1, start1, end1 = bed1[gene] #chrom2, start2, end2 = bed2[gene] print("{}\n".format('\t'.join(['\t'.join(bed1[gene1]), '\t'.join(bed2[gene2]), gene1, gene2])), file=args.out)
def genePairTAD(genes, tads, fraction=0.7): """ Annotate tads with genes and return as dict. Params: -------- genes: `str` bed4 file of gene tads: `str` bed3 file of tad fraction: `str` or `float` fraction of gene overlap with tads [default: 0.7] Returns: -------- out: `dict` dictionary of TADs annotation Examples: -------- >>> db = TADConserved().getGene("gene.bed", "tad.bed") """ check_file_exists(tads) check_file_exists(genes) if 0 > float(fraction) > 1: logging.error('The option `-f` must set in ' 'range [0, 1], and you set {}'.format(fraction)) sys.exit() bedtools_cmd = "bedtools intersect -a {} -b {} -wao -f {} | cut -f 4-7 ".format( genes, tads, fraction) db = OrderedDict() for line in os.popen(bedtools_cmd): line_list = line.strip().split() gene, chrom, start, end = line_list ID = chrRangeID([chrom, start, end]) if chrom != "." \ else "." if ID == ".": continue db[gene] = ID return db
def getBoundaryBed(args): """ %prog <tad.bed> <chrom.sizes> [Options] get a bed file of the tad boundary. """ p = OptionParser(getBoundaryBed.__doc__) p.add_option('-a', '--up', type=int, default=0, help='the upstrean distance of boundary ' '[default: %default]') p.add_option('-b', '--down', type=int, default=1, help='the downstream distance of boundary ' '[default: %default]') opts, args = p.parse_args(args) if len(args) < 2: sys.exit(p.print_help()) tadFile, chromSize = args check_file_exists(chromSize) up, down = opts.up, opts.down if not op.exists(tadFile) or not \ op.exists(chromSize): logging.error('The input file is not exists') chrom_dict = dict(i.strip().split() \ for i in open(chromSize) if i.strip()) tf = TADFile(tadFile) tf.getBoundaryDict() boundaryBed = tf.getBoundaryBed(tf.boundaryDict, chrom_dict, up, down) for item in sorted(boundaryBed): print("\t".join(map(str, item[:3]))) logging.debug('Successful output boundary bed')
def getRealityBed(args): """ %(prog)s <allValidParis> <theoretical.bed> [Options] > <reality.bed> get reality fragment bed file from valid pairs file and theoretical bed. """ p = argparse.ArgumentParser(prog=getRealityBed.__name__, description=getRealityBed.__doc__, conflict_handler='resolve') pReq = p.add_argument_group('Required arguments') pOpt = p.add_argument_group('Optional arguments') pReq.add_argument('validpairs', help='Validparis file') pReq.add_argument('enzyme', help='restriction enzyme site bed file') pOpt.add_argument('-o', '--output', type=argparse.FileType('w'), default=sys.stdout, help='output file [default: stdout]') pOpt.add_argument('-h', '--help', action='help', help='show help message and exit.') args = p.parse_args(args) check_file_exists(args.enzyme) enzyme_df = pd.read_csv(args.enzyme, sep='\t', header=None, index_col='name', names=['chrom', 'start', 'end', 'name', 'flag', 'strand']) vp = ValidPairs(args.validpairs) realFrags = vp.getRealFrags() df = enzyme_df.loc[realFrags] df.reset_index(inplace=True, ) df = df[['chrom', 'start', 'end', 'name', 'flag', 'strand']] df = df.sort_values(by=['chrom', 'start']) df.to_csv(args.output, sep='\t', header=None, index=None)
def statFrag(args): """ %(prog)s allValidParis [Options] stat the Ratio of theoretically digested genomic fragments covered by valid paired Hi-C reads. """ p = p=argparse.ArgumentParser(prog=statFrag.__name__, description=statFrag.__doc__, conflict_handler='resolve') pReq = p.add_argument_group('Required arguments') pOpt = p.add_argument_group('Optional arguments') pReq.add_argument('validpairs', help='Validparis file') pReq.add_argument('enzyme', help='restriction enzyme site bed file') pOpt.add_argument('-o', '--out', type=argparse.FileType('w'), default=sys.stdout, help='output file [default: stdout]') pOpt.add_argument('-h', '--help', action='help', help='show help message and exit.') args = p.parse_args(args) theo_num = 0 reality_num = 0 check_file_exists(args.enzyme) with open(args.enzyme, 'r') as fp: for line in fp: if line.strip(): theo_num += 1 vp = ValidPairs(args.validpairs) realFrags = vp.getRealFrags() reality_num = len(realFrags) print("Theoretical Fragments\t{}".format(theo_num), file=args.out) print("Reality Fragments\t{}".format(reality_num), file=args.out) print("Reality Fragments Ratio (%)\t{:.2%}".format(reality_num*1.0/theo_num), file=args.out)
def plotIDEMultiv1(args): """ %(prog) 1.ValidPairs 2.ValidPairs ... [Options] To multi sample IDE in a picture. """ p = argparse.ArgumentParser(prog=plotIDEMulti.__name__, description=plotIDEMulti.__doc__, conflict_handler='resolve') pReq = p.add_argument_group('Required arguments') pOpt = p.add_argument_group('Optional arguments') pReq.add_argument('validpairs', nargs="+", help='validpairs file') pReq.add_argument('--labels', nargs='+', required=True, help='lable for legend') pReq.add_argument('-o', '--out', required=True, help='output file') pOpt.add_argument('--chrom', default=None, help='plot chrom list') pOpt.add_argument('--scale', default=100000, type=int, metavar='int', help='the scale of data [default: %(default)]') p.add_argument('--xmin', default=1e5, type=float, metavar='float', help='min value of xtick [default: %(default)]') p.add_argument('--xmax', default=2e7, type=float, metavar='float', help='max value of xtick [default: %(default)]') pOpt.add_argument('-h', '--help', action='help', help='show help message and exit.') args = p.parse_args(args) from scipy.stats import linregress from matplotlib.lines import Line2D scale = args.scale xmin = args.xmin xmax = args.xmax out = args.out fig, ax = plt.subplots(figsize=(5, 5)) if args.chrom: if op.exists(args.chrom): chrom = [i.strip().split()[0] for i in open(args.chrom) if i.strip()] else: chrom = args.chrom.split(',') else: chrom = args.chrom for i in args.validpairs: check_file_exists(i) assert len(args.validpairs) == len(args.labels), \ 'input validpair file must equal to labels' i = 0 for validpair, label in zip(args.validpairs, args.labels): vp = ValidPairs(validpair) distance_db = vp.getCisDistance(chrom=chrom) data = list(chain(*list(distance_db.values()))) data = np.array(data) // scale * scale data = data[(data >= xmin) & (data <= xmax)] unique, counts = np.unique(data, return_counts=True) db = OrderedDict(zip(unique, counts)) slope = linregress(np.log10(unique), np.log10(counts)).slope label = "{} ({:.2f})".format(label, slope) #sns.regplot(list(db.keys()), list(db.values()), label=label, # marker=Line2D.filled_markers[i], ci=0, truncate=True, # ) #i += 1 plt.plot(list(db.keys()), list(db.values()), label=label) plt.legend(loc='best', fontsize=13) #plt.xlim(xmin, xmax) plt.ylabel('Contact probability', dict(size=14)) plt.xlabel('Distance (bp)', dict(size=14)) plt.yscale('log') plt.xscale('log') #sns.despine(trim=True) plt.savefig(out, dpi=300, bbox_inches='tight') plt.savefig(out.rsplit(".", 1)[0] + '.png', dpi=300, bbox_inches='tight') logging.debug('Successful, picture is in `{}`'.format(out))
def plotBoundary(args): """ %prog boundary.bed data.bw samplelabel [options] To plot omics data density in tads boundary. """ p = OptionParser(plotBoundary.__doc__) p.add_option('-b', dest='up', default=50000, type=int, help='upstream distance of boundary [default: %default]') p.add_option('-a', dest='down', default=50000, type=int, help='downstream distance of boundary [default: %default]') p.add_option('--binSize', default=1000, type=int, help='calculate binSize [default: %default]') p.add_option('-p', '--process', default=8, type=int, help='process of program [default:%default]') p.add_option('-o', '--output', default=None, help='the plot output prefix [default: tadprefix_label]') opts, args = p.parse_args(args) if len(args) != 3: sys.exit(p.print_help()) boundary, data, label = args check_file_exists(boundary) check_file_exists(data) up = opts.up down = opts.down binSize = opts.binSize process = opts.process prefix = op.basename(boundary).replace('.bed', '') if not opts.output \ else opts.output compute_cmd = """ computeMatrix reference-point -S {data} -R {boundary} \\ --referencePoint center -b {up} -a {down} --binSize {binSize} \\ --samplesLabel {label} -p {process} --missingDataAsZero \\ --skipZeros -o {prefix}_{label}.matrix.gz\\ --outFileSortedRegions {prefix}_{label}.bed """.format(data=data, boundary=boundary, binSize=binSize, up=up, down=down, label=label, prefix=prefix, process=process) plot_cmd = """ plotProfile -m {prefix}_{label}.matrix.gz --refPointLabel Boundary \\ -out {prefix}_{label}.pdf --plotHeight 10 --plotWidth 12 """.format(prefix=prefix, label=label) with open('run_{}_{}.sh'.format(prefix, label), 'w') as out: out.write(compute_cmd + "\n") out.write(plot_cmd) logging.debug('Starting plot {} density in boundary'.format(label)) os.system('sh run_{}_{}.sh'.format(prefix, label)) logging.debug('Done, picture is `{prefix}_{label}.pdf`'.format( prefix=prefix, label=label))
def getConserved(tad1, tad2, syngene1, syngene2, gene1, gene2, anchors, fraction=0.7, threshold=0, gene_num=0, synthre=0): """ Get all syntenic TADs between two species. out: tad1 tad2 geneNum1 geneNum2 synNum1 synNum2 \ genePer1 genePer2 synPer1 synPer2 geneList1 geneList2 >>> tc = TADConserved >>> tc.getConserved(tad1, tad2, syngene1, syngene2, gene1, gene2, anchor) ... """ logging.debug('Start ...') check_file_exists(anchors) tc = TADConserved() tadSynGeneNum1 = tc.getGene(tad1, syngene1, fraction, isnum=True) tadSynGeneNum2 = tc.getGene(tad2, syngene2, fraction, isnum=True) tadGeneNum1 = tc.getGene(tad1, gene1, fraction, isnum=True) tadGeneNum2 = tc.getGene(tad2, gene2, fraction, isnum=True) geneTAD1 = tc.genePairTAD(syngene1, tad1, fraction) geneTAD2 = tc.genePairTAD(syngene2, tad2, fraction) db = OrderedDict() with open(anchors, 'r') as fp: for line in fp: if line[0] == "#": continue gene1, gene2, length = line.strip().split() try: anchor1 = geneTAD1[gene1] anchor2 = geneTAD2[gene2] except KeyError: continue if anchor1 not in db: db[anchor1] = OrderedDict() if anchor2 not in db[anchor1]: db[anchor1][anchor2] = [] db[anchor1][anchor2].append((gene1, gene2)) header = ('#tad1', 'tad2', 'total_gene_num1', 'total_gene_num2', 'syn_gene_num1', 'syn_gene_num2', 'gene_per1', 'gene_per2', 'syngene_per1', 'syngene_per2', 'gene_list1', 'gene_list2') print("\t".join(header), file=sys.stdout) for anchor1 in db: for anchor2 in db[anchor1]: tmp = np.array(db[anchor1][anchor2]) geneNum1 = tadGeneNum1[anchor1] geneNum2 = tadGeneNum2[anchor2] synGeneNum1 = tadSynGeneNum1[anchor1] synGeneNum2 = tadSynGeneNum2[anchor2] genePer1 = len(tmp[:, 0]) * 1.0 / geneNum1 genePer2 = len(tmp[:, 1]) * 1.0 / geneNum2 synGenePer1 = len(tmp[:, 0]) * 1.0 / synGeneNum1 synGenePer2 = len(tmp[:, 1]) * 1.0 / synGeneNum2 if genePer1 >= threshold and genePer2 >= threshold and \ geneNum1 >= gene_num and geneNum2 >= gene_num and \ synGeneNum1 >= synthre and synGeneNum2 >= synthre: print("\t".join( map(str, (anchor1, anchor2, geneNum1, geneNum2, synGeneNum1, synGeneNum2, genePer1, genePer2, synGenePer1, synGenePer2, ",".join( tmp[:, 0]), ",".join(tmp[:, 1])))), file=sys.stdout) logging.debug('Done')
def getGene(tads, genes, fraction=0.7, isnum=False, isPlot=False): """ Annotate tads with genes and return as dict. Params: -------- tads: `str` bed3 file of tad genes: `str` bed4 file of gene fraction: `str` or `float` fraction of gene overlap with tads [default: 0.7] isnum: `bool` if set output the gene number instead of gene list. [default: False] isPlot: `bool` if plot the gene number per TADs distribution. [default: False] Returns: -------- out: `dict` dictionary of TADs annotation Examples: -------- >>> db = TADConserved().getGene("tad.bed", "gene.bed") """ check_file_exists(tads) check_file_exists(genes) if 0 > float(fraction) > 1: logging.error('The option `-F` must set in ' 'range [0, 1], and you set {}'.format(fraction)) sys.exit() bedtools_cmd = "bedtools intersect -a {} -b {} -wao -F {} | \ cut -f 1-3,7 ".format(tads, genes, fraction) db = OrderedDict() for line in os.popen(bedtools_cmd): line_list = line.strip().split() ID = chrRangeID(line_list[:3]) gene = line_list[3] if ID not in db: db[ID] = set() db[ID].add(gene) if isnum: for ID in db: db[ID] = len(db[ID]) if isPlot: assert isnum, 'isnum must specify as True' fig, ax = plt.subplots(figsize=(5, 5)) sns.distplot(db.values(), hist=False, kde=True, ax=ax) ax.set_xticks(range(0, 41, 5)) ax.set_xlim(0, 40) ax.set_xlabel('Gene number') ax.set_ylabel('Frequence') ax.set_title('Gene Number Distribution ({:,})'.format( sum(db.values()))) plt.savefig('{}.gene_num_dist.pdf'.format(genes.rsplit('.', 1)[0]), dpi=300) logging.debug('Successful to plot gene number distribution ' '`{}.gene_num_dist.pdf`.'.format( genes.rsplit('.', 1)[0])) return db
def getSyntenyBlock(args): """ %(prog)s bed1 bed2 anchor [Options] To get synteny block from anchor file, which is generate from jcvi """ p = p=argparse.ArgumentParser(prog=getSyntenyBlock.__name__, description=getSyntenyBlock.__doc__, conflict_handler='resolve') pReq = p.add_argument_group('Required arguments') pOpt = p.add_argument_group('Optional arguments') pReq.add_argument('bed1', help='gene bed file of species1') pReq.add_argument('bed2', help='gene bed file of species2') pReq.add_argument('anchor', help='anchor file of synteny gene pairs') pOpt.add_argument('-o', '--out', type=argparse.FileType('w'), default=sys.stdout, help='output file [default: %(default)s]') pOpt.add_argument('-h', '--help', action='help', help='show help message and exit.') args = p.parse_args(args) bed1 = args.bed1 bed2 = args.bed2 anchor = args.anchor check_file_exists(anchor) bed1 = create_bed_dict(bed1) bed2 = create_bed_dict(bed2) block_num = 0 tmp_outfile = anchor.replace('anchor', 'bed') tmp_out = open(tmp_outfile, 'w') with open(anchor) as fp: for line in fp: if line.startswith("#"): block_num += 1 block = "block{}".format(block_num) continue line_list = line.strip().split() gene1, gene2 = line_list[:2] if gene1 not in bed1 or gene2 not in bed2: continue #chrom1, start1, end1 = bed1[gene] #chrom2, start2, end2 = bed2[gene] tmp_out.write('\t'.join(['\t'.join(bed1[gene1]), '\t'.join(bed2[gene2]), gene1, gene2, block]) + "\n") block_db = OrderedDict() with open(tmp_outfile) as fp: for line in fp: line_list = line.strip().split() chr1, start1, end1, chr2, start2, end2 = line_list[:6] block = line_list[8] if block not in block_db: block_db[block] = [chr1, start1, end1, chr2, start2, end2] if block_db[block][0] != chr1: continue if block_db[block][3] != chr2: continue if block_db[block][1] > start1: block_db[block][1] = start1 if block_db[block][2] < end1: block_db[block][2] = end1 if block_db[block][4] > start2: block_db[block][4] = start2 if block_db[block][5] < end2: block_db[block][5] = end2 for block in block_db: print("\t".join([block, "\t".join(block_db[block])]), file=args.out) logging.debug('Successful ... result is in `{}`'.format(args.out.name))