Example #1
0
def make_overlap_graph(mgene_id, merged_g2t, merged_tid_lines):
    # make temporary gff file for gene
    tmp_out = open('%s.gff' % mgene_id, 'w')
    for tid in merged_g2t[mgene_id]:
        for line in merged_tid_lines[tid]:
            a = line.split('\t')
            if a[2] == 'exon':
                print >> tmp_out, line,
    tmp_out.close()

    tid_overlap_graph = nx.Graph()

    # intersect with self
    proc = subprocess.Popen('intersectBed -wo -s -a %s.gff -b %s.gff' % (mgene_id,mgene_id), shell=True, stdout=subprocess.PIPE)
    line = proc.stdout.readline()
    while line:
        a = line.split('\t')

        tid1 = gff.gtf_kv(a[8])['transcript_id']
        tid2 = gff.gtf_kv(a[17])['transcript_id']

        # ignore same and ignore different ref genes
        if tid1 != tid2:                    
            tid_overlap_graph.add_edge(tid1,tid2)

        line = proc.stdout.readline()
    proc.communicate()

    os.remove('%s.gff' % mgene_id)

    return tid_overlap_graph
Example #2
0
def hash_genes_repeats_nt(gtf_file, repeats_gff, gene_key='gene_id', add_star=True):
    gene_repeat_nt = {}

    p = subprocess.Popen('intersectBed -wo -a %s -b %s' % (gtf_file, repeats_gff), shell=True, stdout=subprocess.PIPE)
    line = p.stdout.readline()
    while line:
        a = line.split('\t')

        # get names
        gene_id = gtf_kv(a[8])['gene_id']
        rep_kv = gtf_kv(a[17])
        rep = rep_kv['repeat']
        fam = rep_kv['family']

        # get overlap
        nt_overlap = int(a[18])

        if not gene_id in gene_repeat_nt:
            gene_repeat_nt[gene_id] = {}

        gene_repeat_nt[gene_id][(rep,fam)] = gene_repeat_nt[gene_id].get((rep,fam),0) + nt_overlap
        if add_star:
            gene_repeat_nt[gene_id][('*',fam)] = gene_repeat_nt[gene_id].get(('*',fam),0) + nt_overlap
            gene_repeat_nt[gene_id][('*','*')] = gene_repeat_nt[gene_id].get(('*','*'),0) + nt_overlap

        line = p.stdout.readline()
    p.communicate()

    return gene_repeat_nt
Example #3
0
def hash_repeats_genes(gtf_file, repeats_gff, gene_key='gene_id', add_star=True, stranded=False):
    repeat_genes = {}
    if add_star:
        repeat_genes[('*','*','+')] = set()
        repeat_genes[('*','*','-')] = set()
    else:
        repeat_genes[('*','*')] = set()

    for line in open(repeats_gff):
        a = line.split('\t')
        kv = gtf_kv(a[8])
        
        if stranded:
            repeat_genes[(kv['repeat'],kv['family'],'+')] = set()
            repeat_genes[(kv['repeat'],kv['family'],'-')] = set()
            if add_star:
                repeat_genes[('*',kv['family'],'+')] = set()
                repeat_genes[('*',kv['family'],'-')] = set()
        else:
            repeat_genes[(kv['repeat'],kv['family'])] = set()
            if add_star:
                repeat_genes[('*',kv['family'])] = set()

    
    p = subprocess.Popen('intersectBed -wo -a %s -b %s' % (gtf_file, repeats_gff), shell=True, stdout=subprocess.PIPE)
    line = p.stdout.readline()
    while line:
        a = line.split('\t')

        # get names
        gene_id = gtf_kv(a[8])[gene_key]
        rep_kv = gtf_kv(a[17])
        rep = rep_kv['repeat']
        fam = rep_kv['family']

        # get strands
        gene_strand = a[6]
        te_strand = a[15]

        if stranded:
            if gene_strand == te_strand:
                orient = '+'
            else:
                orient = '-'

            repeat_genes[(rep,fam,orient)].add(gene_id)
            if add_star:
                repeat_genes[('*',fam,orient)].add(gene_id)
                repeat_genes[('*','*',orient)].add(gene_id)

        else:
            repeat_genes[(rep,fam)].add(gene_id)
            if add_star:
                repeat_genes[('*',fam)].add(gene_id)
                repeat_genes[('*','*')].add(gene_id)

        line = p.stdout.readline()
    p.communicate()

    return repeat_genes
Example #4
0
def filter_single(ref_gtf):
    # intersect with self and compute overlap sets
    #p = subprocess.Popen('intersectBed -sorted -wo -s -a %s -b %s' % (ref_gtf, ref_gtf), shell=True, stdout=subprocess.PIPE)
    p = subprocess.Popen('intersectBed -wo -s -a %s -b %s' % (ref_gtf, ref_gtf), shell=True, stdout=subprocess.PIPE)

    # computer overlaps
    gene_overlaps = {}
    for line in p.stdout:
        a = line.split('\t')

        gid1 = gff.gtf_kv(a[8])['gene_id']
        gid2 = gff.gtf_kv(a[17])['gene_id']

        if gid1 != gid2:
            gene_overlaps.setdefault(gid1,set()).add(gid2)
            gene_overlaps.setdefault(gid2,set()).add(gid1)

    p.communicate()

    # filter overlapping genes out
    single_gtf_fd, single_gtf_file = tempfile.mkstemp()
    single_gtf_out = open(single_gtf_file, 'w')
    for line in open(ref_gtf):
        a = line.split('\t')
        gene_id = gff.gtf_kv(a[8])['gene_id']
        if gene_id not in gene_overlaps:
            print >> single_gtf_out, line,
    single_gtf_out.close()

    return single_gtf_fd, single_gtf_file
def filter_single(ref_gtf):
    # intersect with self and compute overlap sets
    #p = subprocess.Popen('intersectBed -sorted -wo -s -a %s -b %s' % (ref_gtf, ref_gtf), shell=True, stdout=subprocess.PIPE)
    p = subprocess.Popen('intersectBed -wo -s -a %s -b %s' %
                         (ref_gtf, ref_gtf),
                         shell=True,
                         stdout=subprocess.PIPE)

    # computer overlaps
    gene_overlaps = {}
    for line in p.stdout:
        a = line.split('\t')

        gid1 = gff.gtf_kv(a[8])['gene_id']
        gid2 = gff.gtf_kv(a[17])['gene_id']

        if gid1 != gid2:
            gene_overlaps.setdefault(gid1, set()).add(gid2)
            gene_overlaps.setdefault(gid2, set()).add(gid1)

    p.communicate()

    # filter overlapping genes out
    single_gtf_fd, single_gtf_file = tempfile.mkstemp()
    single_gtf_out = open(single_gtf_file, 'w')
    for line in open(ref_gtf):
        a = line.split('\t')
        gene_id = gff.gtf_kv(a[8])['gene_id']
        if gene_id not in gene_overlaps:
            print >> single_gtf_out, line,
    single_gtf_out.close()

    return single_gtf_fd, single_gtf_file
Example #6
0
def preprocess_anchors(anchor_gff, mode, max_anchors, anchor_is_gtf, min_length, window):
    # get lengths
    anchor_lengths = {}
    for line in open(anchor_gff):
        a = line.split('\t')

        if anchor_is_gtf:
            anchor_id = gff.gtf_kv(a[8])['transcript_id']
        else:
            anchor_id = (a[0], int(a[3]), int(a[4]))

        anchor_lengths[anchor_id] = anchor_lengths.get(anchor_id,0) + int(a[4])-int(a[3])+1

    # filter small
    if min_length != None:
        for anchor_id in anchor_lengths.keys():
            if anchor_lengths[anchor_id] < min_length:
                del anchor_lengths[anchor_id]

    # sample
    if max_anchors < len(anchor_lengths):
        anchors_chosen = set(random.sample(anchor_lengths.keys(), max_anchors))
    else:
        anchors_chosen = set(anchor_lengths.keys())

    # make new GFF
    prep_anchor_fd, prep_anchor_gff = tempfile.mkstemp()
    print >> sys.stderr, 'Opening tempfile %s for preprocessed anchors.' % prep_anchor_gff
    prep_anchor_out = open(prep_anchor_gff, 'w')

    for line in open(anchor_gff):
        a = line.split('\t')

        if anchor_is_gtf:
            anchor_id = gff.gtf_kv(a[8])['transcript_id']
        else:
            anchor_id = (a[0], int(a[3]), int(a[4]))

        if anchor_id in anchors_chosen:
            if mode == 'span':
                print >> prep_anchor_out, line,
            elif mode == 'mid':
                # standardize size
                start = int(a[3])
                end = int(a[4])
                mid = start + (end-start)/2
                a[3] = str(mid - window/2)
                a[4] = str(mid + window/2)
                a[-1] = a[-1].rstrip()

                if int(a[3]) > 0:
                    print >> prep_anchor_out, '\t'.join(a)
            else:
                print >> sys.stderr, 'Unknown mode %s' % mode
                exit(1)

    prep_anchor_out.close()

    return prep_anchor_fd, prep_anchor_gff
Example #7
0
def hash_genes_repeats(gtf_file,
                       repeats_gff,
                       gene_key='gene_id',
                       add_star=True,
                       stranded=False):
    gene_repeats = {}
    for line in open(gtf_file):
        a = line.split('\t')
        gene_id = gtf_kv(a[8])[gene_key]
        gene_repeats[gene_id] = set()

    p = subprocess.Popen('intersectBed -wo -a %s -b %s' %
                         (gtf_file, repeats_gff),
                         shell=True,
                         stdout=subprocess.PIPE)
    line = p.stdout.readline()
    while line:
        a = line.split('\t')

        # get names
        gene_id = gtf_kv(a[8])[gene_key]
        rep_kv = gtf_kv(a[17])
        rep = rep_kv['repeat']
        fam = rep_kv['family']

        # get strands
        gene_strand = a[6]
        te_strand = a[15]

        if stranded:
            if gene_strand == te_strand:
                orient = '+'
            else:
                orient = '-'

            gene_repeats[gene_id].add((rep, fam, orient))
            if add_star:
                gene_repeats[gene_id].add(('*', fam, orient))
                gene_repeats[gene_id].add(('*', '*', orient))

        else:
            gene_repeats[gene_id].add((rep, fam))
            if add_star:
                gene_repeats[gene_id].add(('*', fam))
                gene_repeats[gene_id].add(('*', '*'))

        line = p.stdout.readline()
    p.communicate()

    return gene_repeats
Example #8
0
def initialize_coverage(anchor_gff, mode, anchor_is_gtf, bins):
    print >> sys.stderr, 'Initializing coverage using anchor gff %s' % anchor_gff

    coverage = {}
    for line in open(anchor_gff):
        a = line.split('\t')

        chrom = a[0]
        start = int(a[3])
        end = int(a[4])

        if anchor_is_gtf:
            anchor_id = gff.gtf_kv(a[8])['transcript_id']
        else:
            anchor_id = '%s:%d-%d' % (chrom, start, end)

        if not anchor_id in coverage:
            if mode == 'span':
                coverage[anchor_id] = [0] * bins
            elif mode == 'mid':
                coverage[anchor_id] = [0] * (end - start + 1)
            else:
                print >> sys.stderr, 'Unknown mode %s' % mode
                exit(1)

    print >> sys.stderr, '%d anchors found.' % len(coverage)

    return coverage
Example #9
0
def main():
    usage = "usage: %prog [options] <gtf_file>"
    parser = OptionParser(usage)
    # parser.add_option()
    (options, args) = parser.parse_args()

    gtf_file = args[0]

    genes = {}

    for line in open(gtf_file):
        a = line.split()
        gene_id = a[9][1:-2]
        genes.setdefault(gene_id, []).append(line)

    for gene_id in genes:
        start = min([int(line.split()[3]) for line in genes[gene_id]])
        end = max([int(line.split()[4]) for line in genes[gene_id]])

        a = genes[gene_id][0].split("\t")
        kv = gff.gtf_kv(a[8])
        succinct_kv = {"gene_id": kv["gene_id"]}
        succinct_kv["transcript_id"] = ",".join(list(set([line.split()[11][1:-2] for line in genes[gene_id]])))

        d = [a[0], "gtf", "gene", str(start), str(end), ".", a[6], ".", gff.kv_gtf(succinct_kv)]
        print "\t".join(d)
Example #10
0
def main():
    usage = 'usage: %prog [options] <gtf> <fpkm tracking | diff>'
    parser = OptionParser(usage)
    parser.add_option('-a', dest='all_isoforms', default=False, action='store_true', help='Consider all isoforms. Default is to ignore bs ones')
    parser.add_option('-p', dest='pseudocount', default=0.125)
    parser.add_option('-r', dest='random_zeros', default=False, action='store_true', help='Randomly choose an isoform for zero FPKM genes [Default: %default]')
    (options,args) = parser.parse_args()

    if len(args) != 2:
        parser.error('Must provide gtf file and fpkm tracking')
    else:
        gtf_file = args[0]
        fpkm_file = args[1]

    gene_max_iso = map_genes(gtf_file, fpkm_file, options.pseudocount, options.all_isoforms, options.random_zeros)

    # filter gtf file
    for line in open(gtf_file):
        a = line.split('\t')
        kv = gff.gtf_kv(a[8])
        gene_id = kv['gene_id']
        tid = kv['transcript_id']        
        
        if gene_max_iso.get(gene_id,None) == tid:
            print line,
Example #11
0
def hash_repeat_family():
    repeat_family = {}
    for line in open('%s/hg19.fa.out.tp.gff' % os.environ['MASK']):
        a = line.split('\t')
        kv = gtf_kv(a[8])
        repeat_family[kv['repeat']] = kv['family']
    return repeat_family
Example #12
0
def main():
    usage = 'usage: %prog [options] -k <key> <gtf file>'
    parser = OptionParser(usage)
    parser.add_option('-k', dest='key', help='Key to extract')
    parser.add_option('-l', dest='line_too', action='store_true', default=False, help='Print the line too [Default: %default]')
    (options,args) = parser.parse_args()

    if len(args) == 1:
        if args[0] == '-':
            gtf_open = sys.stdin
        else:
            gtf_open = open(args[0])
    else:
        parser.error(usage)

    if not options.key:
        parser.error('Must provide key')
    else:
        keys = options.key.split(',')

    for line in gtf_open:
        if not line.startswith('##'):
            a = line.split('\t')
            kv = gff.gtf_kv(a[8])

            if options.line_too:
                key_str = '\t'.join([kv.get(key,'-') for key in keys])
                print('%s\t%s' % (key_str,line))
            else:
                print('\t'.join([kv.get(key,'-') for key in keys]))
Example #13
0
def main():
    usage = 'usage: %prog [options] <transcript .gff>'
    parser = OptionParser(usage)
    parser.add_option('-c', dest='cgff_file', default='/Users/dk/research/common/data/lncrna/lnc_catalog.gtf', help='Gtf file mapping transcripts to chromosomes [Default: %default]')
    (options,args) = parser.parse_args()

    if len(args) != 1:
        parser.error('Must provide gff file mapping features to transcripts')
    else:
        tgff_file = args[0]

    # get transcript information
    transcripts = {}
    for line in open(options.cgff_file):
        a = line.split('\t')
        if a[2] == 'exon':
            trans_id = gff.gtf_kv(a[8])['transcript_id']
            if not trans_id in transcripts:
                transcripts[trans_id] = Transcript(trans_id,a[0],a[6])
            transcripts[trans_id].add_exon(int(a[3]), int(a[4]))

    # process transcript features
    for line in open(tgff_file):
        feat = Feature(line)
        map_feature(transcripts[feat.trans_id], feat)
Example #14
0
def main():
    usage = 'usage: %prog [options] <gtf file> <cell type>'
    parser = OptionParser(usage)
    parser.add_option('-t', dest='expr_t', type='float', default=.1, help='Minimum allowed fpkm value')
    (options,args) = parser.parse_args()

    if len(args) != 2:
        parser.error('Must provide gtf file and cell type')
    else:
        gtf_file = args[0]
        cell_type = args[1]

    # get expression data
    cuff = cufflinks.fpkm_tracking()

    # find cell type experiment index
    cell_indexes = [i for i in range(len(cuff.experiments)) if cuff.experiments[i]==cell_type]
    if len(cell_indexes) == 0:
        parser.error('Cell type %s does not match any quantified experiments' % cell_type)
    else:
        cell_i = cell_indexes[0]

    # parser gtf file
    for line in open(gtf_file):
        a = line.split('\t')
        gene_id = gff.gtf_kv(a[8])['gene_id']
        expr_vec = cuff.gene_expr(gene_id)
        if expr_vec[cell_i] > options.expr_t:
            print line,
Example #15
0
def header_gff(header, seq, gff_file, options):
    header_seqs = {}
    for line in open(gff_file):
        a = line.split('\t')
        a[-1] = a[-1].rstrip()
        if (not options.exon or a[2] == 'exon') and a[0] == header:
            kv = gff.gtf_kv(a[8])
            #head_id = kv.get(options.header_key,a[8]+'_'+a[0]+':'+a[3]+'-'+a[4])
            head_id = kv.get(options.header_key,a[8])
            if options.gene_too:
                head_id += ' gene=%s' % kv.get('gene_id','')

            feat_start = int(a[3])
            feat_end = int(a[4])

            feat_seq = seq[feat_start-1:feat_end]
            if a[6] == '+':
                header_seqs[head_id] = header_seqs.get(head_id,'') + feat_seq
            else:
                header_seqs[head_id] = dna.rc(feat_seq) + header_seqs.get(head_id,'')

    for header in header_seqs:
        print '>%s' % header
        if options.split_lines:
            i = 0
            while i < len(header_seqs[header]):
                print header_seqs[header][i:i+60]
                i += 60
        else:
            print header_seqs[header]
Example #16
0
def hash_repeat_family():
    repeat_family = {}
    for line in open('%s/hg19.fa.out.tp.gff' % os.environ['MASK']):
        a = line.split('\t')
        kv = gtf_kv(a[8])
        repeat_family[kv['repeat']] = kv['family']
    return repeat_family
Example #17
0
def hash_te(te_gff_in):
    te_bp = {}
    for line in te_gff_in:
        a = line.split('\t')

        kv = gff.gtf_kv(a[8])
        rep = kv['repeat']
        family = kv['family']

        length = int(a[4]) - int(a[3]) + 1

        te_bp[(rep,family)] = te_bp.get((rep,family),0) + length
        te_bp[('*',family)] = te_bp.get(('*',family),0) + length
        te_bp[('*','*')] = te_bp.get(('*','*'),0) + length
        if rep.startswith('LTR'):
            te_bp[('LTR*',family)] = te_bp.get(('LTR*',family),0) + length
        if rep.startswith('LTR12'):
            te_bp[('LTR12*',family)] = te_bp.get(('LTR12*',family),0) + length
        if rep.startswith('LTR7') and (len(rep) < 5 or rep[4].isalpha()):
            te_bp[('LTR7*',family)] = te_bp.get(('LTR7*',family),0) + length
        if rep.startswith('THE1') and len(rep) == 5:
            te_bp[('THE1*',family)] = te_bp.get(('THE1*',family),0) + length
        if rep.startswith('MER61') and len(rep) == 6:
            te_bp[('MER61*',family)] = te_bp.get(('MER61*',family),0) + length
        if rep.startswith('L1PA'):
            te_bp[('L1PA*',family)] = te_bp.get(('L1PA*',family),0) + length

    return te_bp
Example #18
0
def main():
    usage = 'usage: %prog [options] <gtf_file>'
    parser = OptionParser(usage)
    #parser.add_option()
    (options, args) = parser.parse_args()

    gtf_file = args[0]

    genes = {}

    for line in open(gtf_file):
        a = line.split()
        gene_id = a[9][1:-2]
        genes.setdefault(gene_id, []).append(line)

    for gene_id in genes:
        start = min([int(line.split()[3]) for line in genes[gene_id]])
        end = max([int(line.split()[4]) for line in genes[gene_id]])

        a = genes[gene_id][0].split('\t')
        kv = gff.gtf_kv(a[8])
        succinct_kv = {'gene_id': kv['gene_id']}
        succinct_kv['transcript_id'] = ','.join(
            list(set([line.split()[11][1:-2] for line in genes[gene_id]])))

        d = [
            a[0], 'gtf', 'gene',
            str(start),
            str(end), '.', a[6], '.',
            gff.kv_gtf(succinct_kv)
        ]
        print '\t'.join(d)
Example #19
0
def main():
    usage = 'usage: %prog [options] -k <key> <gtf file>'
    parser = OptionParser(usage)
    parser.add_option('-k', dest='key', help='Key to extract')
    parser.add_option('-l',
                      dest='line_too',
                      action='store_true',
                      default=False,
                      help='Print the line too [Default: %default]')
    (options, args) = parser.parse_args()

    if len(args) == 1:
        if args[0] == '-':
            gtf_open = sys.stdin
        else:
            gtf_open = open(args[0])
    else:
        parser.error(usage)

    if not options.key:
        parser.error('Must provide key')
    else:
        keys = options.key.split(',')

    for line in gtf_open:
        if not line.startswith('##'):
            a = line.split('\t')
            kv = gff.gtf_kv(a[8])

            if options.line_too:
                key_str = '\t'.join([kv.get(key, '-') for key in keys])
                print('%s\t%s' % (key_str, line))
            else:
                print('\t'.join([kv.get(key, '-') for key in keys]))
Example #20
0
def gff_df(gff_file, gene_index):
    """Read GFF w/ keys into DataFrame."""

    chrms = []
    starts = []
    ends = []
    strands = []
    gtf_lists = {}
    for line in open(gff_file):
        a = line.split('\t')
        chrms.append(a[0])
        starts.append(int(a[3]))
        ends.append(int(a[3]))
        strands.append(a[5])
        for kv in gff.gtf_kv(a[-1]).items():
            gtf_lists.setdefault(kv[0], []).append(kv[1])

    df = pd.DataFrame({
        'chr': chrms,
        'start': starts,
        'end': ends,
        'strand': strands
    })

    for k, kl in gtf_lists.items():
        df[k] = kl

    df.set_index(gene_index, inplace=True)

    return df
Example #21
0
def initialize_coverage(anchor_gff, mode, anchor_is_gtf, bins):
    print >> sys.stderr, 'Initializing coverage using anchor gff %s' % anchor_gff

    coverage = {}
    for line in open(anchor_gff):
        a = line.split('\t')

        chrom = a[0]
        start = int(a[3])
        end = int(a[4])
            
        if anchor_is_gtf:
            anchor_id = gff.gtf_kv(a[8])['transcript_id']
        else:
            anchor_id = '%s:%d-%d' % (chrom, start, end)
            
        if not anchor_id in coverage:
            if mode == 'span':
                coverage[anchor_id] = [0]*bins
            elif mode == 'mid':
                coverage[anchor_id] = [0]*(end-start+1)
            else:
                print >> sys.stderr, 'Unknown mode %s' % mode
                exit(1)

    print >> sys.stderr, '%d anchors found.' % len(coverage)

    return coverage
Example #22
0
def header_gff(header, seq, gff_file, options):
    header_seqs = {}
    for line in open(gff_file):
        a = line.split('\t')
        a[-1] = a[-1].rstrip()
        if (not options.exon or a[2] == 'exon') and a[0] == header:
            try:
                kv = gff.gtf_kv(a[8])
            except:
                kv = {}

            head_id = kv.get(options.header_key,a[0]+':'+a[3]+'-'+a[4])
            #head_id = kv.get(options.header_key,a[8])

            if options.gene_too:
                head_id += ' gene=%s' % kv.get('gene_id','')

            feat_start = int(a[3])
            feat_end = int(a[4])

            feat_seq = seq[feat_start-1:feat_end]
            if a[6] == '+':
                header_seqs[head_id] = header_seqs.get(head_id,'') + feat_seq
            else:
                header_seqs[head_id] = dna.rc(feat_seq) + header_seqs.get(head_id,'')

    for header in header_seqs:
        print '>%s' % header
        if options.split_lines:
            i = 0
            while i < len(header_seqs[header]):
                print header_seqs[header][i:i+60]
                i += 60
        else:
            print header_seqs[header]
Example #23
0
def main():
    usage = 'usage: %prog [options] <gene/transcript id>'
    parser = OptionParser(usage)
    parser.add_option('-c', dest='cuff_dir', default='%s/research/common/data/lncrna'%os.environ['HOME'], help='Cufflinks output directory with .fpkm_tracking files [Default: %default]')
    parser.add_option('-l', dest='lnc_gtf', default='%s/research/common/data/lncrna/lnc_catalog.gtf'%os.environ['HOME'], help='lncRNA catalog gtf file [Default: %default]')
    parser.add_option('-t', dest='transcript_expr', default=False, action='store_true', help='Return transcript expression rather than gene [Default: %default]')
    (options,args) = parser.parse_args()

    if options.transcript_expr:
        cuff = cufflinks.fpkm_tracking('%s/isoforms.fpkm_tracking' % options.cuff_dir)

        if args[0].find('XLOC') != -1:
            trans_ids = set()
            for line in open(options.lnc_gtf):
                a = line.split('\t')
                kv = gff.gtf_kv(a[8])
                if kv['gene_id'] == args[0]:
                    trans_ids.add(kv['transcript_id'])
        else:
            trans_ids = [args[0]]

        for trans_id in trans_ids:
            print '%s:' % trans_id
            cuff.gene_expr_print(trans_id)

    else:
        cuff = cufflinks.fpkm_tracking('%s/genes.fpkm_tracking' % options.cuff_dir)

        if args[0].find('XLOC') != -1:
            gene_id = args[0]
        else:
            t2g = gff.t2g(options.lnc_gtf)
            gene_id = t2g[args[0]]

        cuff.gene_expr_print(gene_id)
Example #24
0
def main():
    usage = 'usage: %prog [options] <transcript .gff>'
    parser = OptionParser(usage)
    parser.add_option(
        '-c',
        dest='cgff_file',
        default='/Users/dk/research/common/data/lncrna/lnc_catalog.gtf',
        help='Gtf file mapping transcripts to chromosomes [Default: %default]')
    (options, args) = parser.parse_args()

    if len(args) != 1:
        parser.error('Must provide gff file mapping features to transcripts')
    else:
        tgff_file = args[0]

    # get transcript information
    transcripts = {}
    for line in open(options.cgff_file):
        a = line.split('\t')
        if a[2] == 'exon':
            trans_id = gff.gtf_kv(a[8])['transcript_id']
            if not trans_id in transcripts:
                transcripts[trans_id] = Transcript(trans_id, a[0], a[6])
            transcripts[trans_id].add_exon(int(a[3]), int(a[4]))

    # process transcript features
    for line in open(tgff_file):
        feat = Feature(line)
        map_feature(transcripts[feat.trans_id], feat)
Example #25
0
def main():
    usage = 'usage: %prog [options] <bed file>'
    parser = OptionParser(usage)
    parser.add_option('-g', dest='orig_gtf', help='The original gtf file of the TransMap\'d genes to be used to transfer gene id\'s')
    parser.add_option('-m', dest='merge_dist', type='int', default=30, help='Minimum distance two exons can be apart for them to be merged [Default: %default]')
    (options,args) = parser.parse_args()

    if len(args) != 1:
        parser.error('Must provide bed file')
    else:
        bed_file = args[0]

    # map transcript id's to gene id's if possible
    t2g = {}
    if options.orig_gtf:
        for line in open(options.orig_gtf):
            a = line.split('\t')
            kv = gff.gtf_kv(a[8])
            t2g[kv['transcript_id']] = kv['gene_id']

    # hash to disambiguate multi-mapping transcripts
    transcript_maps = {}

    for line in open(bed_file):
        a = line.split('\t')
        a[-1] = a[-1].rstrip()

        tid = a[3]
        gid = t2g.get(a[3],a[3])

        transcript_maps[tid] = transcript_maps.get(tid,0) + 1
        if transcript_maps[tid] > 1:
            gid += '_v%d' % transcript_maps[tid]
            tid += '_v%d' % transcript_maps[tid]

        gene_start = int(a[1])
        gene_end = int(a[2])

        block_sizes = [int(x) for x in a[10].split(',') if x]
        block_starts = [int(x) for x in a[11].split(',') if x]

        exon_cols = []
        last_end = None
        exon_num = 1
        for i in range(len(block_starts)):
            exon_start = gene_start+1+block_starts[i]
            exon_end = gene_start+1+block_starts[i]+block_sizes[i]-1

            if last_end and last_end+options.merge_dist >= exon_start:
                # merge w/ last
                exon_cols[-1][4] = str(exon_end)
            else:
                exon_cols.append([a[0], 'TransMap', 'exon', str(exon_start), str(exon_end), '.', a[5], '.', 'gene_id "%s"; transcript_id "%s"; exon_number "%d"' % (gid,tid,exon_num)])
                exon_num += 1
            
            last_end = exon_end

        for cols in exon_cols:
            print '\t'.join(cols)
Example #26
0
def te_target_size_bed(te_gff, ref_bed, read_len):
    # hash TE intervals by BED region
    bed_te_intervals = {}
    p = subprocess.Popen('intersectBed -wo -a %s -b %s' % (ref_bed, te_gff),
                         shell=True,
                         stdout=subprocess.PIPE)
    for line in p.stdout:
        a = line.split('\t')

        bchrom = a[0]
        bstart = int(a[1])
        bend = int(a[2])
        bid = (bchrom, bstart)

        rep_kv = gff.gtf_kv(a[11])
        rep = rep_kv['repeat']
        fam = rep_kv['family']

        tstart = int(a[6])
        tend = int(a[7])

        ostart = max(bstart, tstart)
        oend = min(bend, tend)

        if not bid in bed_te_intervals:
            bed_te_intervals[bid] = {}
        bed_te_intervals[bid].setdefault((rep, fam), []).append((ostart, oend))
        bed_te_intervals[bid].setdefault(('*', fam), []).append((ostart, oend))
        bed_te_intervals[bid].setdefault(('*', '*'), []).append((ostart, oend))

    p.communicate()

    target_size = {}
    for bid in bed_te_intervals:
        bchrom, bstart = bid

        for te in bed_te_intervals[bid]:
            bt_intervals = bed_te_intervals[bid][te]
            bt_intervals.sort()

            # merge intervals, limited at the start by the BED region's start
            merged_intervals = [(max(bstart,
                                     bt_intervals[0][0] - read_len + 1),
                                 bt_intervals[0][1])]
            for i in range(1, len(bt_intervals)):
                start1, end1 = merged_intervals[-1]
                start2, end2 = bt_intervals[i]

                if end1 + 1 < start2 - read_len + 1:
                    merged_intervals.append((start2 - read_len + 1, end2))
                else:
                    merged_intervals[-1] = (start1, end2)

            # sum
            target_size[te] = target_size.get(te, 0) + sum(
                [e - s + 1 for (s, e) in merged_intervals])

    return target_size
Example #27
0
def hash_genes_repeats_num(gtf_file, repeats_gff, gene_key='gene_id', add_star=True, stranded=False):
    gene_repeat_num = {}

    p = subprocess.Popen('intersectBed -wo -a %s -b %s' % (gtf_file, repeats_gff), shell=True, stdout=subprocess.PIPE)
    line = p.stdout.readline()
    while line:
        a = line.split('\t')

        # get names
        gene_id = gtf_kv(a[8])[gene_key]
        rep_kv = gtf_kv(a[17])
        rep = rep_kv['repeat']
        fam = rep_kv['family']

        # get strands
        gene_strand = a[6]
        te_strand = a[15]

        if stranded:
            if gene_strand == te_strand:
                orient = '+'
            else:
                orient = '-'

            if not gene_id in gene_repeat_num:
                gene_repeat_num[gene_id] = {}

            gene_repeat_num[gene_id][(rep,fam,orient)] = gene_repeat_num[gene_id].get((rep,fam,orient),0) + 1
            if add_star:
                gene_repeat_num[gene_id][('*',fam,orient)] = gene_repeat_num[gene_id].get(('*',fam,orient),0) + 1
                gene_repeat_num[gene_id][('*','*',orient)] = gene_repeat_num[gene_id].get(('*','*',orient),0) + 1

        else:
            if not gene_id in gene_repeat_num:
                gene_repeat_num[gene_id] = {}

            gene_repeat_num[gene_id][(rep,fam)] = gene_repeat_num[gene_id].get((rep,fam),0) + 1
            if add_star:
                gene_repeat_num[gene_id][('*',fam)] = gene_repeat_num[gene_id].get(('*',fam),0) + 1
                gene_repeat_num[gene_id][('*','*')] = gene_repeat_num[gene_id].get(('*','*'),0) + 1

        line = p.stdout.readline()
    p.communicate()

    return gene_repeat_num
Example #28
0
def count_te_fragments(bam_file, te_gff, strand_split=False):
    # count fragments and hash multi-mappers
    num_fragments = 0
    multi_maps = {}
    paired_poll = {False:0, True:0}
    for aligned_read in pysam.Samfile(bam_file, 'rb'):
        if aligned_read.is_paired:
            num_fragments += 0.5/aligned_read.opt('NH')
        else:
            num_fragments += 1.0/aligned_read.opt('NH')

        if aligned_read.opt('NH') > 1:
            multi_maps[aligned_read.qname] = aligned_read.opt('NH')

        paired_poll[aligned_read.is_paired] += 1

    # guess paired-ness
    if paired_poll[True] > 0 and paired_poll[False] > 0:
        print >> sys.stderr, 'Paired-ness of the reads is ambiguous'
    if paired_poll[True] > paired_poll[False]:
        is_paired = True
    else:
        is_paired = False

    # hash read counts by TE family
    te_fragments = {}
    proc = subprocess.Popen('intersectBed -split -wo -bed -abam %s -b %s' % (bam_file, te_gff), shell=True, stdout=subprocess.PIPE)
    for line in proc.stdout:
        a = line.split('\t')
        te_kv = gff.gtf_kv(a[20])

        rep = te_kv['repeat']
        fam = te_kv['family']

        if is_paired:
            read_inc = 0.5/multi_maps.get(a[3],1.0)
        else:
            read_inc = 1.0/multi_maps.get(a[3],1.0)

        rep_star = '*'
        if strand_split:
            rstrand = a[5]
            tstrand = a[18]
            if rstrand == tstrand:
                rep += '+'
                rep_star += '+'
            else:
                rep += '-'
                rep_star += '-'

        te_fragments[(rep,fam)] = te_fragments.get((rep,fam),0.0) + read_inc
        te_fragments[(rep_star,fam)] = te_fragments.get((rep_star,fam),0.0) + read_inc
        te_fragments[(rep_star,'*')] = te_fragments.get((rep_star,'*'),0.0) + read_inc

    proc.communicate()

    return num_fragments, te_fragments
Example #29
0
def te_target_size(te_gff, read_len):
    te_bp = {}
    active_te_intervals = {}

    for line in open(te_gff):
        a = line.split('\t')

        kv = gff.gtf_kv(a[8])
        rep = kv['repeat']
        fam = kv['family']

        chrom = a[0]
        start = int(a[3])
        end = int(a[4])

        # process closed intervals
        for arep, afam in active_te_intervals.keys():
            achrom, astart, aend = active_te_intervals[(arep, afam)]

            if achrom != chrom or aend + read_len < start:
                # add
                te_bp[(arep, afam)] = te_bp.get(
                    (arep, afam), 0) + aend - astart + 1 + read_len
                # close
                del active_te_intervals[(arep, afam)]

        # update/add te
        if (rep, fam) in active_te_intervals:
            achrom, astart, aend = active_te_intervals[(rep, fam)]
            active_te_intervals[(rep, fam)] = (chrom, min(astart, start),
                                               max(aend, end))
        else:
            active_te_intervals[(rep, fam)] = (chrom, start, end)

        if ('*', fam) in active_te_intervals:
            achrom, astart, aend = active_te_intervals[('*', fam)]
            active_te_intervals[('*', fam)] = (chrom, min(astart, start),
                                               max(aend, end))
        else:
            active_te_intervals[('*', fam)] = (chrom, start, end)

        if ('*', '*') in active_te_intervals:
            achrom, astart, aend = active_te_intervals[('*', '*')]
            active_te_intervals[('*', '*')] = (chrom, min(astart, start),
                                               max(aend, end))
        else:
            active_te_intervals[('*', '*')] = (chrom, start, end)

    # close remaining
    for arep, afam in active_te_intervals.keys():
        achrom, astart, aend = active_te_intervals[(arep, afam)]

        # add
        te_bp[(arep, afam)] = te_bp.get(
            (arep, afam), 0) + aend - astart + 1 + read_len

    return te_bp
Example #30
0
def count_te_fragments(bam_file, te_gff, strand_split):
    # count fragments and hash multi-mappers
    num_fragments = 0
    multi_maps = {}
    paired_poll = {False: 0, True: 0}
    for aligned_read in pysam.Samfile(bam_file, 'rb'):
        if aligned_read.is_paired:
            num_fragments += 0.5 / aligned_read.opt('NH')
        else:
            num_fragments += 1.0 / aligned_read.opt('NH')

        if aligned_read.opt('NH') > 1:
            multi_maps[aligned_read.qname] = aligned_read.opt('NH')

        paired_poll[aligned_read.is_paired] += 1

    # guess paired-ness
    if paired_poll[True] > 0 and paired_poll[False] > 0:
        print >> sys.stderr, 'Paired-ness of the reads is ambiguous'
    if paired_poll[True] > paired_poll[False]:
        is_paired = True
    else:
        is_paired = False

    # hash read counts by TE family
    te_fragments = {}
    proc = subprocess.Popen('intersectBed -split -wo -bed -abam %s -b %s' %
                            (bam_file, te_gff),
                            shell=True,
                            stdout=subprocess.PIPE)
    for line in proc.stdout:
        a = line.split('\t')

        if is_paired:
            read_inc = 0.5 / multi_maps.get(a[3], 1.0)
        else:
            read_inc = 1.0 / multi_maps.get(a[3], 1.0)

        te_chrom = a[12]
        te_start = int(a[15])
        te_kv = gff.gtf_kv(a[20])

        if strand_split:
            rstrand = a[5]
            tstrand = a[18]
            if rstrand == tstrand:
                te_key = (te_chrom, te_start, '+')
            else:
                te_key = (te_chrom, te_start, '-')
        else:
            te_key = (te_chrom, te_start)

        te_fragments[te_key] = te_fragments.get(te_key, 0.0) + read_inc

    proc.communicate()

    return num_fragments, te_fragments
Example #31
0
def main():
    usage = 'usage: %prog [options] <chain_file> <net_file> <gtf_from> <gtf_to>'
    parser = OptionParser(usage)
    #parser.add_option()
    (options,args) = parser.parse_args()

    if len(args) != 4:
        parser.error('Must provide chain file and two GTF files')
    else:
        chain_file = args[0]
        net_file = args[1]
        gtf_from = args[2]
        gtf_to = args[3]

    # transmap to new genome
    from_map_gtf_fd, from_map_gtf_file = tempfile.mkstemp()
    subprocess.call('chain_map.py -k gene_id -n %s %s %s > %s' % (net_file,chain_file,gtf_from,from_map_gtf_file), shell=True)

    # intersect w/ gtf_to
    homologues = {}
    p = subprocess.Popen('intersectBed -wo -s -a %s -b %s' % (from_map_gtf_file,gtf_to), shell=True, stdout=subprocess.PIPE)
    for line in p.stdout:
        a = line.split('\t')
        
        kv_to = gff.gtf_kv(a[17])

        gid_from = a[8].split(';')[1].strip()
        gid_to = kv_to['gene_id']

        homologues.setdefault(gid_from,set()).add(gid_to)
    p.communicate()

    # find all genes
    genes = set()
    for line in open(gtf_from):
        a = line.split('\t')
        genes.add(gff.gtf_kv(a[8])['gene_id'])

    # print table
    for g in genes:
        print '%s\t%s' % (g,' '.join(homologues.get(g,['-'])))

    os.close(from_map_gtf_fd)
    os.remove(from_map_gtf_file)
Example #32
0
def intersect_gene_te(gtf_file, upstream, downstream):
    # focus on promoter
    tmp_fd, tmp_file = tempfile.mkstemp()
    gff.promoters(gtf_file, upstream, downstream, tmp_file)

    # intersect genes w/ repeats
    # hash transposon nt by gene
    gene_trans = {}
    p = subprocess.Popen('intersectBed -wo -a %s -b %s' %
                         (tmp_file, hg19_reps_gff),
                         shell=True,
                         stdout=subprocess.PIPE)
    line = p.stdout.readline()
    while line:
        a = line.split('\t')

        # get names
        gene = gff.gtf_kv(a[8])['transcript_id']
        rep_kv = gff.gtf_kv(a[17])
        rep = rep_kv['repeat']
        fam = rep_kv['family']

        # add nt
        if gene not in gene_trans:
            gene_trans[gene] = {}
        gene_trans[gene][(rep, fam)] = gene_trans[gene].get(
            (rep, fam), 0) + int(a[18])
        gene_trans[gene][('*', fam)] = gene_trans[gene].get(
            ('*', fam), 0) + int(a[18])
        gene_trans[gene][('*', '*')] = gene_trans[gene].get(
            ('*', '*'), 0) + int(a[18])

        line = p.stdout.readline()
    p.communicate()

    # create a fake family for dTE-lncRNAs
    for line in open(gtf_file):
        a = line.split('\t')
        tid = gff.gtf_kv(a[8])['transcript_id']
        if tid not in gene_trans:
            gene_trans[tid] = {('n', 'n'): 1}

    return gene_trans
Example #33
0
def hash_genes_repeats(gtf_file, repeats_gff, gene_key='gene_id', add_star=True, stranded=False):
    gene_repeats = {}
    for line in open(gtf_file):
        a = line.split('\t')
        gene_id = gtf_kv(a[8])[gene_key]
        gene_repeats[gene_id] = set()
    
    p = subprocess.Popen('intersectBed -wo -a %s -b %s' % (gtf_file, repeats_gff), shell=True, stdout=subprocess.PIPE)
    line = p.stdout.readline()
    while line:
        a = line.split('\t')

        # get names
        gene_id = gtf_kv(a[8])[gene_key]
        rep_kv = gtf_kv(a[17])
        rep = rep_kv['repeat']
        fam = rep_kv['family']

        # get strands
        gene_strand = a[6]
        te_strand = a[15]

        if stranded:
            if gene_strand == te_strand:
                orient = '+'
            else:
                orient = '-'

            gene_repeats[gene_id].add((rep,fam,orient))
            if add_star:
                gene_repeats[gene_id].add(('*',fam,orient))
                gene_repeats[gene_id].add(('*','*',orient))

        else:
            gene_repeats[gene_id].add((rep,fam))
            if add_star:
                gene_repeats[gene_id].add(('*',fam))
                gene_repeats[gene_id].add(('*','*'))

        line = p.stdout.readline()
    p.communicate()

    return gene_repeats
Example #34
0
def make_te_read_fastas(te_gff, bam_file, read_tes, out_dir, stranded, max_reads):
    # open TE read fasta files
    te_fastas = {}
    for line in open(te_gff):
        a = line.split('\t')
        dfam_te = gff.gtf_kv(a[8])['dfam']
        if not (dfam_te,'fwd') in te_fastas:
            te_fastas[(dfam_te,'fwd')] = open('%s/%s_fwd.fa' % (out_dir,dfam_te), 'w')
            te_fastas[(dfam_te,'rev')] = open('%s/%s_rev.fa' % (out_dir,dfam_te), 'w')

    # initialize counters for total reads
    te_totals = {}
    for dfam_te, orient in te_fastas:
        te_totals[dfam_te, orient] = 0

    # print reads to fasta files
    for aligned_read in pysam.Samfile(bam_file, 'rb'):
        this_read_tes = read_tes.get(aligned_read.qname,{})

        for dfam_te in this_read_tes.keys():
            if this_read_tes[dfam_te] != None:
                (rstrand, tstrand) = this_read_tes[dfam_te]

                # only print if we match the read strand
                if (aligned_read.is_reverse and rstrand == '-') or (not aligned_read.is_reverse and rstrand == '+'):
                    # TE determines reversal
                    if tstrand == '+':
                        rseq = aligned_read.seq
                    else:
                        rseq = dna.rc(aligned_read.seq)

                    # count, and print
                    if not stranded or rstrand == tstrand:
                        te_totals[(dfam_te,'fwd')] += 1
                        if te_totals[(dfam_te,'fwd')] < max_reads:
                            print >> te_fastas[(dfam_te,'fwd')], '>%s\n%s' % (aligned_read.qname,rseq)
                    else:
                        te_totals[(dfam_te,'rev')] += 1
                        if te_totals[(dfam_te,'rev')] < max_reads:
                            print >> te_fastas[(dfam_te,'rev')], '>%s\n%s' % (aligned_read.qname,rseq)

                    # specify printed
                    this_read_tes[dfam_te] = None

    # post-process fasta files
    te_renorm = {}
    for dfam_te, orient in te_fastas:
        # close
        te_fastas[(dfam_te, orient)].close()

        # return renormalization factors
        if te_totals[(dfam_te,orient)] > 10:
            te_renorm[(dfam_te,orient)] = max(1.0, te_totals[(dfam_te,orient)]/float(max_reads))

    return te_renorm
Example #35
0
    def __init__(self, exon_gtf, promoter_length):
        a = exon_gtf.split('\t')
        a[-1] = a[-1].rstrip()

        self.gtf_kv = gff.gtf_kv(a[8])
        self.chr = a[0]
        self.strand = a[6]

        if self.strand == '+':
            self.start = max(0, int(a[3]) - promoter_length)
        else:
            self.start = int(a[4])
Example #36
0
def te_target_size_bed(te_gff, ref_bed, read_len):
    # hash TE intervals by BED region
    bed_te_intervals = {}
    p = subprocess.Popen('intersectBed -wo -a %s -b %s' % (ref_bed, te_gff), shell=True, stdout=subprocess.PIPE)
    for line in p.stdout:
        a = line.split('\t')

        bchrom = a[0]
        bstart = int(a[1])
        bend = int(a[2])
        bid = (bchrom,bstart)

        rep_kv = gff.gtf_kv(a[11])
        rep = rep_kv['repeat']
        fam = rep_kv['family']

        tstart = int(a[6])
        tend = int(a[7])

        ostart = max(bstart, tstart)
        oend = min(bend, tend)

        if not bid in bed_te_intervals:
            bed_te_intervals[bid] = {}
        bed_te_intervals[bid].setdefault((rep,fam),[]).append((ostart,oend))
        bed_te_intervals[bid].setdefault(('*',fam),[]).append((ostart,oend))
        bed_te_intervals[bid].setdefault(('*','*'),[]).append((ostart,oend))        

    p.communicate()

    target_size = {}
    for bid in bed_te_intervals:
        bchrom, bstart = bid        

        for te in bed_te_intervals[bid]:
            bt_intervals = bed_te_intervals[bid][te]
            bt_intervals.sort()

            # merge intervals, limited at the start by the BED region's start
            merged_intervals = [(max(bstart, bt_intervals[0][0]-read_len+1), bt_intervals[0][1])]
            for i in range(1,len(bt_intervals)):
                start1, end1 = merged_intervals[-1]
                start2, end2 = bt_intervals[i]

                if end1+1 < start2-read_len+1:
                    merged_intervals.append((start2-read_len+1,end2))
                else:
                    merged_intervals[-1] = (start1, end2)

            # sum
            target_size[te] = target_size.get(te,0) + sum([e-s+1 for (s,e) in merged_intervals])

    return target_size
Example #37
0
    def __init__(self, exon_gtf, promoter_length):
        a = exon_gtf.split('\t')
        a[-1] = a[-1].rstrip()

        self.gtf_kv = gff.gtf_kv(a[8])
        self.chr = a[0]
        self.strand = a[6]

        if self.strand == '+':
            self.start = max(0, int(a[3]) - promoter_length)
        else:
            self.start = int(a[4])
Example #38
0
def te_target_size(te_gff, read_len):
    te_bp = {}
    active_te_intervals = {}

    for line in open(te_gff):
        a = line.split('\t')

        kv = gff.gtf_kv(a[8])
        rep = kv['repeat']
        fam = kv['family']

        chrom = a[0]
        start = int(a[3])
        end = int(a[4])

        # process closed intervals
        for arep, afam in active_te_intervals.keys():
            achrom, astart, aend = active_te_intervals[(arep,afam)]

            if achrom != chrom or aend + read_len < start:
                # add
                te_bp[(arep,afam)] = te_bp.get((arep,afam),0) + aend - astart + 1 + read_len
                # close
                del active_te_intervals[(arep,afam)]

        # update/add te
        if (rep,fam) in active_te_intervals:
            achrom, astart, aend = active_te_intervals[(rep,fam)]
            active_te_intervals[(rep,fam)] = (chrom, min(astart,start), max(aend, end))
        else:
            active_te_intervals[(rep,fam)] = (chrom, start, end)

        if ('*',fam) in active_te_intervals:
            achrom, astart, aend = active_te_intervals[('*',fam)]
            active_te_intervals[('*',fam)] = (chrom, min(astart,start), max(aend, end))
        else:
            active_te_intervals[('*',fam)] = (chrom, start, end)

        if ('*','*') in active_te_intervals:
            achrom, astart, aend = active_te_intervals[('*','*')]
            active_te_intervals[('*','*')] = (chrom, min(astart,start), max(aend, end))
        else:
            active_te_intervals[('*','*')] = (chrom, start, end)

    # close remaining
    for arep, afam in active_te_intervals.keys():
        achrom, astart, aend = active_te_intervals[(arep,afam)]

        # add
        te_bp[(arep,afam)] = te_bp.get((arep,afam),0) + aend - astart + 1 + read_len

    return te_bp
Example #39
0
def read_genes(gtf_file, key_id='transcript_id'):
    genes = {}
    for line in open(gtf_file):
        a = line.split('\t')

        kv = gff.gtf_kv(a[8])
        if not kv[key_id] in genes:
            genes[kv[key_id]] = Gene(a[0], a[6], kv)

        if a[2] == 'exon':
            genes[kv[key_id]].add_exon(int(a[3]), int(a[4]))

    return genes
Example #40
0
def map_dfam_family():
    repeat_family = {}
    for line in open('%s/hg19.fa.out.tp.gff' % os.environ['MASK']):
        a = line.split('\t')
        kv = gtf_kv(a[8])
        repeat_family[kv['repeat']] = kv['family']

    dfam_family = {}
    for repeat in repeat_family:
        dfam_tes = map_rm_dfam(repeat, quiet=True)
        for dfam_te in dfam_tes:
            dfam_family[dfam_te] = repeat_family[repeat]

    return dfam_family
Example #41
0
def map_dfam_repeat():
    repeats = set()
    for line in open('%s/hg19.fa.out.tp.gff' % os.environ['MASK']):
        a = line.split('\t')
        kv = gtf_kv(a[8])
        repeats.add(kv['repeat'])

    dfam_repeat = {}
    for repeat in repeats:
        dfam_tes = map_rm_dfam(repeat, quiet=True)
        for dfam_te in dfam_tes:
            dfam_repeat.setdefault(dfam_te, set()).add(repeat)

    return dfam_repeat
Example #42
0
def main():
    usage = "usage: %prog [options] <fpkm_tracking>"
    parser = OptionParser(usage)
    parser.add_option("-d", dest="diff_file", help="Limit to significantly differentially expressed genes")
    parser.add_option("-g", dest="gtf", help="GTF file of genes to display")
    parser.add_option("-m", dest="min_fpkm", default=0.125, help="Minimum FPKM (for logs) [Default: %default]")
    parser.add_option("-o", dest="out_pdf", default="cuff_heat.pdf", help="Output PDF [Default: %default]")
    parser.add_option("-s", dest="sample", default=1000, help="Sample genes rather than use all [Default: %default]")
    (options, args) = parser.parse_args()

    if len(args) != 1:
        parser.error("Must provide fpkm_tracking")
    else:
        fpkm_tracking = args[0]

    # load expression data
    cuff = cufflinks.fpkm_tracking(fpkm_file=fpkm_tracking)

    # determine genes
    all_genes = set(cuff.genes)
    if options.gtf:
        all_genes = set()
        for line in open(options.gtf):
            a = line.split("\t")
            all_genes.add(gff.gtf_kv(a[8])["gene_id"])

    if options.diff_file:
        # limit to differentially expressed genes
        diff_genes = find_diff(options.diff_file)
        all_genes &= diff_genes

    # sample genes to display
    if len(all_genes) <= options.sample:
        display_genes = all_genes
    else:
        display_genes = random.sample(all_genes, options.sample)

    # build data frame
    df = {"Gene": [], "FPKM": [], "Sample": []}

    for gene_id in display_genes:
        ge = cuff.gene_expr(gene_id)
        if not math.isnan(ge[0]):
            for i in range(len(cuff.experiments)):
                df["Gene"].append(gene_id)
                df["Sample"].append(cuff.experiments[i])
                df["FPKM"].append(math.log(ge[i] + options.min_fpkm, 2))

    # plot
    ggplot.plot("%s/cuff_heat.r" % os.environ["RDIR"], df, [options.out_pdf])
Example #43
0
def map_dfam_family():
    repeat_family = {}
    for line in open('%s/hg19.fa.out.tp.gff' % os.environ['MASK']):
        a = line.split('\t')
        kv = gtf_kv(a[8])
        repeat_family[kv['repeat']] = kv['family']

    dfam_family = {}
    for repeat in repeat_family:
        dfam_tes = map_rm_dfam(repeat, quiet=True)
        for dfam_te in dfam_tes:
            dfam_family[dfam_te] = repeat_family[repeat]

    return dfam_family
Example #44
0
def map_dfam_repeat():
    repeats = set()
    for line in open('%s/hg19.fa.out.tp.gff' % os.environ['MASK']):
        a = line.split('\t')
        kv = gtf_kv(a[8])
        repeats.add(kv['repeat'])

    dfam_repeat = {}
    for repeat in repeats:
        dfam_tes = map_rm_dfam(repeat, quiet=True)
        for dfam_te in dfam_tes:
            dfam_repeat[dfam_te] = repeat

    return dfam_repeat
Example #45
0
def intersect_gene_te(gtf_file, upstream, downstream):
    # focus on promoter
    tmp_fd, tmp_file = tempfile.mkstemp()
    gff.promoters(gtf_file, upstream, downstream, tmp_file)
    
    # intersect genes w/ repeats
    # hash transposon nt by gene
    gene_trans = {}
    p = subprocess.Popen('intersectBed -wo -a %s -b %s' % (tmp_file,hg19_reps_gff), shell=True, stdout=subprocess.PIPE)
    line = p.stdout.readline()
    while line:
        a = line.split('\t')

        # get names
        gene = gff.gtf_kv(a[8])['transcript_id']
        rep_kv = gff.gtf_kv(a[17])
        rep = rep_kv['repeat']
        fam = rep_kv['family']

        # add nt
        if gene not in gene_trans:
            gene_trans[gene] = {}
        gene_trans[gene][(rep,fam)] = gene_trans[gene].get((rep,fam),0) + int(a[18])
        gene_trans[gene][('*',fam)] = gene_trans[gene].get(('*',fam),0) + int(a[18])
        gene_trans[gene][('*','*')] = gene_trans[gene].get(('*','*'),0) + int(a[18])

        line = p.stdout.readline()
    p.communicate()

    # create a fake family for dTE-lncRNAs
    for line in open(gtf_file):
        a = line.split('\t')
        tid = gff.gtf_kv(a[8])['transcript_id']
        if tid not in gene_trans:
            gene_trans[tid] = {('n','n'):1}

    return gene_trans
Example #46
0
def measure_te(rm_file):
    repeat_bp = {}
    for line in open(rm_file):
        a = line.split('\t')

        kv = gff.gtf_kv(a[8])
        rep = kv['repeat']
        family = kv['family']

        length = int(a[4]) - int(a[3]) + 1

        repeat_bp[(rep,family)] = repeat_bp.get((rep,family),0) + length
        repeat_bp[('*',family)] = repeat_bp.get(('*',family),0) + length
        repeat_bp[('*','*')] = repeat_bp.get(('*','*'),0) + length

    return repeat_bp
Example #47
0
def measure_te(rm_file):
    repeat_bp = {}
    for line in open(rm_file):
        a = line.split('\t')

        kv = gff.gtf_kv(a[8])
        rep = kv['repeat']
        family = kv['family']

        length = int(a[4]) - int(a[3]) + 1

        repeat_bp[(rep,family)] = repeat_bp.get((rep,family),0) + length
        repeat_bp[('*',family)] = repeat_bp.get(('*',family),0) + length
        repeat_bp[('*','*')] = repeat_bp.get(('*','*'),0) + length

    return repeat_bp
Example #48
0
def main():
    usage = 'usage: %prog [options] <gtf file> <bam file>'
    parser = OptionParser(usage)
    parser.add_option('-i', dest='intersect_done', default=False, action='store_true', help='intersectBed is already done [Default: %default]')
    parser.add_option('-o', dest='output_prefix', help='Prefix for the intersectBed intermediate file [Default: %default]')
    (options,args) = parser.parse_args()

    if len(args) != 2:
        parser.error('Must provide gtf file and bam file')
    else:
        gtf_file = args[0]
        bam_file = args[1]

    if options.output_prefix:
        ib_file = '%s_reads_genes.gff' % options.output_prefix
    else:
        ib_file = 'reads_genes.gff'

    if not options.intersect_done:
        # overlap genes w/ aligned reads
        p = subprocess.Popen('intersectBed -s -wo -abam -bed -a %s -b %s > %s' % (bam_file,gtf_file,ib_file), shell=True)
        os.waitpid(p.pid,0)

    # count transcriptome alignments per read
    read_aligns = {}
    for line in open(ib_file):
        a = line.split('\t')
        chrom = a[0]
        start = int(a[1])
        read_id = a[3]

        read_aligns.setdefault(read_id,set()).add((chrom,start))

    # hash reads by gene
    gene_reads = {}
    for line in open(ib_file):
        a = line.split('\t')
        read_id = a[3]
        gene_id = gff.gtf_kv(a[14])['transcript_id']
        gene_reads.setdefault(gene_id,[]).append(read_id)

    # print gene stats
    for gene_id in gene_reads:
        align_counts = [len(read_aligns[read_id]) for read_id in gene_reads[gene_id]]
        multi_count = float(len([ac for ac in align_counts if ac > 1]))
        cols = (gene_id, len(align_counts), util.mean(align_counts), multi_count/float(len(align_counts)))
        print '%-15s %7d %7.2f %7.2f' % cols
Example #49
0
def map_genes(gtf_file,
              fpkm_file,
              pseudocount=0.125,
              all_isoforms=False,
              random_zeros=False):
    # get expression data
    if fpkm_file[-5:] == '.diff':
        transcript_fpkm = diff_fpkm(fpkm_file, pseudocount)
    else:
        transcript_fpkm = cuff_fpkm(fpkm_file, pseudocount)

    # get genes
    if all_isoforms:
        g2t = gff.g2t(gtf_file)
    else:
        g2t = {}
        for line in open(gtf_file):
            a = line.split('\t')
            kv = gff.gtf_kv(a[8])

            if kv['transcript_type'] not in [
                    'intron', 'prerna', 'nonsense_mediated_decay',
                    'retained_intron', 'non_stop_decay'
            ]:
                g2t.setdefault(kv['gene_id'], set()).add(kv['transcript_id'])

    # map gene_id's to max expression isoform
    gene_max_iso = {}
    min_fpkm = math.log(pseudocount, 2)
    for gid in g2t:
        max_fpkm_tid = None
        max_fpkm = min_fpkm

        for tid in g2t[gid]:
            if transcript_fpkm.get(tid, min_fpkm) > max_fpkm:
                max_fpkm_tid = tid
                max_fpkm = transcript_fpkm[tid]

        gene_max_iso[gid] = max_fpkm_tid

    # choose isoforms for None
    if random_zeros:
        for gid in g2t:
            if gene_max_iso[gid] == None:
                gene_max_iso[gid] = random.choice(g2t[gid])

    return gene_max_iso
Example #50
0
def main():
    usage = 'usage: %prog [options] <gene id>'
    parser = OptionParser(usage)
    parser.add_option(
        '-l',
        dest='lncrna_gtf',
        default='/Users/dk/research/common/data/lncrna/lnc_catalog.gtf',
        help='lncRNA gtf file [Default: %default]')
    parser.add_option(
        '-s',
        dest='span',
        action='store_true',
        default=False,
        help='Map the gene\'s entire span, i.e. introns too [Default: %default]'
    )
    (options, args) = parser.parse_args()

    if len(args) != 1:
        parser.error('Must provide gene id')
    else:
        gene_id = args[0]

    # get human genome
    hg19 = worldbase.Bio.Seq.Genome.HUMAN.hg19()

    # get gene exon intervals
    gene_ivals = []
    for line in open(options.lncrna_gtf):
        a = line.split('\t')
        if gff.gtf_kv(a[8])['gene_id'] == gene_id:
            chrom = a[0]
            start = int(a[3])
            end = int(a[4])
            # ignoring orientation at the moment

            gene_ivals.append(hg19[chrom][start:end])

    # get hg19 msa
    msa = worldbase.Bio.MSA.UCSC.hg19_multiz46way()

    # map returned sequences back to genome name
    idDict = ~(msa.seqDict)

    # print alignments
    for gi in gene_ivals:
        for src, dest, edg in msa[gi].edges():
            print repr(gi), repr(src), repr(dest), idDict[dest], edg.length()
Example #51
0
def process_chrom(transcripts_gtf, chrom, seq, transcript_seqs, transcript_genes):
    # find chr transcripts
    for line in open(transcripts_gtf):
        a = line.split('\t')
        if a[0] == chrom:
            kv = gff.gtf_kv(a[8])
            tid = kv['transcript_id']
            gid = kv['gene_id']

            exon_start = int(a[3])
            exon_end = int(a[4])

            exon_seq = seq[exon_start-1:exon_end]
            if a[6] == '+':
                transcript_seqs[tid] = transcript_seqs.get(tid,'') + exon_seq
            else:
                transcript_seqs[tid] = dna.rc(exon_seq) + transcript_seqs.get(tid,'')

            transcript_genes[tid] = gid
Example #52
0
def main():
    usage = 'usage: %prog [options] <trans id>'
    parser = OptionParser(usage)
    parser.add_option(
        '-l',
        dest='lnc_file',
        default='/Users/dk/research/common/data/lncrna/lnc_catalog.gtf',
        help='lncRNA catalog file [Default: %default]')
    (options, args) = parser.parse_args()

    if len(args) != 1:
        parser.error('Must provide transcript id')
    else:
        trans_id = args[0]

    for line in open(options.lnc_file):
        a = line.split('\t')
        kv = gff.gtf_kv(a[8])
        if kv['transcript_id'] == trans_id:
            print kv['gene_id']
            break
Example #53
0
def get_promoters(gtf_file, promoter_length):
    promoters = []

    gene_id = ''
    for line in open(gtf_file):
        a = line.split('\t')
        a[-1] = a[-1].rstrip()

        this_gene_id = gff.gtf_kv(a[8])['gene_id']
        if this_gene_id != gene_id:
            if gene_id:
                promoters.append(find_promoter(gene_id, exons,
                                               promoter_length))
            gene_id = this_gene_id
            exons = [line]
        else:
            exons.append(line)

    promoters.append(find_promoter(gene_id, exons, promoter_length))

    return promoters
Example #54
0
def main():
    usage = 'usage: %prog [options] <gtf file> <fpkm tracking>'
    parser = OptionParser(usage)
    #parser.add_option('-m', dest='fpkm_min', type='float', default=0.25, help='Minimum FPKM [Default: %default]')
    (options,args) = parser.parse_args()

    if len(args) != 2:
        parser.error(usage)
    else:
        gtf_file = args[0]
        fpkm_tracking_file = args[1]

    # get genes
    genes = set()
    for line in open(gtf_file):
        a = line.split('\t')
        genes.add(gff.gtf_kv(a[8])['gene_id'])

    # get expression
    cuff = cufflinks.fpkm_tracking(fpkm_tracking_file)
    log_fpkms = []
    for gene_id in genes:
        max_fpkm = max(cuff.gene_expr(gene_id))
        if max_fpkm > 0:
            log_fpkms.append(math.log(max_fpkm,2))

    # construct R data objects
    fpkms_r = ro.FloatVector(log_fpkms)
    df = ro.DataFrame({'fpkm':fpkms_r})
    
    # construct plot
    gp = ggplot2.ggplot(df) + \
        ggplot2.aes_string(x='fpkm') + \
        ggplot2.geom_histogram(binwidth=0.2)
    
    # save to file
    gtf_pre = os.path.splitext(gtf_file)[0]
    grdevices.pdf(file='%s_fpkmhist.pdf' % gtf_pre)
    gp.plot()
    grdevices.dev_off()
Example #55
0
def main():
    usage = 'usage: %prog [options] <gtf file>'
    parser = OptionParser(usage)
    parser.add_option(
        '-g',
        dest='greater',
        action='store_true',
        default=False,
        help=
        'Keep genes w/ CSF value greater than the one given [Default: %default]'
    )
    parser.add_option(
        '-l',
        dest='less',
        action='store_true',
        default=True,
        help=
        'Keep genes w/ CSF value less than the one given [Default: %default]')
    parser.add_option('-t',
                      dest='csf_t',
                      type='float',
                      default=100.0,
                      help='CSF threshold [Default: %default]')
    (options, args) = parser.parse_args()

    if len(args) == 1:
        gtf_open = open(args[0])
    else:
        gtf_open = sys.stdin

    line = gtf_open.readline()
    while line:
        a = line.split('\t')
        csf = float(gff.gtf_kv(a[8])['csf'])
        if (options.less
                and csf <= options.csf_t) or (options.greater
                                              and csf >= options.csf_t):
            print line,
        line = gtf_open.readline()
Example #56
0
def gff_intervals(gff_file, gtf_key):
    chr_features = {}
    interval_map = {}

    for line in open(gff_file):
        a = line.split('\t')
        a[-1] = a[-1].rstrip()

        chrom = a[0]
        start = int(a[3])
        end = int(a[4])
        strand = a[6]
        if gtf_key:
            feature_id = gff.gtf_kv(a[8]).get(gtf_key, a[8])
        else:
            feature_id = a[8]

        chr_features.setdefault(chrom, IntervalTree()).insert_interval(
            Interval(start, end))
        interval_map.setdefault(chrom, {}).setdefault((start, end), []).append(
            (feature_id, strand))

    return chr_features, interval_map
Example #57
0
def main():
    usage = 'usage: %prog [options] <gtf> <fpkm tracking | diff>'
    parser = OptionParser(usage)
    parser.add_option(
        '-a',
        dest='all_isoforms',
        default=False,
        action='store_true',
        help='Consider all isoforms. Default is to ignore bs ones')
    parser.add_option('-p', dest='pseudocount', default=0.125)
    parser.add_option(
        '-r',
        dest='random_zeros',
        default=False,
        action='store_true',
        help=
        'Randomly choose an isoform for zero FPKM genes [Default: %default]')
    (options, args) = parser.parse_args()

    if len(args) != 2:
        parser.error('Must provide gtf file and fpkm tracking')
    else:
        gtf_file = args[0]
        fpkm_file = args[1]

    gene_max_iso = map_genes(gtf_file, fpkm_file, options.pseudocount,
                             options.all_isoforms, options.random_zeros)

    # filter gtf file
    for line in open(gtf_file):
        a = line.split('\t')
        kv = gff.gtf_kv(a[8])
        gene_id = kv['gene_id']
        tid = kv['transcript_id']

        if gene_max_iso.get(gene_id, None) == tid:
            print line,