if feature_coords_fn: feature_coords = load(open(feature_coords_fn)) print >> stderr, "Note: using FASTA file at %s" % fastafn f = open(gtffn) junctions = set() for row in f: # for each row in the input GTF # we need to get rid of first and last exons if row.strip().split('\t')[2] in ['start_codon', 'stop_codon', 'CDS']: continue if feature_coords_fn: # if we have the resource to check whether this is a terminal exon or not # print >> stderr,row.strip() strand = row.strip().split('\t')[6] feature, blank = parse_lines([row.strip()], strand, get_transcripts) # print >> stderr,feature is_terminal_feature, up_exon, down_exon = terminal_exon( list(feature)[0], feature_coords) if is_terminal_feature: # if this is terminal ignore it continue # now we know that this is not a terminal exon so there must be neighbouring exons up_exon_gtf = region_to_GTF(up_exon, feature_coords[up_exon], get_transcripts) down_exon_gtf = region_to_GTF(down_exon, feature_coords[down_exon], get_transcripts) if get_introns: up_exon_five, up_exon_three = GTFrow_to_5p3pcoords( up_exon_gtf, offset, use_chromnames) five, three = GTFrow_to_5p3pcoords(row, offset, use_chromnames)
#!/home/paulk/software/bin/python from __future__ import division from sys import argv,exit,stderr from subprocess import Popen,PIPE from key_functions import parse_lines import pysam f = open(argv[1]) t = pysam.Tabixfile("resources/Homo_sapiens.GRCh37.66.gtf.gz") for row in f: if row[0] == 'u': continue l = row.strip().split('\t') regions = [l[0],l[1],l[5],l[6]] the_exons = "" for region in regions: result = t.fetch(region[3:-2]) strand = region[-1] exons,no_lines = parse_lines(result,strand,get_transcripts=True) the_exons += ",".join(exons) + "\t" print row.strip()+"\t"+the_exons f.close()
get_introns = args.introns if feature_coords_fn: feature_coords = load(open(feature_coords_fn)) print >> stderr,"Note: using FASTA file at %s" % fastafn f = open(gtffn) junctions = set() for row in f: # for each row in the input GTF # we need to get rid of first and last exons if row.strip().split('\t')[2] in ['start_codon','stop_codon','CDS']: continue if feature_coords_fn: # if we have the resource to check whether this is a terminal exon or not # print >> stderr,row.strip() strand = row.strip().split('\t')[6] feature,blank = parse_lines([row.strip()],strand,get_transcripts) # print >> stderr,feature is_terminal_feature,up_exon,down_exon = terminal_exon(list(feature)[0],feature_coords) if is_terminal_feature: # if this is terminal ignore it continue # now we know that this is not a terminal exon so there must be neighbouring exons up_exon_gtf = region_to_GTF(up_exon,feature_coords[up_exon],get_transcripts) down_exon_gtf = region_to_GTF(down_exon,feature_coords[down_exon],get_transcripts) if get_introns: up_exon_five,up_exon_three = GTFrow_to_5p3pcoords(up_exon_gtf,offset,use_chromnames) five,three = GTFrow_to_5p3pcoords(row,offset,use_chromnames) down_exon_five,down_exon_three = GTFrow_to_5p3pcoords(down_exon_gtf,offset,use_chromnames) junctions.add((up_exon_five,three,five,down_exon_three)) else: five,three = GTFrow_to_5p3pcoords(row,offset,use_chromnames)
if chrom == '---': continue st,sp = st_sp.split('-') if not chrom_names: # if tabix needs 'chr' removed chrom = chrom[3:] else: # if tabix can work with 'chr' pass if chrom == "chrM": chrom = "chrMT" if chrom == "M": chrom = "MT" try: lines = tabix_file_ptr.fetch(region="%s:%s-%s" % (chrom,st,sp)) except ValueError: ps_missing.append((int(l[0]),ps_detail[int(l[colno])])) # print l[0] continue gene_exons,no_lines = parse_lines(lines,sd,get_transcripts) if gene_exons == []: ps_missing.append((int(l[0]),ps_detail[int(l[colno])])) #print l[0] continue if report: print >> h,l[colno]+"\t"+",".join(gene_exons) # for ge in gene_exons: # print >> h,l[colno]+"\t"+ge # """ # """ #if len(gene_exons) > 0: print >> g,"\t".join([",".join(gene_exons)]+l[colno+1:]) if len(gene_exons) > 0: ge = random.choice(list(gene_exons))