def parse(): list_obj = [] for line in open(ifile,'r'): line = line.strip() if len(line) > 0 and not line.startswith('#'): obj = classGene.GFF3(line) if obj.types() == 'mRNA': obj_gene = classGene.LongestCodingIsoform(line, obj) list_obj.append(obj_gene) if obj.types() == 'exon': obj_gene.addExon(line, obj) if obj.types() == 'CDS': obj_gene.addCDS(line, obj) hash_geneID = {} for obj in list_obj: id, parent, exon_length, cds_length = obj.getLongestIsoform() if parent in hash_geneID: if hash_geneID[parent][1] < cds_length: hash_geneID[parent][1] = cds_length hash_geneID[parent][2] = exon_length hash_geneID[parent][0] = id elif hash_geneID[parent][1] == cds_length: if hash_geneID[parent][2] < exon_length: hash_geneID[parent][2] = exon_length hash_geneID[parent][0] = id else: hash_geneID[parent] = [id, cds_length, exon_length] hash_tid = {} for item in hash_geneID: hash_tid[hash_geneID[item][0]] = '' ### process the GFF3 file print_flag = False for line in open(ifile,'r'): line = line.strip() if len(line) > 0 and not line.startswith('#'): obj = classGene.GFF3(line) if obj.types() == 'gene': print line elif obj.types() == 'mRNA': if str(obj) in hash_tid: print line print_flag = True else: print_flag = False elif print_flag == True: print line
def hash_combined(hash_combine_ID): hash_combine = {} count = 0 for line in open(combined_gff3, 'r'): line = line.strip() count += 1 if re.search('ID=', line): if len(line) > 0 and not line.startswith('#'): if count % 100000 == 0: print 'Hashing combined: ', '{:9,.0f}'.format(count) obj = classGene.GFF3(line) if obj.types() == 'mRNA': ID = str(obj) if ID in hash_combine_ID: hash_combine[ID] = {} hash_combine[ID]['mRNA'] = line hash_combine[ID]['exon'] = '' hash_combine[ID]['CDS'] = '' hash_lines = True else: hash_lines = False elif obj.types() == 'exon': if hash_lines == True: hash_combine[ID]['exon'] += (',' + line) elif obj.types() == 'CDS': if hash_lines == True: hash_combine[ID]['CDS'] += (',' + line) return hash_combine
def hash_last_gff3_CDS(chromosome, start_min, start_max): CDS = {} count = 0 exons = {} hash_flag = False for line in open(last_gff3, 'r'): line = line.strip() if len(line) > 0 and not line.startswith('#'): ### print the lines processed count += 1 if count % 100000 == 0: print 'Hashing CDS: ', '{:9,.0f}'.format(count) ### check if it is gff3 format line if re.search('ID=', line): obj = classGene.GFF3(line) if obj.types() == 'mRNA': if obj.seqids() == chromosome and (start_min <= int( obj.starts()) < start_max): ID = str(obj) CDS[ID] = 0 exons[ID] = [] hash_flag = True else: hash_flag = False if obj.types() == 'exon' and obj.seqids( ) == chromosome and hash_flag == True: exons[ID].append((int(obj.starts()), int(obj.ends()))) if obj.types() == 'CDS' and obj.seqids( ) == chromosome and hash_flag == True: if ID in CDS: CDS[ID] += (int(obj.ends()) - int(obj.starts()) + 1) return exons, CDS
def hash_last_gff3_CDS(chromosome): CDS = {} count = 0 exons = {} last_gff3_temp = last_gff3 + '.temp' os.system('grep ' + chromosome + "'\t' " + last_gff3 + ' > ' + last_gff3_temp) for line in open(last_gff3_temp, 'r'): line = line.strip() if len(line) > 0 and not line.startswith('#'): ### print the lines processed count += 1 if count % 100000 == 0: print 'Hashing CDS: ', '{:9,.0f}'.format(count) ### check if it is gff3 format line if re.search('ID=', line): obj = classGene.GFF3(line) if obj.types() == 'mRNA' and obj.seqids() == chromosome: ID = str(obj) CDS[ID] = 0 exons[ID] = [] if obj.types() == 'exon' and obj.seqids() == chromosome: exons[ID].append((int(obj.starts()), int(obj.ends()))) if obj.types() == 'CDS' and obj.seqids() == chromosome: if ID in CDS: CDS[ID] += (int(obj.ends()) - int(obj.starts()) + 1) os.system('rm ' + last_gff3_temp) return exons, CDS
def printOut(Hash): for line in open(gff3, 'r'): if len(line) > 0 and not line.startswith('#'): obj = classGene.GFF3(line) if obj.types() == FeatureType: if obj.seqids() in Hash: print obj
def parseGFF3(EC, Function, Name, GeneComment, ProductType, GO, SYNONYM, header, pf_out, g): ### Attributes #attributes = ['ID', 'NAME', 'STARTBASE', 'ENDBASE', 'PRODUCT-TYPE','SYNONYM','GENE-COMMENT','FUNCTION','EC','GO','DBLINK','//'] genetic_elem_write = True for line in open(gff3, 'r'): line = line.strip() if len(line) > 0 and not line.startswith('#'): obj = classGene.GFF3(line) if obj.types() == 'mRNA' and obj.seqids() == header: t_id = str(obj) if genetic_elem_write == True: g.write('ID\t' + str(header) + '\n') g.write('Name\t' + str(header) + '\n') g.write('TYPE\t:CHRSM\n') g.write('CIRCULAR?\tN\n') g.write('ANNOT-FILE\t' + header + '.pf\n') g.write('SEQ-FILE\t' + header + '.fa\n') g.write('//' + '\n') genetic_elem_write = False pf_out.write('ID\t' + str(t_id) + '\n') pf_out.write(attributes[1] + '\t' + '.'.join(t_id.split('.')[:-1]) + '\n') pf_out.write(attributes[2] + '\t' + str(obj.starts()) + '\n') pf_out.write(attributes[3] + '\t' + str(obj.ends()) + '\n') if t_id in ProductType: pf_out.write(attributes[4] + '\t' + str(ProductType[t_id]) + '\n') else: pf_out.write(attributes[4] + '\t' + str('P') + '\n') if t_id in SYNONYM: pf_out.write(attributes[5] + '\t' + str(SYNONYM[t_id]) + '\n') if t_id in Name: if t_id in GeneComment: pf_out.write(attributes[6] + '\t' + str(GeneComment[t_id]) + '\n') if t_id in Function: pf_out.write(attributes[7] + '\t' + str(Function[t_id]) + '\n') if t_id in EC: for j in range(len(EC[t_id])): ec_no = EC[t_id][j].replace('EC-', '') if len(ec_no.split('.')) == 3: ec_no += '.-' pf_out.write(attributes[8] + '\t' + str(ec_no + '\n')) if t_id in GO: for j in range(len(GO[t_id])): pf_out.write(attributes[9] + '\t' + str(GO[t_id][j]) + '\n') pf_out.write(attributes[10] + '\tGO:' + str(GO[t_id][j]).split('|')[1] + '\n') pf_out.write(attributes[11] + '\n') pf_out.close()
def hashGFF3(chrom): cds_coords = {} for line in open(ifile, 'r'): line = line.strip() if len(line) > 1 and line.startswith(chrom): obj = classGene.GFF3(line) if obj.types() == 'CDS': for i in range(int(obj.starts()), int(obj.ends()) + 1, 1): cds_coords[i] = '' return cds_coords
def parse(hash_anno): for line in open(ifile,'r'): line = line.strip() if len(line)>0 and not line.startswith('#'): obj = classGene.GFF3(line) if obj.types() == 'mRNA': print line + ';'+'Annotation="'+hash_anno[str(obj)].replace(',','')+'"' else: print line
def parse(): print_flag = False for line in open(infile, 'r'): line = line.strip() obj = classGene.GFF3(line) if obj.types() == 'gene': if re.search("Name=CUFF", line): print_flag = True else: print_flag = False if print_flag == True: print line
def parse(): print_flag = False for line in open(infile, 'r'): line = line.strip() obj = classGene.GFF3(line) if obj.types() == 'gene': token = line.split('\t') if token[1] == "CUFFLINKS": print_flag = True else: print_flag = False if print_flag == True: print line
def hash_evidences(chromosome, exons, CDS, start_min, start_max): evidences = {} ### hash exonic co-ordinates by evidence CDS_transcript = {} exons_transcript = {} count = 0 hash_flag = False for line in open(evidences_gff3, 'r'): line = line.strip() count += 1 if len(line) > 0 and not line.startswith('#'): if count % 100000 == 0: print 'Hashing evidence: ', '{:9,.0f}'.format(count) ### check if it is gff3 format line if re.search('ID=', line): obj = classGene.GFF3(line) if obj.types() == 'mRNA': if obj.seqids() == chromosome and (start_min <= int( obj.starts()) < start_max): ID = str(obj) CDS_transcript[ID] = 0 exons_transcript[ID] = [] hash_flag = True ### print the lines processed if obj.sources() not in evidences: evidences[obj.sources()] = {} for i in range( int(obj.starts()) - int(gene_size_difference) - 1, int(obj.starts()) + int(gene_size_difference) + 1): evidences[obj.sources()][i] = str(obj) for i in range( int(obj.ends()) - int(gene_size_difference) - 1, int(obj.ends()) + int(gene_size_difference) + 1): evidences[obj.sources()][i] = str(obj) else: hash_flag = False if obj.types() == 'exon' and obj.seqids() == chromosome: if hash_flag == True: exons_transcript[ID].append( (int(obj.starts()), int(obj.ends()))) if obj.types() == 'CDS' and obj.seqids() == chromosome: if hash_flag == True: if ID in CDS_transcript: CDS_transcript[ID] += (int(obj.ends()) - int(obj.starts()) + 1) find_gene_overlaps(evidences, exons_transcript, CDS_transcript, exons, CDS, chromosome, start_min, start_max)
def print_intron(file): last_parent_ID = '' last_end = '' for line in open(file,'r'): line = line.strip() if len(line) > 1 and not line.startswith('#'): obj = classGene.GFF3(line) if obj.types() == 'exon': if last_parent_ID == get_PARENT(line): print abs(last_end - int(obj.starts())) last_parent_ID = get_PARENT(line) last_end = int(obj.ends())
def hash_gff3(): HASH_GFF3 = {} for line in open(GFF3, 'r'): line = line.strip() obj = classGene.GFF3(line) if obj.types() == 'mRNA': try: anno = re.search(r'Annotation=".+"', line).group(0).split('"')[1] g_id = (get_PARENT(line)).replace('clover_', 'occidentale_') HASH_GFF3[g_id] = anno.split('|')[4] except: continue return HASH_GFF3
def hash_gff3(chromosome): HASH_GFF3 = {} for line in open(GFF3, 'r'): line = line.strip() obj = classGene.GFF3(line) if obj.types() == 'mRNA' and obj.seqids() == chromosome: try: anno = re.search(r'Annotation=".+"', line).group(0).split('"')[1] for i in range(int(obj.starts()), int(obj.ends())): if i not in HASH_GFF3: HASH_GFF3[i] = anno except: continue return HASH_GFF3
def change_gff3(prior_n): for line in open(infile, 'r'): line = line.strip() if len(line) > 0 and not line.startswith('#'): obj = classGene.GFF3(line) string = obj.seqids() + '\t' + \ str(prior_n) + '\t' + \ obj.types() + '\t' + \ obj.starts() + '\t' + \ obj.ends() + '\t' + \ obj.scores() + '\t' + \ obj.strands() + '\t' + \ obj.phases() + '\t' + \ obj.attributes() print string
def parse(gff3): gene_objs = [] for line in open(gff3,'r'): line = line.strip() obj = classGene.GFF3(line) if obj.types()=="gene": LIobj = classGene.LongestIsoform(obj) gene_objs.append(LIobj) elif obj.types()=="mRNA": LIobj.add_mRNA(line, obj) else: LIobj.add_feature(line) for obj in gene_objs: print str(obj) print obj.features
def get_exon_fraction(chrom, hash_call): HEADER = 'Lj30_ID\tExonLength\tCDSLength\tCallableExon\tCallableCDS' obj_list = [] for line in open(ifile, 'r'): if len(line) > 1 and not line.startswith('#'): line = line.strip() obj = classGene.GFF3(line) if obj.types() == "mRNA": obj_mRNA = classmRNA.mRNA(line, obj) obj_list.append(obj_mRNA) if obj.types() == "mRNA" or obj.types() == "exon" or obj.types( ) == "CDS": obj_mRNA.AddData(line, obj) print HEADER for obj_mRNA in obj_list: print str(obj_mRNA), obj_mRNA.GetExonLength(), obj_mRNA.GetCDSLength( ), obj_mRNA.GetExonicOverlap(hash_call), obj_mRNA.GetCDSOverlap( hash_call)
def find_high_gene_density(chromosome, count, avg_gd): o_frag = open(infile + '.' + chromosome + '.frags.temp', 'w') last_start = -100000 new_block = True correct_chro = False for line in open(infile, 'r'): line = line.strip() if len(line) > 0 and not line.startswith('#'): obj = classGene.GFF3(line) if obj.seqids() == chromosome: correct_chro = True if obj.types() == 'gene': start = int(obj.starts()) if start - last_start < max_dist: if max_size > int(obj.ends()) - int(obj.starts()): region += int(obj.ends()) - int(obj.starts()) gene_count += 1 gene_id.append(str(obj)) new_block = False else: if new_block == False: if gene_count * 1000 / float( gene_count ) >= 5 * avg_gd and gene_count > 2: print chromosome, block_start for g_id in gene_id: o_frag.write(g_id + '\n') gene_count = 0 region = int(obj.ends()) - int(obj.starts()) new_block = True gene_id = [] if max_size > int(obj.ends()) - int(obj.starts()): gene_id.append(str(obj)) gene_count += 1 block_start = int(obj.starts()) last_start = start if correct_chro == True and obj.seqids() != chromosome: break o_frag.close()
def hash_evidences(chromosome, exons, CDS): evidences = {} ### hash exonic co-ordinates by evidence CDS_transcript = {} exons_transcript = {} count = 0 evidences_gff3_temp = evidences_gff3 + '.temp' os.system('grep ' + chromosome + "'\t' " + evidences_gff3 + ' > ' + evidences_gff3_temp) for line in open(evidences_gff3_temp, 'r'): line = line.strip() count += 1 if len(line) > 0 and not line.startswith('#'): if count % 100000 == 0: print 'Hashing evidence: ', '{:9,.0f}'.format(count) ### check if it is gff3 format line if re.search('ID=', line): obj = classGene.GFF3(line) if obj.types() == 'mRNA' and obj.seqids() == chromosome: ID = str(obj) CDS_transcript[ID] = 0 exons_transcript[ID] = [] ### print the lines processed if obj.sources() not in evidences: evidences[obj.sources()] = {} for i in range( int(obj.starts()) - int(gene_size_difference) - 1, int(obj.starts()) + int(gene_size_difference) + 1): evidences[obj.sources()][i] = str(obj) for i in range( int(obj.ends()) - int(gene_size_difference) - 1, int(obj.ends()) + int(gene_size_difference) + 1): evidences[obj.sources()][i] = str(obj) if obj.types() == 'exon' and obj.seqids() == chromosome: exons_transcript[ID].append( (int(obj.starts()), int(obj.ends()))) if obj.types() == 'CDS' and obj.seqids() == chromosome: if ID in CDS_transcript: CDS_transcript[ID] += (int(obj.ends()) - int(obj.starts()) + 1) find_gene_overlaps(evidences, exons_transcript, CDS_transcript, exons, CDS, chromosome) os.system('rm ' + evidences_gff3_temp)
def hash_GFF3(chromosome): count = 0 coords = {} for line in open(gff3, 'r'): line = line.strip() if len(line) > 0 and not line.startswith('#'): obj = classGene.GFF3(line) if obj.seqids() == chromosome: count += 1 if count % 1000 == 0: print 'Number of lines processed: ', chromosome, '{:9,.0f}'.format( count) for i in range(int(obj.starts()), int(obj.ends()) + 1): if obj.types() == 'mRNA': coords[i] = 'mRNA' if obj.types() == 'exon': coords[i] = 'exon' if obj.types() == 'CDS': coords[i] = 'CDS' return coords
def find_avg_gene_density(chromosome, count, gene_count_hash): correct_chro = False gene_count = 0 for line in open(infile, 'r'): line = line.strip() if len(line) > 0 and not line.startswith('#'): obj = classGene.GFF3(line) count += 1 if count % 10000 == 0: print 'Number of lines processed: ', chromosome, '{:9,.0f}'.format( count) if obj.seqids() == chromosome: correct_chro = True if obj.types() == 'gene': gene_count += 1 if correct_chro == True and obj.seqids() != chromosome: break gene_count_hash[chromosome] = gene_count
def parseGFF3(EC, Function, Name, GeneComment, ProductType, GO, SYNONYM, pf_out, g): ### Attributes #attributes = ['ID', 'NAME', 'STARTBASE', 'ENDBASE', 'PRODUCT-TYPE','SYNONYM','GENE-COMMENT','FUNCTION','EC','GO','DBLINK','//'] seq = {} for line in open(gff3, 'r'): line = line.strip() if len(line)>0 and not line.startswith('#'): obj = classGene.GFF3(line) if obj.seqids() not in seq: header = obj.seqids() seq[obj.seqids()] = '' t_id = str(obj) g.write('ID\t'+str(header)+'\n') g.write('Name\t'+str(header)+'\n') g.write('TYPE\t:CHRSM\n') g.write('CIRCULAR?\tN\n') g.write('ANNOT-FILE\t'+header+'.pf\n') g.write('SEQ-FILE\t'+header+'.fsa\n') g.write('//'+'\n') '''
def hashGFF3(chrom): cds_coords = {} cds_bound = {} for line in open(ifile,'r'): line = line.strip() if len(line) > 1 and line.startswith(chrom): obj = classGene.GFF3(line) if obj.types() == 'CDS' and obj.get_parent() == candidate: cds_bound[int(obj.starts())] = '' cds_bound[int(obj.ends())] = '' for i in range(int(obj.starts()), int(obj.ends())+1, 1): cds_coords[i] = '' if len(cds_coords) != 0: if obj.strands() == '+': start = min(cds_bound) else: start = max(cds_bound) hashAlignment(chrom, cds_coords, start)
def get_exon_fraction(chrom, hash_call): first_transcript = True hash_exon = {} hash_cds = {} HEADER = 'Lj30_ID\tExonLength\tCDSLength\tCallableExon\tCallableCDS' print HEADER for line in open(ifile, 'r'): if len(line) > 1 and not line.startswith('#'): line = line.strip() obj = classGene.GFF3(line) if obj.seqids() == chrom: if obj.types() == "mRNA": if first_transcript == False: exon_len = len(hash_exon) cds_len = len(hash_cds) exon_call_len = 0 cds_call_len = 0 for i in hash_exon: if i in hash_call: exon_call_len += 1 for i in hash_cds: if i in hash_call: cds_call_len += 1 print id + '\t' + str(exon_len) + '\t' + str( exon_call_len) + '\t' + str(cds_len) + '\t' + str( cds_call_len) first_transcript = False hash_exon = {} hash_cds = {} id = str(obj) elif obj.types() == "exon": for i in range(int(obj.starts()), int(obj.ends())): hash_exon[i] = '' elif obj.types() == "CDS": for i in range(int(obj.starts()), int(obj.ends())): hash_cds[i] = '' print id + '\t' + str(exon_len) + '\t' + str(exon_call_len) + '\t' + str( cds_len) + '\t' + str(cds_call_len)
def hash_coords(file, chr): coords_dis={} coords_mRNA_len = {} exons = [] first_gene = True for line in open(file, 'r'): if len(line) > 1 and not line.startswith('#'): obj = classGene.GFF3(line) if obj.seqids() == chr: if obj.types() == "mRNA": strand = obj.strands() if first_gene == False: coords_dis, coords_mRNA_len = hash_coords_mRNA(coords_dis, exons, strand, coords_mRNA_len) exons = [] first_gene = False if obj.types() == "exon": exons.append(int(obj.starts())) exons.append(int(obj.ends())) ### for last mRNA coords_dis, coords_mRNA_len = hash_coords_mRNA(coords_dis, exons, strand, coords_mRNA_len) return coords_dis, coords_mRNA_len
def hashGFF3(chrom, align_hash): cds_coords = {} cds_bound = {} first_transcript = True for line in open(ifile,'r'): line = line.strip() if len(line) > 1 and line.startswith(chrom): obj = classGene.GFF3(line) if obj.types() == 'CDS': for i in range(int(obj.starts()), int(obj.ends())+1, 1): cds_coords[i] = '' if obj.types() == 'mRNA': if first_transcript == False: processTranscript(chrom, cds_coords, tid, align_hash) tid = str(obj) cds_coords = {} first_transcript = False if len(line) > 1 and line.startswith(chrom): processTranscript(chrom, cds_coords, tid, align_hash)
def hash_annotations(gff3, chro): exons = {} cds = {} utr = {} intron = {} inter = {} mRNA_type = {} first_line = True last_end = 0 last_exon_end = 0 parent_id = '' for line in open(gff3, 'r'): line = line.strip() if len(line) > 1 and not line.startswith('#'): obj = classGene.GFF3(line) if obj.seqids() == chro: if obj.types() == 'mRNA': if first_line == False: exons, cds, utr, intron, mRNA_type = addCoords( gs, exons, cds, utr, intron, mRNA_type) first_line = False gs = classGeneStructure.GeneStructure(obj) gs.addmRNA(obj) if obj.types() == 'exon': gs.addexon(obj) if obj.types() == 'CDS': gs.addcds(obj) exons, cds, utr, intron, mRNA_type = addCoords(gs, exons, cds, utr, intron, mRNA_type) return exons, cds, utr, intron, mRNA_type
def find_gene_overlaps(evidences, exons_transcript, CDS_transcript, exons, CDS, chromosome, start_min, start_max): count = 0 for line in open(last_gff3, 'r'): line = line.strip() if len(line) > 0 and not line.startswith('#'): count += 1 if count % 10000 == 0: print 'find_gene_overlaps: ', '{:9,.0f}'.format(count) obj = classGene.GFF3(line) if obj.types() == 'mRNA' and obj.seqids() == chromosome and ( start_min <= int(obj.starts()) < start_max): ID = str(obj) ### find the best replacement delta_CDS = [] evi_list = [] larger_found = False for i in evidences: if int(obj.starts()) in evidences[i] and int( obj.ends()) in evidences[i]: if ID != evidences[i][int( obj.starts())] and ID != evidences[i][int( obj.ends())]: evidence_id = evidences[i][int(obj.starts())] if CDS[ID] + float(min_coding_differences) * CDS[ ID] < CDS_transcript[evidence_id] and CDS[ ID] >= 0: ### make sure that two IDs are isoformorms if '.'.join(ID.split('.')[:-1]) != '.'.join( evidence_id.split('.')[:-1]): ### check the exonic difference set_exons_transcript = {} for (start, end) in exons_transcript[evidence_id]: for j in range(start, end + 1): set_exons_transcript[j] = '' set_exons_transcript = set( set_exons_transcript) set_exons = {} for start, end in exons[ID]: for j in range(start, end + 1): set_exons[j] = '' set_exons = set(set_exons) if abs( int( len(set_exons_transcript - set_exons)) ) < int(max_exonic_differences): delta_CDS.append( CDS_transcript[evidence_id] - CDS[ID]) evi_list.append( ID + '\t' + evidence_id + '\t' + i + '\t' + str(obj.seqids()) + '\t' + str(obj.starts()) + '\t' + str(CDS_transcript[evidence_id] - CDS[ID]) + '\t' + str( len(set_exons_transcript - set_exons))) larger_found = True if larger_found == True: outfile.write(evi_list[delta_CDS.index(min(delta_CDS))] + '\n')
def FixBoundries(): first_gene = True gene = [] for line in open(infile, 'r'): line = line.strip() if len(line) > 0 and not line.startswith('#'): obj = classGene.GFF3(line) if obj.types() == 'gene': if first_gene == False: obj = classGene.GFF3(gene[0]) print obj.seqids() + '\t' + \ obj.sources() + '\t' + \ obj.types() + '\t' + \ str(gene_start) + '\t' + \ str(gene_end) + '\t' + \ obj.scores() + '\t' + \ obj.strands() + '\t' + \ str(obj.phases()) + '\t' + \ obj.attributes() for l in gene[1:]: print l gene = [] obj = classGene.GFF3(line) gene.append(line) gene_start = int(obj.starts()) gene_end = int(obj.ends()) first_gene = False elif obj.types() == 'mRNA': gene.append(line) if gene_start > int(obj.starts()): gene_start = int(obj.starts()) if int(obj.ends()) > gene_end: gene_end = int(obj.ends()) mRNA_start = int(obj.starts()) mRNA_end = int(obj.ends()) mRNA_coord = {} for i in range(int(obj.starts()), int(obj.ends()) + 1): mRNA_coord[i] = '' else: if int(obj.starts()) not in mRNA_coord: line = obj.seqids() + '\t' + \ obj.sources() + '\t' + \ obj.types() + '\t' + \ str(int(mRNA_start)) + '\t' + \ obj.ends() + '\t' + \ obj.scores() + '\t' + \ obj.strands() + '\t' + \ str(obj.phases()) + '\t' + \ obj.attributes() if int(obj.ends()) not in mRNA_coord: line = obj.seqids() + '\t' + \ obj.sources() + '\t' + \ obj.types() + '\t' + \ obj.starts() + '\t' + \ str(mRNA_end) + '\t' + \ obj.scores() + '\t' + \ obj.strands() + '\t' + \ str(obj.phases()) + '\t' + \ obj.attributes() gene.append(line) obj = classGene.GFF3(gene[0]) print obj.seqids() + '\t' + \ obj.sources() + '\t' + \ obj.types() + '\t' + \ str(gene_start) + '\t' + \ str(gene_end) + '\t' + \ obj.scores() + '\t' + \ obj.strands() + '\t' + \ str(obj.phases()) + '\t' + \ obj.attributes() for l in gene[1:]: print l
def make_gff3(reaplcement_IDs, hash_combine): out = open(last_gff3 + '.replaced', 'w') count = 0 for line in open(last_gff3, 'r'): line = line.strip() count += 1 if len(line) > 0 and not line.startswith('#'): if count % 100000 == 0: print 'Printing final GFF3: ', '{:9,.0f}'.format(count) obj = classGene.GFF3(line) if obj.types() == "gene": out.write(line + '\n') source = obj.sources() g_id = str(obj) elif obj.types() == "mRNA": ID = str(obj) if ID in reaplcement_IDs: token = hash_combine[reaplcement_IDs[ID]]['mRNA'].split( '\t') if obj.seqids() == token[0]: print_flag = False ### print new mRNA out.write(token[0] + '\t' + source + '\t' + token[2] + '\t' + token[3] + '\t' + token[4] + '\t' + token[5] + '\t' + token[6] + '\t' + token[7] + '\t' + (token[8].split("Parent=")[0] ).replace(reaplcement_IDs[ID], ID) + "Parent=" + g_id + ";Name=" + ID + '\n') ### print exon lines of the lines for i in hash_combine[ reaplcement_IDs[ID]]['exon'].split(',')[1:]: i = i.replace(reaplcement_IDs[ID], ID) token = i.split('\t') out.write(token[0]+'\t'+ \ source + '\t' + \ '\t'.join(token[2:])+'\n') ### print CDS lines of the lines CDS_count = 0 for i in hash_combine[ reaplcement_IDs[ID]]['CDS'].split(',')[1:]: CDS_count += 1 i = i.replace(reaplcement_IDs[ID], ID) token = i.split('\t') out.write( token[0] + '\t' + source + '\t' + token[2] + '\t' + token[3] + '\t' + token[4] + '\t' + token[5] + '\t' + token[6] + '\t' + token[7] + '\t' + (token[8].replace(reaplcement_IDs[ID], ID) ).split("ID=")[0] + "ID=" + ID + '.CDS.' + str(CDS_count) + ";Parent=" + ID + '\n') else: print_flag = True out.write(line + '\n') else: if print_flag == True: out.write(line + '\n') out.close()