def Augustus(tool_pred, genome): augustus_ORFs = collections.OrderedDict() genome_size = len(genome) genome_rev = revCompIterative(genome) with open(tool_pred, 'r') as Augustus_input: for line in Augustus_input: line = line.split() if len(line) == 12 and "CDS" in line[2]: start = int(line[3]) stop = int(line[4]) strand = line[6] if '-' in strand: # Reverse Compliment starts and stops adjusted r_start = genome_size - stop r_stop = genome_size - start startCodon = genome_rev[r_start:r_start + 3] stopCodon = genome_rev[r_stop - 2:r_stop + 1] elif '+' in strand: startCodon = genome[start - 1:start + 2] stopCodon = genome[stop - 3:stop] po = str(start) + ',' + str(stop) orf = [strand, startCodon, stopCodon] augustus_ORFs.update({po: orf}) augustus_ORFs = sortORFs(augustus_ORFs) return augustus_ORFs
def FGENESB(tool_pred, genome): FGENESB_ORFs = collections.OrderedDict() genome_size = len(genome) genome_rev = revCompIterative(genome) with open(tool_pred, 'r') as FGENESB_input: for line in FGENESB_input: if '>GENE' in line: line = line.split() if '2208' in line: print("ss") if len(line) == 10 and ">GENE" in line[0]: start = int(line[2]) stop = int(line[4]) strand = line[9] if '-' in strand: # Reverse Compliment starts and stops adjusted r_start = genome_size - stop r_stop = genome_size - start startCodon = genome_rev[r_start:r_start + 3] stopCodon = genome_rev[r_stop - 2:r_stop + 1] elif '+' in strand: startCodon = genome[start - 1:start + 2] stopCodon = genome[stop - 3:stop] po = str(start) + ',' + str(stop) orf = [strand, startCodon, stopCodon] FGENESB_ORFs.update({po: orf}) FGENESB_ORFs = sortORFs(FGENESB_ORFs) return FGENESB_ORFs
def GeneMark_HA(tool_pred, genome): geneMark_HA_ORFs = collections.OrderedDict() genome_size = len(genome) genome_rev = revCompIterative(genome) with open(tool_pred, 'r') as GeneMark_HA_input: for line in GeneMark_HA_input: line = line.split() if len(line) >= 9 and "CDS" in line[5]: start = int(line[6]) stop = int(line[7]) strand = line[9] if '-' in strand: # Reverse Compliment starts and stops adjusted r_start = genome_size - stop r_stop = genome_size - start startCodon = genome_rev[r_start:r_start + 3] stopCodon = genome_rev[r_stop - 2:r_stop + 1] elif '+' in strand: startCodon = genome[start - 1:start + 2] stopCodon = genome[stop - 3:stop] po = str(start) + ',' + str(stop) orf = [strand, startCodon, stopCodon] geneMark_HA_ORFs.update({po: orf}) geneMark_HA_ORFs = sortORFs(geneMark_HA_ORFs) return geneMark_HA_ORFs
def GLIMMER_3(tool_pred, genome): GLIMMER_ORFs = collections.OrderedDict() genome_size = len(genome) genome_rev = revCompIterative(genome) with open( tool_pred, 'r' ) as glimmer_input: # GLIMMER_3 reverses the start and stop positions for ORFS on the negative strand for line in glimmer_input: if '>' not in line: # This will not work with multiple contigs line = line.split() if len(line) == 5 and "orf" in line[0]: if '-' in line[ 3]: # Reverse Compliment starts and stops adjusted - Switched to match Sense Strand start = int(line[2]) stop = int(line[1]) strand = '-' r_start = genome_size - stop r_stop = genome_size - start startCodon = genome_rev[r_start:r_start + 3] stopCodon = genome_rev[r_stop - 2:r_stop + 1] elif '+' in line[3]: start = int(line[1]) stop = int(line[2]) strand = '+' startCodon = genome[start - 1:start + 3] stopCodon = genome[stop - 3:stop] po = str(start) + ',' + str(stop) orf = [strand, startCodon, stopCodon] GLIMMER_ORFs.update({po: orf}) GLIMMER_ORFs = sortORFs(GLIMMER_ORFs) return GLIMMER_ORFs
def GFF(tool_pred, genome): GFF_ORFs = collections.OrderedDict() genome_size = len(genome) genome_rev = revCompIterative(genome) with open(tool_pred, 'r') as gff_input: for line in gff_input: if '#' not in line: line = line.split('\t') if "CDS" in line[2] and len(line) == 9: start = int(line[3]) stop = int(line[4]) strand = line[6] if '-' in strand: # Reverse Compliment starts and stops adjusted r_start = genome_size - stop r_stop = genome_size - start startCodon = genome_rev[r_start:r_start + 3] stopCodon = genome_rev[r_stop - 2:r_stop + 1] elif '+' in strand: startCodon = genome[start - 1:start + 2] stopCodon = genome[stop - 3:stop] po = str(start) + ',' + str(stop) orf = [strand, startCodon, stopCodon] GFF_ORFs.update({po: orf}) elif "CDS" in line[2]: sys.exit("SAS") GFF_ORFs = sortORFs(GFF_ORFs) return GFF_ORFs
def GeneMark(tool_pred, genome): geneMark_ORFs = collections.OrderedDict() genome_size = len(genome) genome_rev = revCompIterative(genome) prev_Start = 0 prev_Stop = 0 started = False with open(tool_pred, 'r') as GeneMark_input: for line in GeneMark_input: line = line.split() if len(line) == 7: started = True if 'direct' in line[2] or 'complement' in line[ 2]: # Strange Output requires strange code - We select the Longest ORF from each set start = int(line[0]) stop = int(line[1]) strand = line[2] if 'complement' in strand: # Reverse Compliment starts and stops adjusted if start != prev_Start: r_start = genome_size - stop r_stop = genome_size - start strand = '-' startCodon = genome_rev[r_start:r_start + 3] stopCodon = genome_rev[r_stop - 2:r_stop + 1] po = str(start) + ',' + str(stop) orf = [strand, startCodon, stopCodon] geneMark_ORFs.update({po: orf}) elif 'direct' in strand: if stop != prev_Stop: startCodon = genome[start - 1:start + 2] stopCodon = genome[stop - 3:stop] strand = '+' po = str(start) + ',' + str(stop) orf = [strand, startCodon, stopCodon] geneMark_ORFs.update({po: orf}) prev_Start = start prev_Stop = stop elif len(line) == 0 and started == True: prev_Stop = 0 prev_Start = 0 geneMark_ORFs = sortORFs(geneMark_ORFs) return geneMark_ORFs