def check_F2(refseq, spec, VERBOSE=0): '''Check fragment F2: gag, pol''' check = check_length_fragment(refseq, 'F2' + spec, VERBOSE=VERBOSE, tolerance=80) if not check: return False # Check gag (there should be end) genename = 'gag' (start, end, start_found, end_found) = locate_gene(refseq, genename, VERBOSE=VERBOSE) if (not end_found): print 'ERROR: end of ' + genename + ' not found in F2!' return False elif VERBOSE >= 3: print 'OK: end of ' + genename + ' found' geneseq = refseq[:end] geneseq = geneseq[len(geneseq) % 3:] gene = geneseq.seq prot = gene.translate() check = check_has_end(prot, 'gag', VERBOSE=VERBOSE) if not check: return False check = check_has_premature_stops(prot, 'gag', VERBOSE=VERBOSE) if not check: return False # Check pol (there should be the start) genename = 'pol' (start, end, start_found, end_found) = locate_gene(refseq, genename, VERBOSE=VERBOSE) if (not start_found): print 'ERROR: start of ' + genename + ' not found in F2!' return False elif VERBOSE >= 3: print 'OK: start of ' + genename + ' found' geneseq = refseq[start:] geneseq = geneseq[:len(geneseq) - len(geneseq) % 3] gene = geneseq.seq prot = gene.translate() check = check_start_aminoacid(prot, genename, VERBOSE=VERBOSE) if not check: return False check = check_has_premature_stops_noend(prot, genename, VERBOSE=VERBOSE) if not check: return False return True
def check_F2(refseq, spec, VERBOSE=0): '''Check fragment F2: gag, pol''' check = check_length_fragment(refseq, 'F2'+spec, VERBOSE=VERBOSE, tolerance=80) if not check: return False # Check gag (there should be end) genename = 'gag' (start, end, start_found, end_found) = locate_gene(refseq, genename, VERBOSE=VERBOSE) if (not end_found): print 'ERROR: end of '+genename+' not found in F2!' return False elif VERBOSE >= 3: print 'OK: end of '+genename+' found' geneseq = refseq[:end] geneseq = geneseq[len(geneseq) % 3:] gene = geneseq.seq prot = gene.translate() check = check_has_end(prot, 'gag', VERBOSE=VERBOSE) if not check: return False check = check_has_premature_stops(prot, 'gag', VERBOSE=VERBOSE) if not check: return False # Check pol (there should be the start) genename = 'pol' (start, end, start_found, end_found) = locate_gene(refseq, genename, VERBOSE=VERBOSE) if (not start_found): print 'ERROR: start of '+genename+' not found in F2!' return False elif VERBOSE >= 3: print 'OK: start of '+genename+' found' geneseq = refseq[start:] geneseq = geneseq[:len(geneseq) - len(geneseq) % 3] gene = geneseq.seq prot = gene.translate() check = check_start_aminoacid(prot, genename, VERBOSE=VERBOSE) if not check: return False check = check_has_premature_stops_noend(prot, genename, VERBOSE=VERBOSE) if not check: return False return True
def check_F3(refseq, spec, VERBOSE=0): '''Check fragment F3: end of pol''' check = check_length_fragment(refseq, 'F3' + spec, VERBOSE=VERBOSE, tolerance=50) if not check: return False # Check pol: this depends on the spec: for F3bo there should be the end, # anything else has only the middle (it's all pol!) genename = 'pol' if spec == 'bo': (start, end, start_found, end_found) = locate_gene(refseq, genename, VERBOSE=VERBOSE) if (not end_found): print 'ERROR: end of ' + genename + ' not found in F3!' return False elif VERBOSE >= 3: print 'OK: end of ' + genename + ' found' geneseq = refseq[:end] geneseq = geneseq[len(geneseq) % 3:] gene = geneseq.seq prot = gene.translate() check = check_has_end(prot, genename, VERBOSE=VERBOSE) if not check: return False check = check_has_premature_stops(prot, genename, VERBOSE=VERBOSE) if not check: return False else: # Try all 3 reading frames for offset in xrange(3): geneseq = refseq[offset:] geneseq = geneseq[:len(geneseq) - (len(geneseq) % 3)] gene = geneseq.seq prot = gene.translate() check = check_has_premature_stops_noend(prot, genename, VERBOSE=0) if check: if VERBOSE >= 3: print 'OK: ' + genename + ' has no premature stop codons' break else: if VERBOSE >= 1: print 'ERROR: ' + genename + ' has premature stop codons in all reading frames!' return False return True
def check_F3(refseq, spec, VERBOSE=0): '''Check fragment F3: end of pol''' check = check_length_fragment(refseq, 'F3'+spec, VERBOSE=VERBOSE, tolerance=50) if not check: return False # Check pol: this depends on the spec: for F3bo there should be the end, # anything else has only the middle (it's all pol!) genename = 'pol' if spec == 'bo': (start, end, start_found, end_found) = locate_gene(refseq, genename, VERBOSE=VERBOSE) if (not end_found): print 'ERROR: end of '+genename+' not found in F3!' return False elif VERBOSE >= 3: print 'OK: end of '+genename+' found' geneseq = refseq[:end] geneseq = geneseq[len(geneseq) % 3:] gene = geneseq.seq prot = gene.translate() check = check_has_end(prot, genename, VERBOSE=VERBOSE) if not check: return False check = check_has_premature_stops(prot, genename, VERBOSE=VERBOSE) if not check: return False else: # Try all 3 reading frames for offset in xrange(3): geneseq = refseq[offset:] geneseq = geneseq[: len(geneseq) - (len(geneseq) % 3)] gene = geneseq.seq prot = gene.translate() check = check_has_premature_stops_noend(prot, genename, VERBOSE=0) if check: if VERBOSE >= 3: print 'OK: '+genename+' has no premature stop codons' break else: if VERBOSE >= 1: print 'ERROR: '+genename+' has premature stop codons in all reading frames!' return False return True
def check_genomewide(refseq, VERBOSE=0): '''Check the integrity of all genes in the genomewide consensus''' # Check single-exon genes length_tolerance = {'gag': 30, 'pol': 30, 'env': 70, 'vpr': 15, 'vpu': 15} for genename, tol in length_tolerance.iteritems(): (start, end, start_found, end_found) = locate_gene(refseq, genename, VERBOSE=VERBOSE) if (not start_found) or (not end_found): print 'ERROR: '+genename+' not found in genomewide!' return False elif VERBOSE >= 3: print 'OK: start and end of '+genename+' found' gene_HXB2 = get_gene_HXB2(genename) check = check_has_similar_length(end - start, len(gene_HXB2), genename, VERBOSE=VERBOSE, maxdiff=tol) if not check: return False geneseq = refseq[start: end] gene = geneseq.seq check = check_has_complete_codons(gene, genename, VERBOSE=VERBOSE) if not check: # sometimes the gene ends a few nucleotides upstream, and there is a # frameshift mutation that screws up gene_new = refseq.seq[start:] gene_new = gene_new[:len(gene_new) - (len(gene_new) % 3)] prot_new = gene_new.translate() end_new = prot_new.find('*') end_diff = start + (3 * end_new + 3) - end if -90 < end_diff < 0: print genename.upper()+' ENDS '+str((end - start) // 3 - end_new - 1)+' AMINO ACIDS UPSTREAM!' gene = gene_new[:3 * (end_new + 1)] else: return False prot = gene.translate() check = check_start_aminoacid(prot, genename, VERBOSE=VERBOSE) if (not check): if genename != 'vpu': return False else: print 'ERROR IN VPU STARTING CODON, CONTINUING!' check = check_has_end(prot, genename, VERBOSE=VERBOSE) if not check: # sometimes a gene is a bit longer gene_new = refseq.seq[start:] gene_new = gene_new[:len(gene_new) - (len(gene_new) % 3)] prot_new = gene_new.translate() end_new = prot_new.find('*') end_diff = start + (3 * end_new + 3) - end if -90 < end_diff < 0: print genename.upper()+' ENDS '+str((end - start) // 3 - end_new - 1)+' AMINO ACIDS UPSTREAM!' gene = gene_new[:3 * (end_new + 1)] prot = gene.translate() elif 0 < end_diff < 90: print genename.upper()+' ENDS '+str(end_new + 1 - (end - start) // 3)+' AMINO ACIDS DOWNSTREAM!' gene = gene_new[:3 * (end_new + 1)] prot = gene.translate() else: return False check = check_has_premature_stops(prot, genename, VERBOSE=VERBOSE) if not check: return False # Vif is special because it can be longer than in HXB2 genename = 'vif' (start, end, start_found, end_found) = locate_gene(refseq, genename, VERBOSE=VERBOSE) if (not start_found) or (not end_found): print 'ERROR: '+genename+' not found in genomewide!' return False elif VERBOSE >= 3: print 'OK: start and end of '+genename+' found' gene_HXB2 = get_gene_HXB2(genename) check = check_has_similar_length(end - start, len(gene_HXB2), genename, VERBOSE=VERBOSE, maxdiff=15) if not check: return False geneseq = refseq[start: end] gene = geneseq.seq check = check_has_complete_codons(gene, genename, VERBOSE=VERBOSE) if not check: return False prot = gene.translate() check = check_start_aminoacid(prot, genename, VERBOSE=VERBOSE) if not check: return False check = check_has_end(prot, genename, VERBOSE=0) if not check: # Vif tends to be a bit longer than in HXB2 for nc in xrange(1, 4): gene_ext = refseq[start: end + 3 * nc].seq prot_ext = gene_ext.translate() check = check_has_end(prot_ext, genename, VERBOSE=0) if check: gene = gene_ext prot = prot_ext if VERBOSE: print 'WARNING: '+genename+' actually ends '+str(nc)+' codons downstream' break else: print 'ERROR: '+genename+' does not end, not even slightly downstream' return False check = check_has_premature_stops(prot, genename, VERBOSE=VERBOSE) if not check: return False # Check 2-exon genes for genename_whole in ('tat', 'rev'): genename = genename_whole+'1' (start, end, start_found, end_found) = locate_gene(refseq, genename, VERBOSE=VERBOSE) if (not start_found) or (not end_found): print 'ERROR: '+genename+' not found in genomewide!' return False elif VERBOSE >= 3: print 'OK: start and end of '+genename+' found' gene_HXB2 = get_gene_HXB2(genename) check = check_has_similar_length(end - start, len(gene_HXB2), genename, VERBOSE=VERBOSE, maxdiff=15) if not check: return False geneseq = refseq[start: end] geneseq = geneseq[:len(geneseq) - len(geneseq) % 3] gene = geneseq.seq prot = gene.translate() check = check_start_aminoacid(prot, genename, VERBOSE=VERBOSE) if not check: return False start_exon1 = start end_exon1 = end genename = genename_whole+'2' (start, end, start_found, end_found) = locate_gene(refseq[end_exon1 + 2000:], genename, VERBOSE=VERBOSE) if (not start_found) or (not end_found): print 'ERROR: '+genename+' not found in genomewide!' return False elif VERBOSE >= 3: print 'OK: start and end of '+genename+' found' start += end_exon1 + 2000 end += end_exon1 + 2000 # NOTE: rev2 overlaps with env gp41 and can have insertions or deletions if genename == 'rev2': tol = 45 else: tol = 15 gene_HXB2 = get_gene_HXB2(genename) check = check_has_similar_length(end - start, len(gene_HXB2), genename, VERBOSE=VERBOSE, maxdiff=tol) if not check: return False geneseq = refseq[start: end] frame = get_frame(geneseq, gene_HXB2, genename, VERBOSE=VERBOSE) geneseq = geneseq[frame:] gene = geneseq.seq prot = gene.translate() check = check_has_end(prot, genename, VERBOSE=VERBOSE) if not check: if genename != 'rev2': return False else: # rev2 can end a bit early end_new = prot.rfind('*') if end_new != -1: if len(prot) - 1 - end_new < 20: print 'REV2 ENDS '+str(len(prot) - end_new - 1)+' AMINO ACIDS UPSTREAM!' prot = prot[:end_new + 1] end = start + frame + 3 * (end_new + 1) else: return False else: # rev2 can also end quite a bit late gene_new = refseq.seq[start:] gene_new = gene_new[(end - start) % 3:] gene_new = gene_new[:len(gene_new) - (len(gene_new) % 3)] prot_new = gene_new.translate() end_new = prot_new.find('*') if (start + 3 * end_new) - end < 200: print 'REV2 ENDS '+str(end_new - len(prot) + 1)+' AMINO ACIDS DOWNSTREAM!' prot = prot_new[:end_new + 1] end = start + ((end - start) % 3) + 3 * (end_new + 1) else: return False check = check_has_premature_stops(prot, genename, VERBOSE=VERBOSE) if not check: return False start_exon2 = start end_exon2 = end genename = genename_whole gene_HXB2 = get_gene_HXB2(genename) from Bio.SeqFeature import FeatureLocation gene_loc = FeatureLocation(start_exon1, end_exon1, strand=+1) + \ FeatureLocation(start_exon2, end_exon2, strand=+1) geneseq = gene_loc.extract(refseq) gene = geneseq.seq check = check_has_complete_codons(gene, genename, VERBOSE=VERBOSE) if not check: return False prot = gene.translate() check = check_start_aminoacid(prot, genename, VERBOSE=VERBOSE) if not check: return False check = check_has_end(prot, genename, VERBOSE=VERBOSE) if not check: return False check = check_has_premature_stops(prot, genename, VERBOSE=VERBOSE) if not check: return False return True
def check_F6(refseq, spec, VERBOSE=0): '''Check fragment F6: end of env, tat2, rev2''' check = check_length_fragment(refseq, 'F6'+spec, VERBOSE=VERBOSE, tolerance=50) if not check: return False # Check env (there should be end) genename = 'env' (start, end, start_found, end_found) = locate_gene(refseq, genename, VERBOSE=VERBOSE) if (not end_found): print 'ERROR: end of '+genename+' not found in F6!' return False elif VERBOSE >= 3: print 'OK: end of '+genename+' found' geneseq = refseq[:end] gene_HXB2 = get_gene_HXB2(genename) frame = get_frame(geneseq, gene_HXB2, genename) geneseq = geneseq[frame:] geneseq = geneseq[:len(geneseq) - (len(geneseq) % 3)] gene = geneseq.seq prot = gene.translate() check = check_has_end(prot, genename, VERBOSE=VERBOSE) # env can end a bit early or late if not check: gene_new = refseq.seq[frame:] gene_new = gene_new[:len(gene_new) - (len(gene_new) % 3)] prot_new = gene_new.translate() end_new = prot_new.find('*') end_diff = (frame + 3 * end_new) - end if 0 < end_diff < 200: print 'ENV ENDS '+str(end_new - len(prot) + 1)+' AMINO ACIDS DOWNSTREAM!' prot = prot_new[:end_new + 1] elif -200 < end_diff < 0: print 'ENV ENDS '+str(len(prot) - 1 - end_new)+' AMINO ACIDS UPSTREAM!' prot = prot_new[:end_new + 1] else: return False check = check_has_premature_stops(prot, genename, VERBOSE=VERBOSE) if not check: print prot return False # Check tat2 (second exon of tat, should be complete) genename = 'tat2' (start, end, start_found, end_found) = locate_gene(refseq, genename, VERBOSE=VERBOSE) if (not start_found) or (not end_found): print 'ERROR: '+genename+' not found in F6!' return False elif VERBOSE >= 3: print 'OK: start and end of '+genename+' found' gene_HXB2 = get_gene_HXB2(genename) check = check_has_similar_length(end - start, len(gene_HXB2), genename, VERBOSE=VERBOSE, maxdiff=15) if not check: return False geneseq = refseq[start: end] geneseq = geneseq[len(geneseq) % 3:] gene = geneseq.seq prot = gene.translate() check = check_has_end(prot, genename, VERBOSE=VERBOSE) if not check: return False check = check_has_premature_stops(prot, genename, VERBOSE=VERBOSE) if not check: print 'ERROR IN TAT2 PREMATURE STOPS, CONTINUING!' # Check rev2 (second exon of rev, should be complete) genename = 'rev2' (start, end, start_found, end_found) = locate_gene(refseq, genename, VERBOSE=VERBOSE) if (not start_found) or (not end_found): print 'ERROR: '+genename+' not found in F6!' return False elif VERBOSE >= 3: print 'OK: start and end of '+genename+' found' # NOTE: rev2 overlaps with env gp41 and can have insertions or deletions gene_HXB2 = get_gene_HXB2(genename) check = check_has_similar_length(end - start, len(gene_HXB2), genename, VERBOSE=VERBOSE, maxdiff=45) if not check: return False geneseq = refseq[start: end] geneseq = geneseq[len(geneseq) % 3:] gene = geneseq.seq prot = gene.translate() check = check_has_end(prot, genename, VERBOSE=VERBOSE) if not check: # rev2 can end a bit early end_new = prot.rfind('*') if end_new != -1: if len(prot) - 1 - end_new < 20: print 'REV2 ENDS '+str(len(prot) - end_new - 1)+' AMINO ACIDS UPSTREAM!' prot = prot[:end_new + 1] else: return False else: # rev2 can also end quite a bit late gene_new = refseq.seq[start:] gene_new = gene_new[(end - start) % 3:] gene_new = gene_new[:len(gene_new) - (len(gene_new) % 3)] prot_new = gene_new.translate() end_new = prot_new.find('*') if (start + 3 * end_new) - end < 200: print 'REV2 ENDS '+str(end_new - len(prot) + 1)+' AMINO ACIDS DOWNSTREAM!' prot = prot_new[:end_new + 1] else: return False check = check_has_premature_stops(prot, genename, VERBOSE=VERBOSE) if not check: return False return True
def check_F5(refseq, spec, VERBOSE=0): '''Check fragment F5: env''' if spec == 'a+bo': spec_inner = 'bo' else: spec_inner = spec check = check_length_fragment(refseq, 'F5'+spec_inner, VERBOSE=VERBOSE, tolerance=70) if not check: return False # Check env (there should be the start) genename = 'env' (start, end, start_found, end_found) = locate_gene(refseq, genename, VERBOSE=VERBOSE) if (not start_found): print 'ERROR: start of '+genename+' not found in F5!' return False elif VERBOSE >= 3: print 'OK: start of '+genename+' found' geneseq = refseq[start:] geneseq = geneseq[:len(geneseq) - len(geneseq) % 3] gene = geneseq.seq prot = gene.translate() check = check_start_aminoacid(prot, genename, VERBOSE=VERBOSE) if not check: return False check = check_has_premature_stops_noend(prot, genename, VERBOSE=VERBOSE) if not check: return False # Check vpu (should be complete in F5ao) if spec_inner == 'ao': genename = 'vpu' (start, end, start_found, end_found) = locate_gene(refseq, genename, VERBOSE=VERBOSE) if (not start_found) or (not end_found): print 'ERROR: '+genename+' not found in F4!' return False elif VERBOSE >= 3: print 'OK: start and end of '+genename+' found' gene_HXB2 = get_gene_HXB2(genename) check = check_has_similar_length(end - start, len(gene_HXB2), genename, VERBOSE=VERBOSE, maxdiff=15) if not check: return False geneseq = refseq[start: end] gene = geneseq.seq check = check_has_complete_codons(gene, genename, VERBOSE=VERBOSE) if not check: return False prot = gene.translate() check = check_start_aminoacid(prot, genename, VERBOSE=VERBOSE) if not check: print 'ERROR IN VPU STARTING CODON, CONTINUING!' #return False check = check_has_end(prot, genename, VERBOSE=VERBOSE) if not check: return False check = check_has_premature_stops(prot, genename, VERBOSE=VERBOSE) if not check: return False return True
def check_F4(refseq, spec, VERBOSE=0): '''Check fragment F4: pol, vif, vpr, vpu, tat1, rev1, env''' check = check_length_fragment(refseq, 'F4'+spec, VERBOSE=VERBOSE, tolerance=50) if not check: return False # Check pol (there should be end) genename = 'pol' (start, end, start_found, end_found) = locate_gene(refseq, genename, VERBOSE=VERBOSE) if (not end_found): print 'ERROR: end of '+genename+' not found in F4!' return False elif VERBOSE >= 3: print 'OK: end of '+genename+' found' geneseq = refseq[:end] gene_HXB2 = get_gene_HXB2(genename) frame = get_frame(geneseq, gene_HXB2, genename) geneseq = geneseq[frame:] geneseq = geneseq[:len(geneseq) - (len(geneseq) % 3)] gene = geneseq.seq prot = gene.translate() check = check_has_end(prot, genename, VERBOSE=VERBOSE) # it can end a bit early or late if not check: gene_new = refseq.seq[frame:] gene_new = gene_new[:len(gene_new) - (len(gene_new) % 3)] prot_new = gene_new.translate() end_new = prot_new.find('*') end_diff = (frame + 3 * end_new) - end if 0 < end_diff < 200: print genename.upper()+' ENDS '+str(end_new - len(prot) + 1)+' AMINO ACIDS DOWNSTREAM!' prot = prot_new[:end_new + 1] elif -200 < end_diff < 0: print genename.upper()+' ENDS '+str(len(prot) - 1 - end_new)+' AMINO ACIDS UPSTREAM!' prot = prot_new[:end_new + 1] else: return False check = check_has_premature_stops(prot, genename, VERBOSE=VERBOSE) if not check: print prot return False # Check env (there should be the start) genename = 'env' (start, end, start_found, end_found) = locate_gene(refseq, genename, VERBOSE=VERBOSE) if (not start_found): print 'ERROR: start of '+genename+' not found in F4!' return False elif VERBOSE >= 3: print 'OK: start of '+genename+' found' geneseq = refseq[start:] geneseq = geneseq[:len(geneseq) - len(geneseq) % 3] gene = geneseq.seq prot = gene.translate() check = check_start_aminoacid(prot, genename, VERBOSE=VERBOSE) if not check: return False check = check_has_premature_stops_noend(prot, genename, VERBOSE=VERBOSE) if not check: return False # Check vif (should be complete) genename = 'vif' (start, end, start_found, end_found) = locate_gene(refseq, genename, VERBOSE=VERBOSE) if (not start_found) or (not end_found): print 'ERROR: '+genename+' not found in F4!' return False elif VERBOSE >= 3: print 'OK: start and end of '+genename+' found' gene_HXB2 = get_gene_HXB2(genename) check = check_has_similar_length(end - start, len(gene_HXB2), genename, VERBOSE=VERBOSE, maxdiff=15) if not check: return False geneseq = refseq[start: end] gene = geneseq.seq check = check_has_complete_codons(gene, genename, VERBOSE=VERBOSE) if not check: return False prot = gene.translate() check = check_start_aminoacid(prot, genename, VERBOSE=VERBOSE) if not check: return False check = check_has_end(prot, genename, VERBOSE=0) if check: if VERBOSE >= 3: print 'OK: '+genename+' ends with a *' else: # Vif tends to be a bit longer than in HXB2 for nc in xrange(1, 4): gene_ext = refseq[start: end + 3 * nc].seq prot_ext = gene_ext.translate() check = check_has_end(prot_ext, genename, VERBOSE=0) if check: gene = gene_ext prot = prot_ext if VERBOSE: print 'WARNING: '+genename+' actually ends '+str(nc)+' codons downstream' break else: print 'ERROR: '+genename+' does not end, not even slightly downstream' return False check = check_has_premature_stops(prot, genename, VERBOSE=VERBOSE) if not check: return False # Check vpu (should be complete) genename = 'vpu' (start, end, start_found, end_found) = locate_gene(refseq, genename, VERBOSE=VERBOSE) if (not start_found) or (not end_found): print 'ERROR: '+genename+' not found in F4!' return False elif VERBOSE >= 3: print 'OK: start and end of '+genename+' found' gene_HXB2 = get_gene_HXB2(genename) check = check_has_similar_length(end - start, len(gene_HXB2), genename, VERBOSE=VERBOSE, maxdiff=15) if not check: return False geneseq = refseq[start: end] gene = geneseq.seq check = check_has_complete_codons(gene, genename, VERBOSE=VERBOSE) if not check: return False prot = gene.translate() check = check_start_aminoacid(prot, genename, VERBOSE=VERBOSE) if not check: print 'ERROR IN VPU STARTING CODON, CONTINUING!' #return False check = check_has_end(prot, genename, VERBOSE=VERBOSE) if not check: return False check = check_has_premature_stops(prot, genename, VERBOSE=VERBOSE) if not check: return False # Check vpr (should be complete) genename = 'vpr' (start, end, start_found, end_found) = locate_gene(refseq, genename, VERBOSE=VERBOSE) if (not start_found) or (not end_found): print 'ERROR: '+genename+' not found in F4!' return False elif VERBOSE >= 3: print 'OK: start and end of '+genename+' found' gene_HXB2 = get_gene_HXB2(genename) check = check_has_similar_length(end - start, len(gene_HXB2), genename, VERBOSE=VERBOSE, maxdiff=15) if not check: return False geneseq = refseq[start: end] gene = geneseq.seq check = check_has_complete_codons(gene, genename, VERBOSE=VERBOSE) if not check: return False prot = gene.translate() check = check_start_aminoacid(prot, genename, VERBOSE=VERBOSE) if not check: return False check = check_has_end(prot, genename, VERBOSE=VERBOSE) if not check: return False check = check_has_premature_stops(prot, genename, VERBOSE=VERBOSE) if not check: return False # Check tat1 (first exon of tat, should be complete) genename = 'tat1' (start, end, start_found, end_found) = locate_gene(refseq, genename, VERBOSE=VERBOSE) if (not start_found) or (not end_found): print 'ERROR: '+genename+' not found in F4!' return False elif VERBOSE >= 3: print 'OK: start and end of '+genename+' found' gene_HXB2 = get_gene_HXB2(genename) check = check_has_similar_length(end - start, len(gene_HXB2), genename, VERBOSE=VERBOSE, maxdiff=35) if not check: return False geneseq = refseq[start: end] geneseq = geneseq[:len(geneseq) - len(geneseq) % 3] gene = geneseq.seq prot = gene.translate() check = check_start_aminoacid(prot, genename, VERBOSE=VERBOSE) if not check: return False check = check_has_premature_stops_noend(prot, genename, VERBOSE=VERBOSE) if not check: return False # Check rev1 (first exon of rev, should be complete) genename = 'rev1' (start, end, start_found, end_found) = locate_gene(refseq, genename, VERBOSE=VERBOSE) if (not start_found) or (not end_found): print 'ERROR: '+genename+' not found in F4!' return False elif VERBOSE >= 3: print 'OK: start and end of '+genename+' found' gene_HXB2 = get_gene_HXB2(genename) check = check_has_similar_length(end - start, len(gene_HXB2), genename, VERBOSE=VERBOSE, maxdiff=15) if not check: return False geneseq = refseq[start: end] geneseq = geneseq[:len(geneseq) - len(geneseq) % 3] gene = geneseq.seq prot = gene.translate() check = check_start_aminoacid(prot, genename, VERBOSE=VERBOSE) if not check: return False check = check_has_premature_stops_noend(prot, genename, VERBOSE=VERBOSE) if not check: return False return True
def check_F1(refseq, spec, VERBOSE=0): '''Check fragment F1: gag, pol''' check = check_length_fragment(refseq, 'F1'+spec, VERBOSE=VERBOSE, tolerance=50) if not check: return False # Check gag (should be complete) genename = 'gag' (start, end, start_found, end_found) = locate_gene(refseq, genename, VERBOSE=VERBOSE) if (not start_found) or (not end_found): print 'ERROR: '+genename+' not found in F1!' return False elif VERBOSE >= 3: print 'OK: start and end of '+genename+' found' gene_HXB2 = get_gene_HXB2(genename) check = check_has_similar_length(end - start, len(gene_HXB2), genename, VERBOSE=VERBOSE, maxdiff=30) if not check: return False geneseq = refseq[start: end] gene = geneseq.seq check = check_has_complete_codons(gene, genename, VERBOSE=VERBOSE) if not check: return False prot = gene.translate() check = check_start_aminoacid(prot, genename, VERBOSE=VERBOSE) if not check: return False check = check_has_end(prot, genename, VERBOSE=VERBOSE) if not check: return False check = check_has_premature_stops(prot, genename, VERBOSE=VERBOSE) if not check: return False # Check pol (there should be the start) genename = 'pol' (start, end, start_found, end_found) = locate_gene(refseq, genename, VERBOSE=VERBOSE) if (not start_found): print 'ERROR: start of '+genename+' not found in F1!' return False elif VERBOSE >= 3: print 'OK: start of '+genename+' found' geneseq = refseq[start:] geneseq = geneseq[:len(geneseq) - len(geneseq) % 3] gene = geneseq.seq prot = gene.translate() check = check_start_aminoacid(prot, genename, VERBOSE=VERBOSE) if not check: return False check = check_has_premature_stops_noend(prot, genename, VERBOSE=VERBOSE) if not check: return False return True
def annotate_sequence(seqrecord, additional_edges={}, additional_features=['chunk'], VERBOSE=0): '''Annotate a consensus with the genes and stuff (in place)''' # TODO: what do we do with genes that do not start/end where they are # supposed to? Do we follow biology and track their new locations? from Bio.SeqFeature import SeqFeature, FeatureLocation, CompoundLocation from hivwholeseq.utils.genome_info import gene_edges, RNA_structure_edges, \ other_edges, find_region_edges, find_region_edges_multiple, \ locate_gene edge_dict = {'gene': gene_edges, 'RNA structure': RNA_structure_edges, 'other': other_edges} edge_dict.update(additional_edges) additional_features = ['protein'] + additional_features features = edge_dict.keys() + additional_features if VERBOSE: print 'Features:', ', '.join(features) smat = np.array(seqrecord) for feature_type in edge_dict: edges_all = edge_dict[feature_type] print feature_type, edge_dict[feature_type].keys() for name, edges in edges_all.iteritems(): if VERBOSE >= 2: print name, # Skip a feature if it's present already if name in map(lambda x: x.id, seqrecord.features): if VERBOSE >= 2: print 'already present.' continue # Behave differently for unsplit regions and split ones if len(edges) == 2: # LTR problems with F6 if 'F6' in name: pos_edge = find_region_edges(smat[6000::], [edges[0], None]) pos_edge[0] += 6000 elif feature_type == 'genes': pos_edge = locate_gene(smat, name, output_compact=True) else: pos_edge = find_region_edges(smat, edges) # Cut the primers for some features if (None not in pos_edge) and name in ['V1', 'V3', 'V4', 'V5']: pos_edge[0] += len(edges[0]) pos_edge[1] -= len(edges[1]) # Cut only the right primer for V2 if (None not in pos_edge) and name in ['V2']: pos_edge[1] -= len(edges[1]) if pos_edge[0] is None: if name not in ['F1', "LTR5'"]: print 'WARNING: start not found' pos_edge[0] = 0 if pos_edge[1] is None: if name not in ['F6', "LTR3'"]: print 'WARNING: end not found' pos_edge[1] = len(smat) location = FeatureLocation(*pos_edge) else: if feature_type == 'genes': pos_edges = [locate_gene(smat, name+suff, output_compact=True) for suff in ('1', '2')] else: pos_edges = find_region_edges_multiple(smat, edges, min_distance=1) locations = [FeatureLocation(*pos_edge) for pos_edge in pos_edges] location = CompoundLocation(locations) if VERBOSE >= 2: print 'found:', location feature = SeqFeature(location, type=feature_type, id=name, strand=1) seqrecord.features.append(feature) # Add proteins and other features from HXB2 from operator import attrgetter from seqanpy import align_overlap from hivwholeseq.utils.genome_info import proteins, chunks from hivwholeseq.reference import load_custom_reference additional_features_dict = {} if 'protein' in additional_features: additional_features_dict['protein'] = proteins if 'chunk' in additional_features: additional_features_dict['chunk'] = chunks ref_ann = load_custom_reference('HXB2', 'gb') for feagroup, additional_features_grp in additional_features_dict.iteritems(): for feaname in additional_features_grp: if VERBOSE >= 2: print feaname, fea = ref_ann.features[map(attrgetter('id'), ref_ann.features).index(feaname)] seq = fea.extract(ref_ann) (score, ali1, ali2) = align_overlap(seqrecord, seq, score_gapopen=-20) start = len(ali2) - len(ali2.lstrip('-')) end = len(ali2.rstrip('-')) end -= ali1[start: end].count('-') location = FeatureLocation(start, end) if VERBOSE >= 2: print 'found:', location feature = SeqFeature(location, type=feagroup, id=feaname, strand=1) seqrecord.features.append(feature)
def check_genomewide(refseq, VERBOSE=0): '''Check the integrity of all genes in the genomewide consensus''' # Check single-exon genes length_tolerance = {'gag': 30, 'pol': 30, 'env': 70, 'vpr': 15, 'vpu': 15} for genename, tol in length_tolerance.iteritems(): (start, end, start_found, end_found) = locate_gene(refseq, genename, VERBOSE=VERBOSE) if (not start_found) or (not end_found): print 'ERROR: ' + genename + ' not found in genomewide!' return False elif VERBOSE >= 3: print 'OK: start and end of ' + genename + ' found' gene_HXB2 = get_gene_HXB2(genename) check = check_has_similar_length(end - start, len(gene_HXB2), genename, VERBOSE=VERBOSE, maxdiff=tol) if not check: return False geneseq = refseq[start:end] gene = geneseq.seq check = check_has_complete_codons(gene, genename, VERBOSE=VERBOSE) if not check: # sometimes the gene ends a few nucleotides upstream, and there is a # frameshift mutation that screws up gene_new = refseq.seq[start:] gene_new = gene_new[:len(gene_new) - (len(gene_new) % 3)] prot_new = gene_new.translate() end_new = prot_new.find('*') end_diff = start + (3 * end_new + 3) - end if -90 < end_diff < 0: print genename.upper() + ' ENDS ' + str( (end - start) // 3 - end_new - 1) + ' AMINO ACIDS UPSTREAM!' gene = gene_new[:3 * (end_new + 1)] else: return False prot = gene.translate() check = check_start_aminoacid(prot, genename, VERBOSE=VERBOSE) if (not check): if genename != 'vpu': return False else: print 'ERROR IN VPU STARTING CODON, CONTINUING!' check = check_has_end(prot, genename, VERBOSE=VERBOSE) if not check: # sometimes a gene is a bit longer gene_new = refseq.seq[start:] gene_new = gene_new[:len(gene_new) - (len(gene_new) % 3)] prot_new = gene_new.translate() end_new = prot_new.find('*') end_diff = start + (3 * end_new + 3) - end if -90 < end_diff < 0: print genename.upper() + ' ENDS ' + str( (end - start) // 3 - end_new - 1) + ' AMINO ACIDS UPSTREAM!' gene = gene_new[:3 * (end_new + 1)] prot = gene.translate() elif 0 < end_diff < 90: print genename.upper() + ' ENDS ' + str( end_new + 1 - (end - start) // 3) + ' AMINO ACIDS DOWNSTREAM!' gene = gene_new[:3 * (end_new + 1)] prot = gene.translate() else: return False check = check_has_premature_stops(prot, genename, VERBOSE=VERBOSE) if not check: return False # Vif is special because it can be longer than in HXB2 genename = 'vif' (start, end, start_found, end_found) = locate_gene(refseq, genename, VERBOSE=VERBOSE) if (not start_found) or (not end_found): print 'ERROR: ' + genename + ' not found in genomewide!' return False elif VERBOSE >= 3: print 'OK: start and end of ' + genename + ' found' gene_HXB2 = get_gene_HXB2(genename) check = check_has_similar_length(end - start, len(gene_HXB2), genename, VERBOSE=VERBOSE, maxdiff=15) if not check: return False geneseq = refseq[start:end] gene = geneseq.seq check = check_has_complete_codons(gene, genename, VERBOSE=VERBOSE) if not check: return False prot = gene.translate() check = check_start_aminoacid(prot, genename, VERBOSE=VERBOSE) if not check: return False check = check_has_end(prot, genename, VERBOSE=0) if not check: # Vif tends to be a bit longer than in HXB2 for nc in xrange(1, 4): gene_ext = refseq[start:end + 3 * nc].seq prot_ext = gene_ext.translate() check = check_has_end(prot_ext, genename, VERBOSE=0) if check: gene = gene_ext prot = prot_ext if VERBOSE: print 'WARNING: ' + genename + ' actually ends ' + str( nc) + ' codons downstream' break else: print 'ERROR: ' + genename + ' does not end, not even slightly downstream' return False check = check_has_premature_stops(prot, genename, VERBOSE=VERBOSE) if not check: return False # Check 2-exon genes for genename_whole in ('tat', 'rev'): genename = genename_whole + '1' (start, end, start_found, end_found) = locate_gene(refseq, genename, VERBOSE=VERBOSE) if (not start_found) or (not end_found): print 'ERROR: ' + genename + ' not found in genomewide!' return False elif VERBOSE >= 3: print 'OK: start and end of ' + genename + ' found' gene_HXB2 = get_gene_HXB2(genename) check = check_has_similar_length(end - start, len(gene_HXB2), genename, VERBOSE=VERBOSE, maxdiff=15) if not check: return False geneseq = refseq[start:end] geneseq = geneseq[:len(geneseq) - len(geneseq) % 3] gene = geneseq.seq prot = gene.translate() check = check_start_aminoacid(prot, genename, VERBOSE=VERBOSE) if not check: return False start_exon1 = start end_exon1 = end genename = genename_whole + '2' (start, end, start_found, end_found) = locate_gene(refseq[end_exon1 + 2000:], genename, VERBOSE=VERBOSE) if (not start_found) or (not end_found): print 'ERROR: ' + genename + ' not found in genomewide!' return False elif VERBOSE >= 3: print 'OK: start and end of ' + genename + ' found' start += end_exon1 + 2000 end += end_exon1 + 2000 # NOTE: rev2 overlaps with env gp41 and can have insertions or deletions if genename == 'rev2': tol = 45 else: tol = 15 gene_HXB2 = get_gene_HXB2(genename) check = check_has_similar_length(end - start, len(gene_HXB2), genename, VERBOSE=VERBOSE, maxdiff=tol) if not check: return False geneseq = refseq[start:end] frame = get_frame(geneseq, gene_HXB2, genename, VERBOSE=VERBOSE) geneseq = geneseq[frame:] gene = geneseq.seq prot = gene.translate() check = check_has_end(prot, genename, VERBOSE=VERBOSE) if not check: if genename != 'rev2': return False else: # rev2 can end a bit early end_new = prot.rfind('*') if end_new != -1: if len(prot) - 1 - end_new < 20: print 'REV2 ENDS ' + str(len(prot) - end_new - 1) + ' AMINO ACIDS UPSTREAM!' prot = prot[:end_new + 1] end = start + frame + 3 * (end_new + 1) else: return False else: # rev2 can also end quite a bit late gene_new = refseq.seq[start:] gene_new = gene_new[(end - start) % 3:] gene_new = gene_new[:len(gene_new) - (len(gene_new) % 3)] prot_new = gene_new.translate() end_new = prot_new.find('*') if (start + 3 * end_new) - end < 200: print 'REV2 ENDS ' + str(end_new - len(prot) + 1 ) + ' AMINO ACIDS DOWNSTREAM!' prot = prot_new[:end_new + 1] end = start + ((end - start) % 3) + 3 * (end_new + 1) else: return False check = check_has_premature_stops(prot, genename, VERBOSE=VERBOSE) if not check: return False start_exon2 = start end_exon2 = end genename = genename_whole gene_HXB2 = get_gene_HXB2(genename) from Bio.SeqFeature import FeatureLocation gene_loc = FeatureLocation(start_exon1, end_exon1, strand=+1) + \ FeatureLocation(start_exon2, end_exon2, strand=+1) geneseq = gene_loc.extract(refseq) gene = geneseq.seq check = check_has_complete_codons(gene, genename, VERBOSE=VERBOSE) if not check: return False prot = gene.translate() check = check_start_aminoacid(prot, genename, VERBOSE=VERBOSE) if not check: return False check = check_has_end(prot, genename, VERBOSE=VERBOSE) if not check: return False check = check_has_premature_stops(prot, genename, VERBOSE=VERBOSE) if not check: return False return True
def check_F6(refseq, spec, VERBOSE=0): '''Check fragment F6: end of env, tat2, rev2''' check = check_length_fragment(refseq, 'F6' + spec, VERBOSE=VERBOSE, tolerance=50) if not check: return False # Check env (there should be end) genename = 'env' (start, end, start_found, end_found) = locate_gene(refseq, genename, VERBOSE=VERBOSE) if (not end_found): print 'ERROR: end of ' + genename + ' not found in F6!' return False elif VERBOSE >= 3: print 'OK: end of ' + genename + ' found' geneseq = refseq[:end] gene_HXB2 = get_gene_HXB2(genename) frame = get_frame(geneseq, gene_HXB2, genename) geneseq = geneseq[frame:] geneseq = geneseq[:len(geneseq) - (len(geneseq) % 3)] gene = geneseq.seq prot = gene.translate() check = check_has_end(prot, genename, VERBOSE=VERBOSE) # env can end a bit early or late if not check: gene_new = refseq.seq[frame:] gene_new = gene_new[:len(gene_new) - (len(gene_new) % 3)] prot_new = gene_new.translate() end_new = prot_new.find('*') end_diff = (frame + 3 * end_new) - end if 0 < end_diff < 200: print 'ENV ENDS ' + str(end_new - len(prot) + 1) + ' AMINO ACIDS DOWNSTREAM!' prot = prot_new[:end_new + 1] elif -200 < end_diff < 0: print 'ENV ENDS ' + str(len(prot) - 1 - end_new) + ' AMINO ACIDS UPSTREAM!' prot = prot_new[:end_new + 1] else: return False check = check_has_premature_stops(prot, genename, VERBOSE=VERBOSE) if not check: print prot return False # Check tat2 (second exon of tat, should be complete) genename = 'tat2' (start, end, start_found, end_found) = locate_gene(refseq, genename, VERBOSE=VERBOSE) if (not start_found) or (not end_found): print 'ERROR: ' + genename + ' not found in F6!' return False elif VERBOSE >= 3: print 'OK: start and end of ' + genename + ' found' gene_HXB2 = get_gene_HXB2(genename) check = check_has_similar_length(end - start, len(gene_HXB2), genename, VERBOSE=VERBOSE, maxdiff=15) if not check: return False geneseq = refseq[start:end] geneseq = geneseq[len(geneseq) % 3:] gene = geneseq.seq prot = gene.translate() check = check_has_end(prot, genename, VERBOSE=VERBOSE) if not check: return False check = check_has_premature_stops(prot, genename, VERBOSE=VERBOSE) if not check: print 'ERROR IN TAT2 PREMATURE STOPS, CONTINUING!' # Check rev2 (second exon of rev, should be complete) genename = 'rev2' (start, end, start_found, end_found) = locate_gene(refseq, genename, VERBOSE=VERBOSE) if (not start_found) or (not end_found): print 'ERROR: ' + genename + ' not found in F6!' return False elif VERBOSE >= 3: print 'OK: start and end of ' + genename + ' found' # NOTE: rev2 overlaps with env gp41 and can have insertions or deletions gene_HXB2 = get_gene_HXB2(genename) check = check_has_similar_length(end - start, len(gene_HXB2), genename, VERBOSE=VERBOSE, maxdiff=45) if not check: return False geneseq = refseq[start:end] geneseq = geneseq[len(geneseq) % 3:] gene = geneseq.seq prot = gene.translate() check = check_has_end(prot, genename, VERBOSE=VERBOSE) if not check: # rev2 can end a bit early end_new = prot.rfind('*') if end_new != -1: if len(prot) - 1 - end_new < 20: print 'REV2 ENDS ' + str(len(prot) - end_new - 1) + ' AMINO ACIDS UPSTREAM!' prot = prot[:end_new + 1] else: return False else: # rev2 can also end quite a bit late gene_new = refseq.seq[start:] gene_new = gene_new[(end - start) % 3:] gene_new = gene_new[:len(gene_new) - (len(gene_new) % 3)] prot_new = gene_new.translate() end_new = prot_new.find('*') if (start + 3 * end_new) - end < 200: print 'REV2 ENDS ' + str(end_new - len(prot) + 1) + ' AMINO ACIDS DOWNSTREAM!' prot = prot_new[:end_new + 1] else: return False check = check_has_premature_stops(prot, genename, VERBOSE=VERBOSE) if not check: return False return True
def check_F5(refseq, spec, VERBOSE=0): '''Check fragment F5: env''' if spec == 'a+bo': spec_inner = 'bo' else: spec_inner = spec check = check_length_fragment(refseq, 'F5' + spec_inner, VERBOSE=VERBOSE, tolerance=70) if not check: return False # Check env (there should be the start) genename = 'env' (start, end, start_found, end_found) = locate_gene(refseq, genename, VERBOSE=VERBOSE) if (not start_found): print 'ERROR: start of ' + genename + ' not found in F5!' return False elif VERBOSE >= 3: print 'OK: start of ' + genename + ' found' geneseq = refseq[start:] geneseq = geneseq[:len(geneseq) - len(geneseq) % 3] gene = geneseq.seq prot = gene.translate() check = check_start_aminoacid(prot, genename, VERBOSE=VERBOSE) if not check: return False check = check_has_premature_stops_noend(prot, genename, VERBOSE=VERBOSE) if not check: return False # Check vpu (should be complete in F5ao) if spec_inner == 'ao': genename = 'vpu' (start, end, start_found, end_found) = locate_gene(refseq, genename, VERBOSE=VERBOSE) if (not start_found) or (not end_found): print 'ERROR: ' + genename + ' not found in F4!' return False elif VERBOSE >= 3: print 'OK: start and end of ' + genename + ' found' gene_HXB2 = get_gene_HXB2(genename) check = check_has_similar_length(end - start, len(gene_HXB2), genename, VERBOSE=VERBOSE, maxdiff=15) if not check: return False geneseq = refseq[start:end] gene = geneseq.seq check = check_has_complete_codons(gene, genename, VERBOSE=VERBOSE) if not check: return False prot = gene.translate() check = check_start_aminoacid(prot, genename, VERBOSE=VERBOSE) if not check: print 'ERROR IN VPU STARTING CODON, CONTINUING!' #return False check = check_has_end(prot, genename, VERBOSE=VERBOSE) if not check: return False check = check_has_premature_stops(prot, genename, VERBOSE=VERBOSE) if not check: return False return True
def check_F4(refseq, spec, VERBOSE=0): '''Check fragment F4: pol, vif, vpr, vpu, tat1, rev1, env''' check = check_length_fragment(refseq, 'F4' + spec, VERBOSE=VERBOSE, tolerance=50) if not check: return False # Check pol (there should be end) genename = 'pol' (start, end, start_found, end_found) = locate_gene(refseq, genename, VERBOSE=VERBOSE) if (not end_found): print 'ERROR: end of ' + genename + ' not found in F4!' return False elif VERBOSE >= 3: print 'OK: end of ' + genename + ' found' geneseq = refseq[:end] gene_HXB2 = get_gene_HXB2(genename) frame = get_frame(geneseq, gene_HXB2, genename) geneseq = geneseq[frame:] geneseq = geneseq[:len(geneseq) - (len(geneseq) % 3)] gene = geneseq.seq prot = gene.translate() check = check_has_end(prot, genename, VERBOSE=VERBOSE) # it can end a bit early or late if not check: gene_new = refseq.seq[frame:] gene_new = gene_new[:len(gene_new) - (len(gene_new) % 3)] prot_new = gene_new.translate() end_new = prot_new.find('*') end_diff = (frame + 3 * end_new) - end if 0 < end_diff < 200: print genename.upper() + ' ENDS ' + str( end_new - len(prot) + 1) + ' AMINO ACIDS DOWNSTREAM!' prot = prot_new[:end_new + 1] elif -200 < end_diff < 0: print genename.upper() + ' ENDS ' + str( len(prot) - 1 - end_new) + ' AMINO ACIDS UPSTREAM!' prot = prot_new[:end_new + 1] else: return False check = check_has_premature_stops(prot, genename, VERBOSE=VERBOSE) if not check: print prot return False # Check env (there should be the start) genename = 'env' (start, end, start_found, end_found) = locate_gene(refseq, genename, VERBOSE=VERBOSE) if (not start_found): print 'ERROR: start of ' + genename + ' not found in F4!' return False elif VERBOSE >= 3: print 'OK: start of ' + genename + ' found' geneseq = refseq[start:] geneseq = geneseq[:len(geneseq) - len(geneseq) % 3] gene = geneseq.seq prot = gene.translate() check = check_start_aminoacid(prot, genename, VERBOSE=VERBOSE) if not check: return False check = check_has_premature_stops_noend(prot, genename, VERBOSE=VERBOSE) if not check: return False # Check vif (should be complete) genename = 'vif' (start, end, start_found, end_found) = locate_gene(refseq, genename, VERBOSE=VERBOSE) if (not start_found) or (not end_found): print 'ERROR: ' + genename + ' not found in F4!' return False elif VERBOSE >= 3: print 'OK: start and end of ' + genename + ' found' gene_HXB2 = get_gene_HXB2(genename) check = check_has_similar_length(end - start, len(gene_HXB2), genename, VERBOSE=VERBOSE, maxdiff=15) if not check: return False geneseq = refseq[start:end] gene = geneseq.seq check = check_has_complete_codons(gene, genename, VERBOSE=VERBOSE) if not check: return False prot = gene.translate() check = check_start_aminoacid(prot, genename, VERBOSE=VERBOSE) if not check: return False check = check_has_end(prot, genename, VERBOSE=0) if check: if VERBOSE >= 3: print 'OK: ' + genename + ' ends with a *' else: # Vif tends to be a bit longer than in HXB2 for nc in xrange(1, 4): gene_ext = refseq[start:end + 3 * nc].seq prot_ext = gene_ext.translate() check = check_has_end(prot_ext, genename, VERBOSE=0) if check: gene = gene_ext prot = prot_ext if VERBOSE: print 'WARNING: ' + genename + ' actually ends ' + str( nc) + ' codons downstream' break else: print 'ERROR: ' + genename + ' does not end, not even slightly downstream' return False check = check_has_premature_stops(prot, genename, VERBOSE=VERBOSE) if not check: return False # Check vpu (should be complete) genename = 'vpu' (start, end, start_found, end_found) = locate_gene(refseq, genename, VERBOSE=VERBOSE) if (not start_found) or (not end_found): print 'ERROR: ' + genename + ' not found in F4!' return False elif VERBOSE >= 3: print 'OK: start and end of ' + genename + ' found' gene_HXB2 = get_gene_HXB2(genename) check = check_has_similar_length(end - start, len(gene_HXB2), genename, VERBOSE=VERBOSE, maxdiff=15) if not check: return False geneseq = refseq[start:end] gene = geneseq.seq check = check_has_complete_codons(gene, genename, VERBOSE=VERBOSE) if not check: return False prot = gene.translate() check = check_start_aminoacid(prot, genename, VERBOSE=VERBOSE) if not check: print 'ERROR IN VPU STARTING CODON, CONTINUING!' #return False check = check_has_end(prot, genename, VERBOSE=VERBOSE) if not check: return False check = check_has_premature_stops(prot, genename, VERBOSE=VERBOSE) if not check: return False # Check vpr (should be complete) genename = 'vpr' (start, end, start_found, end_found) = locate_gene(refseq, genename, VERBOSE=VERBOSE) if (not start_found) or (not end_found): print 'ERROR: ' + genename + ' not found in F4!' return False elif VERBOSE >= 3: print 'OK: start and end of ' + genename + ' found' gene_HXB2 = get_gene_HXB2(genename) check = check_has_similar_length(end - start, len(gene_HXB2), genename, VERBOSE=VERBOSE, maxdiff=15) if not check: return False geneseq = refseq[start:end] gene = geneseq.seq check = check_has_complete_codons(gene, genename, VERBOSE=VERBOSE) if not check: return False prot = gene.translate() check = check_start_aminoacid(prot, genename, VERBOSE=VERBOSE) if not check: return False check = check_has_end(prot, genename, VERBOSE=VERBOSE) if not check: return False check = check_has_premature_stops(prot, genename, VERBOSE=VERBOSE) if not check: return False # Check tat1 (first exon of tat, should be complete) genename = 'tat1' (start, end, start_found, end_found) = locate_gene(refseq, genename, VERBOSE=VERBOSE) if (not start_found) or (not end_found): print 'ERROR: ' + genename + ' not found in F4!' return False elif VERBOSE >= 3: print 'OK: start and end of ' + genename + ' found' gene_HXB2 = get_gene_HXB2(genename) check = check_has_similar_length(end - start, len(gene_HXB2), genename, VERBOSE=VERBOSE, maxdiff=35) if not check: return False geneseq = refseq[start:end] geneseq = geneseq[:len(geneseq) - len(geneseq) % 3] gene = geneseq.seq prot = gene.translate() check = check_start_aminoacid(prot, genename, VERBOSE=VERBOSE) if not check: return False check = check_has_premature_stops_noend(prot, genename, VERBOSE=VERBOSE) if not check: return False # Check rev1 (first exon of rev, should be complete) genename = 'rev1' (start, end, start_found, end_found) = locate_gene(refseq, genename, VERBOSE=VERBOSE) if (not start_found) or (not end_found): print 'ERROR: ' + genename + ' not found in F4!' return False elif VERBOSE >= 3: print 'OK: start and end of ' + genename + ' found' gene_HXB2 = get_gene_HXB2(genename) check = check_has_similar_length(end - start, len(gene_HXB2), genename, VERBOSE=VERBOSE, maxdiff=15) if not check: return False geneseq = refseq[start:end] geneseq = geneseq[:len(geneseq) - len(geneseq) % 3] gene = geneseq.seq prot = gene.translate() check = check_start_aminoacid(prot, genename, VERBOSE=VERBOSE) if not check: return False check = check_has_premature_stops_noend(prot, genename, VERBOSE=VERBOSE) if not check: return False return True
def check_F1(refseq, spec, VERBOSE=0): '''Check fragment F1: gag, pol''' check = check_length_fragment(refseq, 'F1' + spec, VERBOSE=VERBOSE, tolerance=50) if not check: return False # Check gag (should be complete) genename = 'gag' (start, end, start_found, end_found) = locate_gene(refseq, genename, VERBOSE=VERBOSE) if (not start_found) or (not end_found): print 'ERROR: ' + genename + ' not found in F1!' return False elif VERBOSE >= 3: print 'OK: start and end of ' + genename + ' found' gene_HXB2 = get_gene_HXB2(genename) check = check_has_similar_length(end - start, len(gene_HXB2), genename, VERBOSE=VERBOSE, maxdiff=30) if not check: return False geneseq = refseq[start:end] gene = geneseq.seq check = check_has_complete_codons(gene, genename, VERBOSE=VERBOSE) if not check: return False prot = gene.translate() check = check_start_aminoacid(prot, genename, VERBOSE=VERBOSE) if not check: return False check = check_has_end(prot, genename, VERBOSE=VERBOSE) if not check: return False check = check_has_premature_stops(prot, genename, VERBOSE=VERBOSE) if not check: return False # Check pol (there should be the start) genename = 'pol' (start, end, start_found, end_found) = locate_gene(refseq, genename, VERBOSE=VERBOSE) if (not start_found): print 'ERROR: start of ' + genename + ' not found in F1!' return False elif VERBOSE >= 3: print 'OK: start of ' + genename + ' found' geneseq = refseq[start:] geneseq = geneseq[:len(geneseq) - len(geneseq) % 3] gene = geneseq.seq prot = gene.translate() check = check_start_aminoacid(prot, genename, VERBOSE=VERBOSE) if not check: return False check = check_has_premature_stops_noend(prot, genename, VERBOSE=VERBOSE) if not check: return False return True
def annotate_sequence(seqrecord, additional_edges={}, additional_features=['chunk'], VERBOSE=0): '''Annotate a consensus with the genes and stuff (in place)''' # TODO: what do we do with genes that do not start/end where they are # supposed to? Do we follow biology and track their new locations? from Bio.SeqFeature import SeqFeature, FeatureLocation, CompoundLocation from hivwholeseq.utils.genome_info import gene_edges, RNA_structure_edges, \ other_edges, find_region_edges, find_region_edges_multiple, \ locate_gene edge_dict = { 'gene': gene_edges, 'RNA structure': RNA_structure_edges, 'other': other_edges } edge_dict.update(additional_edges) additional_features = ['protein'] + additional_features features = edge_dict.keys() + additional_features if VERBOSE: print 'Features:', ', '.join(features) smat = np.array(seqrecord) for feature_type in edge_dict: edges_all = edge_dict[feature_type] print feature_type, edge_dict[feature_type].keys() for name, edges in edges_all.iteritems(): if VERBOSE >= 2: print name, # Skip a feature if it's present already if name in map(lambda x: x.id, seqrecord.features): if VERBOSE >= 2: print 'already present.' continue # Behave differently for unsplit regions and split ones if len(edges) == 2: # LTR problems with F6 if 'F6' in name: pos_edge = find_region_edges(smat[6000::], [edges[0], None]) pos_edge[0] += 6000 elif feature_type == 'genes': pos_edge = locate_gene(smat, name, output_compact=True) else: pos_edge = find_region_edges(smat, edges) # Cut the primers for some features if (None not in pos_edge) and name in ['V1', 'V3', 'V4', 'V5']: pos_edge[0] += len(edges[0]) pos_edge[1] -= len(edges[1]) # Cut only the right primer for V2 if (None not in pos_edge) and name in ['V2']: pos_edge[1] -= len(edges[1]) if pos_edge[0] is None: if name not in ['F1', "LTR5'"]: print 'WARNING: start not found' pos_edge[0] = 0 if pos_edge[1] is None: if name not in ['F6', "LTR3'"]: print 'WARNING: end not found' pos_edge[1] = len(smat) location = FeatureLocation(*pos_edge) else: if feature_type == 'genes': pos_edges = [ locate_gene(smat, name + suff, output_compact=True) for suff in ('1', '2') ] else: pos_edges = find_region_edges_multiple(smat, edges, min_distance=1) locations = [ FeatureLocation(*pos_edge) for pos_edge in pos_edges ] location = CompoundLocation(locations) if VERBOSE >= 2: print 'found:', location feature = SeqFeature(location, type=feature_type, id=name, strand=1) seqrecord.features.append(feature) # Add proteins and other features from HXB2 from operator import attrgetter from seqanpy import align_overlap from hivwholeseq.utils.genome_info import proteins, chunks from hivwholeseq.reference import load_custom_reference additional_features_dict = {} if 'protein' in additional_features: additional_features_dict['protein'] = proteins if 'chunk' in additional_features: additional_features_dict['chunk'] = chunks ref_ann = load_custom_reference('HXB2', 'gb') for feagroup, additional_features_grp in additional_features_dict.iteritems( ): for feaname in additional_features_grp: if VERBOSE >= 2: print feaname, fea = ref_ann.features[map(attrgetter('id'), ref_ann.features).index(feaname)] seq = fea.extract(ref_ann) (score, ali1, ali2) = align_overlap(seqrecord, seq, score_gapopen=-20) start = len(ali2) - len(ali2.lstrip('-')) end = len(ali2.rstrip('-')) end -= ali1[start:end].count('-') location = FeatureLocation(start, end) if VERBOSE >= 2: print 'found:', location feature = SeqFeature(location, type=feagroup, id=feaname, strand=1) seqrecord.features.append(feature)