def excessive_overlap(record, excess=15, excess_divergent=30): """ Find excessive overlaps in the genome, where excessive is defined as 15 bases for same strand, and 30 for divergent translation. Does a product of all the top-level features in the genome, and calculates gaps. """ results = [] bad = 0 qc_features = [] for (gene_a, gene_b) in itertools.combinations(coding_genes(record.features), 2): # Get the CDS from the subfeature list. # TODO: not recursive. cds_a = [x for x in genes(gene_a.sub_features, feature_type="CDS")] cds_b = [x for x in genes(gene_b.sub_features, feature_type="CDS")] if len(cds_a) == 0: log.warn("Gene missing subfeatures; %s", get_gff3_id(gene_a)) continue if len(cds_b) == 0: log.warn("Gene missing subfeatures; %s", get_gff3_id(gene_b)) continue cds_a = cds_a[0] cds_b = cds_b[0] # Set of locations that are included in the CDS of A and the # CDS of B cas = set(range(cds_a.location.start, cds_a.location.end)) cbs = set(range(cds_b.location.start, cds_b.location.end)) # Here we calculate the intersection between the two sets, and # if it's larger than our excessive size, we know that they're # overlapped ix = cas.intersection(cbs) if (cds_a.location.strand == cds_b.location.strand and len(ix) >= excess) or (cds_a.location.strand != cds_b.location.strand and len(ix) >= excess_divergent): bad += float(len(ix)) / float(min(excess, excess_divergent)) qc_features.append( gen_qc_feature(min(ix), max(ix), "Excessive Overlap", id_src=gene_a)) results.append((gene_a, gene_b, min(ix), max(ix))) # Good isn't accurate here. It's a triangle number and just ugly, but we # don't care enough to fix it. good = len(list(coding_genes(record.features))) good = int(good - bad) if good < 0: good = 0 return good, int(bad), results, qc_features
def bad_gene_model(record): """Find features without product """ results = [] good = 0 bad = 0 qc_features = [] for gene in coding_genes(record.features): exons = [x for x in genes(gene.sub_features, feature_type='exon') if len(x) > 10] CDSs = [x for x in genes(gene.sub_features, feature_type='CDS')] if len(exons) >= 1 and len(CDSs) >= 1: if len(exons) != len(CDSs): results.append(( get_gff3_id(gene), None, None, 'Mismatched number of exons and CDSs in gff3 representation', )) qc_features.append(gen_qc_feature( gene.location.start, gene.location.end, 'Mismatched number of exons and CDSs in gff3 representation', strand=gene.strand, id_src=gene )) bad += 1 else: for (exon, cds) in zip(sorted(exons, key=lambda x: x.location.start), sorted(CDSs, key=lambda x: x.location.start)): if len(exon) != len(cds): results.append(( get_gff3_id(gene), exon, cds, 'CDS does not extend to full length of gene', )) qc_features.append(gen_qc_feature( exon.location.start, exon.location.end, 'CDS does not extend to full length of gene', strand=exon.strand, id_src=gene )) bad += 1 else: good += 1 else: log.warn("Could not handle %s, %s", exons, CDSs) results.append(( get_gff3_id(gene), None, None, '{0} exons, {1} CDSs'.format(len(exons), len(CDSs)) )) return good, len(results) + bad, results, qc_features
def sd_spacing(record, feature): """Shine-Dalgarno spacing """ rbss = get_rbs_from(gene) if len(rbss) == 0: return "None" else: resp = [] for rbs in rbss: cdss = list( genes(feature.sub_features, feature_type="CDS", sort=True)) if rbs.location.strand > 0: distance = min( cdss, key=lambda x: x.location.start - rbs.location.end) distance_val = str(distance.location.start - rbs.location.end) resp.append(distance_val) else: distance = min( cdss, key=lambda x: x.location.end - rbs.location.start) distance_val = str(rbs.location.start - distance.location.end) resp.append(distance_val) if len(resp) == 1: return str(resp[0]) return resp
def missing_tags(record): """Find features without product """ results = [] good = 0 bad = 0 qc_features = [] for gene in coding_genes(record.features): cds = [x for x in genes(gene.sub_features, feature_type="CDS")] if len(cds) == 0: log.warn("Gene missing CDS subfeature %s", get_gff3_id(gene)) continue cds = cds[0] if "product" not in cds.qualifiers: log.info("Missing product tag on %s", get_gff3_id(gene)) qc_features.append( gen_qc_feature( cds.location.start, cds.location.end, "Missing product tag", strand=cds.strand, )) results.append(cds) bad += 1 else: good += 1 return good, bad, results, qc_features
def start_codon(record, feature): """Start Codon """ cdss = list(genes(feature.sub_features, feature_type="CDS", sort=True)) data = [x for x in cdss] if len(data) == 1: return str(data[0].extract(record).seq[0:3]) else: return [ "{0} ({1.location.start}..{1.location.end}:{1.location.strand})" .format(x.extract(record).seq[0:3], x) for x in data ]
def exact_coding_density(record, mean=92.5, sd=20): """ Find exact coding density in the genome """ data = numpy.zeros(len(record.seq)) for gene_a in coding_genes(record.features): for cds in genes(gene_a.sub_features, feature_type="CDS"): for i in range(cds.location.start, cds.location.end + 1): data[i - 1] = 1 return float(sum(data)) / len(data)
def weird_starts(record): """Find features without product """ good = 0 bad = 0 qc_features = [] results = [] overall = {} for gene in coding_genes(record.features): seq = [x for x in genes(gene.sub_features, feature_type='CDS')] if len(seq) == 0: log.warn("No CDS for gene %s", get_gff3_id(gene)) continue else: seq = seq[0] seq_str = str(seq.extract(record.seq)) start_codon = seq_str[0:3] stop_codon = seq_str[-3] seq.__start = start_codon seq.__stop = stop_codon if start_codon not in overall: overall[start_codon] = 1 else: overall[start_codon] += 1 if start_codon not in ('ATG', 'TTG', 'GTG'): log.warn("Weird start codon (%s) on %s", start_codon, get_gff3_id(gene)) seq.__error = 'Unusual start codon %s' % start_codon s = 0 e = 0 if seq.strand > 0: s = seq.location.start e = seq.location.start + 3 else: s = seq.location.end e = seq.location.end - 3 results.append(seq) qc_features.append(gen_qc_feature( s, e, 'Weird start codon', strand=seq.strand, id_src=gene )) bad += 1 else: good += 1 return good, bad, results, qc_features, overall
def coding_density(record, mean=92.5, sd=20): """ Find coding density in the genome """ feature_lengths = 0 for gene_a in coding_genes(record.features): feature_lengths += sum( [len(x) for x in genes(gene_a.sub_features, feature_type="CDS")]) avgFeatLen = float(feature_lengths) / float(len(record.seq)) return int(norm(100 * avgFeatLen, mean=mean, sd=sd) * 100), int(100 * avgFeatLen)
def length(record, feature): """CDS Length (AA) """ cdss = list(genes(feature.sub_features, feature_type="CDS", sort=True)) return str((sum([len(cds) for cds in cdss]) / 3) - 1)
def missing_rbs(record, lookahead_min=5, lookahead_max=15): """ Identify gene features with missing RBSs This "looks ahead" 5-15 bases ahead of each gene feature, and checks if there's an RBS feature in those bounds. The returned data is a set of genes with the RBS sequence in the __upstream attribute, and a message in the __message attribute. """ results = [] good = 0 bad = 0 qc_features = [] sd_finder = NaiveSDCaller() any_rbss = False for gene in coding_genes(record.features): # Check if there are RBSs, TODO: make this recursive. Each feature in # gene.sub_features can also have sub_features. rbss = get_rbs_from(gene) # No RBS found if len(rbss) == 0: # Get the sequence lookahead_min to lookahead_max upstream if gene.strand > 0: start = gene.location.start - lookahead_max end = gene.location.start - lookahead_min else: start = gene.location.end + lookahead_min end = gene.location.end + lookahead_max # We have to ensure the feature is ON the genome, otherwise we may # be trying to access a location outside of the length of the # genome, which would be bad. (start, end) = __ensure_location_in_bounds(start=start, end=end, parent_length=len(record)) # Temporary feature to extract sequence tmp = SeqFeature(FeatureLocation(start, end, strand=gene.strand), type="domain") # Get the sequence seq = str(tmp.extract(record.seq)) # Set the default properties gene.__upstream = seq.lower() gene.__message = "No RBS annotated, None found" # Try and do an automated shinefind call sds = sd_finder.list_sds(seq) if len(sds) > 0: sd = sds[0] gene.__upstream = sd_finder.highlight_sd( seq.lower(), sd["start"], sd["end"]) gene.__message = "Unannotated but valid RBS" qc_features.append( gen_qc_feature(start, end, "Missing RBS", strand=gene.strand, id_src=gene)) bad += 1 results.append(gene) else: if len(rbss) > 1: log.warn("%s RBSs found for gene %s", rbss[0].id, get_gff3_id(gene)) any_rbss = True # get first RBS/CDS cds = list(genes(gene.sub_features, feature_type="CDS"))[0] rbs = rbss[0] # Get the distance between the two if gene.strand > 0: distance = cds.location.start - rbs.location.end else: distance = rbs.location.start - cds.location.end # If the RBS is too far away, annotate that if distance > lookahead_max: gene.__message = "RBS too far away (%s nt)" % distance qc_features.append( gen_qc_feature( rbs.location.start, rbs.location.end, gene.__message, strand=gene.strand, id_src=gene, )) bad += 1 results.append(gene) else: good += 1 return good, bad, results, qc_features, any_rbss
def gene_model_correction_issues(record): """Find features that have issues from the gene model correction step. These have qualifiers beginning with CPT_GMS """ results = [] good = 0 bad = 0 qc_features = [] # For each gene for gene in coding_genes(record.features): # Get the list of child CDSs cdss = [x for x in genes(gene.sub_features, feature_type="CDS")] # And our matching qualifiers gene_data = [(k, v) for (k, v) in gene.qualifiers.items() if k == "cpt_gmc"] # If there are problems with ONLY the parent, let's complain local_results = [] local_qc_features = [] for x in gene_data: if "Missing Locus Tag" in x[1]: # Missing locus tag is an either or thing, if it hits here # there shouldn't be anything else wrong with it. # Obviously missing so we remove it gene.qualifiers["locus_tag"] = [""] # Translation from bp_genbank2gff3.py cdss[0].qualifiers["locus_tag"] = cdss[0].qualifiers["Name"] # Append our results local_results.append( (gene, cdss[0], "Gene is missing a locus_tag")) local_qc_features.append( gen_qc_feature( gene.location.start, gene.location.end, "Gene is missing a locus_tag", strand=gene.strand, )) # We need to alert on any child issues as well. for cds in cdss: cds_data = [(k, v[0]) for (k, v) in cds.qualifiers.items() if k == "cpt_gmc"] if len(gene_data) == 0 and len(cds_data) == 0: # Alles gut pass else: for _, problem in cds_data: if problem == "BOTH Missing Locus Tag": gene.qualifiers["locus_tag"] = [""] cds.qualifiers["locus_tag"] = [""] local_results.append( (gene, cds, "Both gene and CDS are missing locus tags")) local_qc_features.append( gen_qc_feature( cds.location.start, cds.location.end, "CDS is missing a locus_tag", strand=cds.strand, )) local_qc_features.append( gen_qc_feature( gene.location.start, gene.location.end, "Gene is missing a locus_tag", strand=gene.strand, )) elif problem == "Different locus tag from associated gene.": gene.qualifiers["locus_tag"] = gene.qualifiers["Name"] cds.qualifiers["locus_tag"] = cds.qualifiers[ "cpt_gmc_locus"] local_results.append( (gene, cds, "Gene and CDS have differing locus tags")) local_qc_features.append( gen_qc_feature( gene.location.start, gene.location.end, "Gene and CDS have differing locus tags", strand=gene.strand, )) elif problem == "Missing Locus Tag": # Copy this over gene.qualifiers["locus_tag"] = gene.qualifiers["Name"] # This one is missing cds.qualifiers["locus_tag"] = [""] local_results.append( (gene, cds, "CDS is missing a locus_tag")) local_qc_features.append( gen_qc_feature( cds.location.start, cds.location.end, "CDS is missing a locus_tag", strand=cds.strand, )) else: log.warn("Cannot handle %s", problem) if len(local_results) > 0: bad += 1 else: good += 1 qc_features.extend(local_qc_features) results.extend(local_results) return good, bad, results, qc_features
def weird_starts(record): """Find features without product """ good = 0 bad = 0 qc_features = [] results = [] overall = {} for gene in coding_genes(record.features): seq = [x for x in genes(gene.sub_features, feature_type="CDS")] if len(seq) == 0: log.warn("No CDS for gene %s", get_gff3_id(gene)) continue else: seq = seq[0] seq_str = str(seq.extract(record.seq)) start_codon = seq_str[0:3] if len(seq_str) < 3: sys.stderr.write("Fatal Error: CDS of length less than 3 at " + str(seq.location) + '\n') exit(2) # if len(seq_str) % 3 != 0: # if len(seq_str) < 3: # stop_codon = seq_str[-(len(seq_str))] # else: # stop_codon = seq_str[-3] # # log.warn("CDS at %s length is not a multiple of three (Length = %d)", get_gff3_id(gene), len(seq_str)) # seq.__error = "Bad CDS Length" # results.append(seq) # qc_features.append( # gen_qc_feature( # s, e, "Bad Length", strand=seq.strand, id_src=gene # ) # ) # bad += 1 # seq.__start = start_codon # seq.__stop = stop_codon # continue stop_codon = seq_str[-3] seq.__start = start_codon seq.__stop = stop_codon if start_codon not in overall: overall[start_codon] = 1 else: overall[start_codon] += 1 if start_codon not in ("ATG", "TTG", "GTG"): log.warn("Weird start codon (%s) on %s", start_codon, get_gff3_id(gene)) seq.__error = "Unusual start codon %s" % start_codon s = 0 e = 0 if seq.strand > 0: s = seq.location.start e = seq.location.start + 3 else: s = seq.location.end e = seq.location.end - 3 results.append(seq) qc_features.append( gen_qc_feature(s, e, "Weird start codon", strand=seq.strand, id_src=gene)) bad += 1 else: good += 1 return good, bad, results, qc_features, overall
def excessive_gap( record, excess=50, excess_divergent=200, min_gene=30, slop=30, lookahead_min=5, lookahead_max=15, ): """ Identify excessive gaps between gene features. Default "excessive" gap size is 10, but that should likely be larger. """ results = [] good = 0 bad = 0 contiguous_regions = [] sorted_genes = sorted(genes(record.features), key=lambda feature: feature.location.start) if len(sorted_genes) == 0: log.warn("NO GENES FOUND") return good, bad, results, [] current_gene = None for gene in sorted_genes: # If the gene's start is contiguous to the "current_gene", then we # extend current_gene log.debug("gene.id", gene.id) for cds in genes(gene.sub_features, feature_type="CDS"): log.debug("\t%s %s", cds.id, cds.location) if current_gene is None: current_gene = [int(cds.location.start), int(cds.location.end)] if cds.location.start <= current_gene[1] + excess: # Don't want to decrease size if int(cds.location.end) >= current_gene[1]: current_gene[1] = int(cds.location.end) else: # If it's discontiguous, we append the region and clear. contiguous_regions.append(current_gene) current_gene = [int(cds.location.start), int(cds.location.end)] # This generally expected that annotations would NOT continue unto the end # of the genome, however that's a bug, and we can make it here with an # empty contiguous_regions list contiguous_regions.append(current_gene) for i in range(len(contiguous_regions) + 1): if i == 0: a = (1, 1) b = contiguous_regions[i] elif i >= len(contiguous_regions): a = contiguous_regions[i - 1] b = (len(record.seq), None) else: a = contiguous_regions[i - 1] b = contiguous_regions[i] gap_size = abs(b[0] - a[1]) if gap_size > min(excess, excess_divergent): a_feat_l = itertools.islice( feature_lambda( sorted_genes, feature_test_location, {"loc": a[1]}, subfeatures=False, ), 1, ) b_feat_l = itertools.islice( feature_lambda( sorted_genes, feature_test_location, {"loc": b[0]}, subfeatures=False, ), 1, ) try: a_feat = next(a_feat_l) except StopIteration: # Triggers on end of genome a_feat = None try: b_feat = next(b_feat_l) except StopIteration: # Triggers on end of genome b_feat = None result_obj = [ a[1], b[0], None if not a_feat else a_feat.location.strand, None if not b_feat else b_feat.location.strand, ] if a_feat is None or b_feat is None: if gap_size > excess_divergent: results.append(result_obj) else: if (a_feat.location.strand == b_feat.location.strand and gap_size > excess): results.append(result_obj) elif (a_feat.location.strand != b_feat.location.strand and gap_size > excess_divergent): results.append(result_obj) better_results = [] qc_features = [] of = MGAFinder(11, "CDS", "closed", min_gene) # of = OrfFinder(11, 'CDS', 'closed', min_gene) for result_obj in results: start = result_obj[0] end = result_obj[1] f = gen_qc_feature(start, end, "Excessive gap, %s bases" % abs(end - start)) qc_features.append(f) putative_genes = of.putative_genes_in_sequence( str(record[start - slop:end + slop].seq)) putative_genes = list( require_sd(putative_genes, record, start, lookahead_min, lookahead_max)) for putative_gene in putative_genes: # (0, 33, 1, 'ATTATTTTATCAAAACGCTTTACAATCTTTTAG', 'MILSKRFTIF', 123123, 124324) possible_gene_start = start + putative_gene[0] possible_gene_end = start + putative_gene[1] if possible_gene_start <= possible_gene_end: possible_cds = SeqFeature( FeatureLocation(possible_gene_start, possible_gene_end, strand=putative_gene[2]), type="CDS", ) else: possible_cds = SeqFeature( FeatureLocation( possible_gene_end, possible_gene_start, strand=putative_gene[2], ), type="CDS", ) # Now we adjust our boundaries for the RBS that's required # There are only two cases, the rbs is upstream of it, or downstream if putative_gene[5] < possible_gene_start: possible_gene_start = putative_gene[5] else: possible_gene_end = putative_gene[6] if putative_gene[5] <= putative_gene[6]: possible_rbs = SeqFeature( FeatureLocation(putative_gene[5], putative_gene[6], strand=putative_gene[2]), type="Shine_Dalgarno_sequence", ) else: possible_rbs = SeqFeature( FeatureLocation( putative_gene[6], putative_gene[5], strand=putative_gene[2], ), type="Shine_Dalgarno_sequence", ) if possible_gene_start <= possible_gene_end: possible_gene = SeqFeature( FeatureLocation(possible_gene_start, possible_gene_end, strand=putative_gene[2]), type="gene", qualifiers={"note": ["Possible gene"]}, ) else: possible_gene = SeqFeature( FeatureLocation( possible_gene_end, possible_gene_start, strand=putative_gene[2], ), type="gene", qualifiers={"note": ["Possible gene"]}, ) possible_gene.sub_features = [possible_rbs, possible_cds] qc_features.append(possible_gene) better_results.append(result_obj + [len(putative_genes)]) # Bad gaps are those with more than zero possible genes found bad = len([x for x in better_results if x[2] > 0]) # Generally taking "good" here as every possible gap in the genome # Thus, good is TOTAL - gaps good = len(sorted_genes) + 1 - bad # and bad is just gaps return good, bad, better_results, qc_features