def check_groups(self): for tag in self.group_tags: # In gff, the attribute will have a lowercase first letter gff_tag = tag[0].lower() + tag[1:] if gff_tag in self.f.qualifiers: for group in self.f.qualifiers[gff_tag]: group = self.check_group(group) if group != "": if group not in self.groups: self.groups.append(group) else: self.warnings.append( GeneError(GeneError.GROUP_MULTIPLE_SAME, self, {'group': group})) if group not in self.allowed_groups: self.errors.append( GeneError(GeneError.GROUP_UNKNOWN, self, {'group': group})) else: self.errors.append( GeneError(GeneError.GROUP_UNKNOWN, self, {'group': group})) if len(self.groups) == 0: self.groups.append('Unknown') if not self.no_group: self.errors.append(GeneError(GeneError.GROUP_NONE, self)) elif len(self.groups) > 1: self.warnings.append(GeneError(GeneError.GROUP_MULTIPLE, self))
def check_deleted_name(self): if 'Name' not in self.f.qualifiers or self.f.qualifiers['Name'][ 0] == "" or self.f.qualifiers['Name'][0] == "true": self.errors.append(GeneError(GeneError.DELETED_MISSING_NAME, self)) else: self.name = self.f.qualifiers['Name'][0].strip() if not re.match("^[A-Z]{2,3}[0-9]{5,8}-R[A-Z]$", self.name): self.errors.append( GeneError(GeneError.DELETED_WRONG_NAME, self, {'name': self.name}))
def check_intron(self): if len(self.f.sub_features) > 0: exon_coords = {} # Find positions for mrna in self.f.sub_features: for gchild in mrna.sub_features: if gchild.type == "exon": exon_coords[ gchild.location.start] = gchild.location.end # Check minimum intron size start_sorted = sorted(exon_coords) previous_end = None for exon_start in start_sorted: if previous_end != None: intron_size = exon_start - previous_end if intron_size < 9: self.warnings.append( GeneError( GeneError.INTRON_TOO_SMALL, self, { 'len': intron_size, 'start': exon_start, 'end': previous_end })) previous_end = exon_coords[exon_start]
def check_cds(self): # Check the total length of CDS if len(self.f.sub_features) > 0: cdsLen = 0 for sub1 in self.f.sub_features: for sub2 in sub1.sub_features: if sub2.type == 'CDS': start = sub2.location.start end = sub2.location.end if end > start: cdsLen += end - start else: cdsLen += start - end if cdsLen == 0: self.errors.append(GeneError(GeneError.CDS_IS_NULL, self)) if cdsLen < 20: self.warnings.append( GeneError(GeneError.CDS_IS_SMALL, self, {'len': cdsLen}))
def post_validation(self): # validate splitted and duplicated genes once we collected the whole list for s in self.splitted_genes.keys(): if len(self.splitted_genes[s]) == 1: for p in self.splitted_genes[s].keys(): gene = self.splitted_genes[s][p] if len(gene.display_id) == 32 and re.match( "^[A-F0-9]+$", gene.display_id ): # If there is no name, it's probably the cause of the problem gene.errors.append( GeneError(GeneError.PART_SINGLE, gene)) else: # If there is a symbol it's probably an incomplete gene gene.warnings.append( GeneError(GeneError.PART_SINGLE_NAMED, gene)) self.all_genes[gene.wa_id] = gene for s in self.duplicated_genes.keys(): if len(self.duplicated_genes[s]) == 1: for p in self.duplicated_genes[s].keys(): gene = self.duplicated_genes[s][p] gene.errors.append(GeneError(GeneError.ALLELE_SINGLE, gene)) self.all_genes[gene.wa_id] = gene # Check symbol and name are unique seen_symbols = {} warned_symbols = [] for g in self.all_genes.values(): if not g.allele and not g.part and not g.is_deleted and g.symbol is not None: if g.symbol not in seen_symbols: seen_symbols[g.symbol] = g else: if g.symbol not in warned_symbols: seen_symbols[g.symbol].errors.append( GeneError(GeneError.SYMBOL_NOT_UNIQUE, seen_symbols[g.symbol])) warned_symbols.append(g.symbol) g.errors.append(GeneError(GeneError.SYMBOL_NOT_UNIQUE, g)) seen_names = {} warned_names = [] for g in self.all_genes.values(): if not g.allele and not g.part and not g.is_deleted and g.name is not None: if g.name not in seen_names: seen_names[g.name] = g else: if g.name not in warned_names: seen_names[g.name].errors.append( GeneError(GeneError.NAME_NOT_UNIQUE, seen_names[g.name])) warned_names.append(g.name) g.errors.append(GeneError(GeneError.NAME_NOT_UNIQUE, g))
def check_symbol(self): if 'symbol' not in self.f.qualifiers or self.f.qualifiers['symbol'][ 0] == "" or self.f.qualifiers['symbol'][0] == "true": self.errors.append(GeneError(GeneError.SYMBOL_MISSING, self)) else: symbol = self.f.qualifiers['symbol'][0].strip() self.display_id = symbol if not re.match("^[A-Za-z0-9-_.()/]+$", symbol): self.errors.append( GeneError(GeneError.SYMBOL_INVALID, self, {'symbol': symbol})) elif re.match("^[A-Z]{2,3}[0-9]{5,8}-R[A-Z]$", symbol): self.errors.append( GeneError(GeneError.SYMBOL_NOT_ID, self, {'symbol': symbol})) else: self.symbol = self.f.qualifiers['symbol'][0].strip()
def check_dbxref(self): if 'Dbxref' in self.f.qualifiers: for dbxref in self.f.qualifiers['Dbxref']: splitted_dbxref = dbxref.split(":") db = splitted_dbxref[0].strip() for t in self.group_tags: if t.lower() == db.lower(): self.errors.append( GeneError(GeneError.GROUP_MISPLACED, self, {'tag': t})) if db.lower() not in ['go', 'pmid', 'ncbi', 'uniprot']: self.warnings.append( GeneError(GeneError.DBXREF_UNKNOWN, self, {'dbxref': dbxref})) if dbxref.startswith('GO'): self.has_goid = True
def check_name(self): if 'Name' not in self.f.qualifiers or self.f.qualifiers['Name'][ 0] == "" or self.f.qualifiers['Name'][0] == "true": self.errors.append(GeneError(GeneError.NAME_MISSING, self)) else: name = self.f.qualifiers['Name'][0].strip() if len(name) == 32 and re.match("^[A-F0-9]+$", name): self.errors.append( GeneError(GeneError.NAME_INVALID, self, {'name': name})) elif 'putative' in name.lower(): self.warnings.append( GeneError(GeneError.PUTATIVE, self, {'name': name})) elif 'similar to' in name.lower(): self.errors.append( GeneError(GeneError.SIMILAR_TO, self, {'name': name})) elif '-like' in name.lower(): self.warnings.append( GeneError(GeneError.SIMILAR_TO, self, {'name': name})) elif re.match("^[A-Z]{2,3}[0-9]{5,8}-R[A-Z]$", name): self.errors.append( GeneError(GeneError.NAME_NOT_ID, self, {'name': name})) else: self.name = self.f.qualifiers['Name'][0].strip()
def get_tag_value(self, key, allowed=[]): for qk in self.f.qualifiers.keys(): if key.lower() == qk.strip().lower(): new_value = self.f.qualifiers[qk][0].strip() if len(allowed) > 0 and new_value not in allowed: self.errors.append( GeneError(GeneError.ATTRIBUTE_INVALID, self, { 'key': key, 'value': new_value })) return new_value return None
def check_multiple_mrnas(self): if len(self.f.sub_features) > 1: gene_name = self.f.qualifiers['Name'][0] for child in self.f.sub_features: if child.type == "mRNA": if len( child.qualifiers['Name'] [0]) < len(gene_name) or not child.qualifiers['Name'][ 0].startswith(gene_name) or not re.match( "^ [A-F]{1,2}$", child.qualifiers['Name'][0][len(gene_name):]): self.errors.append( GeneError(GeneError.INVALID_MRNA_NAME, self, {'gene_name': gene_name}))
def validate_genes(self): in_handle = open(self.in_file) for rec in GFF.parse(in_handle): for f in rec.features: if (f.type == "gene") and ( 'status' not in f.qualifiers or not f.qualifiers['status'] or f.qualifiers['status'][0].lower() != "deleted"): gene = Gene(f, rec.id, self.scaf_lengths[rec.id], self.allowed_groups, self.group_tags, self.no_group, self.split_users) self.all_genes[gene.wa_id] = gene # Count number of genes with goid if gene.has_goid: self.genes_with_goid += 1 # Collect stats on groups for g in gene.groups: if g not in self.groups_stats: self.groups_stats[g] = 0 self.groups_stats[g] += 1 new_part = gene.part new_allele = gene.allele # Collect wa_errors self.wa_errors.extend(gene.wa_errors) if not new_part and not new_allele: self.genes_seen_once += 1 # keep track of splitted genes if new_part: part_gene_key = gene.display_id if new_allele: part_gene_key = gene.display_id + ", allele " + new_allele if part_gene_key not in self.splitted_genes: self.splitted_genes[part_gene_key] = {} if new_part in self.splitted_genes[part_gene_key]: identical = self.splitted_genes[part_gene_key][ new_part] gene.errors.append( GeneError( GeneError.PART_SAME, gene, { 'other_name': identical.display_id, 'other_scaff': identical.scaffold, 'other_start': identical.f.location.start, 'other_end': identical.f.location.end })) self.splitted_genes[part_gene_key][new_part] = gene # keep track of duplicated genes if new_allele: allele_gene_key = gene.display_id if allele_gene_key not in self.duplicated_genes: self.duplicated_genes[allele_gene_key] = {} if new_allele in self.duplicated_genes[ allele_gene_key]: identical = self.duplicated_genes[allele_gene_key][ new_allele] if identical.part == new_part: gene.errors.append( GeneError( GeneError.ALLELE_SAME, gene, { 'other_name': identical.display_id, 'other_scaff': identical.scaffold, 'other_start': identical.f.location.start, 'other_end': identical.f.location.end })) self.duplicated_genes[allele_gene_key][ new_allele] = gene elif 'status' in f.qualifiers and f.qualifiers[ 'status'] and f.qualifiers['status'][0].lower( ) == "deleted": gene = Gene(f, rec.id, self.scaf_lengths[rec.id], self.allowed_groups, self.group_tags, self.no_group, self.split_users) self.all_genes[gene.wa_id] = gene else: fake_gene = Gene(f, rec.id, self.scaf_lengths[rec.id], self.allowed_groups, self.group_tags) self.wa_errors.append( WAError(WAError.UNEXPECTED_FEATURE, fake_gene)) in_handle.close()
def check_status(self): if (not self.apollo_1x) and ('status' not in self.f.qualifiers or (self.f.qualifiers['status'][0].lower() == "needs review")): self.errors.append(GeneError(GeneError.NEEDS_REVIEW, self))