Example #1
0
    def check_groups(self):

        for tag in self.group_tags:

            # In gff, the attribute will have a lowercase first letter
            gff_tag = tag[0].lower() + tag[1:]

            if gff_tag in self.f.qualifiers:
                for group in self.f.qualifiers[gff_tag]:

                    group = self.check_group(group)

                    if group != "":
                        if group not in self.groups:
                            self.groups.append(group)
                        else:
                            self.warnings.append(
                                GeneError(GeneError.GROUP_MULTIPLE_SAME, self,
                                          {'group': group}))

                        if group not in self.allowed_groups:
                            self.errors.append(
                                GeneError(GeneError.GROUP_UNKNOWN, self,
                                          {'group': group}))
                    else:
                        self.errors.append(
                            GeneError(GeneError.GROUP_UNKNOWN, self,
                                      {'group': group}))

        if len(self.groups) == 0:
            self.groups.append('Unknown')
            if not self.no_group:
                self.errors.append(GeneError(GeneError.GROUP_NONE, self))
        elif len(self.groups) > 1:
            self.warnings.append(GeneError(GeneError.GROUP_MULTIPLE, self))
Example #2
0
    def check_deleted_name(self):

        if 'Name' not in self.f.qualifiers or self.f.qualifiers['Name'][
                0] == "" or self.f.qualifiers['Name'][0] == "true":
            self.errors.append(GeneError(GeneError.DELETED_MISSING_NAME, self))
        else:
            self.name = self.f.qualifiers['Name'][0].strip()
            if not re.match("^[A-Z]{2,3}[0-9]{5,8}-R[A-Z]$", self.name):
                self.errors.append(
                    GeneError(GeneError.DELETED_WRONG_NAME, self,
                              {'name': self.name}))
Example #3
0
    def check_intron(self):

        if len(self.f.sub_features) > 0:
            exon_coords = {}

            # Find positions
            for mrna in self.f.sub_features:
                for gchild in mrna.sub_features:
                    if gchild.type == "exon":
                        exon_coords[
                            gchild.location.start] = gchild.location.end

            # Check minimum intron size
            start_sorted = sorted(exon_coords)
            previous_end = None
            for exon_start in start_sorted:
                if previous_end != None:
                    intron_size = exon_start - previous_end
                    if intron_size < 9:
                        self.warnings.append(
                            GeneError(
                                GeneError.INTRON_TOO_SMALL, self, {
                                    'len': intron_size,
                                    'start': exon_start,
                                    'end': previous_end
                                }))

                previous_end = exon_coords[exon_start]
Example #4
0
    def check_cds(self):

        # Check the total length of CDS
        if len(self.f.sub_features) > 0:
            cdsLen = 0
            for sub1 in self.f.sub_features:
                for sub2 in sub1.sub_features:
                    if sub2.type == 'CDS':
                        start = sub2.location.start
                        end = sub2.location.end
                        if end > start:
                            cdsLen += end - start
                        else:
                            cdsLen += start - end
            if cdsLen == 0:
                self.errors.append(GeneError(GeneError.CDS_IS_NULL, self))
            if cdsLen < 20:
                self.warnings.append(
                    GeneError(GeneError.CDS_IS_SMALL, self, {'len': cdsLen}))
Example #5
0
    def post_validation(self):

        # validate splitted and duplicated genes once we collected the whole list
        for s in self.splitted_genes.keys():
            if len(self.splitted_genes[s]) == 1:
                for p in self.splitted_genes[s].keys():
                    gene = self.splitted_genes[s][p]

                    if len(gene.display_id) == 32 and re.match(
                            "^[A-F0-9]+$", gene.display_id
                    ):  # If there is no name, it's probably the cause of the problem
                        gene.errors.append(
                            GeneError(GeneError.PART_SINGLE, gene))
                    else:  # If there is a symbol it's probably an incomplete gene
                        gene.warnings.append(
                            GeneError(GeneError.PART_SINGLE_NAMED, gene))

                    self.all_genes[gene.wa_id] = gene

        for s in self.duplicated_genes.keys():
            if len(self.duplicated_genes[s]) == 1:
                for p in self.duplicated_genes[s].keys():
                    gene = self.duplicated_genes[s][p]

                    gene.errors.append(GeneError(GeneError.ALLELE_SINGLE,
                                                 gene))

                    self.all_genes[gene.wa_id] = gene

        # Check symbol and name are unique
        seen_symbols = {}
        warned_symbols = []
        for g in self.all_genes.values():
            if not g.allele and not g.part and not g.is_deleted and g.symbol is not None:
                if g.symbol not in seen_symbols:
                    seen_symbols[g.symbol] = g
                else:
                    if g.symbol not in warned_symbols:
                        seen_symbols[g.symbol].errors.append(
                            GeneError(GeneError.SYMBOL_NOT_UNIQUE,
                                      seen_symbols[g.symbol]))
                    warned_symbols.append(g.symbol)
                    g.errors.append(GeneError(GeneError.SYMBOL_NOT_UNIQUE, g))

        seen_names = {}
        warned_names = []
        for g in self.all_genes.values():
            if not g.allele and not g.part and not g.is_deleted and g.name is not None:
                if g.name not in seen_names:
                    seen_names[g.name] = g
                else:
                    if g.name not in warned_names:
                        seen_names[g.name].errors.append(
                            GeneError(GeneError.NAME_NOT_UNIQUE,
                                      seen_names[g.name]))
                    warned_names.append(g.name)
                    g.errors.append(GeneError(GeneError.NAME_NOT_UNIQUE, g))
Example #6
0
    def check_symbol(self):

        if 'symbol' not in self.f.qualifiers or self.f.qualifiers['symbol'][
                0] == "" or self.f.qualifiers['symbol'][0] == "true":
            self.errors.append(GeneError(GeneError.SYMBOL_MISSING, self))
        else:
            symbol = self.f.qualifiers['symbol'][0].strip()
            self.display_id = symbol
            if not re.match("^[A-Za-z0-9-_.()/]+$", symbol):
                self.errors.append(
                    GeneError(GeneError.SYMBOL_INVALID, self,
                              {'symbol': symbol}))

            elif re.match("^[A-Z]{2,3}[0-9]{5,8}-R[A-Z]$", symbol):
                self.errors.append(
                    GeneError(GeneError.SYMBOL_NOT_ID, self,
                              {'symbol': symbol}))

            else:
                self.symbol = self.f.qualifiers['symbol'][0].strip()
Example #7
0
    def check_dbxref(self):

        if 'Dbxref' in self.f.qualifiers:
            for dbxref in self.f.qualifiers['Dbxref']:
                splitted_dbxref = dbxref.split(":")
                db = splitted_dbxref[0].strip()

                for t in self.group_tags:
                    if t.lower() == db.lower():
                        self.errors.append(
                            GeneError(GeneError.GROUP_MISPLACED, self,
                                      {'tag': t}))

                if db.lower() not in ['go', 'pmid', 'ncbi', 'uniprot']:
                    self.warnings.append(
                        GeneError(GeneError.DBXREF_UNKNOWN, self,
                                  {'dbxref': dbxref}))

                if dbxref.startswith('GO'):
                    self.has_goid = True
Example #8
0
    def check_name(self):

        if 'Name' not in self.f.qualifiers or self.f.qualifiers['Name'][
                0] == "" or self.f.qualifiers['Name'][0] == "true":
            self.errors.append(GeneError(GeneError.NAME_MISSING, self))
        else:
            name = self.f.qualifiers['Name'][0].strip()
            if len(name) == 32 and re.match("^[A-F0-9]+$", name):
                self.errors.append(
                    GeneError(GeneError.NAME_INVALID, self, {'name': name}))

            elif 'putative' in name.lower():
                self.warnings.append(
                    GeneError(GeneError.PUTATIVE, self, {'name': name}))
            elif 'similar to' in name.lower():
                self.errors.append(
                    GeneError(GeneError.SIMILAR_TO, self, {'name': name}))
            elif '-like' in name.lower():
                self.warnings.append(
                    GeneError(GeneError.SIMILAR_TO, self, {'name': name}))

            elif re.match("^[A-Z]{2,3}[0-9]{5,8}-R[A-Z]$", name):
                self.errors.append(
                    GeneError(GeneError.NAME_NOT_ID, self, {'name': name}))

            else:
                self.name = self.f.qualifiers['Name'][0].strip()
Example #9
0
    def get_tag_value(self, key, allowed=[]):
        for qk in self.f.qualifiers.keys():
            if key.lower() == qk.strip().lower():
                new_value = self.f.qualifiers[qk][0].strip()
                if len(allowed) > 0 and new_value not in allowed:
                    self.errors.append(
                        GeneError(GeneError.ATTRIBUTE_INVALID, self, {
                            'key': key,
                            'value': new_value
                        }))

                return new_value

        return None
Example #10
0
    def check_multiple_mrnas(self):

        if len(self.f.sub_features) > 1:
            gene_name = self.f.qualifiers['Name'][0]
            for child in self.f.sub_features:
                if child.type == "mRNA":
                    if len(
                            child.qualifiers['Name']
                        [0]) < len(gene_name) or not child.qualifiers['Name'][
                            0].startswith(gene_name) or not re.match(
                                "^ [A-F]{1,2}$",
                                child.qualifiers['Name'][0][len(gene_name):]):
                        self.errors.append(
                            GeneError(GeneError.INVALID_MRNA_NAME, self,
                                      {'gene_name': gene_name}))
Example #11
0
    def validate_genes(self):
        in_handle = open(self.in_file)
        for rec in GFF.parse(in_handle):
            for f in rec.features:
                if (f.type == "gene") and (
                        'status' not in f.qualifiers
                        or not f.qualifiers['status']
                        or f.qualifiers['status'][0].lower() != "deleted"):

                    gene = Gene(f, rec.id, self.scaf_lengths[rec.id],
                                self.allowed_groups, self.group_tags,
                                self.no_group, self.split_users)

                    self.all_genes[gene.wa_id] = gene

                    # Count number of genes with goid
                    if gene.has_goid:
                        self.genes_with_goid += 1

                    # Collect stats on groups
                    for g in gene.groups:
                        if g not in self.groups_stats:
                            self.groups_stats[g] = 0
                        self.groups_stats[g] += 1

                    new_part = gene.part
                    new_allele = gene.allele

                    # Collect wa_errors
                    self.wa_errors.extend(gene.wa_errors)

                    if not new_part and not new_allele:
                        self.genes_seen_once += 1

                    # keep track of splitted genes
                    if new_part:
                        part_gene_key = gene.display_id
                        if new_allele:
                            part_gene_key = gene.display_id + ", allele " + new_allele
                        if part_gene_key not in self.splitted_genes:
                            self.splitted_genes[part_gene_key] = {}
                        if new_part in self.splitted_genes[part_gene_key]:
                            identical = self.splitted_genes[part_gene_key][
                                new_part]

                            gene.errors.append(
                                GeneError(
                                    GeneError.PART_SAME, gene, {
                                        'other_name': identical.display_id,
                                        'other_scaff': identical.scaffold,
                                        'other_start':
                                        identical.f.location.start,
                                        'other_end': identical.f.location.end
                                    }))

                        self.splitted_genes[part_gene_key][new_part] = gene

                    # keep track of duplicated genes
                    if new_allele:
                        allele_gene_key = gene.display_id
                        if allele_gene_key not in self.duplicated_genes:
                            self.duplicated_genes[allele_gene_key] = {}
                        if new_allele in self.duplicated_genes[
                                allele_gene_key]:
                            identical = self.duplicated_genes[allele_gene_key][
                                new_allele]

                            if identical.part == new_part:

                                gene.errors.append(
                                    GeneError(
                                        GeneError.ALLELE_SAME, gene, {
                                            'other_name': identical.display_id,
                                            'other_scaff': identical.scaffold,
                                            'other_start':
                                            identical.f.location.start,
                                            'other_end':
                                            identical.f.location.end
                                        }))

                        self.duplicated_genes[allele_gene_key][
                            new_allele] = gene
                elif 'status' in f.qualifiers and f.qualifiers[
                        'status'] and f.qualifiers['status'][0].lower(
                        ) == "deleted":
                    gene = Gene(f, rec.id, self.scaf_lengths[rec.id],
                                self.allowed_groups, self.group_tags,
                                self.no_group, self.split_users)

                    self.all_genes[gene.wa_id] = gene
                else:
                    fake_gene = Gene(f, rec.id, self.scaf_lengths[rec.id],
                                     self.allowed_groups, self.group_tags)
                    self.wa_errors.append(
                        WAError(WAError.UNEXPECTED_FEATURE, fake_gene))

        in_handle.close()
Example #12
0
    def check_status(self):

        if (not self.apollo_1x) and ('status' not in self.f.qualifiers or
                                     (self.f.qualifiers['status'][0].lower()
                                      == "needs review")):
            self.errors.append(GeneError(GeneError.NEEDS_REVIEW, self))