Exemple #1
0
    def store_feature_annotation(self, feature: str, soterm: str, cvterm: str,
                                 annotation: str) -> None:
        """Store feature annotation."""
        feature_id = retrieve_feature_id(accession=feature, soterm=soterm)
        attrs_str = "{}={};".format(cvterm, annotation)

        attrs_loader = FeatureAttributesLoader(filecontent="genome")
        attrs_dict = attrs_loader.get_attributes(attrs_str)
        attrs_loader.process_attributes(feature_id, attrs_dict)
        self.ignored_attrs = attrs_loader.ignored_attrs
        self.ignored_goterms = attrs_loader.ignored_goterms
    def test_process_attributes(self):
        """Tests - get attributes."""
        test_organism = Organism.objects.create(genus="Mus",
                                                species="musculus")
        # creating test GO term
        test_db = Db.objects.create(name="GO")
        test_dbxref = Dbxref.objects.create(accession="12345", db=test_db)
        test_cv = Cv.objects.create(name="biological_process")
        Cvterm.objects.create(
            name="go test term",
            cv=test_cv,
            dbxref=test_dbxref,
            is_obsolete=0,
            is_relationshiptype=0,
        )
        # creating test SO term
        test_db = Db.objects.create(name="SO")
        test_dbxref = Dbxref.objects.create(accession="12345", db=test_db)
        test_cv = Cv.objects.create(name="sequence")
        Cvterm.objects.create(
            name="gene",
            cv=test_cv,
            dbxref=test_dbxref,
            is_obsolete=0,
            is_relationshiptype=0,
        )
        test_dbxref = Dbxref.objects.create(accession="123455", db=test_db)
        test_so_term = Cvterm.objects.create(
            name="polypeptide",
            cv=test_cv,
            dbxref=test_dbxref,
            is_obsolete=0,
            is_relationshiptype=0,
        )
        test_dbxref = Dbxref.objects.create(accession="1234555", db=test_db)
        Cvterm.objects.create(
            name="protein_match",
            cv=test_cv,
            dbxref=test_dbxref,
            is_obsolete=0,
            is_relationshiptype=0,
        )

        # creating test feature
        test_feature = Feature.objects.create(
            organism=test_organism,
            uniquename="feat1",
            is_analysis=False,
            type_id=test_so_term.cvterm_id,
            is_obsolete=False,
            timeaccessioned=datetime.now(timezone.utc),
            timelastmodified=datetime.now(timezone.utc),
        )
        # creating exact term
        test_db_global = Db.objects.create(name="_global")
        test_dbxref = Dbxref.objects.create(accession="exact",
                                            db=test_db_global)
        test_cv = Cv.objects.create(name="synonym_type")
        Cvterm.objects.create(
            name="exact",
            cv=test_cv,
            dbxref=test_dbxref,
            is_obsolete=0,
            is_relationshiptype=0,
        )
        test_db = Db.objects.create(name="RO")
        test_dbxref = Dbxref.objects.create(accession="00002", db=test_db)
        test_cv = Cv.objects.create(name="relationship")
        Cvterm.objects.create(
            name="contained in",
            cv=test_cv,
            dbxref=test_dbxref,
            is_obsolete=0,
            is_relationshiptype=0,
        )

        # new FeatureLoader
        FeatureLoader(filename="file.name", source="GFF_source")
        # running get_attributes
        test_attrs_file = FeatureAttributesLoader(filecontent="genome")
        test_attrs = test_attrs_file.get_attributes(
            "ID=1;name=feat1;note=Test feature;display=feat1;gene=gene1;"
            "orf_classification=1;ontology_term=GO:12345,GO:54321;parent=2;"
            "alias=Feature1;dbxref=GI:12345,NC:12345;noecziste=True")
        # running process_attributes
        test_attrs_file.process_attributes(feature_id=test_feature.feature_id,
                                           attrs=test_attrs)
        # creating feature_property cvterm
        cv_feature_property = Cv.objects.get(name="feature_property")
        # asserting note
        test_prop_cvterm = Cvterm.objects.get(name="note",
                                              cv=cv_feature_property)
        test_prop = Featureprop.objects.get(feature=test_feature,
                                            type_id=test_prop_cvterm.cvterm_id,
                                            rank=0)
        self.assertEqual("Test feature", test_prop.value)
        # asserting ontology_term
        test_feat_cvterm = FeatureCvterm.objects.get(feature=test_feature)
        test_cvterm = Cvterm.objects.get(cvterm_id=test_feat_cvterm.cvterm_id)
        self.assertEqual("go test term", test_cvterm.name)
        # asserting dbxref
        test_dbxref_ids = FeatureDbxref.objects.filter(
            feature=test_feature).values_list("dbxref_id", flat=True)
        test_db = Db.objects.get(name="GI")
        test_dbxref = Dbxref.objects.get(dbxref_id__in=test_dbxref_ids,
                                         db=test_db)
        self.assertEqual("12345", test_dbxref.accession)
        # asserting alias
        test_synonym = FeatureSynonym.objects.select_related("synonym").get(
            feature=test_feature)
        self.assertEqual("Feature1", test_synonym.synonym.name)
        # asserting ignored goterms
        self.assertEqual("GO:54321", test_attrs_file.ignored_goterms.pop())
 def test_get_attributes(self):
     """Tests - get attributes."""
     test_attrs_file = FeatureAttributesLoader(filecontent="genome")
     test_attrs = test_attrs_file.get_attributes("ID=1;name=feat1")
     self.assertEqual("1", test_attrs.get("id"))
     self.assertEqual("feat1", test_attrs.get("name"))
Exemple #4
0
    def store_tabix_GFF_feature(self, tabix_feature: GTFProxy, organism: str,
                                qtl: bool) -> None:
        """Store tabix feature."""
        organism_obj = retrieve_organism(organism)

        filecontent = "qtl" if qtl else "genome"

        attrs_loader = FeatureAttributesLoader(filecontent=filecontent)
        attrs_dict = attrs_loader.get_attributes(tabix_feature.attributes)
        self.ignored_attrs = attrs_loader.ignored_attrs
        self.ignored_goterms = attrs_loader.ignored_goterms

        if qtl:
            cvterm = Cvterm.objects.get(name="QTL", cv__name="sequence")
            attrs_dict["qtl_type"] = tabix_feature.feature
        else:
            try:
                cvterm = Cvterm.objects.get(name=tabix_feature.feature,
                                            cv__name="sequence")
            except ObjectDoesNotExist:
                raise ImportingError(
                    "{} is not a sequence ontology term.".format(
                        tabix_feature.feature))

        attrs_id = attrs_dict.get("id")
        attrs_name = attrs_dict.get("name")
        try:
            attrs_parent = attrs_dict.get("parent").split(",")
        except AttributeError:
            attrs_parent = list()

        # set id = auto# for features that lack it
        if attrs_id is None:
            attrs_id = "auto{}".format(str(time()))

        try:
            dbxref, created = Dbxref.objects.get_or_create(db=self.db,
                                                           accession=attrs_id)
            Dbxrefprop.objects.get_or_create(
                dbxref=dbxref,
                type_id=self.cvterm_contained_in.cvterm_id,
                value=self.filename,
                rank=0,
            )
            feature_id = Feature.objects.create(
                organism=organism_obj,
                uniquename=attrs_id,
                type_id=cvterm.cvterm_id,
                name=attrs_name,
                dbxref=dbxref,
                is_analysis=False,
                is_obsolete=False,
                timeaccessioned=datetime.now(timezone.utc),
                timelastmodified=datetime.now(timezone.utc),
            ).feature_id
        except IntegrityError as e:
            raise ImportingError("ID {} already registered. {}".format(
                attrs_id, e))

        # DOI: try to link feature to publication's DOI
        if feature_id and self.pub_dbxref_doi:
            try:
                FeaturePub.objects.get_or_create(
                    feature_id=feature_id, pub_id=self.pub_dbxref_doi.pub_id)
            except IntegrityError as e:
                raise ImportingError(e)

        srcdb = Db.objects.get(name="FASTA_SOURCE")
        srcdbxref = Dbxref.objects.get(accession=tabix_feature.contig,
                                       db=srcdb)
        srcfeature = Feature.objects.filter(dbxref=srcdbxref,
                                            organism=organism_obj).values_list(
                                                "feature_id", flat=True)
        if len(srcfeature) == 1:
            srcfeature_id = srcfeature.first()
        else:
            raise ImportingError(
                "Parent not found: {}. It's required to load "
                "a reference FASTA file before loading features.".format(
                    tabix_feature.contig))

        # the database requires -1, 0, and +1 for strand
        if tabix_feature.strand == "+":
            strand = +1
        elif tabix_feature.strand == "-":
            strand = -1
        else:
            strand = 0

        # if row.frame is . phase = None
        # some versions of pysam throws ValueError
        try:
            phase = tabix_feature.frame
            if tabix_feature.frame == ".":
                phase = None
        except ValueError:
            phase = None

        try:
            Featureloc.objects.get_or_create(
                feature_id=feature_id,
                srcfeature_id=srcfeature_id,
                fmin=tabix_feature.start,
                is_fmin_partial=False,
                fmax=tabix_feature.end,
                is_fmax_partial=False,
                strand=strand,
                phase=phase,
                locgroup=0,
                rank=0,
            )
        except IntegrityError as e:
            print(
                attrs_id,
                srcdbxref,
                tabix_feature.start,
                tabix_feature.end,
                strand,
                phase,
            )
            raise ImportingError(e)

        # Process attrs_dict after the creation of the feature
        attrs_loader.process_attributes(feature_id, attrs_dict)

        for parent in attrs_parent:
            self.relationships.append({
                "object_id": attrs_id,
                "subject_id": parent
            })

        # Additional protrein record for each mRNA with the exact same ID
        if tabix_feature.feature == "mRNA":
            translation_of = Cvterm.objects.get(name="translation_of",
                                                cv__name="sequence")
            feature_mRNA_translation_id = Feature.objects.create(
                organism=organism_obj,
                uniquename=attrs_id,
                type_id=self.aa_cvterm.cvterm_id,
                name=attrs_name,
                dbxref=dbxref,
                is_analysis=False,
                is_obsolete=False,
                timeaccessioned=datetime.now(timezone.utc),
                timelastmodified=datetime.now(timezone.utc),
            ).feature_id
            FeatureRelationship.objects.create(
                object_id=feature_mRNA_translation_id,
                subject_id=feature_id,
                type=translation_of,
                rank=0,
            )
Exemple #5
0
    def store_tabix_VCF_feature(self, tabix_feature: VCFProxy,
                                organism: str) -> None:
        """Store tabix feature from VCF files."""
        organism_obj = retrieve_organism(organism)

        attrs_loader = FeatureAttributesLoader(filecontent="polymorphism")
        attrs_dict = attrs_loader.get_attributes(tabix_feature.info)
        self.ignored_attrs = attrs_loader.ignored_attrs
        self.ignored_goterms = attrs_loader.ignored_goterms

        if attrs_dict.get("vc"):
            attrs_class = attrs_dict.get("vc")
        elif attrs_dict.get("tsa"):
            attrs_class = attrs_dict.get("tsa")
        else:
            raise ImportingError(
                "{}: Impossible to get the attribute which defines the type of variation (eg. TSA, VC)"
                .format(tabix_feature.id))

        try:
            cvterm = Cvterm.objects.get(name=attrs_class, cv__name="sequence")
        except ObjectDoesNotExist:
            raise ImportingError(
                "{} is not a sequence ontology term.".format(attrs_class))

        try:
            dbxref, created = Dbxref.objects.get_or_create(
                db=self.db, accession=tabix_feature.id)
            Dbxrefprop.objects.get_or_create(
                dbxref=dbxref,
                type_id=self.cvterm_contained_in.cvterm_id,
                value=self.filename,
                rank=0,
            )
            name = "{}->{}".format(tabix_feature.ref, tabix_feature.alt)
            feature_id = Feature.objects.create(
                organism=organism_obj,
                uniquename=tabix_feature.id,
                name=name,
                type_id=cvterm.cvterm_id,
                dbxref=dbxref,
                is_analysis=False,
                is_obsolete=False,
                timeaccessioned=datetime.now(timezone.utc),
                timelastmodified=datetime.now(timezone.utc),
            ).feature_id
        except IntegrityError as e:
            raise ImportingError("ID {} already registered. {}".format(
                tabix_feature.id, e))

        if tabix_feature.qual != ".":
            cvterm_qual = Cvterm.objects.get(name="quality_value",
                                             cv__name="sequence")
            featureprop_obj = Featureprop(
                feature_id=feature_id,
                type=cvterm_qual,
                value=tabix_feature.qual,
                rank=0,
            )
            featureprop_obj.save()

        # DOI: try to link feature to publication's DOI
        if feature_id and self.pub_dbxref_doi:
            try:
                FeaturePub.objects.get_or_create(
                    feature_id=feature_id, pub_id=self.pub_dbxref_doi.pub_id)
            except IntegrityError as e:
                raise ImportingError(e)

        srcdb = Db.objects.get(name="FASTA_SOURCE")
        srcdbxref = Dbxref.objects.get(accession=tabix_feature.contig,
                                       db=srcdb)
        srcfeature = Feature.objects.filter(dbxref=srcdbxref,
                                            organism=organism_obj).values_list(
                                                "feature_id", flat=True)
        if len(srcfeature) == 1:
            srcfeature_id = srcfeature.first()
        else:
            raise ImportingError(
                "Parent not found: {}. It's required to load "
                "a reference FASTA file before loading features.".format(
                    tabix_feature.contig))

        # Reference allele
        try:
            Featureloc.objects.get_or_create(
                feature_id=feature_id,
                srcfeature_id=srcfeature_id,
                fmin=tabix_feature.pos,
                is_fmin_partial=False,
                fmax=tabix_feature.pos + 1,
                is_fmax_partial=False,
                residue_info=tabix_feature.ref,
                locgroup=0,
                rank=0,
            )
        except IntegrityError as e:
            print(tabix_feature.id, srcdbxref, tabix_feature.pos)
            raise ImportingError(e)

        # Alternative alleles
        rank = 1
        for allele in tabix_feature.alt.split(","):
            try:
                Featureloc.objects.get_or_create(
                    feature_id=feature_id,
                    fmin=tabix_feature.pos,
                    is_fmin_partial=False,
                    fmax=tabix_feature.pos + 1,
                    is_fmax_partial=False,
                    residue_info=allele,
                    locgroup=0,
                    rank=rank,
                )
            except IntegrityError as e:
                print(tabix_feature.id, srcdbxref, tabix_feature.pos)
                raise ImportingError(e)
            rank += 1