def handle(self, file: str, cvterm: str, soterm: str, doi: str = None, verbosity: int = 1, cpu: int = 1, **options): """Execute the main function.""" if verbosity > 0: self.stdout.write("Preprocessing") try: FileValidator().validate(file) except ImportingError as e: raise CommandError(e) # retrieve only the file name filename = os.path.basename(file) try: feature_file = FeatureLoader(filename=filename, source="GFF_source") except ImportingError as e: raise CommandError(e) pool = ThreadPoolExecutor(max_workers=cpu) tasks = list() # Load the annotation file with open(file) as tab_file: for line in tab_file: if line.startswith("#"): continue feature, annotation = line.strip().split("\t") tasks.append( pool.submit( feature_file.store_feature_annotation, feature, soterm, cvterm, annotation, doi, )) if verbosity > 0: self.stdout.write("Loading feature annotations") for task in tqdm(as_completed(tasks), total=len(tasks)): try: task.result() except ImportingError as e: raise CommandError(e) pool.shutdown() if verbosity > 0: self.stdout.write(self.style.SUCCESS("Done"))
def handle(self, file: str, format: str, cpu: int = 1, verbosity: int = 1, **options): """Execute the main function.""" # retrieve only the file name try: FileValidator().validate(file) except ImportingError as e: raise CommandError(e) if format == "blast-xml": source = "BLAST_source" elif format == "interproscan-xml": source = "InterproScan_source" else: raise CommandError("Format allowed options are blast-xml or " "interproscan-xml only, not {}".format(format)) filename = os.path.basename(file) try: feature_file = FeatureLoader(filename=filename, source=source) except ImportingError as e: raise CommandError(e) if verbosity > 0: self.stdout.write("Processing file: {}".format(filename)) try: records = SearchIO.parse(file, format) except ValueError as e: return CommandError(e) pool = ThreadPoolExecutor(max_workers=cpu) tasks = list() for record in records: for hit in record.hits: tasks.append( pool.submit(feature_file.store_bio_searchio_hit, hit, record.target)) if verbosity > 0: self.stdout.write("Loading") for task in tqdm(as_completed(tasks), total=len(tasks)): try: task.result() except ImportingError as e: raise CommandError(e) pool.shutdown() if len(feature_file.ignored_goterms) > 0: self.stdout.write( self.style.WARNING("Ignored GO terms: {}".format( feature_file.ignored_goterms))) if verbosity > 0: self.stdout.write( self.style.SUCCESS("Done with {}".format(filename)))
def test_get_attributes(self): """Tests - get attributes.""" test_db = Db.objects.create(name="SO") test_dbxref = Dbxref.objects.create(accession="12345", db=test_db) test_cv = Cv.objects.create(name="sequence") Cvterm.objects.create( name="polypeptide", cv=test_cv, dbxref=test_dbxref, is_obsolete=0, is_relationshiptype=0, ) test_dbxref = Dbxref.objects.create(accession="123455", db=test_db) Cvterm.objects.create( name="protein_match", cv=test_cv, dbxref=test_dbxref, is_obsolete=0, is_relationshiptype=0, ) test_db = Db.objects.create(name="RO") test_dbxref = Dbxref.objects.create(accession="00002", db=test_db) test_cv = Cv.objects.create(name="relationship") Cvterm.objects.create( name="contained in", cv=test_cv, dbxref=test_dbxref, is_obsolete=0, is_relationshiptype=0, ) Organism.objects.create(genus="Mus", species="musculus") test_feature_file = FeatureLoader(filename="file.name", source="GFF_loader") test_attrs = test_feature_file.get_attributes("ID=1;name=feat1") self.assertEqual("1", test_attrs.get("id")) self.assertEqual("feat1", test_attrs.get("name"))
def test_load_coexpression_pairs(self): """Run tests of load_coexpression_pairs.""" """Load 'pcc.mcl.txt' output result file from LSTrAP. The 'pcc.mcl.txt' file is headless and have the format as follows: AT2G44195.1.TAIR10 AT1G30080.1.TAIR10 0.18189286870895194 AT2G44195.1.TAIR10 AT5G24750.1.TAIR10 0.1715779378273995 ... and so on. The value of the third column is a Pearson correlation coefficient subtracted from 0.7 (PCC - 0.7). To obtain the original PCC value, it must be added 0.7 to every value of the third column.""" # register multispecies organism test_organism, created = Organism.objects.get_or_create( abbreviation="multispecies", genus="multispecies", species="multispecies", common_name="multispecies", ) # creating test SO term test_db = Db.objects.create(name="SO") test_cv = Cv.objects.create(name="sequence") # creating test RO term test_db2 = Db.objects.create(name="RO") test_cv2 = Cv.objects.create(name="relationship") # test_dbxref = Dbxref.objects.create(accession='123456', db=test_db) test_dbxref2 = Dbxref.objects.create(accession="789", db=test_db2) test_dbxref3 = Dbxref.objects.create(accession="135", db=test_db) test_dbxref4 = Dbxref.objects.create(accession="246", db=test_db2) test_dbxref6 = Dbxref.objects.create(accession="357", db=test_db) test_dbxref7 = Dbxref.objects.create(accession="468", db=test_db) Cvterm.objects.create( name="mRNA", cv=test_cv, dbxref=test_dbxref3, is_obsolete=0, is_relationshiptype=0, ) # Cvterm.objects.create( # name='polypeptide', cv=test_cv, dbxref=test_dbxref, # is_obsolete=0, is_relationshiptype=0) # register features. cvterm_contained_in = Cvterm.objects.create( name="contained in", cv=test_cv2, dbxref=test_dbxref2, is_obsolete=0, is_relationshiptype=1, ) term = Cvterm.objects.create( name="correlated with", cv=test_cv2, dbxref=test_dbxref4, is_obsolete=0, is_relationshiptype=1, ) test_term = Cvterm.objects.create( name="polypeptide", cv=test_cv, dbxref=test_dbxref6, is_obsolete=0, is_relationshiptype=0, ) Cvterm.objects.create( name="protein_match", cv=test_cv, dbxref=test_dbxref7, is_obsolete=0, is_relationshiptype=0, ) db = Db.objects.create(name="FASTA_SOURCE") # creating test features test_featurename1 = "AT2G44195.1.TAIR10" dbxref1 = Dbxref.objects.create(db=db, accession=test_featurename1) test_feature1 = Feature.objects.create( organism=test_organism, dbxref=dbxref1, uniquename=test_featurename1, is_analysis=False, type_id=test_term.cvterm_id, is_obsolete=False, timeaccessioned=datetime.now(), timelastmodified=datetime.now(), ) test_featurename2 = "AT1G30080.1.TAIR10" dbxref2 = Dbxref.objects.create(db=db, accession=test_featurename2) test_feature2 = Feature.objects.create( organism=test_organism, dbxref=dbxref2, uniquename=test_featurename2, is_analysis=False, type_id=test_term.cvterm_id, is_obsolete=False, timeaccessioned=datetime.now(), timelastmodified=datetime.now(), ) test_featurename3 = "AT5G24750.1.TAIR10" dbxref3 = Dbxref.objects.create(db=db, accession=test_featurename3) test_feature3 = Feature.objects.create( dbxref=dbxref3, organism=test_organism, uniquename=test_featurename3, is_analysis=False, type_id=test_term.cvterm_id, is_obsolete=False, timeaccessioned=datetime.now(), timelastmodified=datetime.now(), ) test_pair1 = [test_featurename1, test_featurename2] test_pair2 = [test_featurename1, test_featurename3] test_value1 = 0.1818928687089519 test_value2 = 0.1715779378273995 test_pcc_value1 = str(test_value1 + 0.7) test_pcc_value2 = str(test_value2 + 0.7) # dummy coexpression variables test_filename = "pcc.mcl.dummy.txt" source = "null" soterm = "polypeptide" test_coexpression_loader = FeatureLoader(source=source, filename=test_filename) test_coexpression_loader.store_feature_pairs(pair=test_pair1, soterm=soterm, term=term, value=test_pcc_value1) test_coexpression_loader.store_feature_pairs(pair=test_pair2, soterm=soterm, term=term, value=test_pcc_value2) # start checking self.assertTrue( FeatureRelationship.objects.filter( subject_id=test_feature1.feature_id, object_id=test_feature2.feature_id, value=test_pcc_value1, ).exists()) self.assertTrue( FeatureRelationship.objects.filter( subject_id=test_feature1.feature_id, object_id=test_feature3.feature_id, value=test_pcc_value2, ).exists()) fr1 = FeatureRelationship.objects.get( subject_id=test_feature1.feature_id, object_id=test_feature2.feature_id, value=test_pcc_value1, ) fr2 = FeatureRelationship.objects.get( subject_id=test_feature1.feature_id, object_id=test_feature3.feature_id, value=test_pcc_value2, ) self.assertTrue( FeatureRelationshipprop.objects.filter( feature_relationship=fr1, type_id=cvterm_contained_in.cvterm_id, value=test_filename, ).exists()) self.assertTrue( FeatureRelationshipprop.objects.filter( feature_relationship=fr2, type_id=cvterm_contained_in.cvterm_id, value=test_filename, ).exists())
def test_load_coexpression_clusters(self): """Run tests of load_coexpression_pairs.""" """Load 'mcl.clusters.txt' output result file from LSTrAP. The 'mcl.clusters.txt' is a tab separated, headless file and have the format as follows (each line is a cluster): ath_coexpr_mcl_1: AT3G18715.1.TAIR10 AT3G08790.1.TAIR10 AT5G42230.1.TAIR10 ath_coexpr_mcl_1: AT1G27040.1.TAIR10 AT1G71692.1.TAIR10 ath_coexpr_mcl_1: AT5G24750.1.TAIR10 ... and so on. The features need to be loaded previously or won't be registered.""" # register multispecies organism test_organism = Organism.objects.create( abbreviation="multispecies", genus="multispecies", species="multispecies", common_name="multispecies", ) # creating test SO term test_db = Db.objects.create(name="SO") test_cv = Cv.objects.create(name="sequence") # creating test RO term test_db2 = Db.objects.create(name="RO") test_cv2 = Cv.objects.create(name="relationship") test_cv3 = Cv.objects.create(name="feature_property") # test_dbxref = Dbxref.objects.create(accession='123456', db=test_db) test_dbxref2 = Dbxref.objects.create(accession="028", db=test_db) test_dbxref3 = Dbxref.objects.create(accession="135", db=test_db) test_dbxref4 = Dbxref.objects.create(accession="246", db=test_db2) test_dbxref5 = Dbxref.objects.create(accession="579", db=test_db2) test_dbxref6 = Dbxref.objects.create(accession="357", db=test_db) test_dbxref7 = Dbxref.objects.create(accession="468", db=test_db) Cvterm.objects.create( name="mRNA", cv=test_cv, dbxref=test_dbxref3, is_obsolete=0, is_relationshiptype=0, ) # Cvterm.objects.create( # name='polypeptide', cv=test_cv, dbxref=test_dbxref, # is_obsolete=0, is_relationshiptype=0) # register features. Cvterm.objects.create( name="contained in", cv=test_cv2, dbxref=test_dbxref2, is_obsolete=0, is_relationshiptype=1, ) Cvterm.objects.create( name="correlated with", cv=test_cv2, dbxref=test_dbxref4, is_obsolete=0, is_relationshiptype=1, ) term = Cvterm.objects.create( name="coexpression group", cv=test_cv3, dbxref=test_dbxref5, is_obsolete=0, is_relationshiptype=0, ) test_term = Cvterm.objects.create( name="polypeptide", cv=test_cv, dbxref=test_dbxref6, is_obsolete=0, is_relationshiptype=0, ) Cvterm.objects.create( name="protein_match", cv=test_cv, dbxref=test_dbxref7, is_obsolete=0, is_relationshiptype=0, ) db = Db.objects.create(name="FASTA_SOURCE") test_featurename1 = "AT3G18715.1.TAIR10" dbxref1 = Dbxref.objects.create(db=db, accession=test_featurename1) test_feature1 = Feature.objects.create( dbxref=dbxref1, organism=test_organism, uniquename=test_featurename1, is_analysis=False, type_id=test_term.cvterm_id, is_obsolete=False, timeaccessioned=datetime.now(), timelastmodified=datetime.now(), ) test_featurename2 = "AT3G08790.1.TAIR10" dbxref2 = Dbxref.objects.create(db=db, accession=test_featurename2) test_feature2 = Feature.objects.create( dbxref=dbxref2, organism=test_organism, uniquename=test_featurename2, is_analysis=False, type_id=test_term.cvterm_id, is_obsolete=False, timeaccessioned=datetime.now(), timelastmodified=datetime.now(), ) test_featurename3 = "AT5G42230.1.TAIR10" dbxref3 = Dbxref.objects.create(db=db, accession=test_featurename3) test_feature3 = Feature.objects.create( dbxref=dbxref3, organism=test_organism, uniquename=test_featurename3, is_analysis=False, type_id=test_term.cvterm_id, is_obsolete=False, timeaccessioned=datetime.now(), timelastmodified=datetime.now(), ) test_featurename4 = "AT1G27040.1.TAIR10" dbxref4 = Dbxref.objects.create(db=db, accession=test_featurename4) test_feature4 = Feature.objects.create( dbxref=dbxref4, organism=test_organism, uniquename=test_featurename4, is_analysis=False, type_id=test_term.cvterm_id, is_obsolete=False, timeaccessioned=datetime.now(), timelastmodified=datetime.now(), ) test_featurename5 = "AT1G71692.1.TAIR10" dbxref5 = Dbxref.objects.create(db=db, accession=test_featurename5) test_feature5 = Feature.objects.create( dbxref=dbxref5, organism=test_organism, uniquename=test_featurename5, is_analysis=False, type_id=test_term.cvterm_id, is_obsolete=False, timeaccessioned=datetime.now(), timelastmodified=datetime.now(), ) test_featurename6 = "AT5G24750.1.TAIR10" dbxref6 = Dbxref.objects.create(db=db, accession=test_featurename6) test_feature6 = Feature.objects.create( dbxref=dbxref6, organism=test_organism, uniquename=test_featurename6, is_analysis=False, type_id=test_term.cvterm_id, is_obsolete=False, timeaccessioned=datetime.now(), timelastmodified=datetime.now(), ) # clusters setup test_cluster1_name = "ath_coexpr_mcl_1" test_cluster1 = [ test_featurename1, test_featurename2, test_featurename3 ] test_cluster2_name = "ath_coexpr_mcl_2" test_cluster2 = [test_featurename4, test_featurename5] test_cluster3_name = "ath_coexpr_mcl_3" test_cluster3 = [test_featurename6] test_filename = "mcl.clusters.dummy.txt" source = "null" test_coexpression_loader = FeatureLoader(source=source, filename=test_filename) soterm = "polypeptide" test_coexpression_loader.store_feature_groups(group=test_cluster1, soterm=soterm, term=term, value=test_cluster1_name) test_coexpression_loader.store_feature_groups(group=test_cluster2, soterm=soterm, term=term, value=test_cluster2_name) test_coexpression_loader.store_feature_groups(group=test_cluster3, soterm=soterm, term=term, value=test_cluster3_name) # check entire cluster1 relationships (not in reverse) self.assertTrue( Featureprop.objects.filter(feature_id=test_feature1.feature_id, type=term, value=test_cluster1_name).exists()) self.assertTrue( Featureprop.objects.filter(feature_id=test_feature3.feature_id, type=term, value=test_cluster1_name).exists()) self.assertTrue( Featureprop.objects.filter(feature_id=test_feature2.feature_id, type=term, value=test_cluster1_name).exists()) # check cluster2 relationships self.assertTrue( Featureprop.objects.filter(feature_id=test_feature5.feature_id, type=term, value=test_cluster2_name).exists()) self.assertTrue( Featureprop.objects.filter(feature_id=test_feature4.feature_id, type=term, value=test_cluster2_name).exists()) self.assertFalse( Featureprop.objects.filter(feature_id=test_feature6.feature_id, type=term, value=test_cluster3_name).exists())
def handle( self, file: str, organism: str, doi: str = None, cpu: int = 1, verbosity: int = 1, **options ): """Execute the main function.""" # retrieve only the file name filename = os.path.basename(file) if verbosity > 0: self.stdout.write("Processing file: {}".format(filename)) try: FileValidator().validate(file) except ImportingError as e: raise CommandError(e) try: index_file = "{}.tbi".format(file) FileValidator().validate(index_file) except ImportingError: try: index_file = "{}.csi".format(file) FileValidator().validate(index_file) except ImportingError: raise CommandError("No index found (.tbi/.csi)") try: feature_file = FeatureLoader( filename=filename, source="VCF_SOURCE", doi=doi ) except ImportingError as e: raise CommandError(e) pool = ThreadPoolExecutor(max_workers=cpu) tasks = list() chunk_size = cpu * 2 # Load the GFF3 file with open(file) as tbx_file: tbx = pysam.TabixFile(filename=tbx_file.name, index=index_file) for row in tqdm(tbx.fetch(parser=pysam.asVCF()), total=get_num_lines(file)): tasks.append( pool.submit(feature_file.store_tabix_VCF_feature, row, organism) ) if len(tasks) >= chunk_size: for task in as_completed(tasks): try: task.result() except ImportingError as e: raise CommandError(e) tasks.clear() else: for task in as_completed(tasks): try: task.result() except ImportingError as e: raise CommandError(e) tasks.clear() pool.shutdown() if verbosity > 0: self.stdout.write(self.style.SUCCESS("Done with {}".format(filename)))
def handle(self, file: str, cpu: int = 1, verbosity: int = 0, **options): """Execute the main function.""" try: FileValidator().validate(file) except ImportingError as e: raise CommandError(e) filename = os.path.basename(file) if verbosity > 0: self.stdout.write("Processing file: {}".format(filename)) try: groups = open(file, "r") # retrieve only the file name except ImportingError as e: raise CommandError(e) pool = ThreadPoolExecutor(max_workers=cpu) tasks = list() cv, created = Cv.objects.get_or_create(name="feature_property") ortho_db, created = Db.objects.get_or_create(name="ORTHOMCL_SOURCE") ortho_dbxref, created = Dbxref.objects.get_or_create( accession="ORTHOMCL_SOURCE", db=ortho_db) cvterm_cluster, created = Cvterm.objects.get_or_create( name="orthologous group", cv=cv, dbxref=ortho_dbxref, is_obsolete=0, is_relationshiptype=0, ) # hardcoded as orthomcl uses protein input soterm = "polypeptide" source = "null" featureloader = FeatureLoader(source=source, filename=filename) # each line is an orthologous group for line in groups: members = [] name = "" fields = re.split(r"\s+", line.strip()) if re.search(r"^(\w+)\:", fields[0]): group_field = re.match(r"^(\w+)\:", fields[0]) name = group_field.group(1) fields.pop(0) for field in fields: if re.search(r"^(\w+)\|(\S+)", field): member_field = re.match(r"^(\w+)\|(\S+)", field) ident = member_field.group(2) members.append(ident) else: raise CommandError("Cluster has no identification, check.") # only orthologous groups with 2 or more members allowed if len(members) > 1: tasks.append( pool.submit( featureloader.store_feature_groups, soterm=soterm, group=members, term=cvterm_cluster.cvterm_id, value=name, )) if verbosity > 0: self.stdout.write("Loading") for task in tqdm(as_completed(tasks), total=len(tasks)): if task.result(): raise (task.result()) pool.shutdown() if verbosity > 0: self.stdout.write( self.style.SUCCESS("Done with {}".format(filename)))
def test_store_feature_dbxref(self): """Tests - store feature dbxref.""" # creating exact term test_db_global = Db.objects.create(name="_global") Dbxref.objects.create(accession="exact", db=test_db_global) test_db = Db.objects.create(name="RO") test_dbxref = Dbxref.objects.create(accession="00002", db=test_db) test_cv = Cv.objects.create(name="relationship") Cvterm.objects.create( name="contained in", cv=test_cv, dbxref=test_dbxref, is_obsolete=0, is_relationshiptype=0, ) test_db = Db.objects.create(name="SO") test_dbxref = Dbxref.objects.create(accession="12345", db=test_db) test_cv = Cv.objects.create(name="sequence") test_so_term = Cvterm.objects.create( name="polypeptide", cv=test_cv, dbxref=test_dbxref, is_obsolete=0, is_relationshiptype=0, ) test_dbxref = Dbxref.objects.create(accession="123455", db=test_db) Cvterm.objects.create( name="protein_match", cv=test_cv, dbxref=test_dbxref, is_obsolete=0, is_relationshiptype=0, ) test_organism = Organism.objects.create(genus="Mus", species="musculus") test_db = Db.objects.create(name="GFF_SOURCE") test_dbxref = Dbxref.objects.create(accession="feat2", db=test_db) test_feature = Feature.objects.create( organism=test_organism, uniquename="feat2", dbxref=test_dbxref, is_analysis=False, type_id=test_so_term.cvterm_id, is_obsolete=False, timeaccessioned=datetime.now(timezone.utc), timelastmodified=datetime.now(timezone.utc), ) test_feature_file = FeatureLoader(filename="file.name", source="GFF_loader") # store the feature annotation test_feature_file.store_feature_dbxref(feature="feat2", soterm="polypeptide", dbxref="GI:12345") test_featuredbxref = FeatureDbxref.objects.get(feature=test_feature) self.assertEqual("GI", test_featuredbxref.dbxref.db.name) self.assertEqual("12345", test_featuredbxref.dbxref.accession)
def test_store_feature_annotation(self): """Tests - store feature annotation.""" # creating exact term test_db_global = Db.objects.create(name="_global") test_dbxref = Dbxref.objects.create(accession="exact", db=test_db_global) test_cv = Cv.objects.create(name="synonym_type") Cvterm.objects.create( name="exact", cv=test_cv, dbxref=test_dbxref, is_obsolete=0, is_relationshiptype=0, ) test_db = Db.objects.create(name="RO") test_dbxref = Dbxref.objects.create(accession="00002", db=test_db) test_cv = Cv.objects.create(name="relationship") Cvterm.objects.create( name="contained in", cv=test_cv, dbxref=test_dbxref, is_obsolete=0, is_relationshiptype=0, ) test_db = Db.objects.create(name="SO") test_dbxref = Dbxref.objects.create(accession="12345", db=test_db) test_cv = Cv.objects.create(name="sequence") test_so_term = Cvterm.objects.create( name="polypeptide", cv=test_cv, dbxref=test_dbxref, is_obsolete=0, is_relationshiptype=0, ) test_dbxref = Dbxref.objects.create(accession="123455", db=test_db) Cvterm.objects.create( name="protein_match", cv=test_cv, dbxref=test_dbxref, is_obsolete=0, is_relationshiptype=0, ) test_db = Db.objects.create(name="GO") test_dbxref = Dbxref.objects.create(accession="12345", db=test_db) test_cv = Cv.objects.create(name="biological_process") Cvterm.objects.create( name="go test term", cv=test_cv, dbxref=test_dbxref, is_obsolete=0, is_relationshiptype=0, ) test_organism = Organism.objects.create(genus="Mus", species="musculus") test_db = Db.objects.create(name="GFF_SOURCE") test_dbxref = Dbxref.objects.create(accession="feat2", db=test_db) test_feature = Feature.objects.create( organism=test_organism, uniquename="feat2", dbxref=test_dbxref, is_analysis=False, type_id=test_so_term.cvterm_id, is_obsolete=False, timeaccessioned=datetime.now(timezone.utc), timelastmodified=datetime.now(timezone.utc), ) test_feature_file = FeatureLoader(filename="file.name", source="GFF_loader") # store the feature annotation test_feature_file.store_feature_annotation( feature="feat2", soterm="polypeptide", cvterm="display", annotation="feature one", doi=None, ) test_featureprop = Featureprop.objects.get(feature=test_feature) self.assertEqual("feature one", test_featureprop.value) # replace the feature annotation test_feature_file.store_feature_annotation( feature="feat2", soterm="polypeptide", cvterm="display", annotation="feature new", doi=None, ) test_featureprop = Featureprop.objects.get(feature=test_feature) self.assertEqual("feature new", test_featureprop.value) # store the ontology_term test_feature_file.store_feature_annotation( feature="feat2", soterm="polypeptide", cvterm="ontology_term", annotation="GO:12345", doi=None, ) test_cvterm = Cvterm.objects.get(name="go test term") test_feature_cvterm = FeatureCvterm.objects.get(feature=test_feature, cvterm=test_cvterm) self.assertIsNotNone(test_feature_cvterm) # store the dbxref test_feature_file.store_feature_annotation( feature="feat2", soterm="polypeptide", cvterm="dbxref", annotation="GEO:123456", doi=None, ) test_db = Db.objects.get(name="GEO") test_dbxref = Dbxref.objects.get(db=test_db, accession="123456") test_feature_dbxref = FeatureDbxref.objects.get(feature=test_feature, dbxref=test_dbxref) self.assertIsNotNone(test_feature_dbxref)
def test_store_bio_searchio_hit(self): """Tests - store bio searchio hit.""" # create RO term: contained in test_db = Db.objects.create(name="RO") test_dbxref = Dbxref.objects.create(accession="00002", db=test_db) test_cv = Cv.objects.create(name="relationship") Cvterm.objects.create( name="contained in", cv=test_cv, dbxref=test_dbxref, is_obsolete=0, is_relationshiptype=0, ) # create SO terms: protein_match test_cv = Cv.objects.create(name="sequence") test_db = Db.objects.create(name="SO") test_dbxref = Dbxref.objects.create(accession="00001", db=test_db) Cvterm.objects.create( name="protein_match", cv=test_cv, dbxref=test_dbxref, is_obsolete=0, is_relationshiptype=0, ) test_dbxref = Dbxref.objects.create(accession="00002", db=test_db) Cvterm.objects.create( name="polypeptide", cv=test_cv, dbxref=test_dbxref, is_obsolete=0, is_relationshiptype=0, ) # create GO term test_db = Db.objects.create(name="GO") test_dbxref = Dbxref.objects.create(accession="1234", db=test_db) test_cv = Cv.objects.create(name="biological_process") Cvterm.objects.create( name="GO:1234", cv=test_cv, dbxref=test_dbxref, is_obsolete=0, is_relationshiptype=0, ) # create a bio searchio hit test_searchio_hit = Hit() test_searchio_hit.id = "PF1234" test_searchio_hit.accession = "PFAM mock domain" test_searchio_hit.attributes["Target"] = "PFAM" test_searchio_hit.dbxrefs = [ "GO:1234", "IPR:IPR012345", "Reactome:R-HSA-12345" ] Organism.objects.create(genus="test", species="organism") # instantiate the loader test_feature_file = FeatureLoader(filename="file.name", source="InterproScan_source") # store the bio searchio hit # From interproscan target = "InterPro" test_feature_file.store_bio_searchio_hit(test_searchio_hit, target) test_feature = Feature.objects.get(uniquename="PF1234") self.assertEqual("PFAM mock domain", test_feature.name) test_dbxref = Dbxref.objects.get(accession="IPR012345") test_feature_dbxref = FeatureDbxref.objects.get(feature=test_feature, dbxref=test_dbxref) self.assertEqual(True, test_feature_dbxref.is_current) test_cvterm = Cvterm.objects.get(name="GO:1234") test_feature_cvterm = FeatureCvterm.objects.get(feature=test_feature, cvterm=test_cvterm) self.assertEqual(0, test_feature_cvterm.rank)
def test_store_tabix_GFF_feature(self): """Tests - store tabix feature / store relationships.""" # creating exact term test_db_global = Db.objects.create(name="_global") test_dbxref = Dbxref.objects.create(accession="exact", db=test_db_global) test_cv = Cv.objects.create(name="synonym_type") Cvterm.objects.create( name="exact", cv=test_cv, dbxref=test_dbxref, is_obsolete=0, is_relationshiptype=0, ) # creating part_of term test_dbxref = Dbxref.objects.create(accession="part_of", db=test_db_global) test_cv = Cv.objects.create(name="sequence") Cvterm.objects.create( name="part_of", cv=test_cv, dbxref=test_dbxref, is_obsolete=0, is_relationshiptype=0, ) # create SO terms: assembly, gene, and exon test_db = Db.objects.create(name="SO") test_dbxref = Dbxref.objects.create(accession="00001", db=test_db) test_cvterm_assembly = Cvterm.objects.create( name="assembly", cv=test_cv, dbxref=test_dbxref, is_obsolete=0, is_relationshiptype=0, ) test_dbxref = Dbxref.objects.create(accession="00002", db=test_db) Cvterm.objects.create( name="gene", cv=test_cv, dbxref=test_dbxref, is_obsolete=0, is_relationshiptype=0, ) test_dbxref = Dbxref.objects.create(accession="00003", db=test_db) Cvterm.objects.create( name="exon", cv=test_cv, dbxref=test_dbxref, is_obsolete=0, is_relationshiptype=0, ) test_dbxref = Dbxref.objects.create(accession="00004", db=test_db) Cvterm.objects.create( name="polypeptide", cv=test_cv, dbxref=test_dbxref, is_obsolete=0, is_relationshiptype=0, ) test_dbxref = Dbxref.objects.create(accession="00005", db=test_db) Cvterm.objects.create( name="protein_match", cv=test_cv, dbxref=test_dbxref, is_obsolete=0, is_relationshiptype=0, ) # create RO term: contained in test_db = Db.objects.create(name="RO") test_dbxref = Dbxref.objects.create(accession="00002", db=test_db) test_cv = Cv.objects.create(name="relationship") Cvterm.objects.create( name="contained in", cv=test_cv, dbxref=test_dbxref, is_obsolete=0, is_relationshiptype=0, ) # create an organism test_organism = Organism.objects.create(genus="Mus", species="musculus") # create a srcfeature test_db = Db.objects.create(name="FASTA_SOURCE") test_dbxref = Dbxref.objects.create(accession="contig1", db=test_db) feature = Feature.objects.create( dbxref=test_dbxref, organism=test_organism, name="contig1", type=test_cvterm_assembly, uniquename="contig1", is_analysis=False, is_obsolete=False, timeaccessioned=datetime.now(timezone.utc), timelastmodified=datetime.now(timezone.utc), ) # DOI TESTING db2 = BibDatabase() db2.entries = [{ "journal": "Nice Journal", "comments": "A comment", "pages": "12--23", "month": "jan", "abstract": "This is an abstract. This line should be " "long enough to test multilines...", "title": "An amazing title", "year": "2013", "doi": "10.1186/s12864-016-2535-300002", "volume": "12", "ID": "Teste2018", "author": "Foo, b. and Foo1, b. and Foo b.", "keyword": "keyword1, keyword2", "ENTRYTYPE": "article", }] for entry in db2.entries: bibtest3 = PublicationLoader() bibtest3.store_bibtex_entry(entry) test_bibtex3 = Pub.objects.get(uniquename="Teste2018") test_bibtex3_pubdbxref = PubDbxref.objects.get(pub=test_bibtex3) test_bibtex3_dbxref = Dbxref.objects.get( dbxref_id=test_bibtex3_pubdbxref.dbxref_id) self.assertEqual("10.1186/s12864-016-2535-300002", test_bibtex3_dbxref.accession) # DOI: try to link feature to publication's DOI featurepub_test = None if feature and test_bibtex3_pubdbxref: featurepub_test = FeaturePub.objects.create( feature_id=feature.feature_id, pub_id=test_bibtex3_pubdbxref.pub_id) test_pub = Pub.objects.get(pub_id=featurepub_test.pub_id) self.assertEqual("An amazing title", test_pub.title) test_pubdbxref = PubDbxref.objects.get(pub=test_pub) test_dbxref = Dbxref.objects.get(dbxref_id=test_pubdbxref.dbxref_id) self.assertEqual("10.1186/s12864-016-2535-300002", test_dbxref.accession) # create a tabix feature class TabixFeature(object): """mock tabix feature.""" test_tabix_feature1 = TabixFeature() test_tabix_feature1.contig = "contig1" test_tabix_feature1.feature = "gene" test_tabix_feature1.start = "10" test_tabix_feature1.end = "100" test_tabix_feature1.strand = "+" test_tabix_feature1.frame = "1" test_tabix_feature1.attributes = "id=id1;name=name1" test_tabix_feature2 = TabixFeature() test_tabix_feature2.contig = "contig1" test_tabix_feature2.feature = "exon" test_tabix_feature2.start = "10" test_tabix_feature2.end = "100" test_tabix_feature2.strand = "-" test_tabix_feature2.frame = "2" test_tabix_feature2.attributes = "id=id2;name=name2;parent=id1" # instantiate the loader test_feature_file = FeatureLoader(filename="file.name", source="GFF_source") organism = "Mus musculus" # store the tabix feature qtl = False test_feature_file.store_tabix_GFF_feature(test_tabix_feature1, organism, qtl) test_feature_file.store_tabix_GFF_feature(test_tabix_feature2, organism, qtl) # store the relationships for item in test_feature_file.relationships: test_feature_file.store_relationship(organism, item["subject_id"], item["object_id"]) test_feature = Feature.objects.get(uniquename="id2") test_featureloc = Featureloc.objects.get(feature=test_feature) test_feature_relationship = FeatureRelationship.objects.get( object=test_feature.feature_id) test_src_feature = Feature.objects.get( feature_id=test_feature_relationship.subject.feature_id) self.assertEqual("name2", test_feature.name) self.assertEqual(10, test_featureloc.fmin) self.assertEqual("id1", test_src_feature.uniquename)
def test_store_tabix_VCF_feature(self): """Tests - store tabix VCF feature / store relationships.""" # creating exact term test_db_global = Db.objects.create(name="_global") test_dbxref = Dbxref.objects.create(accession="exact", db=test_db_global) test_cv = Cv.objects.create(name="synonym_type") Cvterm.objects.create( name="exact", cv=test_cv, dbxref=test_dbxref, is_obsolete=0, is_relationshiptype=0, ) # creating part_of term test_dbxref = Dbxref.objects.create(accession="part_of", db=test_db_global) test_cv = Cv.objects.create(name="sequence") Cvterm.objects.create( name="part_of", cv=test_cv, dbxref=test_dbxref, is_obsolete=0, is_relationshiptype=0, ) # create SO terms: assembly, gene, and exon test_db = Db.objects.create(name="SO") test_dbxref = Dbxref.objects.create(accession="00001", db=test_db) test_cvterm_assembly = Cvterm.objects.create( name="assembly", cv=test_cv, dbxref=test_dbxref, is_obsolete=0, is_relationshiptype=0, ) test_dbxref = Dbxref.objects.create(accession="00002", db=test_db) Cvterm.objects.create( name="snv", cv=test_cv, dbxref=test_dbxref, is_obsolete=0, is_relationshiptype=0, ) test_dbxref = Dbxref.objects.create(accession="00003", db=test_db) Cvterm.objects.create( name="snp", cv=test_cv, dbxref=test_dbxref, is_obsolete=0, is_relationshiptype=0, ) test_dbxref = Dbxref.objects.create(accession="00004", db=test_db) Cvterm.objects.create( name="polypeptide", cv=test_cv, dbxref=test_dbxref, is_obsolete=0, is_relationshiptype=0, ) test_dbxref = Dbxref.objects.create(accession="00005", db=test_db) Cvterm.objects.create( name="protein_match", cv=test_cv, dbxref=test_dbxref, is_obsolete=0, is_relationshiptype=0, ) test_dbxref = Dbxref.objects.create(accession="00006", db=test_db) Cvterm.objects.create( name="quality_value", cv=test_cv, dbxref=test_dbxref, is_obsolete=0, is_relationshiptype=0, ) # create RO term: contained in test_db = Db.objects.create(name="RO") test_dbxref = Dbxref.objects.create(accession="00002", db=test_db) test_cv = Cv.objects.create(name="relationship") Cvterm.objects.create( name="contained in", cv=test_cv, dbxref=test_dbxref, is_obsolete=0, is_relationshiptype=0, ) # create an organism test_organism = Organism.objects.create(genus="Mus", species="musculus") # create a srcfeature test_db = Db.objects.create(name="FASTA_SOURCE") test_dbxref = Dbxref.objects.create(accession="contig1", db=test_db) feature = Feature.objects.create( dbxref=test_dbxref, organism=test_organism, name="contig1", type=test_cvterm_assembly, uniquename="contig1", is_analysis=False, is_obsolete=False, timeaccessioned=datetime.now(timezone.utc), timelastmodified=datetime.now(timezone.utc), ) # DOI TESTING db2 = BibDatabase() db2.entries = [{ "journal": "Nice Journal", "comments": "A comment", "pages": "12--23", "month": "jan", "abstract": "This is an abstract. This line should be " "long enough to test multilines...", "title": "An amazing title", "year": "2013", "doi": "10.1186/s12864-016-2535-300002", "volume": "12", "ID": "Teste2018", "author": "Foo, b. and Foo1, b. and Foo b.", "keyword": "keyword1, keyword2", "ENTRYTYPE": "article", }] for entry in db2.entries: bibtest3 = PublicationLoader() bibtest3.store_bibtex_entry(entry) test_bibtex3 = Pub.objects.get(uniquename="Teste2018") test_bibtex3_pubdbxref = PubDbxref.objects.get(pub=test_bibtex3) test_bibtex3_dbxref = Dbxref.objects.get( dbxref_id=test_bibtex3_pubdbxref.dbxref_id) self.assertEqual("10.1186/s12864-016-2535-300002", test_bibtex3_dbxref.accession) # DOI: try to link feature to publication's DOI featurepub_test = None if feature and test_bibtex3_pubdbxref: featurepub_test = FeaturePub.objects.create( feature_id=feature.feature_id, pub_id=test_bibtex3_pubdbxref.pub_id) test_pub = Pub.objects.get(pub_id=featurepub_test.pub_id) self.assertEqual("An amazing title", test_pub.title) test_pubdbxref = PubDbxref.objects.get(pub=test_pub) test_dbxref = Dbxref.objects.get(dbxref_id=test_pubdbxref.dbxref_id) self.assertEqual("10.1186/s12864-016-2535-300002", test_dbxref.accession) # create a tabix feature class TabixFeature(object): """mock tabix feature.""" test_tabix_feature1 = TabixFeature() test_tabix_feature1.contig = "contig1" test_tabix_feature1.feature = "snp" test_tabix_feature1.pos = 10 test_tabix_feature1.id = "id1" test_tabix_feature1.ref = "A" test_tabix_feature1.alt = "T,C" test_tabix_feature1.info = "TSA=snv" test_tabix_feature1.qual = 10 test_tabix_feature2 = TabixFeature() test_tabix_feature2.contig = "contig1" test_tabix_feature2.feature = "snv" test_tabix_feature2.pos = 100 test_tabix_feature2.id = "id2" test_tabix_feature2.ref = "G" test_tabix_feature2.alt = "C,A" test_tabix_feature2.info = "VC=snp;SAO=0" test_tabix_feature2.qual = 20 # instantiate the loader test_feature_file = FeatureLoader(filename="file.name", source="VCF_SOURCE") organism = "Mus musculus" # store the tabix feature test_feature_file.store_tabix_VCF_feature(test_tabix_feature1, organism) test_feature_file.store_tabix_VCF_feature(test_tabix_feature2, organism) test_feature = Feature.objects.get(uniquename="id2") test_featurelocs = Featureloc.objects.filter(feature=test_feature) self.assertEqual(100, test_featurelocs[0].fmin) self.assertEqual("G", test_featurelocs[0].residue_info) self.assertEqual("C", test_featurelocs[1].residue_info) self.assertEqual("A", test_featurelocs[2].residue_info) self.assertEqual(0, test_featurelocs[0].rank) self.assertEqual(1, test_featurelocs[1].rank) self.assertEqual(2, test_featurelocs[2].rank) self.assertEqual("contig1", test_featurelocs[0].srcfeature.uniquename)
def handle(self, file: str, organism: str, doi: str = None, ignore: str = None, qtl: bool = False, cpu: int = 1, verbosity: int = 1, **options): """Execute the main function.""" # retrieve only the file name filename = os.path.basename(file) if verbosity > 0: self.stdout.write("Processing file: {}".format(filename)) try: FileValidator().validate(file) except ImportingError as e: raise CommandError(e) try: index_file = "{}.tbi".format(file) FileValidator().validate(index_file) except ImportingError: try: index_file = "{}.csi".format(file) FileValidator().validate(index_file) except ImportingError: raise CommandError("No index found (.tbi/.csi)") try: feature_file = FeatureLoader(filename=filename, source="GFF_SOURCE", doi=doi) except ImportingError as e: raise CommandError(e) pool = ThreadPoolExecutor(max_workers=cpu) tasks = list() chunk_size = cpu * 2 # Load the GFF3 file with open(file) as tbx_file: tbx = pysam.TabixFile(filename=tbx_file.name, index=index_file) for row in tqdm(tbx.fetch(parser=pysam.asGTF()), total=get_num_lines(file)): if ignore is not None and row.feature in ignore: continue tasks.append( pool.submit(feature_file.store_tabix_GFF_feature, row, organism, qtl)) if len(tasks) >= chunk_size: for task in as_completed(tasks): try: task.result() except ImportingError as e: raise CommandError(e) tasks.clear() else: for task in as_completed(tasks): try: task.result() except ImportingError as e: raise CommandError(e) tasks.clear() pool.shutdown() if verbosity > 0: self.stdout.write("Loading relationships") pool = ThreadPoolExecutor(max_workers=cpu) tasks = list() for item in feature_file.relationships: tasks.append( pool.submit( feature_file.store_relationship, organism, item["subject_id"], item["object_id"], )) for task in tqdm(as_completed(tasks), total=len(tasks)): try: task.result() except ImportingError as e: raise CommandError(e) pool.shutdown() if feature_file.ignored_attrs is not None: self.stdout.write( self.style.WARNING("Ignored attrs: {}".format( feature_file.ignored_attrs))) if verbosity > 0: self.stdout.write( self.style.SUCCESS("Done with {}".format(filename)))
def handle( self, file: str, cpu: int = 1, soterm: str = "mRNA", verbosity: int = 0, **options ): """Execute the main function.""" filename = os.path.basename(file) if verbosity > 0: self.stdout.write("Processing file: {}".format(filename)) try: FileValidator().validate(file) except ImportingError as e: raise CommandError(e) try: pairs = open(file, "r") # retrieve only the file name except ImportingError as e: raise CommandError(e) cvterm_corel = Cvterm.objects.get( name="correlated with", cv__name="relationship" ).cvterm_id # feature source is not needed here source = "null" featureloader = FeatureLoader(source=source, filename=filename) size = get_num_lines(file) # every cpu should be able to handle 5 tasks chunk = cpu * 5 with ThreadPoolExecutor(max_workers=cpu) as pool: tasks = list() for line in tqdm(pairs, total=size): nfields = 3 fields = re.split(r"\s+", line.rstrip()) try: FieldsValidator().validate(nfields, fields) except ImportingError as e: raise CommandError(e) # get corrected PCC value (last item from fields list) value = float(fields.pop()) + 0.7 tasks.append( pool.submit( featureloader.store_feature_pairs, pair=fields, soterm=soterm, term=cvterm_corel, value=value, ) ) if len(tasks) >= chunk: for task in as_completed(tasks): if task.result(): raise (task.result()) tasks.clear() else: for task in as_completed(tasks): if task.result(): raise (task.result()) tasks.clear() pool.shutdown() if verbosity > 0: self.stdout.write(self.style.SUCCESS("Done with {}".format(filename)))
def test_orthology(self): """Tests - __init__.""" # register multispecies organism so_db = Db.objects.create(name="SO") so_cv = Cv.objects.create(name="sequence") # creating test RO term ro_db = Db.objects.create(name="RO") ro_cv = Cv.objects.create(name="relationship") fo_db = Db.objects.create(name="ORTHOMCL_SOURCE") fo_cv = Cv.objects.create(name="feature_property") # test_dbxref = Dbxref.objects.create(accession='123456', db=test_db) so_dbxref = Dbxref.objects.create(accession="357", db=so_db) so_dbxref2 = Dbxref.objects.create(accession="358", db=so_db) ro_dbxref = Dbxref.objects.create(accession="658", db=ro_db) # creating test SO term Cvterm.objects.create( name="contained in", cv=ro_cv, dbxref=ro_dbxref, is_obsolete=0, is_relationshiptype=1, ) ortho_dbxref = Dbxref.objects.create(accession="ORTHOMCL_SOURCE", db=fo_db) term = Cvterm.objects.create( name="orthologous group", cv=fo_cv, dbxref=ortho_dbxref, is_obsolete=0, is_relationshiptype=0, ) poly_cvterm = Cvterm.objects.create( name="polypeptide", cv=so_cv, dbxref=so_dbxref, is_obsolete=0, is_relationshiptype=0, ) Cvterm.objects.create( name="protein_match", cv=so_cv, dbxref=so_dbxref2, is_obsolete=0, is_relationshiptype=0, ) db_null = Db.objects.create(name="null") null_dbxref = Dbxref.objects.create(db=db_null, accession="null") null_cv = Cv.objects.create(name="null") Cvterm.objects.create( cv=null_cv, name="null", definition="", dbxref=null_dbxref, is_obsolete=0, is_relationshiptype=0, ) # need to insert organisms first organism1 = Organism.objects.create(species="coerulea", genus="Aquilegia", abbreviation="Aco") organism2 = Organism.objects.create(species="distachyon", genus="Brachypodium", abbreviation="Brd") organism3 = Organism.objects.create(species="clementina", genus="Citrus", abbreviation="Ccl") organism4 = Organism.objects.create(species="carota", genus="Dacus", abbreviation="Dca") organism5 = Organism.objects.create(species="grandis", genus="Eucalyptus", abbreviation="Egr") organism6 = Organism.objects.create(species="vesca", genus="Fragaria", abbreviation="Fve") organism7 = Organism.objects.create(species="max", genus="Glycine", abbreviation="Gma") organism8 = Organism.objects.create(species="fedtschenkoi", genus="Kalanchoe", abbreviation="Kld") self.assertTrue(Organism.objects.filter(abbreviation="Aco").exists()) self.assertTrue(Organism.objects.filter(abbreviation="Brd").exists()) self.assertTrue(Organism.objects.filter(abbreviation="Ccl").exists()) self.assertTrue(Organism.objects.filter(abbreviation="Dca").exists()) self.assertTrue(Organism.objects.filter(abbreviation="Egr").exists()) self.assertTrue(Organism.objects.filter(abbreviation="Fve").exists()) self.assertTrue(Organism.objects.filter(abbreviation="Gma").exists()) self.assertTrue(Organism.objects.filter(abbreviation="Kld").exists()) # also need to insert Features from fasta file first. # inserting: Aqcoe0131s0001.1.v3.1 db = Db.objects.create(name="FASTA_SOURCE") acc1 = "Aqcoe0131s0001.1.v3.1" dbxref1 = Dbxref.objects.create(db=db, accession=acc1) feature1 = Feature.objects.create( dbxref=dbxref1, organism=organism1, uniquename="Aqcoe0131s0001.1.v3.1", type=poly_cvterm, is_analysis=False, is_obsolete=False, timeaccessioned=datetime.now(timezone.utc), timelastmodified=datetime.now(timezone.utc), ) self.assertTrue( Feature.objects.filter( type__cv__name="sequence", type__name="polypeptide", dbxref__accession=acc1, dbxref__db__name="FASTA_SOURCE", ).exists()) # inserting: Bradi0180s00100.1.v3.1; Bradi2g20400.1.v3.1 acc2 = "Bradi0180s00100.1.v3.1" dbxref2 = Dbxref.objects.create(db=db, accession=acc2) Feature.objects.create( dbxref=dbxref2, organism=organism2, uniquename="Bradi0180s00100.1.v3.1", type=poly_cvterm, is_analysis=False, is_obsolete=False, timeaccessioned=datetime.now(timezone.utc), timelastmodified=datetime.now(timezone.utc), ) self.assertTrue( Feature.objects.filter( type__cv__name="sequence", type__name="polypeptide", dbxref__accession=acc2, dbxref__db__name__in=["GFF_SOURCE", "FASTA_SOURCE"], ).exists()) acc3 = "Bradi2g20400.1.v3.1" dbxref3 = Dbxref.objects.create(db=db, accession=acc3) Feature.objects.create( dbxref=dbxref3, organism=organism2, uniquename="Bradi2g20400.1.v3.1", type=poly_cvterm, is_analysis=False, is_obsolete=False, timeaccessioned=datetime.now(timezone.utc), timelastmodified=datetime.now(timezone.utc), ) self.assertTrue( Feature.objects.filter( type__cv__name="sequence", type__name="polypeptide", dbxref__accession=acc3, dbxref__db__name__in=["GFF_SOURCE", "FASTA_SOURCE"], ).exists()) # inserting: Ciclev10013963m.v1.0; Ciclev10013962m.v1.0; # Ciclev10013970m.v1.0 acc4 = "Ciclev10013963m.v1.0" dbxref4 = Dbxref.objects.create(db=db, accession=acc4) feature4 = Feature.objects.create( dbxref=dbxref4, organism=organism3, uniquename="Ciclev10013963m.v1.0", type=poly_cvterm, is_analysis=False, is_obsolete=False, timeaccessioned=datetime.now(timezone.utc), timelastmodified=datetime.now(timezone.utc), ) self.assertTrue( Feature.objects.filter( type__cv__name="sequence", type__name="polypeptide", dbxref__accession=acc4, dbxref__db__name__in=["GFF_SOURCE", "FASTA_SOURCE"], ).exists()) acc5 = "Ciclev10013962m.v1.0" dbxref5 = Dbxref.objects.create(db=db, accession=acc5) Feature.objects.create( dbxref=dbxref5, organism=organism3, uniquename="Ciclev10013962m.v1.0", type=poly_cvterm, is_analysis=False, is_obsolete=False, timeaccessioned=datetime.now(timezone.utc), timelastmodified=datetime.now(timezone.utc), ) self.assertTrue( Feature.objects.filter( type__cv__name="sequence", type__name="polypeptide", dbxref__accession=acc5, dbxref__db__name__in=["GFF_SOURCE", "FASTA_SOURCE"], ).exists()) acc6 = "Ciclev10013970m.v1.0" dbxref6 = Dbxref.objects.create(db=db, accession=acc6) Feature.objects.create( dbxref=dbxref6, organism=organism3, uniquename="Ciclev10013970m.v1.0", type=poly_cvterm, is_analysis=False, is_obsolete=False, timeaccessioned=datetime.now(timezone.utc), timelastmodified=datetime.now(timezone.utc), ) self.assertTrue( Feature.objects.filter( type__cv__name="sequence", type__name="polypeptide", dbxref__accession=acc6, dbxref__db__name__in=["GFF_SOURCE", "FASTA_SOURCE"], ).exists()) # inserting: DCAR_032182.v1.0.388; DCAR_031986.v1.0.388; # DCAR_032223.v1.0.388; DCAR_000323.v1.0.388 acc7 = "DCAR_032182.v1.0.388" dbxref7 = Dbxref.objects.create(db=db, accession=acc7) Feature.objects.create( dbxref=dbxref7, organism=organism4, uniquename="DCAR_032182.v1.0.388", type=poly_cvterm, is_analysis=False, is_obsolete=False, timeaccessioned=datetime.now(timezone.utc), timelastmodified=datetime.now(timezone.utc), ) self.assertTrue( Feature.objects.filter( type__cv__name="sequence", type__name="polypeptide", dbxref__accession=acc7, dbxref__db__name__in=["GFF_SOURCE", "FASTA_SOURCE"], ).exists()) acc8 = "DCAR_031986.v1.0.388" dbxref8 = Dbxref.objects.create(db=db, accession=acc8) Feature.objects.create( dbxref=dbxref8, organism=organism4, uniquename="DCAR_031986.v1.0.388", type=poly_cvterm, is_analysis=False, is_obsolete=False, timeaccessioned=datetime.now(timezone.utc), timelastmodified=datetime.now(timezone.utc), ) self.assertTrue( Feature.objects.filter( type__cv__name="sequence", type__name="polypeptide", dbxref__accession=acc8, dbxref__db__name__in=["GFF_SOURCE", "FASTA_SOURCE"], ).exists()) acc9 = "DCAR_032223.v1.0.388" dbxref9 = Dbxref.objects.create(db=db, accession=acc9) feature9 = Feature.objects.create( dbxref=dbxref9, organism=organism4, uniquename="DCAR_032223.v1.0.388", type=poly_cvterm, is_analysis=False, is_obsolete=False, timeaccessioned=datetime.now(timezone.utc), timelastmodified=datetime.now(timezone.utc), ) self.assertTrue( Feature.objects.filter( type__cv__name="sequence", type__name="polypeptide", dbxref__accession=acc9, dbxref__db__name__in=["GFF_SOURCE", "FASTA_SOURCE"], ).exists()) acc10 = "DCAR_000323.v1.0.388" dbxref10 = Dbxref.objects.create(db=db, accession=acc10) feature10 = Feature.objects.create( dbxref=dbxref10, organism=organism4, uniquename="DCAR_000323.v1.0.388", type=poly_cvterm, is_analysis=False, is_obsolete=False, timeaccessioned=datetime.now(timezone.utc), timelastmodified=datetime.now(timezone.utc), ) self.assertTrue( Feature.objects.filter( type__cv__name="sequence", type__name="polypeptide", dbxref__accession=acc10, dbxref__db__name__in=["GFF_SOURCE", "FASTA_SOURCE"], ).exists()) # inserting: Eucgr.L02820.1.v2.0 acc11 = "Eucgr.L02820.1.v2.0" dbxref11 = Dbxref.objects.create(db=db, accession=acc11) Feature.objects.create( dbxref=dbxref11, organism=organism5, uniquename="Eucgr.L02820.1.v2.0", type=poly_cvterm, is_analysis=False, is_obsolete=False, timeaccessioned=datetime.now(timezone.utc), timelastmodified=datetime.now(timezone.utc), ) self.assertTrue( Feature.objects.filter( type__cv__name="sequence", type__name="polypeptide", dbxref__accession=acc11, dbxref__db__name__in=["GFF_SOURCE", "FASTA_SOURCE"], ).exists()) # inserting: mrna13067.1-v1.0-hybrid.v1.1 acc12 = "mrna13067.1-v1.0-hybrid.v1.1" dbxref12 = Dbxref.objects.create(db=db, accession=acc12) Feature.objects.create( dbxref=dbxref12, organism=organism6, uniquename="mrna13067.1-v1.0-hybrid.v1.1", type=poly_cvterm, is_analysis=False, is_obsolete=False, timeaccessioned=datetime.now(timezone.utc), timelastmodified=datetime.now(timezone.utc), ) self.assertTrue( Feature.objects.filter( type__cv__name="sequence", type__name="polypeptide", dbxref__accession=acc12, dbxref__db__name__in=["GFF_SOURCE", "FASTA_SOURCE"], ).exists()) # inserting: Glyma.10G030500.1.Wm82.a2.v1; Glyma.10G053100.1.Wm82.a2.v1 acc13 = "Glyma.10G030500.1.Wm82.a2.v1" dbxref13 = Dbxref.objects.create(db=db, accession=acc13) Feature.objects.create( dbxref=dbxref13, organism=organism7, uniquename="Glyma.10G030500.1.Wm82.a2.v1", type_id=poly_cvterm.cvterm_id, is_analysis=False, is_obsolete=False, timeaccessioned=datetime.now(timezone.utc), timelastmodified=datetime.now(timezone.utc), ) self.assertTrue( Feature.objects.filter( type__cv__name="sequence", type__name="polypeptide", dbxref__accession=acc13, dbxref__db__name__in=["GFF_SOURCE", "FASTA_SOURCE"], ).exists()) acc14 = "Glyma.10G053100.1.Wm82.a2.v1" dbxref14 = Dbxref.objects.create(db=db, accession=acc14) feature14 = Feature.objects.create( dbxref=dbxref14, organism=organism7, uniquename="Glyma.10G053100.1.Wm82.a2.v1", type_id=poly_cvterm.cvterm_id, is_analysis=False, is_obsolete=False, timeaccessioned=datetime.now(timezone.utc), timelastmodified=datetime.now(timezone.utc), ) self.assertTrue( Feature.objects.filter( type__cv__name="sequence", type__name="polypeptide", dbxref__accession=acc14, dbxref__db__name__in=["GFF_SOURCE", "FASTA_SOURCE"], ).exists()) acc15 = "Glyma.10G008400.1.Wm82.a2.v1" dbxref15 = Dbxref.objects.create(db=db, accession=acc15) Feature.objects.create( dbxref=dbxref15, organism=organism7, uniquename="Glyma.10G008400.1.Wm82.a2.v1", type_id=poly_cvterm.cvterm_id, is_analysis=False, is_obsolete=False, timeaccessioned=datetime.now(timezone.utc), timelastmodified=datetime.now(timezone.utc), ) self.assertTrue( Feature.objects.filter( type__cv__name="sequence", type__name="polypeptide", dbxref__accession=acc15, dbxref__db__name__in=["GFF_SOURCE", "FASTA_SOURCE"], ).exists()) # inserting: Kaladp0598s0001.1.v1.1 acc16 = "Kaladp0598s0001.1.v1.1" dbxref16 = Dbxref.objects.create(db=db, accession=acc16) feature16 = Feature.objects.create( dbxref=dbxref16, organism=organism8, uniquename="Kaladp0598s0001.1.v1.1", type_id=poly_cvterm.cvterm_id, is_analysis=False, is_obsolete=False, timeaccessioned=datetime.now(timezone.utc), timelastmodified=datetime.now(timezone.utc), ) self.assertTrue( Feature.objects.filter( type__cv__name="sequence", type__name="polypeptide", dbxref__accession=acc16, dbxref__db__name__in=["GFF_SOURCE", "FASTA_SOURCE"], ).exists()) acc17 = "Kaladp0598s0002.1.v1.1" dbxref17 = Dbxref.objects.create(db=db, accession=acc17) feature17 = Feature.objects.create( dbxref=dbxref17, organism=organism8, uniquename="Kaladp0598s0002.1.v1.1", type_id=poly_cvterm.cvterm_id, is_analysis=False, is_obsolete=False, timeaccessioned=datetime.now(timezone.utc), timelastmodified=datetime.now(timezone.utc), ) self.assertTrue( Feature.objects.filter( type__cv__name="sequence", type__name="polypeptide", dbxref__accession=acc17, dbxref__db__name__in=["GFF_SOURCE", "FASTA_SOURCE"], ).exists()) # ######################## # store feature groups: filename = "groups.txt" organism, created = Organism.objects.get_or_create( abbreviation="multispecies", genus="multispecies", species="multispecies", common_name="multispecies", ) source = "null" soterm = "polypeptide" test_orthology_loader = FeatureLoader(source=source, filename=filename) # #################### # test store groups group1_name = "machado0001" members1 = [ "Aqcoe0131s0001.1.v3.1", "Bradi0180s00100.1.v3.1", "Bradi2g20400.1.v3.1", "Ciclev10013963m.v1.0", "DCAR_032223.v1.0.388", "UnknownProtein.v1.1", ] test_orthology_loader.store_feature_groups(group=members1, soterm=soterm, term=term, value=group1_name) group2_name = "machado0002" members2 = [ "Eucgr.L02820.1.v2.0", "mrna13067.1-v1.0-hybrid.v1.1", "Ciclev10013970m.v1.0", "DCAR_031986.v1.0.388", ] test_orthology_loader.store_feature_groups(group=members2, soterm=soterm, term=term, value=group2_name) group3_name = "machado0003" members3 = [ "Glyma.10G030500.1.Wm82.a2.v1", "Glyma.10G053100.1.Wm82.a2.v1", "DCAR_032182.v1.0.388", ] test_orthology_loader.store_feature_groups(group=members3, soterm=soterm, term=term, value=group3_name) group4_name = "machado0004" members4 = ["Glyma.10G008400.1.Wm82.a2.v1", "", "UnknownProtein.v1.2"] test_orthology_loader.store_feature_groups(group=members4, soterm=soterm, term=term, value=group4_name) group5_name = "machado0005" members5 = ["DCAR_000323.v1.0.388", "Kaladp0598s0002.1.v1.1"] test_orthology_loader.store_feature_groups(group=members5, soterm=soterm, term=term, value=group5_name) group6_name = "machado0006" members6 = ["Kaladp0598s0001.1.v1.1", "UnknownProtein.v1.3"] test_orthology_loader.store_feature_groups(group=members6, soterm=soterm, term=term, value=group6_name) group7_name = "machado0007" members7 = ["UnknownProtein.v1.4"] test_orthology_loader.store_feature_groups(group=members7, soterm=soterm, term=term, value=group7_name) # ###check if relationships exist### # in a group (machado0001 and machado0005) self.assertTrue( Featureprop.objects.filter(feature_id=feature1.feature_id, type_id=term, value=group1_name).exists()) self.assertTrue( Featureprop.objects.filter(feature_id=feature9.feature_id, type_id=term, value=group1_name).exists()) self.assertTrue( Featureprop.objects.filter(feature_id=feature4.feature_id, type_id=term, value=group1_name).exists()) # another example group5 self.assertTrue( Featureprop.objects.filter(feature_id=feature10.feature_id, type_id=term, value=group5_name).exists()) self.assertTrue( Featureprop.objects.filter(feature_id=feature17.feature_id, type_id=term, value=group5_name).exists()) # another example: # ###check if a relationship does not exist### # between features from different groups (machado0004 and machado0003) self.assertFalse( Featureprop.objects.filter(feature_id=feature16.feature_id, type_id=term, value=group6_name).exists()) self.assertFalse( Featureprop.objects.filter(feature_id=feature4.feature_id, type_id=term, value=group2_name).exists()) self.assertFalse( Featureprop.objects.filter(feature_id=feature14.feature_id, type_id=term, value=group1_name).exists())
def test_store_feature_publication(self): """Tests - store feature publication.""" test_db = Db.objects.create(name="RO") test_dbxref = Dbxref.objects.create(accession="00002", db=test_db) test_cv = Cv.objects.create(name="relationship") Cvterm.objects.create( name="contained in", cv=test_cv, dbxref=test_dbxref, is_obsolete=0, is_relationshiptype=0, ) test_db = Db.objects.create(name="SO") test_dbxref = Dbxref.objects.create(accession="12345", db=test_db) test_cv = Cv.objects.create(name="sequence") test_so_term = Cvterm.objects.create( name="gene", cv=test_cv, dbxref=test_dbxref, is_obsolete=0, is_relationshiptype=0, ) test_dbxref = Dbxref.objects.create(accession="123456", db=test_db) Cvterm.objects.create( name="polypeptide", cv=test_cv, dbxref=test_dbxref, is_obsolete=0, is_relationshiptype=0, ) test_dbxref = Dbxref.objects.create(accession="123455", db=test_db) Cvterm.objects.create( name="protein_match", cv=test_cv, dbxref=test_dbxref, is_obsolete=0, is_relationshiptype=0, ) test_organism = Organism.objects.create(genus="Mus", species="musculus") test_db = Db.objects.create(name="GFF_SOURCE") test_dbxref = Dbxref.objects.create(accession="feat_gene", db=test_db) test_feature = Feature.objects.create( organism=test_organism, uniquename="feat_gene", dbxref=test_dbxref, is_analysis=False, type_id=test_so_term.cvterm_id, is_obsolete=False, timeaccessioned=datetime.now(timezone.utc), timelastmodified=datetime.now(timezone.utc), ) db2 = BibDatabase() db2.entries = [{ "journal": "Nice Journal", "comments": "A comment", "pages": "12--23", "month": "jan", "abstract": "This is an abstract. This line should be " "long enough to test multilines...", "title": "An amazing title", "year": "2013", "doi": "10.1186/s12864-016-2535-300002", "volume": "12", "ID": "Teste2018", "author": "Foo, b. and Foo1, b. and Foo b.", "keyword": "keyword1, keyword2", "ENTRYTYPE": "article", }] for entry in db2.entries: bibtest = PublicationLoader() bibtest.store_bibtex_entry(entry) test_feature_file = FeatureLoader(filename="file.name", source="GFF_loader") test_feature_file.store_feature_publication( feature="feat_gene", soterm="gene", doi="10.1186/s12864-016-2535-300002") test_featurepub = FeaturePub.objects.get(feature=test_feature) self.assertEqual("An amazing title", test_featurepub.pub.title)
def test_process_attributes(self): """Tests - get attributes.""" test_organism = Organism.objects.create(genus="Mus", species="musculus") # creating test GO term test_db = Db.objects.create(name="GO") test_dbxref = Dbxref.objects.create(accession="12345", db=test_db) test_cv = Cv.objects.create(name="biological_process") Cvterm.objects.create( name="go test term", cv=test_cv, dbxref=test_dbxref, is_obsolete=0, is_relationshiptype=0, ) # creating test SO term test_db = Db.objects.create(name="SO") test_dbxref = Dbxref.objects.create(accession="12345", db=test_db) test_cv = Cv.objects.create(name="sequence") Cvterm.objects.create( name="gene", cv=test_cv, dbxref=test_dbxref, is_obsolete=0, is_relationshiptype=0, ) test_dbxref = Dbxref.objects.create(accession="123455", db=test_db) test_so_term = Cvterm.objects.create( name="polypeptide", cv=test_cv, dbxref=test_dbxref, is_obsolete=0, is_relationshiptype=0, ) test_dbxref = Dbxref.objects.create(accession="1234555", db=test_db) Cvterm.objects.create( name="protein_match", cv=test_cv, dbxref=test_dbxref, is_obsolete=0, is_relationshiptype=0, ) # creating test feature test_feature = Feature.objects.create( organism=test_organism, uniquename="feat1", is_analysis=False, type_id=test_so_term.cvterm_id, is_obsolete=False, timeaccessioned=datetime.now(timezone.utc), timelastmodified=datetime.now(timezone.utc), ) # creating exact term test_db_global = Db.objects.create(name="_global") test_dbxref = Dbxref.objects.create(accession="exact", db=test_db_global) test_cv = Cv.objects.create(name="synonym_type") Cvterm.objects.create( name="exact", cv=test_cv, dbxref=test_dbxref, is_obsolete=0, is_relationshiptype=0, ) test_db = Db.objects.create(name="RO") test_dbxref = Dbxref.objects.create(accession="00002", db=test_db) test_cv = Cv.objects.create(name="relationship") Cvterm.objects.create( name="contained in", cv=test_cv, dbxref=test_dbxref, is_obsolete=0, is_relationshiptype=0, ) # new FeatureLoader FeatureLoader(filename="file.name", source="GFF_source") # running get_attributes test_attrs_file = FeatureAttributesLoader(filecontent="genome") test_attrs = test_attrs_file.get_attributes( "ID=1;name=feat1;note=Test feature;display=feat1;gene=gene1;" "orf_classification=1;ontology_term=GO:12345,GO:54321;parent=2;" "alias=Feature1;dbxref=GI:12345,NC:12345;noecziste=True") # running process_attributes test_attrs_file.process_attributes(feature_id=test_feature.feature_id, attrs=test_attrs) # creating feature_property cvterm cv_feature_property = Cv.objects.get(name="feature_property") # asserting note test_prop_cvterm = Cvterm.objects.get(name="note", cv=cv_feature_property) test_prop = Featureprop.objects.get(feature=test_feature, type_id=test_prop_cvterm.cvterm_id, rank=0) self.assertEqual("Test feature", test_prop.value) # asserting ontology_term test_feat_cvterm = FeatureCvterm.objects.get(feature=test_feature) test_cvterm = Cvterm.objects.get(cvterm_id=test_feat_cvterm.cvterm_id) self.assertEqual("go test term", test_cvterm.name) # asserting dbxref test_dbxref_ids = FeatureDbxref.objects.filter( feature=test_feature).values_list("dbxref_id", flat=True) test_db = Db.objects.get(name="GI") test_dbxref = Dbxref.objects.get(dbxref_id__in=test_dbxref_ids, db=test_db) self.assertEqual("12345", test_dbxref.accession) # asserting alias test_synonym = FeatureSynonym.objects.select_related("synonym").get( feature=test_feature) self.assertEqual("Feature1", test_synonym.synonym.name) # asserting ignored goterms self.assertEqual("GO:54321", test_attrs_file.ignored_goterms.pop())
def handle(self, file: str, organism: str, soterm: str = "mRNA", cpu: int = 1, verbosity: int = 0, **options): """Execute the main function.""" filename = os.path.basename(file) if verbosity > 0: self.stdout.write("Processing file: {}".format(filename)) try: organism = retrieve_organism(organism) except IntegrityError as e: raise ImportingError(e) try: FileValidator().validate(file) except ImportingError as e: raise CommandError(e) try: clusters = open(file, "r") # retrieve only the file name except ImportingError as e: raise CommandError(e) tasks = list() cv, created = Cv.objects.get_or_create(name="feature_property") coexp_db, created = Db.objects.get_or_create(name="LSTRAP_SOURCE") coexp_dbxref, created = Dbxref.objects.get_or_create( accession="LSTRAP_SOURCE", db=coexp_db) cvterm_cluster, created = Cvterm.objects.get_or_create( name="coexpression group", cv=cv, dbxref=coexp_dbxref, is_obsolete=0, is_relationshiptype=0, ) # feature source is not needed here source = "null" featureloader = FeatureLoader(source=source, filename=filename) pool = ThreadPoolExecutor(max_workers=cpu) # each line is an coexpression cluster group for line in tqdm(clusters, total=get_num_lines(file)): name = "" fields = re.split(r"\s+", line.strip()) nfields = len(fields) try: FieldsValidator().validate(nfields, fields) except ImportingError as e: raise CommandError(e) if re.search(r"^(\w+)\:", fields[0]): group_field = re.match(r"^(\w+)\:", fields[0]) name = group_field.group(1) else: raise CommandError("Cluster identification has problems.") # remove cluster name before loading fields.pop(0) # get cvterm for correlation tasks.append( pool.submit( featureloader.store_feature_groups, group=fields, soterm=soterm, term=cvterm_cluster.cvterm_id, value=name, )) if verbosity > 0: self.stdout.write("Loading") for task in tqdm(as_completed(tasks), total=len(tasks)): if task.result(): raise (task.result()) pool.shutdown() if verbosity > 0: self.stdout.write( self.style.SUCCESS("Done with {}".format(filename)))