Python Dataset Examples, ga4gh.datamodel.datasets.Dataset Python Examples

Example #1

0

Show file

File: test_compound_ids.py Project: Philip-Wu/server

 def testTopLevelIdsUnique(self):
     datasetId = "a"
     idStr = "b"
     dataset = datasets.Dataset(datasetId)
     readGroupSet = reads.AbstractReadGroupSet(dataset, idStr)
     variantSet = variants.AbstractVariantSet(dataset, idStr)
     self.assertNotEqual(readGroupSet.getId(), variantSet.getId())

Example #2

0

Show file

File: test_simulator.py Project: ga4ghpoc/server

 def testCreation(self):
     dataset = datasets.Dataset('dataset1')
     referenceSet = references.SimulatedReferenceSet("srs1")
     localId = "variantAnnotationSetId"
     simulatedVariantSet = variants.SimulatedVariantSet(
         dataset,
         referenceSet,
         'variantSet1',
         randomSeed=self.randomSeed,
         numCalls=self.numCalls,
         variantDensity=self.variantDensity)
     simulatedVariantAnnotationSet = variants.SimulatedVariantAnnotationSet(
         simulatedVariantSet, localId, self.randomSeed)
     annotations = simulatedVariantAnnotationSet.getVariantAnnotations(
         self.referenceName, self.startPosition, self.endPosition)
     self.assertEquals(
         simulatedVariantSet.toProtocolElement().id,
         simulatedVariantAnnotationSet.toProtocolElement().variant_set_id,
         "Variant Set ID should match the annotation's variant set ID")
     for variant, ann in annotations:
         self.assertEquals(
             datetime.datetime.strptime(
                 ann.created,
                 "%Y-%m-%dT%H:%M:%S.%fZ").strftime("%Y-%m-%dT%H:%M:%S.%fZ"),
             ann.created, "Expect time format to be in ISO8601")
         self.assertEqual(variant.id, ann.variant_id)

Example #3

0

Show file

File: test_bio_metadata.py Project: ga4ghpoc/server

 def testToProtocolElement(self):
     dataset = datasets.Dataset('dataset1')
     term = protocol.OntologyTerm()
     term.term = "male genotypic sex"
     term.id = "PATO:0020001"
     term.source_name = "PATO"
     term.source_version = pb.string("2015-11-18")
     # Write out a valid input
     print(protocol.toJsonDict(term))
     validIndividual = protocol.Individual(name="test",
                                           created="2016-05-19T21:00:19Z",
                                           updated="2016-05-19T21:00:19Z",
                                           sex=term)
     validIndividual.info['test'].values.add().string_value = 'test-info'
     # pass through protocol creation
     individual = bioMetadata.Individual(dataset, "test")
     individual.populateFromJson(protocol.toJson(validIndividual))
     gaIndividual = individual.toProtocolElement()
     # Verify elements exist
     self.assertEqual(gaIndividual.created, validIndividual.created)
     self.assertEqual(gaIndividual.updated, validIndividual.updated)
     # Invalid input
     invalidIndividual = '{"bad:", "json"}'
     individual = bioMetadata.Individual(dataset, "test")
     # Should fail
     self.assertRaises(exceptions.InvalidJsonException,
                       individual.populateFromJson, invalidIndividual)

Example #4

0

Show file

File: test_backends.py Project: alexanderDoria/server

 def testGetDatasetByIndexBadIndex(self):
     self.assertRaises(IndexError, self._dataRepo.getDatasetByIndex, 0)
     self.assertRaises(TypeError, self._dataRepo.getDatasetByIndex, None)
     self.assertRaises(TypeError, self._dataRepo.getDatasetByIndex, "")
     datasetName = "ds"
     dataset = datasets.Dataset(datasetName)
     self._dataRepo.addDataset(dataset)
     self.assertRaises(IndexError, self._dataRepo.getDatasetByIndex, 1)

Example #5

0

Show file

File: test_variants.py Project: ga4ghpoc/server

 def __init__(self, variantSetId, baseDir):
     self._dataset = datasets.Dataset("ds")
     super(VariantSetTest, self).__init__(variantSetId, baseDir)
     self._variantRecords = []
     self._reference_names = set()
     # Read in all the VCF files in datadir and store each variant.
     for vcfFile in glob.glob(os.path.join(self._dataPath, "*.vcf.gz")):
         self._readVcf(vcfFile)

Example #6

0

Show file

File: test_simulator.py Project: ga4ghpoc/server

 def testCreation(self):
     dataset = datasets.Dataset('dataset1')
     localId = "readGroupSetId"
     referenceSet = references.SimulatedReferenceSet("srs1")
     simulatedReadGroupSet = reads.SimulatedReadGroupSet(
         dataset, localId, referenceSet)
     for readGroup in simulatedReadGroupSet.getReadGroups():
         alignments = list(readGroup.getReadAlignments())
         self.assertGreater(len(alignments), 0)

Example #7

0

Show file

File: repomanager.py Project: ga4ghpoc/server

 def addDataset(self):
     """
     Adds a new dataset into this repo.
     """
     self._openRepo()
     dataset = datasets.Dataset(self._args.datasetName)
     dataset.setDescription(self._args.description)
     dataset.setInfo(json.loads(self._args.info))
     self._updateRepo(self._repo.insertDataset, dataset)

Example #8

0

Show file

File: datarepo.py Project: alexanderDoria/server

 def _readDatasetTable(self, cursor):
     cursor.row_factory = sqlite3.Row
     cursor.execute("SELECT * FROM Dataset;")
     for row in cursor:
         dataset = datasets.Dataset(row[b'name'])
         dataset.populateFromRow(row)
         assert dataset.getId() == row[b"id"]
         # Insert the dataset into the memory-based object model.
         self.addDataset(dataset)

Example #9

0

Show file

File: test_reads.py Project: ga4ghpoc/server

 def __init__(self, localId, dataPath):
     self._backend = backend.Backend(datarepo.AbstractDataRepository())
     self._referenceSet = None
     self._dataset = datasets.Dataset("ds")
     self._readGroupInfos = {}
     self._readGroupSetInfo = None
     self._samFile = pysam.AlignmentFile(dataPath)
     self._readReferences()
     super(ReadGroupSetTest, self).__init__(localId, dataPath)
     self._readAlignmentInfo()

Example #10

0

Show file

File: test_simulator.py Project: ga4ghpoc/server

 def _getSimulatedVariantSet(self):
     dataset = datasets.Dataset('dataset1')
     referenceSet = references.SimulatedReferenceSet("srs1")
     simulatedVariantSet = variants.SimulatedVariantSet(
         dataset,
         referenceSet,
         'variantSet1',
         randomSeed=self.randomSeed,
         numCalls=self.numCalls,
         variantDensity=self.variantDensity)
     return simulatedVariantSet

Example #11

0

Show file

File: test_backends.py Project: alexanderDoria/server

 def testAddOneDataset(self):
     datasetName = "ds"
     dataset = datasets.Dataset(datasetName)
     self.assertEqual(self._dataRepo.getNumDatasets(), 0)
     self.assertEqual(self._dataRepo.getDatasets(), [])
     self._dataRepo.addDataset(dataset)
     self.assertEqual(self._dataRepo.getNumDatasets(), 1)
     self.assertEqual(self._dataRepo.getDatasets(), [dataset])
     self.assertEqual(self._dataRepo.getDatasetByIndex(0), dataset)
     self.assertEqual(self._dataRepo.getDatasetByName(datasetName), dataset)
     self.assertEqual(self._dataRepo.getDataset(dataset.getId()), dataset)

Example #12

0

Show file

File: test_backends.py Project: alexanderDoria/server

 def testAddMultipleDatasets(self):
     firstDatasetName = "ds1"
     firstDataset = datasets.Dataset(firstDatasetName)
     secondDatasetName = "ds2"
     secondDataset = datasets.Dataset(secondDatasetName)
     self.assertEqual(self._dataRepo.getNumDatasets(), 0)
     self.assertEqual(self._dataRepo.getDatasets(), [])
     self._dataRepo.addDataset(firstDataset)
     self._dataRepo.addDataset(secondDataset)
     self.assertEqual(self._dataRepo.getNumDatasets(), 2)
     self.assertEqual(self._dataRepo.getDatasets(),
                      [firstDataset, secondDataset])
     self.assertEqual(self._dataRepo.getDatasetByIndex(0), firstDataset)
     self.assertEqual(self._dataRepo.getDatasetByIndex(1), secondDataset)
     self.assertEqual(self._dataRepo.getDatasetByName(firstDatasetName),
                      firstDataset)
     self.assertEqual(self._dataRepo.getDatasetByName(secondDatasetName),
                      secondDataset)
     self.assertEqual(self._dataRepo.getDataset(firstDataset.getId()),
                      firstDataset)
     self.assertEqual(self._dataRepo.getDataset(secondDataset.getId()),
                      secondDataset)

Example #13

0

Show file

 def getDataModelInstance(self, localId, dataPath):
     dataset = datasets.Dataset("ds")
     variantSet = variants.HtslibVariantSet(dataset, localId)
     variantSet.populateFromDirectory(dataPath)
     referenceSet = references.AbstractReferenceSet("rs")
     variantSet.setReferenceSet(referenceSet)
     if variantSet.isAnnotated():
         sequenceOntology = ontologies.Ontology(paths.ontologyName)
         sequenceOntology.populateFromFile(paths.ontologyPath)
         annotationSet = variantSet.getVariantAnnotationSets()[0]
         annotationSet.setOntology(sequenceOntology)
         return annotationSet
     else:
         return variantSet

Example #14

0

Show file

 def __init__(self, featureSetLocalName, dataPath):
     """
     :param localId: Name of the GFF3 resource corresponding to a pair
     of files, .db and .gff3
     :param dataPath: string representing full path to the .db file
     :return:
     """
     self._dataset = datasets.Dataset(_datasetName)
     self._repo = datarepo.SqlDataRepository(paths.testDataRepo)
     self._repo.open(datarepo.MODE_READ)
     self._ontology = self._repo.getOntologyByName(paths.ontologyName)
     self._referenceSet = references.AbstractReferenceSet("test_rs")
     featureSetLocalName = featureSetLocalName[:-3]  # remove '.db'
     self._testData = _testDataForFeatureSetName[featureSetLocalName]
     super(FeatureSetTests, self).__init__(featureSetLocalName, dataPath)

Example #15

0

Show file

 def _createVariantAnnotationSet(self, vcfDir):
     """
     Creates a VariantAnnotationSet from the specified directory of
     VCF files.
     """
     self._variantSetName = "testVariantSet"
     self._repo = datarepo.SqlDataRepository(paths.testDataRepo)
     self._repo.open(datarepo.MODE_READ)
     self._dataset = datasets.Dataset("testDs")
     self._variantSet = variants.HtslibVariantSet(
         self._dataset, self._variantSetName)
     self._variantSet.populateFromDirectory(vcfDir)
     self._variantAnnotationSet = variants.HtslibVariantAnnotationSet(
         self._variantSet, "testVAs")
     self._variantAnnotationSet.setOntology(
         self._repo.getOntologyByName(paths.ontologyName))

Example #16

0

Show file

File: download_example_data.py Project: Philip-Wu/server

    def createRepo(self):
        """
        Creates the repository for all the data we've just downloaded.
        """
        repo = datarepo.SqlDataRepository(self.repoPath)
        repo.open("w")
        repo.initialise()

        referenceSet = references.HtslibReferenceSet("GRCh37-subset")
        referenceSet.populateFromFile(self.fastaFilePath)
        referenceSet.setDescription("Subset of GRCh37 used for demonstration")
        referenceSet.setNcbiTaxonId(9606)
        for reference in referenceSet.getReferences():
            reference.setNcbiTaxonId(9606)
            reference.setSourceAccessions(
                self.accessions[reference.getName()] + ".subset")
        repo.insertReferenceSet(referenceSet)

        dataset = datasets.Dataset("1kg-p3-subset")
        dataset.setDescription("Sample data from 1000 Genomes phase 3")
        repo.insertDataset(dataset)

        variantSet = variants.HtslibVariantSet(dataset, "mvncall")
        variantSet.setReferenceSet(referenceSet)
        dataUrls = [vcfFile for vcfFile, _ in self.vcfFilePaths]
        indexFiles = [indexFile for _, indexFile in self.vcfFilePaths]
        variantSet.populateFromFile(dataUrls, indexFiles)
        variantSet.checkConsistency()
        repo.insertVariantSet(variantSet)

        for sample, (bamFile, indexFile) in zip(self.samples,
                                                self.bamFilePaths):
            readGroupSet = reads.HtslibReadGroupSet(dataset, sample)
            readGroupSet.populateFromFile(bamFile, indexFile)
            readGroupSet.setReferenceSet(referenceSet)
            repo.insertReadGroupSet(readGroupSet)

        repo.commit()
        repo.close()
        self.log("Finished creating the repository; summary:\n")
        repo.open("r")
        repo.printSummary()

Example #17

0

Show file

File: test_bio_metadata.py Project: ga4ghpoc/server

 def testToProtocolElement(self):
     dataset = datasets.Dataset('dataset1')
     # Write out a valid input
     validBioSample = protocol.BioSample(name="test",
                                         created="2016-05-19T21:00:19Z",
                                         updated="2016-05-19T21:00:19Z")
     validBioSample.info['test'].values.add().string_value = 'test-info'
     # pass through protocol creation
     bioSample = bioMetadata.BioSample(dataset, "test")
     bioSample.populateFromJson(protocol.toJson(validBioSample))
     gaBioSample = bioSample.toProtocolElement()
     # Verify elements exist
     self.assertEqual(gaBioSample.created, validBioSample.created)
     self.assertEqual(gaBioSample.updated, validBioSample.updated)
     # Invalid input
     invalidBioSample = '{"bad:", "json"}'
     bioSample = bioMetadata.Individual(dataset, "test")
     # Should fail
     self.assertRaises(exceptions.InvalidJsonException,
                       bioSample.populateFromJson, invalidBioSample)

Example #18

0

Show file

File: test_sequence_annotations.py Project: alexanderDoria/server

 def setUp(self):
     self._featureSetName = "testFeatureSet"
     self._dataset = datasets.Dataset("test_ds")
     self._featureSet = features.AbstractFeatureSet(self._dataset,
                                                    self._featureSetName)

Example #19

0

Show file

File: test_variants.py Project: ga4ghpoc/server

 def setUp(self):
     self._variantSetName = "testVariantSet"
     self._dataset = datasets.Dataset("datasetId")
     self._variantSet = variants.AbstractVariantSet(self._dataset,
                                                    self._variantSetName)

Example #20

0

Show file

File: test_faultydata.py Project: ga4ghpoc/server

 def setUp(self):
     self.testDataDir = "tests/faultydata/variants"
     self.dataset = datasets.Dataset('dataset1')

Example #21

0

Show file

File: prepare_compliance_data.py Project: ga4ghpoc/server

    def run(self):
        if not os.path.exists(self.outputDirectory):
            os.makedirs(self.outputDirectory)
        self.repo.open("w")
        self.repo.initialise()

        referenceFileName = "ref_brca1.fa"
        inputRef = os.path.join(self.inputDirectory, referenceFileName)
        outputRef = os.path.join(self.outputDirectory, referenceFileName)
        shutil.copy(inputRef, outputRef)
        fastaFilePath = os.path.join(self.outputDirectory,
                                     referenceFileName + '.gz')
        pysam.tabix_compress(outputRef, fastaFilePath)

        with open(os.path.join(self.inputDirectory,
                               "ref_brca1.json")) as refMetadataFile:
            refMetadata = json.load(refMetadataFile)
        with open(os.path.join(self.inputDirectory,
                               "referenceset_hg37.json")) as refMetadataFile:
            refSetMetadata = json.load(refMetadataFile)

        referenceSet = references.HtslibReferenceSet(
            refSetMetadata['assemblyId'])

        referenceSet.populateFromFile(fastaFilePath)
        referenceSet.setAssemblyId(refSetMetadata['assemblyId'])
        referenceSet.setDescription(refSetMetadata['description'])
        referenceSet.setNcbiTaxonId(refSetMetadata['ncbiTaxonId'])
        referenceSet.setIsDerived(refSetMetadata['isDerived'])
        referenceSet.setSourceUri(refSetMetadata['sourceUri'])
        referenceSet.setSourceAccessions(refSetMetadata['sourceAccessions'])
        for reference in referenceSet.getReferences():
            reference.setNcbiTaxonId(refMetadata['ncbiTaxonId'])
            reference.setSourceAccessions(refMetadata['sourceAccessions'])
        self.repo.insertReferenceSet(referenceSet)

        dataset = datasets.Dataset("brca1")
        # Some info is set, it isn't important what
        dataset.setInfo({"version": ga4gh.__version__})
        self.repo.insertDataset(dataset)

        hg00096Individual = biodata.Individual(dataset, "HG00096")
        with open(os.path.join(self.inputDirectory,
                               "individual_HG00096.json")) as jsonString:
            hg00096Individual.populateFromJson(jsonString.read())
        self.repo.insertIndividual(hg00096Individual)
        hg00096BioSample = biodata.BioSample(dataset, "HG00096")
        with open(os.path.join(self.inputDirectory,
                               "bioSample_HG00096.json")) as jsonString:
            hg00096BioSample.populateFromJson(jsonString.read())
        hg00096BioSample.setIndividualId(hg00096Individual.getId())
        self.repo.insertBioSample(hg00096BioSample)
        hg00099Individual = biodata.Individual(dataset, "HG00099")
        with open(os.path.join(self.inputDirectory,
                               "individual_HG00099.json")) as jsonString:
            hg00099Individual.populateFromJson(jsonString.read())
        self.repo.insertIndividual(hg00099Individual)
        hg00099BioSample = biodata.BioSample(dataset, "HG00099")
        with open(os.path.join(self.inputDirectory,
                               "bioSample_HG00099.json")) as jsonString:
            hg00099BioSample.populateFromJson(jsonString.read())
        hg00099BioSample.setIndividualId(hg00099Individual.getId())
        self.repo.insertBioSample(hg00099BioSample)
        hg00101Individual = biodata.Individual(dataset, "HG00101")
        with open(os.path.join(self.inputDirectory,
                               "individual_HG00101.json")) as jsonString:
            hg00101Individual.populateFromJson(jsonString.read())
        self.repo.insertIndividual(hg00101Individual)
        hg00101BioSample = biodata.BioSample(dataset, "HG00101")
        with open(os.path.join(self.inputDirectory,
                               "bioSample_HG00101.json")) as jsonString:
            hg00101BioSample.populateFromJson(jsonString.read())
        hg00101BioSample.setIndividualId(hg00101Individual.getId())
        self.repo.insertBioSample(hg00101BioSample)

        readFiles = [
            "brca1_HG00096.sam", "brca1_HG00099.sam", "brca1_HG00101.sam"
        ]

        for readFile in readFiles:
            name = readFile.split('_')[1].split('.')[0]
            readSrc = pysam.AlignmentFile(
                os.path.join(self.inputDirectory, readFile), "r")
            readDest = pysam.AlignmentFile(os.path.join(
                self.outputDirectory, name + ".bam"),
                                           "wb",
                                           header=readSrc.header)
            destFilePath = readDest.filename
            for readData in readSrc:
                readDest.write(readData)
            readDest.close()
            readSrc.close()
            pysam.index(destFilePath)
            readGroupSet = reads.HtslibReadGroupSet(dataset, name)
            readGroupSet.populateFromFile(destFilePath, destFilePath + ".bai")
            readGroupSet.setReferenceSet(referenceSet)
            dataset.addReadGroupSet(readGroupSet)
            bioSamples = [hg00096BioSample, hg00099BioSample, hg00101BioSample]
            for readGroup in readGroupSet.getReadGroups():
                for bioSample in bioSamples:
                    if bioSample.getLocalId() == readGroup.getSampleName():
                        readGroup.setBioSampleId(bioSample.getId())
            self.repo.insertReadGroupSet(readGroupSet)

        ontologyMapFileName = "so-xp-simple.obo"
        inputOntologyMap = os.path.join(self.inputDirectory,
                                        ontologyMapFileName)
        outputOntologyMap = os.path.join(self.outputDirectory,
                                         ontologyMapFileName)
        shutil.copy(inputOntologyMap, outputOntologyMap)

        sequenceOntology = ontologies.Ontology("so-xp-simple")
        sequenceOntology.populateFromFile(outputOntologyMap)
        sequenceOntology._id = "so-xp-simple"
        self.repo.insertOntology(sequenceOntology)
        self.repo.addOntology(sequenceOntology)

        vcfFiles = [
            "brca1_1kgPhase3_variants.vcf", "brca1_WASH7P_annotation.vcf",
            "brca1_OR4F_annotation.vcf"
        ]
        for vcfFile in vcfFiles:
            self.addVariantSet(vcfFile, dataset, referenceSet,
                               sequenceOntology, bioSamples)

        # Sequence annotations
        seqAnnFile = "brca1_gencodev19.gff3"
        seqAnnSrc = os.path.join(self.inputDirectory, seqAnnFile)
        seqAnnDest = os.path.join(self.outputDirectory, "gencodev19.db")
        dbgen = generate_gff3_db.Gff32Db(seqAnnSrc, seqAnnDest)
        dbgen.run()
        gencode = sequence_annotations.Gff3DbFeatureSet(dataset, "gencodev19")
        gencode.setOntology(sequenceOntology)
        gencode.populateFromFile(seqAnnDest)
        gencode.setReferenceSet(referenceSet)

        self.repo.insertFeatureSet(gencode)

        # add g2p featureSet
        g2pPath = os.path.join(self.inputDirectory, "cgd")
        # copy all files input directory to output path
        outputG2PPath = os.path.join(self.outputDirectory, "cgd")
        os.makedirs(outputG2PPath)
        for filename in glob.glob(os.path.join(g2pPath, '*.*')):
            shutil.copy(filename, outputG2PPath)

        featuresetG2P = g2p_featureset.PhenotypeAssociationFeatureSet(
            dataset, outputG2PPath)
        featuresetG2P.setOntology(sequenceOntology)
        featuresetG2P.setReferenceSet(referenceSet)
        featuresetG2P.populateFromFile(outputG2PPath)
        self.repo.insertFeatureSet(featuresetG2P)

        # add g2p phenotypeAssociationSet
        phenotypeAssociationSet = g2p_associationset\
            .RdfPhenotypeAssociationSet(dataset, "cgd", outputG2PPath)
        self.repo.insertPhenotypeAssociationSet(phenotypeAssociationSet)

        self.repo.commit()
        dataset.addFeatureSet(gencode)

        # RNA Quantification
        rnaDbName = os.path.join(self.outputDirectory, "rnaseq.db")
        store = rnaseq2ga.RnaSqliteStore(rnaDbName)
        store.createTables()
        rnaseq2ga.rnaseq2ga(self.inputDirectory + "/rna_brca1.tsv",
                            rnaDbName,
                            "rna_brca1.tsv",
                            "rsem",
                            featureType="transcript",
                            readGroupSetNames="HG00096",
                            featureSetNames="gencodev19",
                            dataset=dataset)
        rnaQuantificationSet = rna_quantification.SqliteRnaQuantificationSet(
            dataset, "rnaseq")
        rnaQuantificationSet.setReferenceSet(referenceSet)
        rnaQuantificationSet.populateFromFile(rnaDbName)
        self.repo.insertRnaQuantificationSet(rnaQuantificationSet)

        self.repo.commit()

Example #22

0

Show file

File: test_compound_ids.py Project: Philip-Wu/server

 def getDataset(self):
     return datasets.Dataset("dataset")

Example #23

0

Show file

File: prepare_compliance_data.py Project: alexanderDoria/server

    def run(self):
        if not os.path.exists(self.outputDirectory):
            os.makedirs(self.outputDirectory)
        self.repo.open("w")
        self.repo.initialise()

        referenceFileName = "ref_brca1.fa"
        inputRef = os.path.join(
            self.inputDirectory, referenceFileName)
        outputRef = os.path.join(
            self.outputDirectory, referenceFileName)
        shutil.copy(inputRef, outputRef)
        fastaFilePath = os.path.join(
            self.outputDirectory,
            referenceFileName + '.gz')
        pysam.tabix_compress(
            outputRef, fastaFilePath)

        with open(
                os.path.join(
                    self.inputDirectory, "ref_brca1.json")) as refMetadataFile:
            refMetadata = json.load(refMetadataFile)
        with open(
                os.path.join(
                    self.inputDirectory,
                    "referenceset_hg37.json")) as refMetadataFile:
            refSetMetadata = json.load(refMetadataFile)

        referenceSet = references.HtslibReferenceSet(
            refSetMetadata['assemblyId'])

        referenceSet.populateFromFile(fastaFilePath)
        referenceSet.setAssemblyId(refSetMetadata['assemblyId'])
        referenceSet.setDescription(refSetMetadata['description'])
        referenceSet.setNcbiTaxonId(refSetMetadata['ncbiTaxonId'])
        referenceSet.setIsDerived(refSetMetadata['isDerived'])
        referenceSet.setSourceUri(refSetMetadata['sourceUri'])
        referenceSet.setSourceAccessions(refSetMetadata['sourceAccessions'])
        for reference in referenceSet.getReferences():
            reference.setNcbiTaxonId(refMetadata['ncbiTaxonId'])
            reference.setSourceAccessions(
                refMetadata['sourceAccessions'])
        self.repo.insertReferenceSet(referenceSet)

        dataset = datasets.Dataset("brca1")
        self.repo.insertDataset(dataset)

        hg00096Individual = biodata.Individual(dataset, "HG00096")
        with open(
                os.path.join(
                    self.inputDirectory,
                    "individual_HG00096.json")) as jsonString:
            hg00096Individual.populateFromJson(jsonString.read())
        self.repo.insertIndividual(hg00096Individual)
        hg00096BioSample = biodata.BioSample(dataset, "HG00096")
        with open(
                os.path.join(
                    self.inputDirectory,
                    "bioSample_HG00096.json")) as jsonString:
            hg00096BioSample.populateFromJson(jsonString.read())
        hg00096BioSample.setIndividualId(hg00096Individual.getId())
        self.repo.insertBioSample(hg00096BioSample)
        hg00099Individual = biodata.Individual(dataset, "HG00099")
        with open(
                os.path.join(
                    self.inputDirectory,
                    "individual_HG00099.json")) as jsonString:
            hg00099Individual.populateFromJson(jsonString.read())
        self.repo.insertIndividual(hg00099Individual)
        hg00099BioSample = biodata.BioSample(dataset, "HG00099")
        with open(
                os.path.join(
                    self.inputDirectory,
                    "bioSample_HG00099.json")) as jsonString:
            hg00099BioSample.populateFromJson(jsonString.read())
        hg00099BioSample.setIndividualId(hg00099Individual.getId())
        self.repo.insertBioSample(hg00099BioSample)
        hg00101Individual = biodata.Individual(dataset, "HG00101")
        with open(
                os.path.join(
                    self.inputDirectory,
                    "individual_HG00101.json")) as jsonString:
            hg00101Individual.populateFromJson(jsonString.read())
        self.repo.insertIndividual(hg00101Individual)
        hg00101BioSample = biodata.BioSample(dataset, "HG00101")
        with open(
                os.path.join(
                    self.inputDirectory,
                    "bioSample_HG00101.json")) as jsonString:
            hg00101BioSample.populateFromJson(jsonString.read())
        hg00101BioSample.setIndividualId(hg00101Individual.getId())
        self.repo.insertBioSample(hg00101BioSample)

        readFiles = [
            "brca1_HG00096.sam",
            "brca1_HG00099.sam",
            "brca1_HG00101.sam"]

        for readFile in readFiles:
            name = readFile.split('_')[1].split('.')[0]
            readSrc = pysam.AlignmentFile(
                os.path.join(self.inputDirectory, readFile), "r")
            readDest = pysam.AlignmentFile(
                os.path.join(
                    self.outputDirectory,
                    name + ".bam"),
                "wb", header=readSrc.header)
            destFilePath = readDest.filename
            for readData in readSrc:
                readDest.write(readData)
            readDest.close()
            readSrc.close()
            pysam.index(destFilePath)
            readGroupSet = reads.HtslibReadGroupSet(dataset, name)
            readGroupSet.populateFromFile(destFilePath, destFilePath + ".bai")
            readGroupSet.setReferenceSet(referenceSet)
            bioSamples = [hg00096BioSample, hg00099BioSample, hg00101BioSample]
            for readGroup in readGroupSet.getReadGroups():
                for bioSample in bioSamples:
                    if bioSample.getLocalId() == readGroup.getSampleName():
                        readGroup.setBioSampleId(bioSample.getId())
            self.repo.insertReadGroupSet(readGroupSet)

        ontologyMapFileName = "so-xp-simple.obo"
        inputOntologyMap = os.path.join(
            self.inputDirectory, ontologyMapFileName)
        outputOntologyMap = os.path.join(
            self.outputDirectory, ontologyMapFileName)
        shutil.copy(inputOntologyMap, outputOntologyMap)

        sequenceOntology = ontologies.Ontology("so-xp-simple")
        sequenceOntology.populateFromFile(outputOntologyMap)
        sequenceOntology._id = "so-xp-simple"
        self.repo.insertOntology(sequenceOntology)
        self.repo.addOntology(sequenceOntology)

        vcfFiles = [
            "brca1_1kgPhase3_variants.vcf",
            "brca1_WASH7P_annotation.vcf",
            "brca1_OR4F_annotation.vcf"]
        for vcfFile in vcfFiles:
            self.addVariantSet(
                vcfFile,
                dataset,
                referenceSet,
                sequenceOntology,
                bioSamples)

        seqAnnFile = "brca1_gencodev19.gff3"
        seqAnnSrc = os.path.join(self.inputDirectory, seqAnnFile)
        seqAnnDest = os.path.join(self.outputDirectory, "gencodev19.db")
        dbgen = generate_gff3_db.Gff32Db(seqAnnSrc, seqAnnDest)
        dbgen.run()
        gencode = sequenceAnnotations.Gff3DbFeatureSet(dataset, "gencodev19")
        gencode.setOntology(sequenceOntology)
        gencode.populateFromFile(seqAnnDest)
        gencode.setReferenceSet(referenceSet)

        self.repo.insertFeatureSet(gencode)

        self.repo.commit()

        print("Done converting compliance data.", file=sys.stderr)

Example #24

0

Show file

File: test_sequence_annotations.py Project: ga4ghpoc/server

 def setUp(self):
     self._featureSetName = "testFeatureSet"
     self._dataset = datasets.Dataset("test_ds")
     self._featureSet = sequence_annotations.AbstractFeatureSet(
         self._dataset, self._featureSetName)

Example #25

0

Show file

File: test_genotype_phenotype.py Project: ga4ghpoc/server

 def __init__(self, localId, baseDir):
     self._dataset = datasets.Dataset("ds")
     super(PhenotypeAssociationSetTest, self).__init__(localId, baseDir)
     self.phenotypeAssocationSet = self.getDataModelInstance(
         localId, baseDir)