Beispiel #1
0
 def _readBiosampleTable(self):
     for biosampleRecord in m.Biosample.select():
         dataset = self.getDataset(biosampleRecord.datasetid.id)
         biosample = biodata.Biosample(
             dataset, biosampleRecord.name)
         biosample.populateFromRow(biosampleRecord)
         assert biosample.getId() == biosampleRecord.id
         dataset.addBiosample(biosample)
Beispiel #2
0
 def addBiosample(self):
     """
     Adds a new biosample into this repo
     """
     self._openRepo()
     dataset = self._repo.getDatasetByName(self._args.datasetName)
     biosample = bio_metadata.Biosample(
         dataset, self._args.biosampleName)
     biosample.populateFromJson(self._args.biosample)
     self._updateRepo(self._repo.insertBiosample, biosample)
 def testToProtocolElement(self):
     dataset = datasets.Dataset('dataset1')
     # Write out a valid input
     validBiosample = protocol.Biosample(name="test",
                                         created="2016-05-19T21:00:19Z",
                                         updated="2016-05-19T21:00:19Z")
     validBiosample.attributes.attr['test']. \
         values.add().string_value = 'test-info'
     # pass through protocol creation
     biosample = bioMetadata.Biosample(dataset, "test")
     biosample.populateFromJson(protocol.toJson(validBiosample))
     gaBiosample = biosample.toProtocolElement()
     # Verify elements exist
     self.assertEqual(gaBiosample.created, validBiosample.created)
     self.assertEqual(gaBiosample.updated, validBiosample.updated)
     # Invalid input
     invalidBiosample = '{"bad:", "json"}'
     biosample = bioMetadata.Individual(dataset, "test")
     # Should fail
     self.assertRaises(exceptions.InvalidJsonException,
                       biosample.populateFromJson, invalidBiosample)
Beispiel #4
0
    def run(self):
        if not os.path.exists(self.outputDirectory):
            os.makedirs(self.outputDirectory)
        self.repo.open("w")
        self.repo.initialise()

        referenceFileName = "ref_brca1.fa"
        inputRef = os.path.join(self.inputDirectory, referenceFileName)
        outputRef = os.path.join(self.outputDirectory, referenceFileName)
        shutil.copy(inputRef, outputRef)
        fastaFilePath = os.path.join(self.outputDirectory,
                                     referenceFileName + '.gz')
        pysam.tabix_compress(outputRef, fastaFilePath)

        with open(os.path.join(self.inputDirectory,
                               "ref_brca1.json")) as refMetadataFile:
            refMetadata = json.load(refMetadataFile)
        with open(os.path.join(self.inputDirectory,
                               "referenceset_hg37.json")) as refMetadataFile:
            refSetMetadata = json.load(refMetadataFile)

        referenceSet = references.HtslibReferenceSet(
            refSetMetadata['assemblyId'])

        referenceSet.populateFromFile(os.path.abspath(fastaFilePath))
        referenceSet.setAssemblyId(refSetMetadata['assemblyId'])
        referenceSet.setDescription(refSetMetadata['description'])
        if refSetMetadata['species']:
            speciesJson = json.dumps(refSetMetadata['species'])
            referenceSet.setSpeciesFromJson(speciesJson)  # needs a string
        referenceSet.setIsDerived(refSetMetadata['isDerived'])
        referenceSet.setSourceUri(refSetMetadata['sourceUri'])
        referenceSet.setSourceAccessions(refSetMetadata['sourceAccessions'])
        for reference in referenceSet.getReferences():
            if refSetMetadata['species']:
                speciesJsonStr = json.dumps(refMetadata['species'])
                reference.setSpeciesFromJson(speciesJsonStr)
            reference.setSourceAccessions(refMetadata['sourceAccessions'])
        self.repo.insertReferenceSet(referenceSet)

        dataset = datasets.Dataset("brca1")
        # Some info is set, it isn't important what
        dataset.setAttributes({"version": ga4gh.server.__version__})
        self.repo.insertDataset(dataset)

        hg00096Individual = biodata.Individual(dataset, "HG00096")
        with open(os.path.join(self.inputDirectory,
                               "individual_HG00096.json")) as jsonString:
            hg00096Individual.populateFromJson(jsonString.read())
        self.repo.insertIndividual(hg00096Individual)
        hg00096Biosample = biodata.Biosample(dataset, "HG00096")
        with open(os.path.join(self.inputDirectory,
                               "biosample_HG00096.json")) as jsonString:
            hg00096Biosample.populateFromJson(jsonString.read())
        hg00096Biosample.setIndividualId(hg00096Individual.getId())
        self.repo.insertBiosample(hg00096Biosample)
        hg00099Individual = biodata.Individual(dataset, "HG00099")
        with open(os.path.join(self.inputDirectory,
                               "individual_HG00099.json")) as jsonString:
            hg00099Individual.populateFromJson(jsonString.read())
        self.repo.insertIndividual(hg00099Individual)
        hg00099Biosample = biodata.Biosample(dataset, "HG00099")
        with open(os.path.join(self.inputDirectory,
                               "biosample_HG00099.json")) as jsonString:
            hg00099Biosample.populateFromJson(jsonString.read())
        hg00099Biosample.setIndividualId(hg00099Individual.getId())
        self.repo.insertBiosample(hg00099Biosample)
        hg00101Individual = biodata.Individual(dataset, "HG00101")
        with open(os.path.join(self.inputDirectory,
                               "individual_HG00101.json")) as jsonString:
            hg00101Individual.populateFromJson(jsonString.read())
        self.repo.insertIndividual(hg00101Individual)
        hg00101Biosample = biodata.Biosample(dataset, "HG00101")
        with open(os.path.join(self.inputDirectory,
                               "biosample_HG00101.json")) as jsonString:
            hg00101Biosample.populateFromJson(jsonString.read())
        hg00101Biosample.setIndividualId(hg00101Individual.getId())
        self.repo.insertBiosample(hg00101Biosample)
        readFiles = [
            "brca1_HG00096.sam", "brca1_HG00099.sam", "brca1_HG00101.sam"
        ]

        for readFile in readFiles:
            name = readFile.split('_')[1].split('.')[0]
            readSrc = pysam.AlignmentFile(
                os.path.join(self.inputDirectory, readFile), "r")
            readDest = pysam.AlignmentFile(os.path.join(
                self.outputDirectory, name + ".bam"),
                                           "wb",
                                           header=readSrc.header)
            destFilePath = readDest.filename
            for readData in readSrc:
                readDest.write(readData)
            readDest.close()
            readSrc.close()
            pysam.index(destFilePath)
            readGroupSet = reads.HtslibReadGroupSet(dataset, name)
            readGroupSet.populateFromFile(
                os.path.abspath(destFilePath),
                os.path.abspath(destFilePath + ".bai"))
            readGroupSet.setReferenceSet(referenceSet)
            dataset.addReadGroupSet(readGroupSet)
            biosamples = [hg00096Biosample, hg00099Biosample, hg00101Biosample]
            for readGroup in readGroupSet.getReadGroups():
                for biosample in biosamples:
                    if biosample.getLocalId() == readGroup.getSampleName():
                        readGroup.setBiosampleId(biosample.getId())
            self.repo.insertReadGroupSet(readGroupSet)

        ontologyMapFileName = "so-xp-simple.obo"
        inputOntologyMap = os.path.join(self.inputDirectory,
                                        ontologyMapFileName)
        outputOntologyMap = os.path.join(self.outputDirectory,
                                         ontologyMapFileName)
        shutil.copy(inputOntologyMap, outputOntologyMap)

        sequenceOntology = ontologies.Ontology("so-xp-simple")
        sequenceOntology.populateFromFile(os.path.abspath(outputOntologyMap))
        sequenceOntology._id = "so-xp-simple"
        self.repo.insertOntology(sequenceOntology)
        self.repo.addOntology(sequenceOntology)

        vcfFiles = [
            "brca1_1kgPhase3_variants.vcf", "brca1_WASH7P_annotation.vcf",
            "brca1_OR4F_annotation.vcf"
        ]
        for vcfFile in vcfFiles:
            self.addVariantSet(vcfFile, dataset, referenceSet,
                               sequenceOntology, biosamples)

        # Sequence annotations
        seqAnnFile = "brca1_gencodev19.gff3"
        seqAnnSrc = os.path.join(self.inputDirectory, seqAnnFile)
        seqAnnDest = os.path.join(self.outputDirectory, "gencodev19.db")
        dbgen = generate_gff3_db.Gff32Db(seqAnnSrc, seqAnnDest)
        dbgen.run()
        gencode = sequence_annotations.Gff3DbFeatureSet(dataset, "gencodev19")
        gencode.setOntology(sequenceOntology)
        gencode.populateFromFile(os.path.abspath(seqAnnDest))
        gencode.setReferenceSet(referenceSet)

        self.repo.insertFeatureSet(gencode)

        # add g2p featureSet
        g2pPath = os.path.join(self.inputDirectory, "cgd")
        # copy all files input directory to output path
        outputG2PPath = os.path.join(self.outputDirectory, "cgd")
        os.makedirs(outputG2PPath)
        for filename in glob.glob(os.path.join(g2pPath, '*.*')):
            shutil.copy(filename, outputG2PPath)

        featuresetG2P = g2p_featureset.PhenotypeAssociationFeatureSet(
            dataset, os.path.abspath(outputG2PPath))
        featuresetG2P.setOntology(sequenceOntology)
        featuresetG2P.setReferenceSet(referenceSet)
        featuresetG2P.populateFromFile(os.path.abspath(outputG2PPath))
        self.repo.insertFeatureSet(featuresetG2P)

        # add g2p phenotypeAssociationSet
        phenotypeAssociationSet = \
            g2p_associationset.RdfPhenotypeAssociationSet(
                dataset, "cgd", os.path.abspath(outputG2PPath))
        self.repo.insertPhenotypeAssociationSet(phenotypeAssociationSet)

        dataset.addFeatureSet(gencode)

        # RNA Quantification
        rnaDbName = os.path.join(self.outputDirectory, "rnaseq.db")
        store = rnaseq2ga.RnaSqliteStore(rnaDbName)
        store.createTables()
        rnaseq2ga.rnaseq2ga(self.inputDirectory + "/rna_brca1.tsv",
                            rnaDbName,
                            "rna_brca1.tsv",
                            "rsem",
                            featureType="transcript",
                            readGroupSetNames="HG00096",
                            dataset=dataset,
                            featureSetNames="gencodev19",
                            biosampleId=hg00096Biosample.getId())
        rnaQuantificationSet = rna_quantification.SqliteRnaQuantificationSet(
            dataset, "rnaseq")
        rnaQuantificationSet.setReferenceSet(referenceSet)
        rnaQuantificationSet.populateFromFile(os.path.abspath(rnaDbName))
        self.repo.insertRnaQuantificationSet(rnaQuantificationSet)
Beispiel #5
0
    def __init__(
            self, localId, referenceSet, randomSeed=0,
            numVariantSets=1, numCalls=1, variantDensity=0.5,
            numReadGroupSets=1, numReadGroupsPerReadGroupSet=1,
            numAlignments=1, numFeatureSets=1, numPhenotypeAssociationSets=1,
            numPhenotypeAssociations=2, numRnaQuantSets=2,
            numExpressionLevels=2):
        super(SimulatedDataset, self).__init__(localId)
        self._description = "Simulated dataset {}".format(localId)

        for i in range(numPhenotypeAssociationSets):
            localId = "simPas{}".format(i)
            seed = randomSeed + i
            phenotypeAssociationSet = g2p.SimulatedPhenotypeAssociationSet(
                self, localId, seed, numPhenotypeAssociations)
            self.addPhenotypeAssociationSet(phenotypeAssociationSet)

        # TODO create a simulated Ontology
        # Variants
        for i in range(numVariantSets):
            localId = "simVs{}".format(i)
            seed = randomSeed + i
            variantSet = variants.SimulatedVariantSet(
                self, referenceSet, localId, seed, numCalls, variantDensity)
            callSets = variantSet.getCallSets()
            # Add biosamples
            for callSet in callSets:
                biosample = biodata.Biosample(
                    self, callSet.getLocalId())
                biosample2 = biodata.Biosample(
                    self, callSet.getLocalId() + "2")
                individual = biodata.Individual(
                    self, callSet.getLocalId())
                biosample.setIndividualId(individual.getId())
                biosample2.setIndividualId(individual.getId())
                self.addIndividual(individual)
                self.addBiosample(biosample)
                self.addBiosample(biosample2)
            self.addVariantSet(variantSet)
            variantAnnotationSet = variants.SimulatedVariantAnnotationSet(
                variantSet, "simVas{}".format(i), seed)
            variantSet.addVariantAnnotationSet(variantAnnotationSet)
        # Reads
        for i in range(numReadGroupSets):
            localId = 'simRgs{}'.format(i)
            seed = randomSeed + i
            readGroupSet = reads.SimulatedReadGroupSet(
                self, localId, referenceSet, seed,
                numReadGroupsPerReadGroupSet, numAlignments)
            for rg in readGroupSet.getReadGroups():
                biosample = biodata.Biosample(
                    self, rg.getLocalId())
                individual = biodata.Individual(
                    self, rg.getLocalId())
                biosample.setIndividualId(individual.getId())
                rg.setBiosampleId(biosample.getId())
                self.addIndividual(individual)
                self.addBiosample(biosample)
            self.addReadGroupSet(readGroupSet)
        # Features
        for i in range(numFeatureSets):
            localId = "simFs{}".format(i)
            seed = randomSeed + i
            featureSet = sequence_annotations.SimulatedFeatureSet(
                self, localId, seed)
            featureSet.setReferenceSet(referenceSet)
            self.addFeatureSet(featureSet)
        # RnaQuantificationSets
        for i in range(numRnaQuantSets):
            localId = 'simRqs{}'.format(i)
            rnaQuantSet = rnaQuantification.SimulatedRnaQuantificationSet(
                self, localId)
            rnaQuantSet.setReferenceSet(referenceSet)
            self.addRnaQuantificationSet(rnaQuantSet)
Beispiel #6
0
def main():

    # Set for using hg38 rather than hg19
    # reference_set_path = '/mnt/ga4gh/repo_data/hg38.fa.gz'
    reference_set_path = '/mnt/ga4gh/repo_data/hs37d5.fa.gz'

    bio_tsv_location = 'SGDP_metadata.279public.21signedLetter.samples.Biosample.tsv'
    ind_tsv_location = 'SGDP_metadata.279public.21signedLetter.samples.individual.tsv'

    bio_samples = parse_file_biosamples(bio_tsv_location)
    individuals = parse_file_individuals(ind_tsv_location)
    repoPath = os.path.join("repo2.db")
    repo = datarepo.SqlDataRepository(repoPath)
    if (os.path.isfile("repo2.db") == True):
        os.system("rm repo2.db")
    repo.open("w")
    repo.initialise()

    dataset = datasets.Dataset("Simons")
    dataset.setDescription(
        "Variants from the Simons Foundation Genome Diversity Project")
    repo.insertDataset(dataset)

    print("Inserting biosamples")
    new_bio_samples = []
    for bio_sample in bio_samples:
        new_bio_sample = biodata.Biosample(
            dataset, unicode(bio_sample['name'], errors='replace'))
        new_bio_sample.populateFromJson(json.dumps(bio_sample))
        repo.insertBiosample(new_bio_sample)
        new_bio_samples.append(new_bio_sample)

    print("Inserting individuals")
    new_individuals = []
    for individual in individuals:
        new_individual = biodata.Individual(
            dataset, unicode(individual['name'], errors='replace'))
        new_individual.populateFromJson(json.dumps(individual))
        repo.insertIndividual(new_individual)
        new_individuals.append(new_individual)

    print("Adding reference set (takes a while)")
    reference_set = references.HtslibReferenceSet("NCBI37")
    reference_set.populateFromFile(reference_set_path)
    reference_set.setDescription("NCBI37 assembly of the human genome")
    reference_set.setNcbiTaxonId(9606)
    reference_set.setSourceUri(
        "ftp://ftp.1000genomes.ebi.ac.uk/vol1/ftp/technical/reference/phase2_reference_assembly_sequence/hs37d5.fa.gz"
    )
    for reference in reference_set.getReferences():
        reference.setNcbiTaxonId(9606)
    repo.insertReferenceSet(reference_set)

    seq_ontology = ontologies.Ontology("/mnt/ga4gh/repo_data/so-xp")
    ontology_file_path = '/mnt/ga4gh/repo_data/so-xp-simple.obo'
    seq_ontology.populateFromFile(ontology_file_path)
    seq_ontology._id = "so-xp"
    repo.insertOntology(seq_ontology)
    repo.addOntology(seq_ontology)

    vcf_directory = os.path.dirname('/mnt/ga4gh/data/vcf/')
    pattern = os.path.join(vcf_directory, "*.vcf.gz")
    for vcfFile in glob.glob(pattern):
        name = vcfFile.replace("/mnt/ga4gh/data/vcf/", "")
        name = name.replace(".annotated.nh2.variants.vcf.gz", "")
        print(name)
        variant_set = variants.HtslibVariantSet(dataset, name)
        variant_set.setReferenceSet(reference_set)
        variant_set.populateFromFile([vcfFile], [vcfFile + ".tbi"])
        variant_set.checkConsistency()
        for call_set in variant_set.getCallSets():
            for bio_sample in new_bio_samples:
                if bio_sample.getLocalId() == call_set.getLocalId():
                    call_set.setBioSampleId(bio_sample.getId())

        repo.insertVariantSet(variant_set)

        name = name + "-annotated-nh2"
        print(name)
        variant_set2 = variants.HtslibVariantSet(dataset, name)
        variant_set2.setReferenceSet(reference_set)
        variant_set2.populateFromFile([vcfFile], [vcfFile + ".tbi"])
        variant_set2.checkConsistency()
        repo.insertVariantSet(variant_set2)
        for annotation_set in variant_set2.getVariantAnnotationSets():
            print(str(annotation_set) + "found")
            annotation_set.setOntology(seq_ontology)
            repo.insertVariantAnnotationSet(annotation_set)

    repo.commit()
    print("database filled!")