def run(self): if not os.path.exists(self.outputDirectory): os.makedirs(self.outputDirectory) # Export sequence ontologies print("Exporting ontologies...", file=sys.stderr) ontologiesDir = os.path.join(self.outputDirectory, "ontologymaps") sequenceOntologyDir = os.path.join(ontologiesDir, "sequence_ontology") os.makedirs(sequenceOntologyDir) shutil.copy(os.path.join(self.inputDirectory, "sequence_ontology.txt"), os.path.join(sequenceOntologyDir, "sequence_ontology.txt")) # Clean out, make and re-populate references directory # For now, assume a single, statically-named referenceSet utils.log("Converting references...") shutil.rmtree(self.refsetsDirectory, ignore_errors=True) os.makedirs(self.refsetsDirectory) shutil.copy( os.path.join(self.inputDirectory, "referenceset_hg37.json"), os.path.join(self.refsetsDirectory, "hg37.json")) os.makedirs(self.hg37Directory) for refFile in self.referenceFiles: refBase = os.path.splitext(refFile)[0] destFastaFilename = os.path.join( self.hg37Directory, refBase) + ".fa" shutil.copy(os.path.join(self.inputDirectory, refBase) + ".fa", destFastaFilename) pysam.tabix_compress(destFastaFilename, destFastaFilename + ".gz") refFasta = pysam.FastaFile(destFastaFilename + ".gz") refFasta.close() os.remove(destFastaFilename) shutil.copy( os.path.join(self.inputDirectory, refBase) + ".json", os.path.join(self.hg37Directory, refBase) + ".json") # Clean out, make and repopulate dataset directories shutil.rmtree(self.datasetsDirectory, ignore_errors=True) os.makedirs(self.datasetsDirectory) for ds in self.datasets: dsdir = os.path.join(self.datasetsDirectory, ds) os.makedirs(dsdir) # Reads utils.log("Converting reads...") dsReadsdir = os.path.join(dsdir, "reads") os.makedirs(dsReadsdir) for readFile in self.datasetReads[ds]: destFile = os.path.join( dsReadsdir, readFile.split('_')[1].split('.')[0]) + ".bam" readSrc = pysam.AlignmentFile( os.path.join(self.inputDirectory, readFile), "r") readDest = pysam.AlignmentFile( destFile, "wb", header=readSrc.header) destFilePath = readDest.filename for readData in readSrc: readDest.write(readData) readDest.close() readSrc.close() pysam.index(destFilePath) # Variants utils.log("Converting variants...") dsVariantsdir = os.path.join(dsdir, "variants") os.makedirs(dsVariantsdir) for vgroup in self.datasetVariants[ds].keys(): vgroupdir = os.path.join(dsVariantsdir, vgroup) os.makedirs(vgroupdir) for variantFile in self.datasetVariants[ds][vgroup]: destFile = os.path.join( vgroupdir, variantFile.split('_')[2]) shutil.copy( os.path.join( self.inputDirectory, variantFile), destFile) # Pysam's tabix_index automatically compresses the file # in place, creates a tabix index. pysam.tabix_index(destFile, preset="vcf") # Sequence Annotations print("Converting sequence annotations...", file=sys.stderr) dsSeqAnndir = os.path.join(dsdir, "sequenceAnnotations") os.makedirs(dsSeqAnndir) for seqAnnFile in self.datasetSequenceAnnotations[ds]: seqAnnDest = os.path.join( dsSeqAnndir, seqAnnFile.split('_')[1].split('.')[0]) + ".db" seqAnnSrc = os.path.join(self.inputDirectory, seqAnnFile) dbgen = generate_gff3_db.Gff32Db(seqAnnSrc, seqAnnDest) dbgen.run() print("done converting compliance data.", file=sys.stderr)
def run(self): if not os.path.exists(self.outputDirectory): os.makedirs(self.outputDirectory) self.repo.open("w") self.repo.initialise() referenceFileName = "ref_brca1.fa" inputRef = os.path.join(self.inputDirectory, referenceFileName) outputRef = os.path.join(self.outputDirectory, referenceFileName) shutil.copy(inputRef, outputRef) fastaFilePath = os.path.join(self.outputDirectory, referenceFileName + '.gz') pysam.tabix_compress(outputRef, fastaFilePath) with open(os.path.join(self.inputDirectory, "ref_brca1.json")) as refMetadataFile: refMetadata = json.load(refMetadataFile) with open(os.path.join(self.inputDirectory, "referenceset_hg37.json")) as refMetadataFile: refSetMetadata = json.load(refMetadataFile) referenceSet = references.HtslibReferenceSet( refSetMetadata['assemblyId']) referenceSet.populateFromFile(os.path.abspath(fastaFilePath)) referenceSet.setAssemblyId(refSetMetadata['assemblyId']) referenceSet.setDescription(refSetMetadata['description']) if refSetMetadata['species']: speciesJson = json.dumps(refSetMetadata['species']) referenceSet.setSpeciesFromJson(speciesJson) # needs a string referenceSet.setIsDerived(refSetMetadata['isDerived']) referenceSet.setSourceUri(refSetMetadata['sourceUri']) referenceSet.setSourceAccessions(refSetMetadata['sourceAccessions']) for reference in referenceSet.getReferences(): if refSetMetadata['species']: speciesJsonStr = json.dumps(refMetadata['species']) reference.setSpeciesFromJson(speciesJsonStr) reference.setSourceAccessions(refMetadata['sourceAccessions']) self.repo.insertReferenceSet(referenceSet) dataset = datasets.Dataset("brca1") # Some info is set, it isn't important what dataset.setAttributes({"version": ga4gh.server.__version__}) self.repo.insertDataset(dataset) hg00096Individual = biodata.Individual(dataset, "HG00096") with open(os.path.join(self.inputDirectory, "individual_HG00096.json")) as jsonString: hg00096Individual.populateFromJson(jsonString.read()) self.repo.insertIndividual(hg00096Individual) hg00096Biosample = biodata.Biosample(dataset, "HG00096") with open(os.path.join(self.inputDirectory, "biosample_HG00096.json")) as jsonString: hg00096Biosample.populateFromJson(jsonString.read()) hg00096Biosample.setIndividualId(hg00096Individual.getId()) self.repo.insertBiosample(hg00096Biosample) hg00099Individual = biodata.Individual(dataset, "HG00099") with open(os.path.join(self.inputDirectory, "individual_HG00099.json")) as jsonString: hg00099Individual.populateFromJson(jsonString.read()) self.repo.insertIndividual(hg00099Individual) hg00099Biosample = biodata.Biosample(dataset, "HG00099") with open(os.path.join(self.inputDirectory, "biosample_HG00099.json")) as jsonString: hg00099Biosample.populateFromJson(jsonString.read()) hg00099Biosample.setIndividualId(hg00099Individual.getId()) self.repo.insertBiosample(hg00099Biosample) hg00101Individual = biodata.Individual(dataset, "HG00101") with open(os.path.join(self.inputDirectory, "individual_HG00101.json")) as jsonString: hg00101Individual.populateFromJson(jsonString.read()) self.repo.insertIndividual(hg00101Individual) hg00101Biosample = biodata.Biosample(dataset, "HG00101") with open(os.path.join(self.inputDirectory, "biosample_HG00101.json")) as jsonString: hg00101Biosample.populateFromJson(jsonString.read()) hg00101Biosample.setIndividualId(hg00101Individual.getId()) self.repo.insertBiosample(hg00101Biosample) readFiles = [ "brca1_HG00096.sam", "brca1_HG00099.sam", "brca1_HG00101.sam" ] for readFile in readFiles: name = readFile.split('_')[1].split('.')[0] readSrc = pysam.AlignmentFile( os.path.join(self.inputDirectory, readFile), "r") readDest = pysam.AlignmentFile(os.path.join( self.outputDirectory, name + ".bam"), "wb", header=readSrc.header) destFilePath = readDest.filename for readData in readSrc: readDest.write(readData) readDest.close() readSrc.close() pysam.index(destFilePath) readGroupSet = reads.HtslibReadGroupSet(dataset, name) readGroupSet.populateFromFile( os.path.abspath(destFilePath), os.path.abspath(destFilePath + ".bai")) readGroupSet.setReferenceSet(referenceSet) dataset.addReadGroupSet(readGroupSet) biosamples = [hg00096Biosample, hg00099Biosample, hg00101Biosample] for readGroup in readGroupSet.getReadGroups(): for biosample in biosamples: if biosample.getLocalId() == readGroup.getSampleName(): readGroup.setBiosampleId(biosample.getId()) self.repo.insertReadGroupSet(readGroupSet) ontologyMapFileName = "so-xp-simple.obo" inputOntologyMap = os.path.join(self.inputDirectory, ontologyMapFileName) outputOntologyMap = os.path.join(self.outputDirectory, ontologyMapFileName) shutil.copy(inputOntologyMap, outputOntologyMap) sequenceOntology = ontologies.Ontology("so-xp-simple") sequenceOntology.populateFromFile(os.path.abspath(outputOntologyMap)) sequenceOntology._id = "so-xp-simple" self.repo.insertOntology(sequenceOntology) self.repo.addOntology(sequenceOntology) vcfFiles = [ "brca1_1kgPhase3_variants.vcf", "brca1_WASH7P_annotation.vcf", "brca1_OR4F_annotation.vcf" ] for vcfFile in vcfFiles: self.addVariantSet(vcfFile, dataset, referenceSet, sequenceOntology, biosamples) # Sequence annotations seqAnnFile = "brca1_gencodev19.gff3" seqAnnSrc = os.path.join(self.inputDirectory, seqAnnFile) seqAnnDest = os.path.join(self.outputDirectory, "gencodev19.db") dbgen = generate_gff3_db.Gff32Db(seqAnnSrc, seqAnnDest) dbgen.run() gencode = sequence_annotations.Gff3DbFeatureSet(dataset, "gencodev19") gencode.setOntology(sequenceOntology) gencode.populateFromFile(os.path.abspath(seqAnnDest)) gencode.setReferenceSet(referenceSet) self.repo.insertFeatureSet(gencode) # add g2p featureSet g2pPath = os.path.join(self.inputDirectory, "cgd") # copy all files input directory to output path outputG2PPath = os.path.join(self.outputDirectory, "cgd") os.makedirs(outputG2PPath) for filename in glob.glob(os.path.join(g2pPath, '*.*')): shutil.copy(filename, outputG2PPath) featuresetG2P = g2p_featureset.PhenotypeAssociationFeatureSet( dataset, os.path.abspath(outputG2PPath)) featuresetG2P.setOntology(sequenceOntology) featuresetG2P.setReferenceSet(referenceSet) featuresetG2P.populateFromFile(os.path.abspath(outputG2PPath)) self.repo.insertFeatureSet(featuresetG2P) # add g2p phenotypeAssociationSet phenotypeAssociationSet = \ g2p_associationset.RdfPhenotypeAssociationSet( dataset, "cgd", os.path.abspath(outputG2PPath)) self.repo.insertPhenotypeAssociationSet(phenotypeAssociationSet) dataset.addFeatureSet(gencode) # RNA Quantification rnaDbName = os.path.join(self.outputDirectory, "rnaseq.db") store = rnaseq2ga.RnaSqliteStore(rnaDbName) store.createTables() rnaseq2ga.rnaseq2ga(self.inputDirectory + "/rna_brca1.tsv", rnaDbName, "rna_brca1.tsv", "rsem", featureType="transcript", readGroupSetNames="HG00096", dataset=dataset, featureSetNames="gencodev19", biosampleId=hg00096Biosample.getId()) rnaQuantificationSet = rna_quantification.SqliteRnaQuantificationSet( dataset, "rnaseq") rnaQuantificationSet.setReferenceSet(referenceSet) rnaQuantificationSet.populateFromFile(os.path.abspath(rnaDbName)) self.repo.insertRnaQuantificationSet(rnaQuantificationSet)
def run(self): if not os.path.exists(self.outputDirectory): os.makedirs(self.outputDirectory) self.repo.open("w") self.repo.initialise() referenceFileName = "ref_brca1.fa" inputRef = os.path.join( self.inputDirectory, referenceFileName) outputRef = os.path.join( self.outputDirectory, referenceFileName) shutil.copy(inputRef, outputRef) fastaFilePath = os.path.join( self.outputDirectory, referenceFileName + '.gz') pysam.tabix_compress( outputRef, fastaFilePath) with open( os.path.join( self.inputDirectory, "ref_brca1.json")) as refMetadataFile: refMetadata = json.load(refMetadataFile) with open( os.path.join( self.inputDirectory, "referenceset_hg37.json")) as refMetadataFile: refSetMetadata = json.load(refMetadataFile) referenceSet = references.HtslibReferenceSet( refSetMetadata['assemblyId']) referenceSet.populateFromFile(fastaFilePath) referenceSet.setAssemblyId(refSetMetadata['assemblyId']) referenceSet.setDescription(refSetMetadata['description']) referenceSet.setNcbiTaxonId(refSetMetadata['ncbiTaxonId']) referenceSet.setIsDerived(refSetMetadata['isDerived']) referenceSet.setSourceUri(refSetMetadata['sourceUri']) referenceSet.setSourceAccessions(refSetMetadata['sourceAccessions']) for reference in referenceSet.getReferences(): reference.setNcbiTaxonId(refMetadata['ncbiTaxonId']) reference.setSourceAccessions( refMetadata['sourceAccessions']) self.repo.insertReferenceSet(referenceSet) dataset = datasets.Dataset("brca1") self.repo.insertDataset(dataset) hg00096Individual = biodata.Individual(dataset, "HG00096") with open( os.path.join( self.inputDirectory, "individual_HG00096.json")) as jsonString: hg00096Individual.populateFromJson(jsonString.read()) self.repo.insertIndividual(hg00096Individual) hg00096BioSample = biodata.BioSample(dataset, "HG00096") with open( os.path.join( self.inputDirectory, "bioSample_HG00096.json")) as jsonString: hg00096BioSample.populateFromJson(jsonString.read()) hg00096BioSample.setIndividualId(hg00096Individual.getId()) self.repo.insertBioSample(hg00096BioSample) hg00099Individual = biodata.Individual(dataset, "HG00099") with open( os.path.join( self.inputDirectory, "individual_HG00099.json")) as jsonString: hg00099Individual.populateFromJson(jsonString.read()) self.repo.insertIndividual(hg00099Individual) hg00099BioSample = biodata.BioSample(dataset, "HG00099") with open( os.path.join( self.inputDirectory, "bioSample_HG00099.json")) as jsonString: hg00099BioSample.populateFromJson(jsonString.read()) hg00099BioSample.setIndividualId(hg00099Individual.getId()) self.repo.insertBioSample(hg00099BioSample) hg00101Individual = biodata.Individual(dataset, "HG00101") with open( os.path.join( self.inputDirectory, "individual_HG00101.json")) as jsonString: hg00101Individual.populateFromJson(jsonString.read()) self.repo.insertIndividual(hg00101Individual) hg00101BioSample = biodata.BioSample(dataset, "HG00101") with open( os.path.join( self.inputDirectory, "bioSample_HG00101.json")) as jsonString: hg00101BioSample.populateFromJson(jsonString.read()) hg00101BioSample.setIndividualId(hg00101Individual.getId()) self.repo.insertBioSample(hg00101BioSample) readFiles = [ "brca1_HG00096.sam", "brca1_HG00099.sam", "brca1_HG00101.sam"] for readFile in readFiles: name = readFile.split('_')[1].split('.')[0] readSrc = pysam.AlignmentFile( os.path.join(self.inputDirectory, readFile), "r") readDest = pysam.AlignmentFile( os.path.join( self.outputDirectory, name + ".bam"), "wb", header=readSrc.header) destFilePath = readDest.filename for readData in readSrc: readDest.write(readData) readDest.close() readSrc.close() pysam.index(destFilePath) readGroupSet = reads.HtslibReadGroupSet(dataset, name) readGroupSet.populateFromFile(destFilePath, destFilePath + ".bai") readGroupSet.setReferenceSet(referenceSet) bioSamples = [hg00096BioSample, hg00099BioSample, hg00101BioSample] for readGroup in readGroupSet.getReadGroups(): for bioSample in bioSamples: if bioSample.getLocalId() == readGroup.getSampleName(): readGroup.setBioSampleId(bioSample.getId()) self.repo.insertReadGroupSet(readGroupSet) ontologyMapFileName = "so-xp-simple.obo" inputOntologyMap = os.path.join( self.inputDirectory, ontologyMapFileName) outputOntologyMap = os.path.join( self.outputDirectory, ontologyMapFileName) shutil.copy(inputOntologyMap, outputOntologyMap) sequenceOntology = ontologies.Ontology("so-xp-simple") sequenceOntology.populateFromFile(outputOntologyMap) sequenceOntology._id = "so-xp-simple" self.repo.insertOntology(sequenceOntology) self.repo.addOntology(sequenceOntology) vcfFiles = [ "brca1_1kgPhase3_variants.vcf", "brca1_WASH7P_annotation.vcf", "brca1_OR4F_annotation.vcf"] for vcfFile in vcfFiles: self.addVariantSet( vcfFile, dataset, referenceSet, sequenceOntology, bioSamples) seqAnnFile = "brca1_gencodev19.gff3" seqAnnSrc = os.path.join(self.inputDirectory, seqAnnFile) seqAnnDest = os.path.join(self.outputDirectory, "gencodev19.db") dbgen = generate_gff3_db.Gff32Db(seqAnnSrc, seqAnnDest) dbgen.run() gencode = sequenceAnnotations.Gff3DbFeatureSet(dataset, "gencodev19") gencode.setOntology(sequenceOntology) gencode.populateFromFile(seqAnnDest) gencode.setReferenceSet(referenceSet) self.repo.insertFeatureSet(gencode) self.repo.commit() print("Done converting compliance data.", file=sys.stderr)