def saveAsVcf(self, filePath, asSingleFile=True, deferMerging=False, stringency=LENIENT, disableFastConcat=False): """ Saves this RDD of variants to disk as VCF. :param str filePath: Path to save file to. :param bool asSingleFile: If true, saves the output as a single file by merging the sharded output after saving. :param bool deferMerging: If true, saves the output as prepped for merging into a single file, but does not merge. :param bdgenomics.adam.stringency stringency: The stringency to use when writing the VCF. :param bool disableFastConcat: If asSingleFile is true, disables the use of the fast concatenation engine for saving to HDFS. """ self._jvmRdd.saveAsVcf(filePath, asSingleFile, deferMerging, disableFastConcat, _toJava(stringency, self.sc._jvm))
def loadIndexedBam(self, filePath, viewRegions, stringency=STRICT): """ Functions like loadAlignments, but uses BAM index files to look at fewer blocks, and only returns records within the specified ReferenceRegions. BAM index file required. :param str pathName: The path name to load indexed BAM formatted alignment records from. Globs/directories are supported. :param list<ReferenceRegion> viewRegions: List of ReferenceRegion to filter on. :param int stringency: The validation stringency to use when validating the BAM/CRAM/SAM format header. Defaults to ValidationStringency.STRICT. :return Returns an AlignmentRecordRDD which wraps the RDD of alignment records, sequence dictionary representing contigs the alignment records may be aligned to, and the record group dictionary for the alignment records if one is available. :rtype: bdgenomics.adam.rdd.AlignmentRecordRDD """ # translate reference regions into jvm types javaRrs = [rr._toJava(self._jvm) for rr in viewRegions] adamRdd = self.__jac.loadIndexedBam(filePath, javaRrs, _toJava(stringency, self._jvm)) return AlignmentRecordRDD(adamRdd, self._sc)
def loadAlignments(self, filePath, stringency=STRICT): """ Load alignment records into an AlignmentRecordRDD. Loads path names ending in: * .bam/.cram/.sam as BAM/CRAM/SAM format, * .fa/.fasta as FASTA format, * .fq/.fastq as FASTQ format, and * .ifq as interleaved FASTQ format. If none of these match, fall back to Parquet + Avro. For FASTA, FASTQ, and interleaved FASTQ formats, compressed files are supported through compression codecs configured in Hadoop, which by default include .gz and .bz2, but can include more. :param str filePath: The path to load the file from. :param stringency: The validation stringency to apply. Defaults to STRICT. :return: Returns an RDD containing reads. :rtype: bdgenomics.adam.rdd.AlignmentRecordRDD """ adamRdd = self.__jac.loadAlignments(filePath, _toJava(stringency, self._jvm)) return AlignmentRecordRDD(adamRdd, self._sc)
def loadFeatures(self, filePath, stringency=STRICT): """ Load features into a FeatureDataset. Loads path names ending in: * .bed as BED6/12 format, * .gff3 as GFF3 format, * .gtf/.gff as GTF/GFF2 format, * .narrow[pP]eak as NarrowPeak format, and * .interval_list as IntervalList format. If none of these match, fall back to Parquet + Avro. For BED6/12, GFF3, GTF/GFF2, NarrowPeak, and IntervalList formats, compressed files are supported through compression codecs configured in Hadoop, which by default include .gz and .bz2, but can include more. :param str filePath: The path to load the file from. :param stringency: The validation stringency to apply. Defaults to STRICT. :return: Returns a genomic dataset containing features. :rtype: bdgenomics.adam.rdd.FeatureDataset """ adamRdd = self.__jac.loadFeatures(filePath, _toJava(stringency, self._jvm)) return FeatureDataset(adamRdd, self._sc)
def saveAsPairedFastq(self, fileName1, fileName2, persistLevel, outputOriginalBaseQualities = False, validationStringency = LENIENT): """ Saves these AlignmentRecords to two FASTQ files. The files are one for the first mate in each pair, and the other for the second mate in the pair. :param str fileName1: Path at which to save a FASTQ file containing the first mate of each pair. :param str fileName2: Path at which to save a FASTQ file containing the second mate of each pair. :param bool outputOriginalBaseQualities: If true, writes out reads with the base qualities from the original qualities (SAM "OQ") field. If false, writes out reads with the base qualities from the qual field. Default is false. :param bdgenomics.adam.stringency validationStringency: If strict, throw an exception if any read in this RDD is not accompanied by its mate. :param pyspark.storagelevel.StorageLevel persistLevel: The persistance level to cache reads at between passes. """ self._jvmRdd.saveAsPairedFastq(fileName1, fileName2, outputOriginalBaseQualities, _toJava(validationStringency, self.sc._jvm), persistLevel)
def saveAsPairedFastq(self, fileName1, fileName2, persistLevel, outputOriginalBaseQualities=False, validationStringency=LENIENT): """ Saves these AlignmentRecords to two FASTQ files. The files are one for the first mate in each pair, and the other for the second mate in the pair. :param str fileName1: Path at which to save a FASTQ file containing the first mate of each pair. :param str fileName2: Path at which to save a FASTQ file containing the second mate of each pair. :param bool outputOriginalBaseQualities: If true, writes out reads with the base qualities from the original qualities (SAM "OQ") field. If false, writes out reads with the base qualities from the qual field. Default is false. :param bdgenomics.adam.stringency validationStringency: If strict, throw an exception if any read in this RDD is not accompanied by its mate. :param pyspark.storagelevel.StorageLevel persistLevel: The persistance level to cache reads at between passes. """ self._jvmRdd.saveAsPairedFastq( fileName1, fileName2, outputOriginalBaseQualities, _toJava(validationStringency, self.sc._jvm), persistLevel)
def loadCoverage(self, filePath, stringency=STRICT): """ Load features into a FeatureRDD and convert to a CoverageRDD. Coverage is stored in the score field of Feature. Loads path names ending in: * .bed as BED6/12 format, * .gff3 as GFF3 format, * .gtf/.gff as GTF/GFF2 format, * .narrow[pP]eak as NarrowPeak format, and * .interval_list as IntervalList format. If none of these match, fall back to Parquet + Avro. For BED6/12, GFF3, GTF/GFF2, NarrowPeak, and IntervalList formats, compressed files are supported through compression codecs configured in Hadoop, which by default include .gz and .bz2, but can include more. :param str filePath: The path to load coverage data from. :param stringency: The validation stringency to apply. Defaults to STRICT. :return: Returns an RDD containing coverage. :rtype: bdgenomics.adam.rdd.CoverageRDD """ adamRdd = self.__jac.loadCoverage(filePath, _toJava(stringency, self._jvm)) return CoverageRDD(adamRdd, self._sc)
def loadIndexedBam(self, filePath, viewRegions, stringency=STRICT): """ Functions like loadAlignments, but uses BAM index files to look at fewer blocks, and only returns records within the specified ReferenceRegions. BAM index file required. :param str pathName: The path name to load indexed BAM formatted alignments from. Globs/directories are supported. :param list<ReferenceRegion> viewRegions: List of ReferenceRegion to filter on. :param int stringency: The validation stringency to use when validating the BAM/CRAM/SAM format header. Defaults to ValidationStringency.STRICT. :return Returns an AlignmentDataset which wraps the RDD of alignment records, sequence dictionary representing contigs the alignments may be aligned to, and the read group dictionary for the alignment records if one is available. :rtype: bdgenomics.adam.ds.AlignmentDataset """ # translate reference regions into jvm types javaRrs = [rr._toJava(self._jvm) for rr in viewRegions] adamRdd = self.__jac.loadIndexedBam(filePath, javaRrs, _toJava(stringency, self._jvm)) return AlignmentDataset(adamRdd, self._sc)
def recalibrateBaseQualities(self, knownSnps, validationStringency = LENIENT): """ Runs base quality score recalibration on a set of reads. Uses a table of known SNPs to mask true variation during the recalibration process. :param bdgenomics.adam.rdd.VariantRDD knownSnps: A table of known SNPs to mask valid variants. :param bdgenomics.adam.stringency validationStringency: """ return AlignmentRecordRDD(self._jvmRdd.recalibrateBaseQualities(knownSnps._jvmRdd, _toJava(validationStringency, self.sc._jvm)))
def loadGenotypes(self, filePath, stringency=STRICT): """ Load genotypes into a GenotypeRDD. If the path name has a .vcf/.vcf.gz/.vcf.bgz extension, load as VCF format. Else, fall back to Parquet + Avro. :param str filePath: The path to load the file from. :param stringency: The validation stringency to apply. Defaults to STRICT. :return: Returns an RDD containing genotypes. :rtype: bdgenomics.adam.rdd.GenotypeRDD """ adamRdd = self.__jac.loadGenotypes(filePath, _toJava(stringency, self._jvm)) return GenotypeRDD(adamRdd, self._sc)
def loadVariants(self, filePath, stringency=STRICT): """ Load variants into a VariantDataset. If the path name has a .vcf/.vcf.gz/.vcf.bgz extension, load as VCF format. Else, fall back to Parquet + Avro. :param str filePath: The path to load the file from. :param stringency: The validation stringency to apply. Defaults to STRICT. :return: Returns a genomic dataset containing variants. :rtype: bdgenomics.adam.rdd.VariantDataset """ adamRdd = self.__jac.loadVariants(filePath, _toJava(stringency, self._jvm)) return VariantDataset(adamRdd, self._sc)
def loadVariants(self, filePath, stringency=STRICT): """ Load variants into a VariantRDD. If the path name has a .vcf/.vcf.gz/.vcf.bgz extension, load as VCF format. Else, fall back to Parquet + Avro. :param str filePath: The path to load the file from. :param stringency: The validation stringency to apply. Defaults to STRICT. :return: Returns an RDD containing variants. :rtype: bdgenomics.adam.rdd.VariantRDD """ adamRdd = self.__jac.loadVariants(filePath, _toJava(stringency, self._jvm)) return VariantRDD(adamRdd, self._sc)
def reassembleReadPairs(self, secondPairRdd, validationStringency=LENIENT): """ Reassembles read pairs from two sets of unpaired reads. The assumption is that the two sets were _originally_ paired together. The RDD that this is called on should be the RDD with the first read from the pair. :param pyspark.rdd.RDD secondPairRdd: The rdd containing the second read from the pairs. :param bdgenomics.adam.stringency validationStringency: How stringently to validate the reads. :return: Returns an RDD with the pair information recomputed. :rtype: bdgenomics.adam.rdd.AlignmentRecordRDD """ return AlignmentRecordRDD( self._jvmRdd.reassembleReadPairs( rdd._jrdd, _toJava(validationStringency, self.sc._jvm)), self.sc)
def reassembleReadPairs(self, secondPairRdd, validationStringency = LENIENT): """ Reassembles read pairs from two sets of unpaired reads. The assumption is that the two sets were _originally_ paired together. The RDD that this is called on should be the RDD with the first read from the pair. :param pyspark.rdd.RDD secondPairRdd: The rdd containing the second read from the pairs. :param bdgenomics.adam.stringency validationStringency: How stringently to validate the reads. :return: Returns an RDD with the pair information recomputed. :rtype: bdgenomics.adam.rdd.AlignmentRecordRDD """ return AlignmentRecordRDD(self._jvmRdd.reassembleReadPairs(rdd._jrdd, _toJava(validationStringency, self.sc._jvm)), self.sc)
def saveAsVcf(self, filePath, asSingleFile=True, deferMerging=False, stringency=LENIENT, sortOnSave=None, disableFastConcat=False): """ Saves this RDD of variants to disk as VCF. :param str filePath: Path to save file to. :param bool asSingleFile: If true, saves the output as a single file by merging the sharded output after saving. :param bool deferMerging: If true, saves the output as prepped for merging into a single file, but does not merge. :param bdgenomics.adam.stringency stringency: The stringency to use when writing the VCF. :param bool sortOnSave: Whether to sort when saving. If None, does not sort. If True, sorts by contig index. If "lexicographically", sorts by contig name. :param bool disableFastConcat: If asSingleFile is true, disables the use of the fast concatenation engine for saving to HDFS. """ vcs = self._jvmRdd.toVariantContextRDD() if sortOnSave is None: finalVcs = vcs elif sortOnSave == "lexicographically": finalVcs = vcs.sortLexicographically() elif sortOnSave: finalVcs = vcs.sort() else: raise RuntimeError( 'sortOnSave = %s. Expected None, "lexicographically", or True.' % sortOnSave) finalVcs.saveAsVcf(filePath, asSingleFile, deferMerging, disableFastConcat, _toJava(stringency, self.sc._jvm))
def saveAsFastq(self, fileName, validationStringency=LENIENT, sort=False, outputOriginalBaseQualities=False): """ Saves reads in FASTQ format. :param str fileName: Path to save files at. :param bdgenomics.adam.stringency validationStringency: If strict, throw an exception if any read in this RDD is not accompanied by its mate. :param bool sort: Whether to sort the FASTQ files by read name or not. Defaults to false. Sorting the output will recover pair order, if desired. :param bool outputOriginalBaseQualities: If true, writes out reads with the base qualities from the original qualities (SAM "OQ") field. If false, writes out reads with the base qualities from the qual field. Default is false. """ self._jvmRdd.saveAsFastq(fileName, outputOriginalBaseQualities, sort, _toJava(validationStringency, self.sc._jvm))
def saveAsFastq(self, fileName, validationStringency = LENIENT, sort = False, outputOriginalBaseQualities = False): """ Saves reads in FASTQ format. :param str fileName: Path to save files at. :param bdgenomics.adam.stringency validationStringency: If strict, throw an exception if any read in this RDD is not accompanied by its mate. :param bool sort: Whether to sort the FASTQ files by read name or not. Defaults to false. Sorting the output will recover pair order, if desired. :param bool outputOriginalBaseQualities: If true, writes out reads with the base qualities from the original qualities (SAM "OQ") field. If false, writes out reads with the base qualities from the qual field. Default is false. """ self._jvmRdd.saveAsFastq(fileName, outputOriginalBaseQualities, sort, _toJava(validationStringency, self.sc._jvm))