def __init__(self, inputFnameLs=None, **keywords): """ 2012.5.23 """ AbstractMapper.__init__(self, inputFnameLs=inputFnameLs, **keywords) self.chromosomeList = utils.getListOutOfStr(self.chromosomeList, data_type=str, separator2=None) self.chromosomeSet = set(self.chromosomeList) self.fileFormatDict = {1: 'fasta', 2: 'fastq'} if not self.inputFileFormat: #0 or None or '' #use 1: to exclude the '.' in suffix self.inputFileFormat = utils.getRealPrefixSuffixOfFilenameWithVariableSuffix( self.inputFname)[1][1:] else: self.inputFileFormat = self.fileFormatDict.get( self.inputFileFormat) if not self.outputFileFormat: #0 or None or '' self.outputFileFormat = utils.getRealPrefixSuffixOfFilenameWithVariableSuffix( self.outputFname)[1][1:] else: self.outputFileFormat = self.fileFormatDict.get( self.outputFileFormat)
def __init__(self, **keywords): """ 2011-7-11 """ self.ad = ProcessOptions.process_function_arguments(keywords, self.option_default_dict, error_doc=self.__doc__, \ class_to_have_attr=self) if not self.outputFnamePrefix: self.outputFnamePrefix = utils.getRealPrefixSuffixOfFilenameWithVariableSuffix(self.inputFname)[0]
def getPEInputFiles(input_dir, isPE=True): """ 2011-8-28 copied from MpiBWA.py 2011-8-5 add argument isPE, which flags whether input_dir contains PE or single-end reads become a classmethod 2011-2-7 for paired-end files, sequence_628BWAAXX_1_1.fastq.gz and sequence_628BWAAXX_1_2.fastq.gz are regarded as one pair of two files. """ sys.stderr.write("Pair input files from %s ..." % input_dir) pairedEndPrefix2FileLs = {} files = os.listdir(input_dir) no_of_fastq_files = 0 for fname in files: fname_prefix, fname_suffix = utils.getRealPrefixSuffixOfFilenameWithVariableSuffix( fname) if fname_suffix != '.fastq': #skip non-fastq files continue no_of_fastq_files += 1 if isPE == True: pairedEndPrefix = fname_prefix[:-2] pairedEndOrder = fname_prefix[-2:] if pairedEndPrefix not in pairedEndPrefix2FileLs: pairedEndPrefix2FileLs[pairedEndPrefix] = ['', ''] if pairedEndOrder == '_1': #the first file pairedEndPrefix2FileLs[pairedEndPrefix][0] = fname else: pairedEndPrefix2FileLs[pairedEndPrefix][1] = fname else: pairedEndPrefix2FileLs[fname_prefix] = [fname] #single End no_of_files = len(files) no_of_pairedEndPrefix = len(pairedEndPrefix2FileLs) if no_of_pairedEndPrefix > 0: avg_no_of_files_per_prefix = no_of_fastq_files / float( no_of_pairedEndPrefix) else: avg_no_of_files_per_prefix = 0.0 sys.stderr.write("%.2f files per one pairedEnd prefix. %s fastq files. %s total files. Done.\n"%\ (avg_no_of_files_per_prefix, no_of_fastq_files, no_of_files)) return pairedEndPrefix2FileLs
def selectIntervalFromInputFile(self, jobData=None, chromosome=None,\ intervalData=None, mapEachChromosomeData=None,\ passingData=None, transferOutput=False,\ **keywords): """ 2013.11.24 """ inputSuffix = utils.getRealPrefixSuffixOfFilenameWithVariableSuffix( jobData.file.name)[1] outputFile = File(os.path.join(self.mapDirJob.output, '%s_%s%s'%(passingData.fileBasenamePrefix, \ intervalData.overlapInterval, inputSuffix))) tabixRetrieveJob = self.addTabixRetrieveJob(executable=self.tabixRetrieve, \ tabixPath=self.tabixPath, \ inputF=jobData.file, outputF=outputFile, \ regionOfInterest=intervalData.overlapInterval, includeHeader=True,\ parentJobLs=jobData.jobLs + [self.mapDirJob], job_max_memory=100, \ extraDependentInputLs=jobData.fileLs[1:], \ transferOutput=False) return self.constructJobDataFromJob(job=tabixRetrieveJob)