def __init__(self, inputFnameLs=None, **keywords):
        """
		2012.5.23
		"""
        AbstractMapper.__init__(self, inputFnameLs=inputFnameLs, **keywords)
        self.chromosomeList = utils.getListOutOfStr(self.chromosomeList,
                                                    data_type=str,
                                                    separator2=None)
        self.chromosomeSet = set(self.chromosomeList)

        self.fileFormatDict = {1: 'fasta', 2: 'fastq'}

        if not self.inputFileFormat:  #0 or None or ''
            #use 1: to exclude the '.' in suffix
            self.inputFileFormat = utils.getRealPrefixSuffixOfFilenameWithVariableSuffix(
                self.inputFname)[1][1:]
        else:
            self.inputFileFormat = self.fileFormatDict.get(
                self.inputFileFormat)
        if not self.outputFileFormat:  #0 or None or ''
            self.outputFileFormat = utils.getRealPrefixSuffixOfFilenameWithVariableSuffix(
                self.outputFname)[1][1:]
        else:
            self.outputFileFormat = self.fileFormatDict.get(
                self.outputFileFormat)
	def __init__(self,  **keywords):
		"""
		2011-7-11
		"""
		self.ad = ProcessOptions.process_function_arguments(keywords, self.option_default_dict, error_doc=self.__doc__, \
														class_to_have_attr=self)
		
		if not self.outputFnamePrefix:
			self.outputFnamePrefix = utils.getRealPrefixSuffixOfFilenameWithVariableSuffix(self.inputFname)[0]
Exemple #3
0
def getPEInputFiles(input_dir, isPE=True):
    """
	2011-8-28
		copied from MpiBWA.py
	2011-8-5
		add argument isPE, which flags whether input_dir contains PE or single-end reads
		become a classmethod
	2011-2-7
		for paired-end files, sequence_628BWAAXX_1_1.fastq.gz and sequence_628BWAAXX_1_2.fastq.gz
			are regarded as one pair of two files.
	"""
    sys.stderr.write("Pair input files from %s ..." % input_dir)
    pairedEndPrefix2FileLs = {}
    files = os.listdir(input_dir)
    no_of_fastq_files = 0
    for fname in files:
        fname_prefix, fname_suffix = utils.getRealPrefixSuffixOfFilenameWithVariableSuffix(
            fname)
        if fname_suffix != '.fastq':  #skip non-fastq files
            continue
        no_of_fastq_files += 1
        if isPE == True:
            pairedEndPrefix = fname_prefix[:-2]
            pairedEndOrder = fname_prefix[-2:]

            if pairedEndPrefix not in pairedEndPrefix2FileLs:
                pairedEndPrefix2FileLs[pairedEndPrefix] = ['', '']

            if pairedEndOrder == '_1':  #the first file
                pairedEndPrefix2FileLs[pairedEndPrefix][0] = fname
            else:
                pairedEndPrefix2FileLs[pairedEndPrefix][1] = fname
        else:
            pairedEndPrefix2FileLs[fname_prefix] = [fname]  #single End
    no_of_files = len(files)
    no_of_pairedEndPrefix = len(pairedEndPrefix2FileLs)
    if no_of_pairedEndPrefix > 0:
        avg_no_of_files_per_prefix = no_of_fastq_files / float(
            no_of_pairedEndPrefix)
    else:
        avg_no_of_files_per_prefix = 0.0
    sys.stderr.write("%.2f files per one pairedEnd prefix. %s fastq files. %s total files. Done.\n"%\
        (avg_no_of_files_per_prefix, no_of_fastq_files, no_of_files))
    return pairedEndPrefix2FileLs
    def selectIntervalFromInputFile(self, jobData=None, chromosome=None,\
           intervalData=None, mapEachChromosomeData=None,\
           passingData=None, transferOutput=False,\
           **keywords):
        """
		2013.11.24
		"""
        inputSuffix = utils.getRealPrefixSuffixOfFilenameWithVariableSuffix(
            jobData.file.name)[1]
        outputFile = File(os.path.join(self.mapDirJob.output, '%s_%s%s'%(passingData.fileBasenamePrefix, \
                      intervalData.overlapInterval, inputSuffix)))
        tabixRetrieveJob = self.addTabixRetrieveJob(executable=self.tabixRetrieve, \
             tabixPath=self.tabixPath, \
             inputF=jobData.file, outputF=outputFile, \
             regionOfInterest=intervalData.overlapInterval, includeHeader=True,\
             parentJobLs=jobData.jobLs + [self.mapDirJob], job_max_memory=100, \
             extraDependentInputLs=jobData.fileLs[1:], \
             transferOutput=False)
        return self.constructJobDataFromJob(job=tabixRetrieveJob)