Ejemplo n.º 1
0
    def run(self):
        """
        2013.2.11
            input looks like (inputFileFormat=1)
                msHOT-lite 2 1 -t 4781.50413187402 -r 790.4466018 ...
                //
                segsites: 40567
                
                positions: 0.0002 0.0003
                001001101011011001...
                101001010100101111...
                ...
        """

        if self.debug:
            import pdb
            pdb.set_trace()

        inf = utils.openGzipFile(self.inputFname, 'r')

        outf = utils.openGzipFile(self.outputFname, mode='w')
        self.convertFuncDict[self.inputFileFormat](inf=inf, outf=outf, \
            noOfHaplotypesDefault=self.noOfHaplotypesDefault,\
            chromosomeLengthToSimulate=self.chromosomeLengthToSimulate)

        inf.close()
        outf.close()
    def selectSequences(self, inputFname=None, outputFname=None, inputFileFormat='fasta', outputFileFormat='fasta', chromosomeSet=None,\
        defaultBasePhredQuality=87):
        """
		2012.5.24
		"""
        sys.stderr.write("Choosing %s chromosome sequences from %s ..." %
                         (len(chromosomeSet), inputFname))
        inf = utils.openGzipFile(inputFname, 'r')
        counter = 0
        real_counter = 0
        outputHandle = utils.openGzipFile(outputFname, 'w')
        for seq_record in SeqIO.parse(inf, inputFileFormat):
            counter += 1
            if seq_record.id in chromosomeSet:
                if outputFileFormat == 'fastq' and 'phred_quality' not in seq_record.letter_annotations:
                    #fake quality for fastq output
                    seq_record.letter_annotations['phred_quality'] = [
                        defaultBasePhredQuality
                    ] * len(seq_record.seq)
                SeqIO.write([seq_record], outputHandle, outputFileFormat)
                real_counter += 1
            elif real_counter == len(chromosomeSet):  #got enough chromosomes
                break
        #close the last handle
        outputHandle.close()
        sys.stderr.write(" %s records chosen into %s.\n" %
                         (real_counter, outputFname))
    def run(self):
        """
        """

        if self.debug:
            import pdb
            pdb.set_trace()

        inf = utils.openGzipFile(self.inputFname, mode='r')

        reader = csv.reader(inf, delimiter=figureOutDelimiter(inf))
        header = None
        for i in range(self.noOfLinesInHeader):
            if i == 0:
                header = next(reader)
            else:
                next(reader)
        if header is not None:
            colName2Index = getColName2IndexFromHeader(header)

        newHeader = [
            'alignmentID', 'total_base_count', 'sampled_base_count',
            'meanDepth', 'medianDepth', 'modeDepth'
        ]
        inputStatLs = []

        writer = csv.writer(utils.openGzipFile(self.outputFname, mode='w'),
                            delimiter='\t')
        writer.writerow(newHeader)
        counter = 0
        real_counter = 0
        for row in reader:
            counter += 1
            if real_counter <= self.maxNumberOfSamplings:
                r = random.random()
                if r <= self.fractionToSample and real_counter <= self.maxNumberOfSamplings:
                    inputStatLs.append(float(row[self.whichColumn]))
                    real_counter += 1

        meanDepth = numpy.mean(inputStatLs)
        medianDepth = numpy.median(inputStatLs)
        modeDepth = scipy.stats.mode(inputStatLs)[0][0]
        outputRow = [
            self.alignmentID, counter, real_counter, meanDepth, medianDepth,
            modeDepth
        ]
        writer.writerow(outputRow)
        del writer
Ejemplo n.º 4
0
def countNoOfChromosomesBasesInFastQFile(inputFname=None):
    """
    2013.2.16 add the try...except around the parser
    2013.2.9 count the #chromosomes, #bases of inputFname
    """
    sys.stderr.write("Counting #chromosomes, #bases of %s ..." % (inputFname))
    no_of_chromosomes = 0
    no_of_bases = 0
    inf = utils.openGzipFile(inputFname)
    try:
        from Bio import SeqIO
        for seq_record in SeqIO.parse(inf, 'fastq'):
            no_of_chromosomes += 1
            no_of_bases += len(seq_record)
    except:
        sys.stderr.write("Except after handling %s chromosomes & %s bases.\n" %
                         (no_of_chromosomes, no_of_bases))
        sys.stderr.write('Except type: %s\n' % repr(sys.exc_info()))
        import traceback
        traceback.print_exc()
        raise

    inf.close()
    sys.stderr.write("%s chromosomes, %s bases\n" %
                     (no_of_chromosomes, no_of_bases))
    return utils.PassingData(no_of_chromosomes=no_of_chromosomes,
                             no_of_bases=no_of_bases)
Ejemplo n.º 5
0
    def run(self):

        if self.debug:
            import pdb
            pdb.set_trace()

        writer = csv.writer(open(self.outputFname, 'w'), delimiter='\t')
        writer.writerow(
            ['#sampleID', 'chromosome', 'meanDepth', 'medianDepth'])
        for inputFname in self.inputFnameLs:
            inputFile = utils.openGzipFile(inputFname)
            delimiter = figureOutDelimiter(inputFile)
            reader = csv.reader(inputFile, delimiter=delimiter)
            header = next(reader)
            col_name2index = getColName2IndexFromHeader(header)

            intervalIDIndex = col_name2index.get("Target")
            #only the first read group among the output (so don't run
            #   the DepthOfCoverageWalker over multi-read-group bam files
            avgCoverageIndex = 4
            sampleID = header[avgCoverageIndex][:-9]
            #this column header is like $sampleID_mean_cvg. so get rid of _mean_cvg
            medianCoverageIndex = 6

            for row in reader:
                intervalID = row[intervalIDIndex]
                writer.writerow([
                    sampleID, intervalID, row[avgCoverageIndex],
                    row[medianCoverageIndex]
                ])
        del writer
        sys.stderr.write("Done.\n")
Ejemplo n.º 6
0
 def run(self):
     """
     """
     
     if self.debug:
         import pdb
         pdb.set_trace()
     
     inf = utils.openGzipFile(self.inputFname)
     outf= open(self.outputFname, 'w')
     lineNumber = 0
     real_counter = 0
     for line in inf:
         lineNumber += 1
         if lineNumber>=self.startLineNumber and \
             lineNumber<=self.stopLineNumber:
             outf.write(line);
             real_counter += 1
         elif lineNumber>self.stopLineNumber:
             #stop here
             break
         
     inf.close()
     outf.close()
     sys.stderr.write("%s lines chosen.\n"%(real_counter))
Ejemplo n.º 7
0
    def splitFastaFile(self,
                       inputFname=None,
                       outputFnamePrefix=None,
                       noOfSequences=1000,
                       suffixLength=3,
                       filenameSuffix=""):
        """
		2012.5.24
		"""
        sys.stderr.write("Splitting fasta file %s ..." % (inputFname))
        inf = utils.openGzipFile(inputFname)
        counter = 0
        real_counter = 0
        outputFname = utils.comeUpSplitFilename(outputFnamePrefix=outputFnamePrefix, suffixLength=suffixLength, fileOrder=real_counter,\
                 filenameSuffix=filenameSuffix)
        outputHandle = open(outputFname, 'w')
        for seq_record in SeqIO.parse(inf, "fasta"):
            counter += 1
            SeqIO.write([seq_record], outputHandle, "fasta")
            if counter % noOfSequences == 0:
                outputHandle.close()
                real_counter += 1
                outputFname = utils.comeUpSplitFilename(outputFnamePrefix=outputFnamePrefix, suffixLength=suffixLength, fileOrder=real_counter,\
                       filenameSuffix=filenameSuffix)
                outputHandle = open(outputFname, 'w')
        #close the last handle
        outputHandle.close()
        sys.stderr.write(" into %s files.\n" %
                         (real_counter + 1))  #real_counter starts from 0
    def run(self):
        if self.debug:
            import pdb
            pdb.set_trace()

        header = None
        outf = utils.openGzipFile(self.outputFname, 'w')
        for inputFname in self.inputFnameLs:
            print(f"File {inputFname} ... ", flush=True)
            if not os.path.isfile(inputFname):
                if self.exitNonZeroIfAnyInputFileInexistent:
                    logging.error(f"{inputFname} doesn't exist.")
                    sys.exit(3)
                else:
                    continue
            inf = utils.openGzipFile(inputFname, 'r')
            if self.noHeader == 0:
                #in the case that every input has a common header
                if not header:
                    #if empty string or None, obtain a header
                    try:
                        header = inf.readline()
                        outf.write(header)
                    except:  #in case something wrong (i.e. file is empty)
                        logging.error('Except type: %s' % repr(sys.exc_info()))
                        import traceback
                        traceback.print_exc()
                        print(sys.exc_info())
                else:
                    #skip the header for other input files
                    try:
                        inf.readline()
                    except:
                        #in case something wrong (i.e. file is empty)
                        logging.error('Except type: %s' % repr(sys.exc_info()))
                        import traceback
                        traceback.print_exc()
                        print(sys.exc_info())
            for line in inf:
                isEmpty = self.isInputLineEmpty(
                    line.strip(),
                    inputFile=inf,
                    inputEmptyType=self.inputEmptyType)
                if not isEmpty:
                    outf.write(line)
            print(f"Done.", flush=True)
Ejemplo n.º 9
0
    def run(self):

        if self.debug:
            import pdb
            pdb.set_trace()

        writer = csv.writer(open(self.outputFname, 'w'), delimiter='\t')
        writer.writerow(['#sampleID', 'chromosome', 'length',
            'noOfReadsAlignedByLength', 'noOfSingletonsByLength', \
            'noOfPairsOnSameContigByLength',
            'meanInferInsertSize', 'noOfPairsOnDifferentContigsByLength'])
        for inputFname in self.inputFnameLs:
            inputFile = utils.openGzipFile(inputFname)
            delimiter = figureOutDelimiter(inputFile)
            reader = csv.reader(inputFile, delimiter=delimiter)
            header = next(reader)
            col_name2index = getColName2IndexFromHeader(header)

            sampleIDIndex = col_name2index.get("readGroup")
            chromosomeIndex = col_name2index.get("firstReferenceName")
            chromosomeLengthIndex = col_name2index.get("firstReferenceLength")

            numberOfReadsIndex = col_name2index.get("numberOfReads")
            numberOfReadsAlignedIndex = col_name2index.get(
                "numberOfReadsAligned")
            numberOfSingletonsMappedIndex = col_name2index.get(
                "numberOfSingletonsMapped")
            numberOfPairsOnSameContigIndex = col_name2index.get(
                "numberOfPairsOnSameContig")
            numberOfPairsOnDifferentContigsIndex = col_name2index.get(
                "numberOfPairsOnDifferentContigs")
            meanInsertSizeIndex = col_name2index.get("meanInsertSize")

            for row in reader:
                sampleID = row[sampleIDIndex]
                chromosome = row[chromosomeIndex]
                chromosomeLength = int(row[chromosomeLengthIndex])

                numberOfReads = float(row[numberOfReadsIndex])
                numberOfReadsAligned = float(row[numberOfReadsAlignedIndex])
                numberOfSingletonsMapped = float(
                    row[numberOfSingletonsMappedIndex])
                numberOfPairsOnSameContig = float(
                    row[numberOfPairsOnSameContigIndex])
                numberOfPairsOnDifferentContigs = float(
                    row[numberOfPairsOnDifferentContigsIndex])
                meanInsertSize = row[meanInsertSizeIndex]

                writer.writerow([
                    sampleID, chromosome, chromosomeLength,
                    numberOfReadsAligned / chromosomeLength,
                    numberOfSingletonsMapped / chromosomeLength,
                    numberOfPairsOnSameContig / chromosomeLength,
                    meanInsertSize,
                    numberOfPairsOnDifferentContigs / chromosomeLength
                ])
        del writer
        sys.stderr.write("Done.\n")
Ejemplo n.º 10
0
    def run(self):
        """
		input looks like (inputFileFormat=1)
				msHOT-lite 2 1 -t 4781.50413187402 -r 790.4466018 ...
				//
				segsites: 40567
				
				positions: 0.0002 0.0003
				001001101011011001...
				101001010100101111...
				...
			
			./msHOT-lite 2 1 -t 84989.8346003745 -r 34490.1412746802 30000000 -l -en 0.0013 1 0.0670 -en 0.0022 1 0.3866 -en 0.0032 1 0.3446 -en 0.0044 1 0.21
				79 -en 0.0059 1 0.1513 -en 0.0076 1 0.1144 -en 0.0096 1 0.0910 -en 0.0121 1 0.0757 -en 0.0150 1 0.0662 -en 0.0184 1 0.0609 -en 0.0226 1 0.0583 -en
				 0.0275 1 0.0572 -en 0.0333 1 0.0571 -en 0.0402 1 0.0577 -en 0.0485 1 0.0589 -en 0.0583 1 0.0603 -en 0.0700 1 0.0615 -en 0.0839 1 0.0624 -en 0.100
				5 1 0.0632 -en 0.1202 1 0.0641 -en 0.1437 1 0.0651 -en 0.1716 1 0.0663 -en 0.2048 1 0.0678 -en 0.2444 1 0.0696 -en 0.2914 1 0.0719 -en 0.3475 1 0.
				0752 -en 0.4935 1 0.0794 
				//
				@begin 6422
				
				30000000
				1100    01
				6074    10
				
				29966899        10
				29971027        01
				29973740        01
				29982767        01
				29985696        10
				@end
		"""
        if self.debug:
            import pdb
            pdb.set_trace()

        if not os.path.isfile(self.inputFname):
            sys.stderr.write("Error: file, %s,  is not a file.\n" %
                             (self.inputFname))
            sys.exit(3)

        inputFile = utils.openGzipFile(self.inputFname, 'r')
        outputPolymorphismFile = PolymorphismTableFile(self.outputFname, mode='w', isPhased=1, \
                    ploidy=self.ploidy)

        commandline = inputFile.next().strip()
        outputPolymorphismFile.addAttribute('commandline',
                                            value=commandline,
                                            overwrite=True,
                                            tableName='polymorphism')

        self._convert(inputFile=inputFile,
                      outputPolymorphismFile=outputPolymorphismFile,
                      ploidy=self.ploidy)

        inputFile.close()
        outputPolymorphismFile.close()
Ejemplo n.º 11
0
 def getNoOfSequencesFromFasta(self, inputFastaFname=None):
     """
     2012.5.24
     """
     sys.stderr.write("Getting number of sequences from %s ..."%(inputFastaFname))
     inf = utils.openGzipFile(inputFastaFname)
     no_of_sequences = 0
     for line in inf:
         if line[0]=='>':
             no_of_sequences += 1
     del inf
     sys.stderr.write("%s sequences.\n"%(no_of_sequences))
     return no_of_sequences
Ejemplo n.º 12
0
 def _initializeInput(self, inputFname=None):
     """
     """
     if inputFname and self.mode[0] == 'r':
         self.inf = utils.openGzipFile(inputFname, mode='r')
         """
         if inputFname[-3:]=='.gz':
             import gzip
             self.inf = gzip.open(inputFname, 'rb')
         else:
             self.inf = open(inputFname)
         """
         self.reader = csv.reader(self.inf, delimiter='\t')
         self._parseHeader()
Ejemplo n.º 13
0
    def parseArgumentsFromFile(self, inputFname):
        """
        20190206
        """
        #parse inputFname to get individual_sequence_id &
        # individual_sequence_file_raw_id and others.
        inputFile = utils.openGzipFile(inputFname)
        input_variable_dict = {}
        for line in inputFile:
            var_name, var_value = line.strip().split(": ")
            input_variable_dict[var_name] = var_value
        inputFile.close()

        individual_sequence_id = input_variable_dict.get(
            "individual_sequence_id", self.individual_sequence_id)
        if individual_sequence_id:
            individual_sequence_id = int(individual_sequence_id)
            self.individual_sequence_id = individual_sequence_id

        individual_sequence_file_raw_id = input_variable_dict.get(
            "individual_sequence_file_raw_id",
            self.individual_sequence_file_raw_id)
        if individual_sequence_file_raw_id:
            individual_sequence_file_raw_id = \
                int(individual_sequence_file_raw_id)
            self.individual_sequence_file_raw_id = \
                individual_sequence_file_raw_id

        self.outputDir = input_variable_dict.get("outputDir", self.outputDir)
        self.relativeOutputDir = input_variable_dict.get(
            "relativeOutputDir", self.relativeOutputDir)

        relativePathIndex = self.outputDir.find(self.relativeOutputDir)
        noOfCharsInRelativeOutputDir = len(self.relativeOutputDir)
        if self.outputDir[relativePathIndex:relativePathIndex+\
                noOfCharsInRelativeOutputDir]!=self.relativeOutputDir:
            logging.error(f'relativeOutputDir {self.relativeOutputDir} is not'
                          f' the last part of outputDir {self.outputDir}.')
            sys.exit(4)
Ejemplo n.º 14
0
 def __init__(self, path=None, **keywords):
     self.ad = ProcessOptions.process_function_arguments(keywords,
         self.option_default_dict, error_doc=self.__doc__,
         class_to_have_attr=self)
     if not self.path:
         self.path = path
     
     if self.path and self.file_handle is None:
         self.file_handle = utils.openGzipFile(self.path, mode=self.mode)
     
     #2013.05.03 for easy access
     self.filename = self.path		
     self.csvFile = None
     self.isRealCSV = False
     if self.mode=='r':	#reading mode
         if self.delimiter is None:
             self.delimiter = figureOutDelimiter(self.file_handle)
         
         if self.delimiter=='\t' or self.delimiter==',':
             self.csvFile = csv.reader(self.file_handle, delimiter=self.delimiter)
             self.isRealCSV = True
         else:
             self.csvFile = self.file_handle
             self.isRealCSV = False
     else:	#writing mode
         if not self.delimiter:
             self.delimiter = '\t'
         self.csvFile = csv.writer(self.file_handle, delimiter=self.delimiter)
         self.isRealCSV = True
         #else:
         #	self.csvFile = self.file_handle
         #	self.isRealCSV = False
     self.col_name2index = None
     
     self._row = None	# store the current row being read
     self.headerPattern = re.compile(r'^[a-zA-Z]')
     #default header pattern, line beginned with letter
     self.commentPattern = re.compile(r'^#')	#default, beginned with #
     self.comment_row_list  = []
Ejemplo n.º 15
0
def getReadBaseCount(inputFname,
                     ignore_set=set(['>', '+', '@']),
                     onlyForEmptyCheck=False):
    """
    inputFname could be fastq or fasta
    """
    inf = utils.openGzipFile(inputFname, mode='r')
    read_count = 0
    base_count = 0

    for line in inf:
        if line[0] in ignore_set:
            if line[0] == '+':
                #skip the quality-score line right after this "+" line
                inf.readline()
            continue
        read_count += 1
        base_count += len(line.strip())
        if onlyForEmptyCheck:
            #2012.3.19 one read is enough.
            break
    del inf
    return PassingData(read_count=read_count, base_count=base_count)
Ejemplo n.º 16
0
    def run(self):
        """
        """
        if self.debug:
            import pdb
            pdb.set_trace()

        if not os.path.isfile(self.inputFname):
            sys.stderr.write("Error: file, %s,  is not a file.\n" %
                             (self.inputFname))
            sys.exit(3)

        inputFile = utils.openGzipFile(self.inputFname, 'r')
        outputPolymorphismFile = PolymorphismTableFile(self.outputFname,
                                                       mode='w',
                                                       isPhased=1,
                                                       ploidy=self.ploidy)
        outputChromosomeSequenceFile = open(self.outputChromosomeSequenceFname,
                                            "w")

        commandline = inputFile.next().strip()
        outputPolymorphismFile.addAttribute('commandline',
                                            value=commandline,
                                            overwrite=True,
                                            tableName='polymorphism')

        for line in inputFile:
            if self.iterationPattern.search(
                    line):  #one iteration is regarded as one species
                self.outputOneIteration(inputFile=inputFile, iterationLine=line,
                    outputPolymorphismFile=outputPolymorphismFile,\
                    outputChromosomeSequenceFile=outputChromosomeSequenceFile,
                    ploidy=self.ploidy)

        inputFile.close()
        outputPolymorphismFile.close()
        outputChromosomeSequenceFile.close()
    def run(self):
        """
		"""
        if self.debug:
            import pdb
            pdb.set_trace()

        if not os.path.isfile(self.inputFname):
            sys.stderr.write("Error: file, %s,  is not a file.\n" %
                             (self.inputFname))
            sys.exit(3)

        inf = utils.openGzipFile(self.inputFname, 'r')
        outf = open(self.outputFname, 'w')
        for line in inf:
            newLine = re.sub(r'%s' % (self.oldMSPath), r'%s' % (self.msPath),
                             line)
            if self.replaceTheHengLiOutputFlagAsWell:
                newLine = newLine.replace(
                    " -l", ""
                )  #it's global and exhaustive, any " -l " will be replaced.
            outf.write(newLine)
        inf.close()
        outf.close()
Ejemplo n.º 18
0
    def getQualityData(self,
                       inputFname,
                       read_sampling_rate=0.05,
                       quality_score_format='Sanger'):
        """
        """
        print(f"Getting base quality data from {inputFname} ...", flush=True)
        quality_ls_per_position = []
        quality_ls = []
        no_of_bases_per_position = []
        diNuc2count = {}
        diNuc2quality_ls = {}

        inf = utils.openGzipFile(inputFname, 'r')
        counter = 0
        real_counter = 0
        for line in inf:
            if line[0] == '@':
                counter += 1
                coin_toss = random.random()
                base_string = inf.readline().strip()
                inf.readline()
                quality_string = inf.readline().strip()
                if coin_toss <= read_sampling_rate:
                    real_counter += 1
                    read_length = len(base_string)
                    if len(quality_ls_per_position) < read_length:
                        # extend quality_ls_per_position to house more data
                        extraNoOfBases = read_length - len(
                            quality_ls_per_position)
                        for j in range(extraNoOfBases):
                            quality_ls_per_position.append([])
                            no_of_bases_per_position.append(0)

                    for i in range(read_length):
                        base = base_string[i]
                        base_quality = quality_string[i]
                        if quality_score_format == 'Illumina1.3':
                            phredScore = utils.converSolexaScoreToPhred(
                                base_quality)
                        else:
                            phredScore = ord(base_quality) - 33
                        quality_ls_per_position[i].append(phredScore)
                        quality_ls.append(phredScore)
                        if base != 'N':
                            no_of_bases_per_position[i] += 1
                            if i < read_length - 1:
                                nextBase = base_string[i + 1]
                                if nextBase != 'N':
                                    diNuc = base + nextBase
                                    if diNuc not in diNuc2quality_ls:
                                        diNuc2quality_ls[diNuc] = []
                                        diNuc2count[diNuc] = 0
                                    diNuc2quality_ls[diNuc].append(phredScore)
                                    diNuc2count[diNuc] += 1
            if counter % 5000 == 0 and self.report:
                sys.stderr.write("%s%s\t%s" %
                                 ('\x08' * 80, real_counter, counter))

            #if baseCount>10000:	#temporary, for testing
            #	break
        del inf
        print(f"{real_counter}/{counter} reads selected.", flush=True)
        return PassingData(
            quality_ls_per_position=quality_ls_per_position,
            quality_ls=quality_ls, \
            no_of_bases_per_position=no_of_bases_per_position,
            diNuc2quality_ls=diNuc2quality_ls,
            diNuc2count=diNuc2count)
    def traverse(self):
        """
        """
        newHeader = []
        key2dataLs = {}
        #key is the keyColumn,
        #  dataLs corresponds to the sum of each column from valueColumnLs
        delimiter = None
        for inputFname in self.inputFnameLs:
            if not os.path.isfile(inputFname):
                if self.exitNonZeroIfAnyInputFileInexistent:
                    sys.exit(3)
                else:
                    continue
            reader = None
            try:
                inputFile = utils.openGzipFile(inputFname)
                delimiter = figureOutDelimiter(inputFile)
                reader = MatrixFile(file_handle=inputFile, delimiter=delimiter)
            except:
                logging.error(f'Except type: {sys.exc_info()}')
                import traceback
                traceback.print_exc()

            try:
                header = next(reader)
                self.handleNewHeader(header,
                                     newHeader,
                                     self.keyColumnLs,
                                     self.valueColumnLs,
                                     keyColumnSet=self.keyColumnSet)
                if self.noHeader:
                    inputFile.seek(0)
                    reader = MatrixFile(file_handle=inputFile,
                                        delimiter=delimiter)
            except:
                logging.error(f'Except type: {sys.exc_info()}')
                import traceback
                traceback.print_exc()

            if reader is not None:
                for row in reader:
                    try:
                        self.handleValueColumns(
                            row,
                            key2dataLs=key2dataLs,
                            keyColumnLs=self.keyColumnLs,
                            valueColumnLs=self.valueColumnLs)
                    except:
                        #in case something wrong (i.e. file is empty)
                        logging.error(f'Ignore this row: {row}.')
                        logging.error(f'Except type: {sys.exc_info()}')
                        import traceback
                        traceback.print_exc()
                del reader
        if self.noHeader:
            newHeader = None
        returnData = PassingData(key2dataLs=key2dataLs,
                                 delimiter=delimiter,
                                 header=newHeader)
        return returnData
Ejemplo n.º 20
0
    def run(self):
        """
        """
        
        if self.debug:
            import pdb
            pdb.set_trace()
        """
        2012.4.3
        the output of samtools flagstat looks like:

20170602 new flagstat output

470131994 + 0 in total (QC-passed reads + QC-failed reads)
63918054 + 0 secondary
0 + 0 supplementary
3001858 + 0 duplicates
460732266 + 0 mapped (98.00% : N/A)
406213940 + 0 paired in sequencing
203106970 + 0 read1
203106970 + 0 read2
391157952 + 0 properly paired (96.29% : N/A)
394571382 + 0 with itself and mate mapped
2242830 + 0 singletons (0.55% : N/A)
2443798 + 0 with mate mapped to a different chr
1751451 + 0 with mate mapped to a different chr (mapQ>=5)

        """
        
        inf = utils.openGzipFile(self.inputFname, mode='r')
        writer = csv.writer(utils.openGzipFile(self.outputFname, mode='w'), delimiter='\t')
        header = ['alignmentID', 'total_no_of_reads', 'perc_secondary', 'perc_supplementary', \
                'perc_reads_mapped', 'perc_duplicates', 'perc_paired', 'perc_properly_paired', \
                'perc_both_mates_mapped', 'perc_singletons',\
                'perc_mapped_to_diff_chrs', 'perc_mapq5_mapped_to_diff_chrs']
        writer.writerow(header)
        
        #float total_no_of_reads now so that no "float" upon division
        total_no_of_reads = float(self.getNumberOutOfFlagStatLine(line=inf.readline(),
            grabPattern=re.compile(r'^(\d+) \+ (\d+) in total')))
        no_of_secondary = self.getNumberOutOfFlagStatLine(line=inf.readline(),
            grabPattern=re.compile(r'^(\d+) \+ (\d+) secondary'))
        no_of_supplementary = self.getNumberOutOfFlagStatLine(line=inf.readline(),
            grabPattern=re.compile(r'^(\d+) \+ (\d+) supplementary'))
        no_of_duplicates = self.getNumberOutOfFlagStatLine(line=inf.readline(),
            grabPattern=re.compile(r'^(\d+) \+ (\d+) duplicates'))
        no_of_mapped = self.getNumberOutOfFlagStatLine(line=inf.readline(),
            grabPattern=re.compile(r'^(\d+) \+ (\d+) mapped'))
        no_of_paired = self.getNumberOutOfFlagStatLine(line=inf.readline(),
            grabPattern=re.compile(r'^(\d+) \+ (\d+) paired in sequencing'))
        no_of_read1 = self.getNumberOutOfFlagStatLine(line=inf.readline(),
            grabPattern=re.compile(r'^(\d+) \+ (\d+) read1'))
        no_of_read2 = self.getNumberOutOfFlagStatLine(line=inf.readline(),
            grabPattern=re.compile(r'^(\d+) \+ (\d+) read2'))
        no_of_properly_paired = self.getNumberOutOfFlagStatLine(line=inf.readline(),
            grabPattern=re.compile(r'^(\d+) \+ (\d+) properly paired'))
        no_of_both_mates_mapped = self.getNumberOutOfFlagStatLine(line=inf.readline(),
            grabPattern=re.compile(r'^(\d+) \+ (\d+) with itself and mate mapped'))
        no_of_singletons = self.getNumberOutOfFlagStatLine(line=inf.readline(),
            grabPattern=re.compile(r'^(\d+) \+ (\d+) singletons'))
        no_of_mates_mapped_to_diff_chrs = self.getNumberOutOfFlagStatLine(line=inf.readline(),
            grabPattern=re.compile(r'^(\d+) \+ (\d+) with mate mapped to a different chr\n'))
        no_of_mates_mapped_to_diff_chrs_mapQAbove5 = self.getNumberOutOfFlagStatLine(line=inf.readline(),
            grabPattern=re.compile(r'^(\d+) \+ (\d+) with mate mapped to a different chr \(mapQ>=5\)'))
        #
        del inf
        
        data_row = [self.alignmentID, total_no_of_reads,
            no_of_secondary/total_no_of_reads*100,
            no_of_supplementary/total_no_of_reads*100,
            no_of_mapped/total_no_of_reads*100,
            no_of_duplicates/total_no_of_reads*100,
            no_of_paired/total_no_of_reads*100,
            no_of_properly_paired/total_no_of_reads*100,
            no_of_both_mates_mapped/total_no_of_reads*100,
            no_of_singletons/total_no_of_reads*100,
            no_of_mates_mapped_to_diff_chrs/total_no_of_reads*100,
            no_of_mates_mapped_to_diff_chrs_mapQAbove5/total_no_of_reads*100]
        writer.writerow(data_row)
        del writer
    def traverse(self):
        """
        """
        newHeader = []
        key2dataLs = {}
        #key is the keyColumn,
        #  dataLs corresponds to the sum of each column from valueColumnLs
        noOfDataColumnsFromPriorFiles = 0
        for inputFname in self.inputFnameLs:
            if not os.path.isfile(inputFname):
                if self.exitNonZeroIfAnyInputFileInexistent:
                    logging.error(f'{inputFname} does not exist.')
                    sys.exit(3)
                else:
                    continue
            reader = None
            try:
                inputFile = utils.openGzipFile(inputFname)
                if self.inputDelimiter is None or self.inputDelimiter == '':
                    self.inputDelimiter = figureOutDelimiter(inputFile)
                reader = MatrixFile(file_handle=inputFile,
                                    delimiter=self.inputDelimiter)
            except:
                logging.error(f'Except type: {sys.exc_info()}')
                import traceback
                traceback.print_exc()

            valueColumnLs = []
            try:
                header = next(reader)
                self.handleNewHeader(header,
                                     newHeader,
                                     self.keyColumnLs,
                                     valueColumnLs,
                                     keyColumnSet=self.keyColumnSet)
                if self.noHeader:
                    inputFile.seek(0)
                    reader = MatrixFile(file_handle=inputFile,
                                        delimiter=self.inputDelimiter)
            except:
                #in case something wrong (i.e. file is empty)
                logging.error(f'Except type: {sys.exc_info()}')
                import traceback
                traceback.print_exc()

            if reader is not None and valueColumnLs:
                visitedKeySet = set()
                for row in reader:
                    try:
                        self.handleValueColumns(row,
                                                key2dataLs=key2dataLs,
                                                keyColumnLs=self.keyColumnLs,
                                                valueColumnLs=valueColumnLs,
                                                noOfDataColumnsFromPriorFiles=
                                                noOfDataColumnsFromPriorFiles,
                                                visitedKeySet=visitedKeySet)
                    except:
                        logging.error(f'Ignore this row: {row}.')
                        logging.error(f'Except type: {sys.exc_info()}')
                        import traceback
                        traceback.print_exc()
                del reader
                #append empty data to keys who are missing in the current file.
                totalKeySet = set(key2dataLs.keys())
                unvisitedKeySet = totalKeySet - visitedKeySet
                for key in unvisitedKeySet:
                    for i in valueColumnLs:
                        key2dataLs[key].append('')
            noOfDataColumnsFromPriorFiles += len(valueColumnLs)
        if self.noHeader:
            newHeader = None
        returnData = PassingData(key2dataLs=key2dataLs,
                                 delimiter=self.inputDelimiter,
                                 header=newHeader)
        return returnData
Ejemplo n.º 22
0
    def run(self):
        """
		"""
        if self.debug:
            import pdb
            pdb.set_trace()
        #check if inputFname is empty
        inputFile = utils.openGzipFile(self.inputFname)
        char_counter = 0
        for line in inputFile:
            #only need one line
            char_counter += len(line)
            break
        inputFile.close()
        sys.stderr.write("First line character count of %s: %s.\n" %
                         (self.inputFname, char_counter))
        if char_counter == 0:
            sys.stderr.write("ERROR: exit due to empty file.\n")
            sys.exit(2)

        db_main = self.db_main
        session = db_main.session
        session.begin()

        if self.data_dir:
            data_dir = self.data_dir
        else:
            data_dir = db_main.data_dir
        #uuid is for sequence only, add as isq.comment
        individual_sequence = db_main.getIndividualSequence(
         individual_id=self.individual_id,
         sequencer_id=self.sequencer_id,\
         sequence_type_name=self.sequence_type_name, \
         sequence_format=self.sequence_format,
         path_to_original_sequence=self.original_sequence_filepath, \
         copy_original_file=self.copy_original_file,\
         tissue_name=self.tissue_name, tissue_id=self.tissue_id, \
         coverage=self.coverage,\
         quality_score_format=self.quality_score_format, filtered=self.filtered,\
         parent_individual_sequence_id=self.parent_individual_sequence_id,\
         read_count=self.read_count, no_of_chromosomes=self.no_of_chromosomes, \
         sequence_batch_id=self.sequence_batch_id, version=self.version,
         subFolder=None, data_dir=data_dir,\
         is_contaminated=self.is_contaminated, outdated_index=self.outdated_index,
         comment=self.comment)
        file_raw_db_entry = None
        if self.original_sequence_filepath:
            file_raw_db_entry = db_main.registerOriginalSequenceFileToDB(
             self.original_sequence_filepath,
             library=self.original_sequence_library, \
             individual_sequence_id=individual_sequence.id, mate_id=self.original_sequence_mate_id, \
             md5sum=self.original_sequence_md5sum)

        #output isq_id to outputFname
        outputDir = os.path.join(data_dir, individual_sequence.path)
        if not os.path.isdir(outputDir):
            os.makedirs(outputDir)
        if self.outputFname:
            outf = open(self.outputFname, 'w')
            outf.write("individual_sequence_id: %s\n" %
                       (individual_sequence.id))
            if file_raw_db_entry:
                outf.write("individual_sequence_file_raw_id: %s\n" %
                           (file_raw_db_entry.id))
            outf.write("outputDir: %s\n" % (outputDir))
            outf.write("relativeOutputDir: %s\n" % (individual_sequence.path))
            outf.close()
        if self.commit:
            session.commit()
        else:
            self.sessionRollback(session)
Ejemplo n.º 23
0
    def vcftoolsOutputStatFileWalker(self, inputFname, processFunc=None, run_type=1, \
        chrColumnHeader='CHR', minChrLength=1000000, chrLengthColumnHeader='chrLength',\
        xColumnHeader="BIN_START", valueForNonPositiveYValue=-1):
        """
        2012.10.26 skip sites if chr_cumu_start is not available
        2012.10.25 only skip except during file opening, not file reading
        2012.9.18 chrLengthColumnHeader could be nothing
        
        """
        sys.stderr.write("walking through %s ..." % (inputFname))
        counter = 0
        chr2xy_ls = self.chr2xy_ls
        try:
            inf = utils.openGzipFile(inputFname)
            delimiter = figureOutDelimiter(inf)
            sys.stderr.write(" delimiter is '%s'  " % (delimiter))
            reader = csv.reader(inf, delimiter=delimiter)
            header = next(reader)
            col_name2index = getColName2IndexFromHeader(header,
                                                        skipEmptyColumn=True)
        except:  #in case something wrong (i.e. file is empty)
            sys.stderr.write('Except type: %s\n' % repr(sys.exc_info()))
            import traceback
            traceback.print_exc()
            print(sys.exc_info())
            return

        chr_id_index = col_name2index.get(chrColumnHeader, None)
        if chr_id_index is None:
            chr_id_index = col_name2index.get("CHROM", None)
        if chr_id_index is None:
            chr_id_index = col_name2index.get("CHR", None)
        if chr_id_index is None:
            sys.stderr.write("Error chr_id_index is None.\n")
            sys.exit(3)
        bin_start_index = col_name2index.get(xColumnHeader, None)
        if chrLengthColumnHeader:  #could be nothing
            chrLength_index = col_name2index.get(chrLengthColumnHeader, None)
        else:
            chrLength_index = None
        if self.whichColumnHeader:
            whichColumn = col_name2index.get(self.whichColumnHeader, None)
        else:
            whichColumn = self.whichColumn

        for row in reader:
            if self.samplingRate < 1 and self.samplingRate >= 0:
                r = random.random()
                if r > self.samplingRate:
                    continue
            if chrLength_index:
                chrLength = int(row[chrLength_index])
                if chrLength < minChrLength:
                    continue
            chr_id = row[chr_id_index]
            bin_start = int(float(row[bin_start_index]))

            yValue = row[whichColumn]
            yValue = self.handleYValue(yValue)

            if chr_id not in chr2xy_ls:
                chr2xy_ls[chr_id] = [[], []]
            chr_cumu_start = self.chr_id2cumu_start.get(chr_id)
            if chr_cumu_start is None:  #2012.10.26 skip sites
                sys.stderr.write(
                    "Chromosome %s does not have chr_cumu_start.\n" % (chr_id))
                continue
            chr2xy_ls[chr_id][0].append(chr_cumu_start + bin_start + 1)
            chr2xy_ls[chr_id][1].append(yValue)
            counter += 1
        del reader
        inf.close()
        sys.stderr.write("%s data.\n" % (counter))
    def outputSNPDataInNewCoordinate(self, querySNPDataFname=None, querySNPID2NewReferenceCoordinateLs=None,\
                                    newSNPDataOutputFname=None, newSNPDataOutputFormat=1):
        """
        2013.07.03 added argument newSNPDataOutputFormat
            
        2012.10.14
            split out of findSNPPositionOnNewRef()
        """
        sys.stderr.write("Converting querySNPDataFname %s into individual X SNP format, format=%s ... "%\
                        (querySNPDataFname, newSNPDataOutputFormat))
        """
Sample  Geno    SNP
1999010 CC      cs_primer1082_247
1999068 CC      cs_primer1082_247
2000022 CT      cs_primer1082_247
2000064 CT      cs_primer1082_247
2000117 CC      cs_primer1082_247

        """
        inf = utils.openGzipFile(querySNPDataFname)
        reader = csv.reader(inf, delimiter=figureOutDelimiter(inf))
        col_name2index = getColName2IndexFromHeader(next(reader))

        sampleIndex = col_name2index.get("Sample")
        genotypeIndex = col_name2index.get("Geno")
        SNPIDIndex = col_name2index.get("SNP")

        row_id2index = {}
        row_id_ls = []
        col_id_ls = []
        col_id2index = {}
        row_col_index2genotype = {}
        for row in reader:
            sampleID = row[sampleIndex]
            genotype = row[genotypeIndex]
            querySNPID = row[SNPIDIndex]
            if querySNPID in querySNPID2NewReferenceCoordinateLs:
                newRefCoordinateLs = querySNPID2NewReferenceCoordinateLs.get(
                    querySNPID)
                if len(newRefCoordinateLs) == 1:
                    newRefCoordinate = newRefCoordinateLs[0]
                    if newSNPDataOutputFormat == 2:
                        col_id = '%s_%s' % (newRefCoordinate.newChr,
                                            newRefCoordinate.newRefStart)
                    else:
                        col_id = '%s_%s_%s' % (newRefCoordinate.newChr,
                                               newRefCoordinate.newRefStart,
                                               newRefCoordinate.newRefStop)
                    queryStrand = newRefCoordinate.queryStrand
                    if col_id not in col_id2index:
                        col_id2index[col_id] = len(col_id2index)
                        col_id_ls.append(col_id)
                    if sampleID not in row_id2index:
                        row_id2index[sampleID] = len(row_id2index)
                        row_id_ls.append(sampleID)
                    if queryStrand == "-":
                        genotype = SNP.reverseComplement(genotype)
                    row_index = row_id2index[sampleID]
                    col_index = col_id2index[col_id]
                    row_col_index2genotype[(row_index, col_index)] = genotype
                else:
                    continue
        data_matrix = numpy.zeros(
            [len(row_id_ls), len(col_id2index)], dtype=numpy.int8)

        for row_col_index, genotype in row_col_index2genotype.items():
            row_index, col_index = row_col_index[:2]
            data_matrix[row_index, col_index] = SNP.nt2number[genotype]
        sys.stderr.write("\n")
        snpData = SNP.SNPData(row_id_ls=row_id_ls,
                              col_id_ls=col_id_ls,
                              data_matrix=data_matrix)
        snpData.tofile(newSNPDataOutputFname)
Ejemplo n.º 25
0
 def parse_chromosome_fasta_file(self, db=None, filename=None, tax_id=None, 
     version=None, chunk_size=10000, \
     sequence_type_name=None, sequence_type_id=None, run_type=1,
     maxNoOfFastaRecords=500):
     """
     argument maxNoOfFastaRecords: the max number of fasta records before quitting
     argument run_type
         1: chromosome sequences from NCBI genbank
         2: vervet scaffolds from WUSTL
         3: full vervet BACs from McGill
     2010-12-15
         fix a bug that _tax_id shall be used in query AnnotAssembly.
         This bug caused the db redundancy check to fail.
     2010-12-15
         if entry already exists in AnnotAssembly, skip it.
     2008-07-29
         figure out tax_id via FigureOutTaxID
         filename could contain multiple fasta blocks
     2008-07-27
         change to use data structures from GenomeDB.py
     2008-07-06
     use the firstline (header) of the fasta file to extract which chromosome.
     using filename is unreliable.
     """
     inf = utils.openGzipFile(filename, mode='r')
     
     line = inf.readline()
     #'line' is not enough to stop the 'while' loop. after the file reading is
     #  exhausted by "for line in inf:", 'line' still contains the stuff from the last line.
     new_fasta_block = 1
     no_of_fasta_blocks = 0
     while line and new_fasta_block:
         new_fasta_block = 0
         #set it to 0, assuming only one fasta block, change upon new fasta block
         if line[0]!='>':	#not fasta block header
             for line in inf:	#exhaust this fasta block as it's not what's wanted.
                 if line[0]=='>':
                     new_fasta_block = 1
                     break
             continue
         headerData = self.parseFastaDescriptionDict[run_type](line, self.FigureOutTaxID_ins)
         if not headerData.chromosome:
             sys.stderr.write("Error chromosome for header %s is empty %s.\n"%(
                 line, headerData.chromosome))
             import pdb
             pdb.set_trace()
         if tax_id is not None and headerData.tax_id and tax_id!=headerData.tax_id:
             sys.stderr.write("tax_id (%s) not matching the one given (%s). Ignore.\n"%(
                 headerData.tax_id, tax_id))
             line = inf.readline()
             new_fasta_block = 1
             continue
         
         chromosome = headerData.chromosome
         sequence_type = db.getSequenceType(short_name=sequence_type_name, entry_id=sequence_type_id)
         start = 1
         aa_attr_instance = db.checkAnnotAssembly(version=version, tax_id=tax_id, \
                             chromosome=chromosome, start=start, stop=None, \
                             sequence_type_id=sequence_type.id)
         if aa_attr_instance and aa_attr_instance.raw_sequence_start_id is not None:
             # if raw sequences have been associated with this AnnotAssembly and 
             sys.stderr.write("raw sequences have been associated with this AnnotAssembly "
                 "(tax_id %s, chr=%s, start=%s). Ignore.\n"%\
                 (tax_id, chromosome, start))
             line = inf.readline()
             new_fasta_block = 1
             continue
         if aa_attr_instance is None:
             aa_attr_instance = db.getAnnotAssembly(gi=headerData.gi, 
                 acc_ver=headerData.acc_ver, accession=None, \
                 version =version, tax_id=tax_id, chromosome =chromosome, \
                 start =start, stop =None, orientation=None, sequence = None,\
                 raw_sequence_start_id=None, original_path=os.path.abspath(filename),\
                 sequence_type_id=sequence_type.id, \
                 chromosome_type_id=None, chromosome_type_name=None, comment=headerData.comment)
             if aa_attr_instance.acc_ver and self.p_acc_ver.search(aa_attr_instance.acc_ver):
                 aa_attr_instance.accession, aa_attr_instance.version = self.p_acc_ver.search(
                     aa_attr_instance.acc_ver).groups()
                 aa_attr_instance.version = int(aa_attr_instance.version)
             else:
                 aa_attr_instance.accession = None
                 aa_attr_instance.version = version
             if self.debug:
                 sys.stderr.write("tax_id=%s for %s.\n"%(aa_attr_instance.tax_id, line))
             #aa_attr_instance.raw_sequence_start_id = 
             # self.get_current_max_raw_sequence_id(curs, raw_sequence_table)+1
         passingdata = PassingData()
         passingdata.current_start = 1
         passingdata.raw_sequence_initiated = False
         seq = ''
         for line in inf:
             if line[0]=='>':
                 if seq:	#last segment from the previous fasta block
                     self.saveRawSequence(db.session, seq, passingdata, aa_attr_instance)
                     seq = ''	#set to nothing to avoid saving one more RawSequence
                 new_fasta_block = 1
                 break	#start from while again
             
             seq += line.strip()
             if len(seq)>=chunk_size:
                 seq_to_db = seq[:chunk_size]
                 self.saveRawSequence(db.session, seq_to_db, passingdata, aa_attr_instance)
                 seq = seq[chunk_size:]	#remove the one already in db
                 if self.report:
                     sys.stderr.write("%s\t%s\t%s"%('\x08'*40, no_of_fasta_blocks, 
                         passingdata.current_start/chunk_size+1))
         if seq:	# last segment from last line
             self.saveRawSequence(db.session, seq, passingdata, aa_attr_instance)
         aa_attr_instance.stop = passingdata.current_stop
         db.session.add(aa_attr_instance)
         db.session.flush()
         no_of_fasta_blocks += 1
         if no_of_fasta_blocks>=maxNoOfFastaRecords:
             break
     sys.stderr.write("\n  Number of fasta records/chromosomes: %s.\n"%(no_of_fasta_blocks))
     del inf