def run(self):

        if self.debug:
            import pdb
            pdb.set_trace()

        writer = csv.writer(open(self.outputFname, 'w'), delimiter='\t')
        writer.writerow(
            ['#sampleID', 'chromosome', 'meanDepth', 'medianDepth'])
        for inputFname in self.inputFnameLs:
            inputFile = utils.openGzipFile(inputFname)
            delimiter = figureOutDelimiter(inputFile)
            reader = csv.reader(inputFile, delimiter=delimiter)
            header = next(reader)
            col_name2index = getColName2IndexFromHeader(header)

            intervalIDIndex = col_name2index.get("Target")
            #only the first read group among the output (so don't run
            #   the DepthOfCoverageWalker over multi-read-group bam files
            avgCoverageIndex = 4
            sampleID = header[avgCoverageIndex][:-9]
            #this column header is like $sampleID_mean_cvg. so get rid of _mean_cvg
            medianCoverageIndex = 6

            for row in reader:
                intervalID = row[intervalIDIndex]
                writer.writerow([
                    sampleID, intervalID, row[avgCoverageIndex],
                    row[medianCoverageIndex]
                ])
        del writer
        sys.stderr.write("Done.\n")
Esempio n. 2
0
    def run(self):
        """
        in case chop the whole figure into blocks, swap col_block_index and
            row_block_index to make row first, column 2nd
        """
        from palos.polymorphism.SNP import read_data
        from palos.utils import figureOutDelimiter, PassingData
        delimiter = figureOutDelimiter(self.input_fname)
        print(delimiter)
        header, row_label_ls1, row_label_ls2, data_matrix = read_data(
            self.input_fname, matrix_data_type=float, delimiter='\t')
        import numpy
        data_matrix = numpy.array(data_matrix)
        min_value = numpy.min(data_matrix)
        if self.min_value_non_negative and min_value < 0:
            min_value = 0
        max_value = numpy.max(data_matrix)
        font = get_font(self.font_path, font_size=self.font_size)
        Value2Color.special_value2color[-2] = self.super_value_color
        value2color_func = lambda x: Value2Color.value2HSLcolor(
            x, min_value, max_value)
        im_legend = drawContinousLegend(min_value, max_value, self.no_of_ticks,
                                        value2color_func, font)

        fig_fname_prefix = os.path.splitext(self.fig_fname)[0]
        if self.split_legend_and_matrix:
            im_legend.save('%s_legend.png' % fig_fname_prefix)

        no_of_rows, no_of_cols = data_matrix.shape
        passParam = PassingData(
            value2color_func=value2color_func,
            im_legend=im_legend,
            font=font,
            split_legend_and_matrix=self.split_legend_and_matrix,
            no_grid=self.no_grid)

        if no_of_cols <= self.blockColUnit:
            self._drawMatrix(data_matrix, row_label_ls1, header[2:],
                             self.fig_fname, passParam)
        else:  #split into blocks
            no_of_col_blocks = no_of_cols / self.blockColUnit + 1
            no_of_row_blocks = no_of_rows / self.blockRowUnit + 1
            for i in range(no_of_col_blocks):
                col_start_index = i * self.blockColUnit
                col_end_index = (i + 1) * self.blockColUnit
                if col_start_index < no_of_cols:
                    for j in range(no_of_row_blocks):
                        row_start_index = j * self.blockRowUnit
                        row_end_index = (j + 1) * self.blockRowUnit
                        if row_start_index < no_of_rows:
                            fig_fname = '%s_%s_%s.png' % (fig_fname_prefix, j,
                                                          i)
                            #row first, column 2nd
                            self._drawMatrix(
                                data_matrix[row_start_index:row_end_index,
                                            col_start_index:col_end_index],
                                row_label_ls1[row_start_index:row_end_index],
                                header[2 + col_start_index:2 + col_end_index],
                                fig_fname, passParam)
Esempio n. 3
0
    def run(self):

        if self.debug:
            import pdb
            pdb.set_trace()

        writer = csv.writer(open(self.outputFname, 'w'), delimiter='\t')
        writer.writerow(['#sampleID', 'chromosome', 'length',
            'noOfReadsAlignedByLength', 'noOfSingletonsByLength', \
            'noOfPairsOnSameContigByLength',
            'meanInferInsertSize', 'noOfPairsOnDifferentContigsByLength'])
        for inputFname in self.inputFnameLs:
            inputFile = utils.openGzipFile(inputFname)
            delimiter = figureOutDelimiter(inputFile)
            reader = csv.reader(inputFile, delimiter=delimiter)
            header = next(reader)
            col_name2index = getColName2IndexFromHeader(header)

            sampleIDIndex = col_name2index.get("readGroup")
            chromosomeIndex = col_name2index.get("firstReferenceName")
            chromosomeLengthIndex = col_name2index.get("firstReferenceLength")

            numberOfReadsIndex = col_name2index.get("numberOfReads")
            numberOfReadsAlignedIndex = col_name2index.get(
                "numberOfReadsAligned")
            numberOfSingletonsMappedIndex = col_name2index.get(
                "numberOfSingletonsMapped")
            numberOfPairsOnSameContigIndex = col_name2index.get(
                "numberOfPairsOnSameContig")
            numberOfPairsOnDifferentContigsIndex = col_name2index.get(
                "numberOfPairsOnDifferentContigs")
            meanInsertSizeIndex = col_name2index.get("meanInsertSize")

            for row in reader:
                sampleID = row[sampleIDIndex]
                chromosome = row[chromosomeIndex]
                chromosomeLength = int(row[chromosomeLengthIndex])

                numberOfReads = float(row[numberOfReadsIndex])
                numberOfReadsAligned = float(row[numberOfReadsAlignedIndex])
                numberOfSingletonsMapped = float(
                    row[numberOfSingletonsMappedIndex])
                numberOfPairsOnSameContig = float(
                    row[numberOfPairsOnSameContigIndex])
                numberOfPairsOnDifferentContigs = float(
                    row[numberOfPairsOnDifferentContigsIndex])
                meanInsertSize = row[meanInsertSizeIndex]

                writer.writerow([
                    sampleID, chromosome, chromosomeLength,
                    numberOfReadsAligned / chromosomeLength,
                    numberOfSingletonsMapped / chromosomeLength,
                    numberOfPairsOnSameContig / chromosomeLength,
                    meanInsertSize,
                    numberOfPairsOnDifferentContigs / chromosomeLength
                ])
        del writer
        sys.stderr.write("Done.\n")
    def run(self):
        """
		2008-5-12
		"""
        if self.debug:
            import pdb
            pdb.set_trace()

        #database connection and etc
        db = self.db_250k

        session = db.session
        session.begin()

        delimiter = figureOutDelimiter(self.inputFname, report=self.report)
        header, strain_acc_list, category_list, data_matrix = read_data(self.inputFname, delimiter=delimiter,\
                       matrix_data_type=int)

        if self.snp_id_type == 1:
            #2011-2-27 translate the db_id into chr_pos because the new StrainXSNP dataset uses db_id to identify SNPs.
            # but if col-id is already chr_pos, it's fine.
            new_header = header[:2]
            data_matrix_col_index_to_be_kept = []
            for i in range(2, len(header)):
                snp_id = header[i]
                chr_pos = db.get_chr_pos_given_db_id2chr_pos(snp_id, )
                if chr_pos is not None:
                    data_matrix_col_index_to_be_kept.append(i - 2)
                    new_header.append(chr_pos)
            # to remove no-db_id columns from data matrix
            data_matrix = numpy.array(data_matrix)
            data_matrix = data_matrix[:, data_matrix_col_index_to_be_kept]
            header = new_header

        if self.array_id_2nd_column:
            snpData = SNPData(header=header, strain_acc_list=strain_acc_list, category_list=category_list,\
                data_matrix=data_matrix)
        else:
            snpData = SNPData(header=header, strain_acc_list=strain_acc_list,\
                data_matrix=data_matrix) #ignore category_list

        rawSnpsData_ls = SNPData2RawSnpsData_ls(snpData,
                                                need_transposeSNPData=1,
                                                report=self.report)
        chromosomes = [
            rawSnpsData.chromosome for rawSnpsData in rawSnpsData_ls
        ]
        snpsdata.writeRawSnpsDatasToFile(self.outputFname,
                                         rawSnpsData_ls,
                                         chromosomes=chromosomes,
                                         deliminator=',',
                                         withArrayIds=self.array_id_2nd_column)
    def run(self):
        """
        """

        if self.debug:
            import pdb
            pdb.set_trace()

        inf = utils.openGzipFile(self.inputFname, mode='r')

        reader = csv.reader(inf, delimiter=figureOutDelimiter(inf))
        header = None
        for i in range(self.noOfLinesInHeader):
            if i == 0:
                header = next(reader)
            else:
                next(reader)
        if header is not None:
            colName2Index = getColName2IndexFromHeader(header)

        newHeader = [
            'alignmentID', 'total_base_count', 'sampled_base_count',
            'meanDepth', 'medianDepth', 'modeDepth'
        ]
        inputStatLs = []

        writer = csv.writer(utils.openGzipFile(self.outputFname, mode='w'),
                            delimiter='\t')
        writer.writerow(newHeader)
        counter = 0
        real_counter = 0
        for row in reader:
            counter += 1
            if real_counter <= self.maxNumberOfSamplings:
                r = random.random()
                if r <= self.fractionToSample and real_counter <= self.maxNumberOfSamplings:
                    inputStatLs.append(float(row[self.whichColumn]))
                    real_counter += 1

        meanDepth = numpy.mean(inputStatLs)
        medianDepth = numpy.median(inputStatLs)
        modeDepth = scipy.stats.mode(inputStatLs)[0][0]
        outputRow = [
            self.alignmentID, counter, real_counter, meanDepth, medianDepth,
            modeDepth
        ]
        writer.writerow(outputRow)
        del writer
    def get_isqID2coverage(self, seqCoverageFname, defaultCoverage=None):
        """
        2011-9-2
        """
        sys.stderr.write("Fetching sequence coverage info from %s ..." %
                         (seqCoverageFname))

        reader = csv.reader(open(seqCoverageFname, 'r'),
                            delimiter=figureOutDelimiter(seqCoverageFname))
        isqID2coverage = {}
        header = next(reader)
        for row in reader:
            isqID = int(row[0])
            coverage = float(row[1])
            isqID2coverage[isqID] = coverage
        sys.stderr.write("%s entries.\n" % len(isqID2coverage))
        return isqID2coverage
Esempio n. 7
0
 def trioInconsistentRateFileWalker(cls,
                                    inputFname,
                                    processFunc=None,
                                    minNoOfTotal=100,
                                    run_type=1):
     """
     only skip except during file opening, not file reading
     """
     try:
         reader = csv.reader(open(inputFname),
                             delimiter=figureOutDelimiter(inputFname))
         header = next(reader)
         col_name2index = getColName2IndexFromHeader(header,
                                                     skipEmptyColumn=True)
     except:
         sys.stderr.write('Except type: %s\n' % repr(sys.exc_info()))
         import traceback
         traceback.print_exc()
         return
     inconsistent_rate_index = col_name2index.get("inconsistency")
     if run_type == 1:
         index_of_x_data = col_name2index.get("stopFrequency")
     elif run_type == 2:
         index_of_x_data = col_name2index.get("stop")
     else:
         sys.stderr.write(
             "Unsupported run_type %s in trioInconsistentRateFileWalker().\n"
             % (run_type))
         sys.exit(3)
     index_of_no_of_total = col_name2index.get("no_of_total")
     inconsistent_rate_ls = []
     x_ls = []
     for row in reader:
         if self.samplingRate < 1 and self.samplingRate >= 0:
             r = random.random()
             if r > self.samplingRate:
                 continue
         no_of_total = int(float(row[index_of_no_of_total]))
         if no_of_total <= minNoOfTotal:
             continue
         inconsistency = float(row[inconsistent_rate_index])
         inconsistent_rate_ls.append(inconsistency)
         x_data = float(row[index_of_x_data])
         x_ls.append(x_data)
     processFunc(x_ls, inconsistent_rate_ls)
     del reader
Esempio n. 8
0
 def __init__(self, path=None, **keywords):
     self.ad = ProcessOptions.process_function_arguments(keywords,
         self.option_default_dict, error_doc=self.__doc__,
         class_to_have_attr=self)
     if not self.path:
         self.path = path
     
     if self.path and self.file_handle is None:
         self.file_handle = utils.openGzipFile(self.path, mode=self.mode)
     
     #2013.05.03 for easy access
     self.filename = self.path		
     self.csvFile = None
     self.isRealCSV = False
     if self.mode=='r':	#reading mode
         if self.delimiter is None:
             self.delimiter = figureOutDelimiter(self.file_handle)
         
         if self.delimiter=='\t' or self.delimiter==',':
             self.csvFile = csv.reader(self.file_handle, delimiter=self.delimiter)
             self.isRealCSV = True
         else:
             self.csvFile = self.file_handle
             self.isRealCSV = False
     else:	#writing mode
         if not self.delimiter:
             self.delimiter = '\t'
         self.csvFile = csv.writer(self.file_handle, delimiter=self.delimiter)
         self.isRealCSV = True
         #else:
         #	self.csvFile = self.file_handle
         #	self.isRealCSV = False
     self.col_name2index = None
     
     self._row = None	# store the current row being read
     self.headerPattern = re.compile(r'^[a-zA-Z]')
     #default header pattern, line beginned with letter
     self.commentPattern = re.compile(r'^#')	#default, beginned with #
     self.comment_row_list  = []
    def run(self):
        """
		2008-9-7
		"""
        if self.debug:
            import pdb
            pdb.set_trace()

        delimiter = figureOutDelimiter(self.input_fname, report=self.report)
        header, strain_acc_list, category_list, data_matrix = read_data(
            self.input_fname, delimiter=delimiter)

        snpData = SNPData(header=header, strain_acc_list=strain_acc_list, category_list=category_list,\
            data_matrix=data_matrix)
        newSnpData, allele_index2allele_ls = snpData.convert2Binary(
            self.report)

        if self.mapping_fname:  #output allele_index2allele_ls
            self.output_allele2index_ls(snpData, allele_index2allele_ls,
                                        self.mapping_fname)

        newSnpData.tofile(self.output_fname)
    def outputSNPDataInNewCoordinate(self, querySNPDataFname=None, querySNPID2NewReferenceCoordinateLs=None,\
                                    newSNPDataOutputFname=None, newSNPDataOutputFormat=1):
        """
        2013.07.03 added argument newSNPDataOutputFormat
            
        2012.10.14
            split out of findSNPPositionOnNewRef()
        """
        sys.stderr.write("Converting querySNPDataFname %s into individual X SNP format, format=%s ... "%\
                        (querySNPDataFname, newSNPDataOutputFormat))
        """
Sample  Geno    SNP
1999010 CC      cs_primer1082_247
1999068 CC      cs_primer1082_247
2000022 CT      cs_primer1082_247
2000064 CT      cs_primer1082_247
2000117 CC      cs_primer1082_247

        """
        inf = utils.openGzipFile(querySNPDataFname)
        reader = csv.reader(inf, delimiter=figureOutDelimiter(inf))
        col_name2index = getColName2IndexFromHeader(next(reader))

        sampleIndex = col_name2index.get("Sample")
        genotypeIndex = col_name2index.get("Geno")
        SNPIDIndex = col_name2index.get("SNP")

        row_id2index = {}
        row_id_ls = []
        col_id_ls = []
        col_id2index = {}
        row_col_index2genotype = {}
        for row in reader:
            sampleID = row[sampleIndex]
            genotype = row[genotypeIndex]
            querySNPID = row[SNPIDIndex]
            if querySNPID in querySNPID2NewReferenceCoordinateLs:
                newRefCoordinateLs = querySNPID2NewReferenceCoordinateLs.get(
                    querySNPID)
                if len(newRefCoordinateLs) == 1:
                    newRefCoordinate = newRefCoordinateLs[0]
                    if newSNPDataOutputFormat == 2:
                        col_id = '%s_%s' % (newRefCoordinate.newChr,
                                            newRefCoordinate.newRefStart)
                    else:
                        col_id = '%s_%s_%s' % (newRefCoordinate.newChr,
                                               newRefCoordinate.newRefStart,
                                               newRefCoordinate.newRefStop)
                    queryStrand = newRefCoordinate.queryStrand
                    if col_id not in col_id2index:
                        col_id2index[col_id] = len(col_id2index)
                        col_id_ls.append(col_id)
                    if sampleID not in row_id2index:
                        row_id2index[sampleID] = len(row_id2index)
                        row_id_ls.append(sampleID)
                    if queryStrand == "-":
                        genotype = SNP.reverseComplement(genotype)
                    row_index = row_id2index[sampleID]
                    col_index = col_id2index[col_id]
                    row_col_index2genotype[(row_index, col_index)] = genotype
                else:
                    continue
        data_matrix = numpy.zeros(
            [len(row_id_ls), len(col_id2index)], dtype=numpy.int8)

        for row_col_index, genotype in row_col_index2genotype.items():
            row_index, col_index = row_col_index[:2]
            data_matrix[row_index, col_index] = SNP.nt2number[genotype]
        sys.stderr.write("\n")
        snpData = SNP.SNPData(row_id_ls=row_id_ls,
                              col_id_ls=col_id_ls,
                              data_matrix=data_matrix)
        snpData.tofile(newSNPDataOutputFname)
Esempio n. 11
0
	def run(self):
		"""
		2012.5.7 new input looks like this (tab-delimited):
			alignmentID     total_base_count        sampled_base_count      meanDepth       medianDepth     modeDepth
			100     1005506 301614  70.0441756682   9.0     8.0
			27     1005506 301614  70.0441756682   9.0     8.0

		2012.4.3
			each input looks like this:
			
sample_id       total   mean    granular_third_quartile granular_median granular_first_quartile %_bases_above_15
553_2_VRC_ref_GA_vs_524 2434923137      8.25    11      9       6       4.4
Total   2434923137      8.25    N/A     N/A     N/A
554_3_Barbados_GA_vs_524        2136011136      7.23    11      8       6       3.5
Total   2136011136      7.23    N/A     N/A     N/A
...

		"""
		
		if self.debug:
			import pdb
			pdb.set_trace()
		session = self.db_main.session
		session.begin()
		
		no_of_total_lines = 0
		for inputFname in self.inputFnameLs:
			reader = csv.reader(open(inputFname), delimiter=figureOutDelimiter(inputFname))
			header = next(reader)
			col_name2index = utils.getColName2IndexFromHeader(header, skipEmptyColumn=True)
			
			sample_id_index = col_name2index.get("alignmentID")
			total_base_count_index = col_name2index.get('total_base_count')
			mean_depth_index = col_name2index.get("meanDepth")
			median_depth_index = col_name2index.get("medianDepth")
			mode_depth_index = col_name2index.get("modeDepth")
			for row in reader:
				sample_id = row[sample_id_index]
				if sample_id=='Total':	#ignore rows with this as sample id
					continue
				alignment_id = int(sample_id.split("_")[0])
				total_base_count = int(row[total_base_count_index])
				mean_depth = float(row[mean_depth_index])
				median_depth = float(row[median_depth_index])
				mode_depth = float(row[mode_depth_index])
				individual_alignment = self.db_main.queryTable(SunsetDB.IndividualAlignment).get(alignment_id)
				individual_alignment.pass_qc_read_base_count = total_base_count	#2012.9.17 no longer trustworthy because CalculateMedianMeanOfInputColumn skips data.
				individual_alignment.mean_depth = mean_depth
				individual_alignment.median_depth = median_depth
				individual_alignment.mode_depth = mode_depth
				session.add(individual_alignment)
				no_of_total_lines += 1
			del reader
		sys.stderr.write("%s alignments in total.\n"%(no_of_total_lines))
		
		if self.logFilename:
			logF = open(self.logFilename, 'w')
			logF.write("%s alignments in total.\n"%(no_of_total_lines))
			del logF
		
		if self.commit:
			session.flush()
			session.commit()
Esempio n. 12
0
    def vcftoolsOutputStatFileWalker(self, inputFname, processFunc=None, run_type=1, \
        chrColumnHeader='CHR', minChrLength=1000000, chrLengthColumnHeader='chrLength',\
        xColumnHeader="BIN_START", valueForNonPositiveYValue=-1):
        """
        2012.10.26 skip sites if chr_cumu_start is not available
        2012.10.25 only skip except during file opening, not file reading
        2012.9.18 chrLengthColumnHeader could be nothing
        
        """
        sys.stderr.write("walking through %s ..." % (inputFname))
        counter = 0
        chr2xy_ls = self.chr2xy_ls
        try:
            inf = utils.openGzipFile(inputFname)
            delimiter = figureOutDelimiter(inf)
            sys.stderr.write(" delimiter is '%s'  " % (delimiter))
            reader = csv.reader(inf, delimiter=delimiter)
            header = next(reader)
            col_name2index = getColName2IndexFromHeader(header,
                                                        skipEmptyColumn=True)
        except:  #in case something wrong (i.e. file is empty)
            sys.stderr.write('Except type: %s\n' % repr(sys.exc_info()))
            import traceback
            traceback.print_exc()
            print(sys.exc_info())
            return

        chr_id_index = col_name2index.get(chrColumnHeader, None)
        if chr_id_index is None:
            chr_id_index = col_name2index.get("CHROM", None)
        if chr_id_index is None:
            chr_id_index = col_name2index.get("CHR", None)
        if chr_id_index is None:
            sys.stderr.write("Error chr_id_index is None.\n")
            sys.exit(3)
        bin_start_index = col_name2index.get(xColumnHeader, None)
        if chrLengthColumnHeader:  #could be nothing
            chrLength_index = col_name2index.get(chrLengthColumnHeader, None)
        else:
            chrLength_index = None
        if self.whichColumnHeader:
            whichColumn = col_name2index.get(self.whichColumnHeader, None)
        else:
            whichColumn = self.whichColumn

        for row in reader:
            if self.samplingRate < 1 and self.samplingRate >= 0:
                r = random.random()
                if r > self.samplingRate:
                    continue
            if chrLength_index:
                chrLength = int(row[chrLength_index])
                if chrLength < minChrLength:
                    continue
            chr_id = row[chr_id_index]
            bin_start = int(float(row[bin_start_index]))

            yValue = row[whichColumn]
            yValue = self.handleYValue(yValue)

            if chr_id not in chr2xy_ls:
                chr2xy_ls[chr_id] = [[], []]
            chr_cumu_start = self.chr_id2cumu_start.get(chr_id)
            if chr_cumu_start is None:  #2012.10.26 skip sites
                sys.stderr.write(
                    "Chromosome %s does not have chr_cumu_start.\n" % (chr_id))
                continue
            chr2xy_ls[chr_id][0].append(chr_cumu_start + bin_start + 1)
            chr2xy_ls[chr_id][1].append(yValue)
            counter += 1
        del reader
        inf.close()
        sys.stderr.write("%s data.\n" % (counter))
Esempio n. 13
0
    def run(self):
        """
		2008-12-02
		"""
        if self.debug:
            import pdb
            pdb.set_trace()

        delimiter = figureOutDelimiter(self.input_fname, report=self.report)
        if self.matrix_data_type_int == 2:
            matrix_data_type = float
        else:
            matrix_data_type = int
        header, strain_acc_list, category_list, data_matrix = read_data(
            self.input_fname,
            matrix_data_type=matrix_data_type,
            delimiter=delimiter)

        if self.array_id_2nd_column:
            snpData = SNPData(header=header, strain_acc_list=strain_acc_list, category_list=category_list,\
                data_matrix=data_matrix)
        else:
            snpData = SNPData(header=header, strain_acc_list=strain_acc_list,\
                data_matrix=data_matrix) #ignore category_list

        newSnpData, allele_index2allele_ls = snpData.convert2Binary(
            self.report)

        if self.phenotype_fname and self.phenotype_method_id:
            header_phen, strain_acc_list_phen, category_list_phen, data_matrix_phen = read_data(
                self.phenotype_fname, turn_into_integer=0)
            phenData = SNPData(
                header=header_phen,
                strain_acc_list=newSnpData.strain_acc_list,
                data_matrix=data_matrix_phen
            )  #row label is that of the SNP matrix, because the phenotype matrix is gonna be re-ordered in that way
            phenData.data_matrix = Kruskal_Wallis.get_phenotype_matrix_in_data_matrix_order(
                newSnpData.row_id_ls, strain_acc_list_phen,
                phenData.data_matrix)  #tricky, using strain_acc_list_phen

            phenotype_col_index = PlotGroupOfSNPs.findOutWhichPhenotypeColumn(
                phenData, set([self.phenotype_method_id]))[0]
            phenotype_label = phenData.col_id_ls[phenotype_col_index]
            phenotype_f = open(
                '%s_%s.pheno' %
                (self.output_fname_prefix, phenotype_label.replace('/', '_')),
                'w')
            for phenotype_value in phenData.data_matrix[:,
                                                        phenotype_col_index]:
                if self.phenotype_is_binary:  #binary and non-binary have different NA designator
                    if numpy.isnan(phenotype_value):
                        phenotype_value = 9
                    else:
                        phenotype_value = int(phenotype_value)
                else:
                    if numpy.isnan(phenotype_value):
                        phenotype_value = -100.0
                phenotype_f.write('%s\n' % phenotype_value)
            del phenotype_f

        genotype_f = open('%s.geno' % self.output_fname_prefix, 'w')
        ind_writer = csv.writer(open('%s.ind' % self.output_fname_prefix, 'w'),
                                delimiter='\t')
        snp_writer = csv.writer(open('%s.snp' % self.output_fname_prefix, 'w'),
                                delimiter='\t')

        #transpose it
        newSnpData = transposeSNPData(newSnpData)

        no_of_rows = len(newSnpData.data_matrix)
        no_of_cols = len(newSnpData.data_matrix[0])
        for i in range(no_of_rows):
            snp_id = newSnpData.row_id_ls[i]
            chr, pos = snp_id.split('_')
            allele1 = allele_index2allele_ls[i][0]  #major allele
            allele2 = allele_index2allele_ls[i][1]  #minor allele
            snp_writer.writerow([snp_id, chr, 0.0, pos, allele1, allele2])
            geno_line = ''
            for j in range(no_of_cols):
                if i == 0:  #write out the accessions
                    ind_writer.writerow([newSnpData.col_id_ls[j], 'U', 'Case'])
                allele = newSnpData.data_matrix[i][j]
                if allele == 0:
                    geno_line += '0'
                elif allele == 1:
                    geno_line += '2'
                else:
                    geno_line += '9'
            geno_line += '\n'
            genotype_f.write(geno_line)

        del genotype_f, ind_writer, snp_writer
    def traverse(self):
        """
        """
        newHeader = []
        key2dataLs = {}
        #key is the keyColumn,
        #  dataLs corresponds to the sum of each column from valueColumnLs
        delimiter = None
        for inputFname in self.inputFnameLs:
            if not os.path.isfile(inputFname):
                if self.exitNonZeroIfAnyInputFileInexistent:
                    sys.exit(3)
                else:
                    continue
            reader = None
            try:
                inputFile = utils.openGzipFile(inputFname)
                delimiter = figureOutDelimiter(inputFile)
                reader = MatrixFile(file_handle=inputFile, delimiter=delimiter)
            except:
                logging.error(f'Except type: {sys.exc_info()}')
                import traceback
                traceback.print_exc()

            try:
                header = next(reader)
                self.handleNewHeader(header,
                                     newHeader,
                                     self.keyColumnLs,
                                     self.valueColumnLs,
                                     keyColumnSet=self.keyColumnSet)
                if self.noHeader:
                    inputFile.seek(0)
                    reader = MatrixFile(file_handle=inputFile,
                                        delimiter=delimiter)
            except:
                logging.error(f'Except type: {sys.exc_info()}')
                import traceback
                traceback.print_exc()

            if reader is not None:
                for row in reader:
                    try:
                        self.handleValueColumns(
                            row,
                            key2dataLs=key2dataLs,
                            keyColumnLs=self.keyColumnLs,
                            valueColumnLs=self.valueColumnLs)
                    except:
                        #in case something wrong (i.e. file is empty)
                        logging.error(f'Ignore this row: {row}.')
                        logging.error(f'Except type: {sys.exc_info()}')
                        import traceback
                        traceback.print_exc()
                del reader
        if self.noHeader:
            newHeader = None
        returnData = PassingData(key2dataLs=key2dataLs,
                                 delimiter=delimiter,
                                 header=newHeader)
        return returnData
    def traverse(self):
        """
        """
        newHeader = []
        key2dataLs = {}
        #key is the keyColumn,
        #  dataLs corresponds to the sum of each column from valueColumnLs
        noOfDataColumnsFromPriorFiles = 0
        for inputFname in self.inputFnameLs:
            if not os.path.isfile(inputFname):
                if self.exitNonZeroIfAnyInputFileInexistent:
                    logging.error(f'{inputFname} does not exist.')
                    sys.exit(3)
                else:
                    continue
            reader = None
            try:
                inputFile = utils.openGzipFile(inputFname)
                if self.inputDelimiter is None or self.inputDelimiter == '':
                    self.inputDelimiter = figureOutDelimiter(inputFile)
                reader = MatrixFile(file_handle=inputFile,
                                    delimiter=self.inputDelimiter)
            except:
                logging.error(f'Except type: {sys.exc_info()}')
                import traceback
                traceback.print_exc()

            valueColumnLs = []
            try:
                header = next(reader)
                self.handleNewHeader(header,
                                     newHeader,
                                     self.keyColumnLs,
                                     valueColumnLs,
                                     keyColumnSet=self.keyColumnSet)
                if self.noHeader:
                    inputFile.seek(0)
                    reader = MatrixFile(file_handle=inputFile,
                                        delimiter=self.inputDelimiter)
            except:
                #in case something wrong (i.e. file is empty)
                logging.error(f'Except type: {sys.exc_info()}')
                import traceback
                traceback.print_exc()

            if reader is not None and valueColumnLs:
                visitedKeySet = set()
                for row in reader:
                    try:
                        self.handleValueColumns(row,
                                                key2dataLs=key2dataLs,
                                                keyColumnLs=self.keyColumnLs,
                                                valueColumnLs=valueColumnLs,
                                                noOfDataColumnsFromPriorFiles=
                                                noOfDataColumnsFromPriorFiles,
                                                visitedKeySet=visitedKeySet)
                    except:
                        logging.error(f'Ignore this row: {row}.')
                        logging.error(f'Except type: {sys.exc_info()}')
                        import traceback
                        traceback.print_exc()
                del reader
                #append empty data to keys who are missing in the current file.
                totalKeySet = set(key2dataLs.keys())
                unvisitedKeySet = totalKeySet - visitedKeySet
                for key in unvisitedKeySet:
                    for i in valueColumnLs:
                        key2dataLs[key].append('')
            noOfDataColumnsFromPriorFiles += len(valueColumnLs)
        if self.noHeader:
            newHeader = None
        returnData = PassingData(key2dataLs=key2dataLs,
                                 delimiter=self.inputDelimiter,
                                 header=newHeader)
        return returnData
Esempio n. 16
0
    def run(self):
        """
		2012.4.3
			each input has this as its header:
			
			['alignmentID', 'total_no_of_reads', 'perc_reads_mapped', 'perc_duplicates', 'perc_paired', 'perc_properly_paired', \
				'perc_both_mates_mapped', 'perc_singletons',\
				'perc_mapped_to_diff_chrs']
		"""

        if self.debug:
            import pdb
            pdb.set_trace()
        session = self.db_main.session
        session.begin()

        no_of_total_lines = 0
        for inputFname in self.inputFnameLs:
            reader = csv.reader(open(inputFname),
                                delimiter=figureOutDelimiter(inputFname))
            header = next(reader)
            colName2Index = utils.getColName2IndexFromHeader(
                header, skipEmptyColumn=True)
            alignment_id_index = colName2Index.get('alignmentID')
            total_no_of_reads_index = colName2Index.get('total_no_of_reads')
            perc_secondary_index = colName2Index.get("perc_secondary")
            perc_supplementary_index = colName2Index.get("perc_supplementary")
            perc_reads_mapped_index = colName2Index.get("perc_reads_mapped")
            perc_duplicates_index = colName2Index.get("perc_duplicates")
            perc_paired_index = colName2Index.get("perc_paired")
            perc_properly_paired_index = colName2Index.get(
                "perc_properly_paired")
            perc_both_mates_mapped_index = colName2Index.get(
                "perc_both_mates_mapped")
            perc_singletons_index = colName2Index.get("perc_singletons")
            perc_mapped_to_diff_chrs_index = colName2Index.get(
                "perc_mapped_to_diff_chrs")
            perc_mapq5_mapped_to_diff_chrs_index = colName2Index.get(
                "perc_mapq5_mapped_to_diff_chrs")
            for row in reader:
                alignmentID = int(row[alignment_id_index])
                alignment = self.db_main.queryTable(
                    SunsetDB.IndividualAlignment).get(alignmentID)

                alignment.perc_reads_mapped = float(
                    row[perc_reads_mapped_index])
                alignment.perc_secondary = float(row[perc_secondary_index])
                alignment.perc_supplementary = float(
                    row[perc_supplementary_index])
                alignment.perc_duplicates = float(row[perc_duplicates_index])
                alignment.perc_paired = float(row[perc_paired_index])
                alignment.perc_properly_paired = float(
                    row[perc_properly_paired_index])
                alignment.perc_both_mates_mapped = float(
                    row[perc_both_mates_mapped_index])
                alignment.perc_singletons = float(row[perc_singletons_index])
                alignment.perc_mapped_to_diff_chrs = float(
                    row[perc_mapped_to_diff_chrs_index])
                alignment.perc_mapq5_mapped_to_diff_chrs = float(
                    row[perc_mapq5_mapped_to_diff_chrs_index])
                alignment.total_no_of_reads = int(
                    float(row[total_no_of_reads_index]))
                session.add(alignment)
                no_of_total_lines += 1
            del reader
        sys.stderr.write("%s alignments in total.\n" % (no_of_total_lines))

        if self.logFilename:
            logF = open(self.logFilename, 'w')
            logF.write("%s alignments in total.\n" % (no_of_total_lines))
            del logF

        if self.commit:
            session.flush()
            session.commit()