Exemple #1
    def RunInfo(self):
        Print useful information about the run, including the version, time, and configuration used 

        outputDirName = self.outputDirName
        outputFileName = self.file_names['runinfo']
        config = self.config
            ofRunInfo = open(outputDirName + "/" + outputFileName, 'w+')
        except IOError:
            abortWithMessage("Error opening output file {0}/{1}".format(outputDirName, outputFileName))
        # =========================
        # run_info.txt
        ofRunInfo.write(Info.versionInfo + "\n")
        ofRunInfo.write("{0}\n\n".format(time.ctime() ) )
        TO DO: total runtime? or runtime breakdown per segment of the program?
        ofRunInfo.write("Variants Pre-filter: \n")
        ofRunInfo.write("        Post-filter: \n")
        return True
Exemple #2
    def VCF(self):
        Print vcf file of the variant locations, feature, depth, and variant frequency.

        Output: variant_locations.vcf
        outputDirName = self.outputDirName
        outputFileName = self.file_names['vcf']
        varDF = self.data
            ofVariantVCF = open(outputDirName + "/" + outputFileName, 'w+')
        except IOError:
            abortWithMessage("Error opening output file {0}/{1}".format(outputDirName, outputFileName))
        #VCF header to stream
        grouped = varDF.groupby(['chr', 'pos', 'ref', 'alt'])
        vcf_fields = ['chr','pos','id','ref','alt','qual','filter','INFO']  # the relevant fields for a vcf file
        outSeries = grouped.apply(collapseVCF)                           # make the INFO column for this set of samples
        outDF = pd.DataFrame(outSeries, columns=['INFO'])
        varDF = varDF.merge(outDF, left_on=['chr','pos','ref','alt'], right_index=True) # add the info column to the original dataframe
        out = varDF.reindex(columns=vcf_fields).fillna('.')                 # drop the columns that are unrelated to vcf format, while reordering columns into proper vcf order
        official_vcf_fields = ['#CHROM','POS','ID', 'REF','ALT', 'QUAL', 'FILTER','INFO']   
        out.columns = pd.Index(official_vcf_fields)                         # rename columns to comply with official vcf standards
        out.to_csv(ofVariantVCF, sep='\t', na_rep='?', index=False, header=True, sparsify=False)
        print("\t{0}: {1} rows".format(ofVariantVCF.name, len(out)))
        return True
Exemple #3
def collapseVariantDetails(group):
    Pandas operations to support the VariantDetails family of functions. 
    Collapses variant rows that share the same contig, position, ref allele, alt allele, and feature.
    Input: a pandas groupby object
    Output: a pandas dataframe object
    outvals = []
    columns = list(group.keys().values)
    for column in columns:
        # outstring = '' # string based
        outlist = []     # list based
        if column in ['vf', 'dp', 'sample', 'source']:  # the only columns that need to be concatenated 
                                                        # the others are uniquified and "always" yield 1 value
            for i in group[column].values:
                #outstring += str(i) + ", "   # string based
            #outvals.append( outstring[:-2] )  # string based          # trim the extra ', ' off the end of outstring 
            outvals.append(", ".join(outlist))
            if len(group[column].unique()) == 1:
                outvals.append( group[column].unique()[0] ) # only 1 value; extract it since there's no need to preserve the array type
                #this should be pretty rare, such as inputting two files for the same sample that were annotated for functional consequence by two different tools
                outvals.append( "/".join(filter(bool, [x for x in group[column].unique() if x != '?' ] )) )  # functional consequence and effect should only be 1 value as well, but may not be if there is a mixture of annotated and unannotated vcf files. 
                                                                    # This sorts the effects and consequences, and concatenates the values that are not '?'s.
                                                                    # each of the appended values must be an array in order to transform dictionary to pandas data frame in the next line
        #outDF = pd.DataFrame( dict(zip(columns,outvals)), columns=columns )
        outD = pd.Series( dict(zip(columns,outvals)) )
    except ValueError:
        abortWithMessage("Could not collapse variant {0} {1} {2}/{3} into a single row. \n\t\tThis mutation may have inconsistent effect and/or functional consequence values in different VCF files.".format( str(group['chr'].unique()[0]), str(group['pos'].unique()[0]), str(group['ref'].unique()[0]), str(group['alt'].unique()[0]) ))
    return outD
Exemple #4
    def Feature_and_MutationXSampleVAF(self):
        Print counts per mutation per sample.
        Sample names populate the table header and feature names populate the first column. Chromosome, position, ref, and alt populate the next columns.
        The table values are mutation VAF. 

        Output: feature_and_mutation_by_sample_vaf.xlsx
        outputDirName = self.outputDirName
        outputFileName = self.file_names['mutXsampVAF']
        varDF = self.data
        ofFeature_and_MutationXSample = pd.ExcelWriter(str(outputDirName) + "/" + outputFileName)
        # apply weighted average to collapse multiple VAFs 
        outDF = varDF.groupby(['feature','chr','pos','ref','alt','sample']).apply(wavg('vf','dp'))
        outDF = pd.DataFrame(outDF)
        # collapse the hierarchical index and pivot 
        outDF = pd.pivot_table( outDF.reset_index(), values=0, index=['feature','chr','pos','ref','alt'], columns='sample' )
        outDF.to_excel(ofFeature_and_MutationXSample, 'Feat. + Mutation by Sample VAF', na_rep=0, index=True)
        except IOError:
            abortWithMessage("Error opening output file {0}/{1}".format(outputDirName, outputFileName))
        print("\t{0}: {1} rows".format(str(outputDirName) + "/" + outputFileName, len(outDF)))        
        return True 
Exemple #5
    def VariantDetails(self):
        Print all information about each mutation,
        combining all mutations (irrespective of in how many samples they appear)
        into a single row
        Note: chrom, position, ref, alt, and feature are all required to uniquely identify a mutation 
              indels may have the same chr, pos, but different ref/alt

        Output: variant_details.txt, variant_details.xls
        Note: switching the pandas ExcelWriter file extension to xlsx instead of xls requires openpyxl
              If openpyxl is unavailable, the xlwt library can write xls 

        outputDirName = self.outputDirName
        varDF = self.data
        if 'txt' in self.config.outputFormats and 'txt' not in self.attempted_formats: # and not os.path.exists(outputDirName + "/variant_details.txt"):
            txt = bool(True)
            txt = bool(False) # User has not opted for this output [parenthetical 'if's] or this output has already been run [last 'if']
        if 'xls' in self.config.outputFormats and 'xls' not in self.attempted_formats: # and not os.path.exists(outputDirName + "/variant_details.xls"):
            xls = bool(True)
            xls = bool(False) # User has not opted for this output [parenthetical 'if's] or this output has already been run [last 'if']

            if txt:
                outputFileName = self.file_names['txt']
                ofVariantDetailsTXT = open(outputDirName + "/" + outputFileName, 'w+')
            if xls:
                outputFileName = self.file_names['xls']
                ofVariantDetailsXLS = pd.ExcelWriter(str(outputDirName) + '/' + outputFileName)
        except IOError:
            abortWithMessage("Error opening output file {0}/{1}".format(outputDirName, outputFileName))
        if txt or xls:
            # Group by (chr, pos, ref, alt, feature)
            grouped = varDF.groupby(['chr', 'pos', 'ref', 'alt', 'feature'])
            # apply collapsing function to each pandas group
            outSeries = grouped.apply(collapseVariantDetails)
            out = pd.DataFrame(outSeries.reset_index(drop=True), columns=varDF.columns)

            #outSeries = grouped.apply(collapseVariantDetails)
            #out = pd.DataFrame(outSeries.reset_index(drop=True), columns=varDF.columns)

        if txt:
            # print the new, collapsed dataframe to a file
            mySort(out, ['feature','pos']).to_csv(ofVariantDetailsTXT, sep='\t', na_rep='?', index=False)
            print("\t{0}: {1} rows".format(ofVariantDetailsTXT.name, len(out)))
        if xls:
            # print the new, collapsed dataframe to file a
            mySort(out, ['feature','pos']).to_excel(ofVariantDetailsXLS, 'Variant Details', na_rep='?', index=False)
            print("\t{0}: {1} rows".format(str(outputDirName + '/' + outputFileName), len(out)))

        return True
Exemple #6
    def LongVariantDetails(self):
        Similar to printVariantDetails above, but writes each instance
        of a mutation to a new row. 
        Each mutation is written once per source instead of combining
        reoccurring mutations in to 1 unique row.
        Output: long_variant_details.txt, long_variant_details.xls
        Note: switching the pandas ExcelWriter file extension to xlsx instead
        of xls requires openpyxl
        outputDirName = self.outputDirName
        varDF = self.data

        if 'longtxt' in self.config.outputFormats and 'longtxt' not in self.attempted_formats:
            # and not os.path.exists(outputDirName + "/long_variant_details.txt"):
            longtxt = bool(True)
            longtxt = bool(False) # User has not opted for this output [parenthetical 'if's] or this output has already been run [last 'if']
        if 'longxls' in self.config.outputFormats and 'longxls' not in self.attempted_formats: # and not os.path.exists(outputDirName + "/long_variant_details.xls"):
            longxls = bool(True)
            longxls = bool(False) # User has not opted for this output [parenthetical 'if's] or this output has already been run [last 'if']

            if longtxt:
                outputFileName = self.file_names['longtxt']
                ofLongVariantDetailsTXT = open(outputDirName + "/" + outputFileName, 'w+')
                mySort(varDF, ['feature','pos']).to_csv(ofLongVariantDetailsTXT, sep='\t', na_rep='?', index=False)
                print("\t{0}: {1} rows".format(str(outputDirName + '/' + outputFileName), len(varDF)))
            if longxls:
                outputFileName = self.file_names['longxls']
                ofLongVariantDetailsXLS = pd.ExcelWriter(str(outputDirName) + '/' + outputFileName)
                mySort(varDF, ['feature','pos']).to_excel(ofLongVariantDetailsXLS, 'Long Variant Details', na_rep='?', index=False)
                print("\t{0}: {1} rows".format(str(outputDirName + '/' + outputFileName), len(varDF)))
        except IOError:
            abortWithMessage("Error opening output file {0}/{1}".format(outputDirName, outputFileName))
        return True
Exemple #7
    def Counts(self):
        Print counts per feature

        Output: counts.txt
        outputDirName = self.outputDirName
        outputFileName = self.file_names['counts']
        varDF = self.data
            ofCounts = open(outputDirName + "/" + outputFileName, 'w+')
        except IOError:
            abortWithMessage("Error opening output file {0}/{1}".format(outputDirName, outputFileName))
        grouped = varDF.groupby('feature')

        numHits = grouped['sample'].count()
        numHits.name = 'Hits'
        numHits = pd.DataFrame(numHits)
        weightedHits = grouped['vf'].apply(np.sum)
        weightedHits.name = 'WeightedHits'
        weightedHits = pd.DataFrame(weightedHits)

        avgWeight = weightedHits.div(numHits.Hits, axis='index')
        avgWeight = avgWeight.rename(columns={'WeightedHits':'AverageWeight'}) # different kind of column re-name required for division result
        avgWeight = pd.DataFrame(avgWeight)

        uniqueHits = grouped['pos'].nunique()
        uniqueHits.name = 'UniqueHits'
        uniqueHits = pd.DataFrame(uniqueHits)

        numSamples = grouped['sample'].nunique()
        numSamples.name = 'NumSamples'
        numSamples = pd.DataFrame(numSamples)

        # merge the 5 1-column dataframes into a single, 5-column dataframe. (6 columns including the feature name column)
        out = numHits.join(weightedHits).join(avgWeight).join(uniqueHits).join(numSamples)
        # change capitalization for consistency 
        out.index.names = ['FeatureName']

        mySort(out).to_csv(ofCounts, sep='\t', na_rep='?', index=True)
        print("\t{0}: {1} rows".format(ofCounts.name, len(out)))        
        return True 
Exemple #8
    def VariantBed(self):
        Print bed file of the variant locations

        Output: variant_locations.bed
        outputDirName = self.outputDirName
        outputFileName = self.file_names['bed']
        varDF = self.data
            ofVariantBeds = open(outputDirName + "/" + outputFileName, 'w+')
        except IOError:
            abortWithMessage("Error opening output file {0}/{1}".format(outputDirName, outputFileName))
        grouped = varDF.groupby(['chr', 'pos', 'ref', 'alt'])
        outSeries = grouped.apply(collapseVariantBed)
        out = pd.DataFrame(outSeries.reset_index(drop=True))
        mySort(out, ['chr','start']).to_csv(ofVariantBeds, sep='\t', na_rep='?', index=False, header=False)
        print("\t{0}: {1} rows".format(ofVariantBeds.name, len(out)))
        return True
Exemple #9
    def FeatureXSample(self):
        Print counts per feature per sample.
        Sample names populate the table header and feature names populate the first column. The count per sample per feature are the table values.

        Output: feature_by_sample.xlsx
        outputDirName = self.outputDirName
        outputFileName = self.file_names['featXsamp']
        varDF = self.data
        ofFeatureXSample = pd.ExcelWriter(str(outputDirName) + "/" + outputFileName)
        groupedDF = pd.DataFrame(varDF.groupby(['feature','sample']).apply(len))
        outDF = groupedDF.stack().unstack(1)
        outDF.index = outDF.index.droplevel(1)
        outDF.to_excel(ofFeatureXSample, 'Feature by Sample', na_rep=0, index=True)
        except IOError:
            abortWithMessage("Error opening output file {0}/{1}".format(outputDirName, outputFileName))
        print("\t{0}: {1} rows".format(str(outputDirName) + "/" + outputFileName, len(outDF)))        
        return True 
Exemple #10
    def Feature_and_MutationXSample(self):
        Print counts per mutation per sample.
        Sample names populate the table header and feature names populate the first column. Chromosome, position, ref, and alt populate the next columns.
        The table values are boolean: 1 for present mutation, 0 for missing mutation. 

        Output: feature_and_mutation_by_sample.xlsx
        outputDirName = self.outputDirName
        outputFileName = self.file_names['mutXsamp']
        varDF = self.data
        ofFeature_and_MutationXSample = pd.ExcelWriter(str(outputDirName) + "/" + outputFileName)
        groupedDF = pd.DataFrame(varDF.groupby(['feature','chr','pos','ref','alt','sample']).apply(len))
        outDF = groupedDF.stack().unstack(5)
        outDF.index = outDF.index.droplevel(5)
        outDF.to_excel(ofFeature_and_MutationXSample, 'Feature and Mutation by Sample', na_rep=0, index=True)
        except IOError:
            abortWithMessage("Error opening output file {0}/{1}".format(outputDirName, outputFileName))
        print("\t{0}: {1} rows".format(str(outputDirName) + "/" + outputFileName, len(outDF)))        
        return True