Example #1
0
def sashmi_plot_list(bamdir,eventsToVisualizeFilename,PSIFilename,events=None):
    import gene_associations
    gene_to_symbol = gene_associations.getGeneToUid(species,('hide','Ensembl-Symbol'))
    import OBO_import; symbol_to_gene = OBO_import.swapKeyValues(gene_to_symbol)

    if events==None:
        splicing_events,expandedSearch = importSplicingEventsToVisualize(eventsToVisualizeFilename)
    else:
        ### Replace any ":" from the input events
        #for i in range(len(events)): events[i] = string.replace(events[i],':','__')
        expandedSearch = True
        
        for i in range(len(events)):
            gene = string.split(events[i],'__')[0]
            if gene in gene_to_symbol:
                symbol = gene_to_symbol[gene][0]
            elif 'ENS' not in gene or 'G0000' in gene:
                if gene in symbol_to_gene:
                    ensID = symbol_to_gene[gene][0]
                    symbol = gene
                    events[i] = ensID ### translate this ID to an Ensembl gene ID for propper SashimiPlot lookup
        splicing_events = events ### optionally get from supplied variable

    if len(splicing_events)==0:
        print eventsToVisualizeFilename
        forceNoCompatibleEventsInFile
    
    print 'Exporting plots',
    
    ### Determine Groups for Coloring
    groups_file = 'None'
    dir_list = unique.read_directory(root_dir+'/ExpressionInput')

    for file in dir_list:
         if 'groups.' in file:
            groups_file = root_dir+'/ExpressionInput/'+file

    if groups_file != None:
        try:
            import ExpressionBuilder
            sample_group_db = ExpressionBuilder.simplerGroupImport(groups_file)
            groups=[]
            for sample in sample_group_db:
                if sample_group_db[sample] not in groups:
                    groups.append(sample_group_db[sample]) ### create an ordered list of unique group
        except Exception:
            groups = ['None']
            #print traceback.format_exc()
            pass

    processed_events = formatAndSubmitSplicingEventsToSashimiPlot(PSIFilename, bamdir, splicing_events, sample_group_db, groups, False)
    mopup_events = getMopUpEvents(splicing_events, processed_events)

    ### Do the same for supplied gene queries or junctions that didn't map above using the gene expression values as a guide
    #print len(splicing_events),len(processed_events),len(mopup_events)
    processed_events = formatAndSubmitSplicingEventsToSashimiPlot(steady_state_exp_file,bamdir,mopup_events,sample_group_db,groups,expandedSearch)
    if len(processed_events)>0:
        mopup_events = getMopUpEvents(mopup_events, processed_events)
        processed_events = formatAndSubmitSplicingEventsToSashimiPlot(PSIFilename, bamdir, mopup_events, sample_group_db, groups, True)
    return gene_to_symbol
Example #2
0
def filterRows(input_file,output_file,filterDB=None,logData=False):
    orderlst={}
    counter=[]
    export_object = open(output_file,'w')
    firstLine = True
    Flag=0;
    species="Hs"
    import OBO_import; import ExpressionBuilder
    gene_to_symbol_db = ExpressionBuilder.importGeneAnnotations(species)
    symbol_to_gene = OBO_import.swapKeyValues(gene_to_symbol_db)
    
    for line in open(input_file,'rU').xreadlines():
        flag1 = 0
        data = cleanUpLine(line)
        values = string.split(data,'\t')
    
        if firstLine:
            firstLine = False
            if Flag==0:
                export_object.write(line)
        else:
            try: symbolID = gene_to_symbol_db[values[0]][0]
            except Exception: symbolID = values[0]
            if symbolID in filterDB:
                counter=[index for index, value in enumerate(filterDB) if value == symbolID]
                for it in range(0,len(counter)):
                    orderlst[counter[it]]=line
    try:
        for i in range(0,len(orderlst)):
            export_object.write(orderlst[i])
    except Exception:
        print i,filterDB[i]

    export_object.close()
    print 'Filtered rows printed to:',output_file
Example #3
0
def matrixImport(filename):
    matrix={}
    original_data={}
    headerRow=True
    for line in open(filename,'rU').xreadlines():
        original_line = line
        data = line.rstrip()
        values = string.split(data,'\t')
        if headerRow:
            group_db={}
            groups=[]
            if ':' in data:
                group_sample_list = map(lambda x: string.split(x,':'),values[1:])
                index=1
                for (g,s) in group_sample_list:
                    try: group_db[g].append(index)
                    except Exception: group_db[g] = [index]
                    index+=1
                    if g not in groups: groups.append(g)
            else:
                import ExpressionBuilder
                search_dir = string.split(filename,'AltResults')[0]+'ExpressionInput'
                files = unique.read_directory(search_dir)
                for file in files:
                    if 'groups.' in file:
                        sample_group_db = ExpressionBuilder.simplerGroupImport(search_dir+'/'+file)
                index=0
                for s in values[1:]:
                    g = sample_group_db[s]
                    try: group_db[g].append(index)
                    except Exception: group_db[g] = [index]
                    index+=1
                    if g not in groups: groups.append(g)
            headerRow = False
            grouped_values=[]
            original_data['header'] = original_line
        else:
            key = values[0]
            grouped_floats=[]
            float_values = []
            for g in groups: ### string values
                gvalues_list=[]
                for i in group_db[g]:
                    if values[i] != '0':
                        try: gvalues_list.append(float(values[i]))
                        except Exception: pass
                    else:
                        try: gvalues_list.append('') ### Thus are missing values
                        except Exception: pass
                grouped_floats.append(gvalues_list)
            matrix[key] = grouped_floats
            if '\n' not in original_line:
                original_line+='\n'
            original_data[key] = original_line
            last_line = line
    return matrix,original_data
Example #4
0
def importGroups(fn):
    try: group_db=collections.OrderedDict()
    except Exception:
        try:
            import ordereddict
            group_db=ordereddict.OrderedDict()
        except Exception: group_db={}
    for line in open(fn,'rU').xreadlines():
        data = ExpressionBuilder.cleanUpLine(line)
        sample_filename,group_number,group_name = string.split(data,'\t')
        try: group_db[group_name].append(sample_filename)
        except Exception: group_db[group_name] = [sample_filename]
    return group_db
Example #5
0
def exportChromosomeStrandCoordinates(species):
    import EnsemblImport
    gene_location_db = EnsemblImport.getEnsemblGeneLocations(
        species, 'RNASeq', 'key_by_array')

    import ExpressionBuilder
    gene_biotype_db = ExpressionBuilder.importTranscriptBiotypeAnnotations(
        species)
    export_path = 'GenMAPPDBs/' + species + '/chr_gene_locations.txt'
    export_data = export.ExportFile(export_path)

    import ExonAnalyze_module
    gene_annotation_file = "AltDatabase/ensembl/" + species + "/" + species + "_Ensembl-annotations.txt"
    annotate_db = ExonAnalyze_module.import_annotations(
        gene_annotation_file, 'RNASeq')

    print 'Annotations for', len(gene_location_db), 'genes imported'

    sorted_list = []
    protein_coding = 0
    for gene in gene_location_db:
        chr, strand, start, end = gene_location_db[gene]
        if gene in gene_biotype_db:
            biotype = gene_biotype_db[gene][-1]
            if biotype == 'protein_coding': protein_coding += 1

        else: biotype = 'NA'
        if len(chr) < 7:
            sorted_list.append(
                [chr, strand, int(start),
                 int(end), gene, biotype])
        #else: print chr;sys.exit()
    print len(sorted_list), 'genes for typical chromosomes present'
    print protein_coding, 'protein coding genes present'
    sorted_list.sort()
    for values in sorted_list:
        chr, strand, start, end, gene, biotype = values
        try:
            symbol = annotate_db[gene].Symbol()
        except Exception:
            symbol = ''
        values = [gene, symbol, chr, strand, str(start), str(end), biotype]
        export_data.write(string.join(values, '\t') + '\n')
    export_data.close()
    print species, 'chromosome locations exported to:\n', export_path
Example #6
0
def exportChromosomeStrandCoordinates(species):
    import EnsemblImport
    gene_location_db = EnsemblImport.getEnsemblGeneLocations(species,'RNASeq','key_by_array')

    import ExpressionBuilder
    gene_biotype_db = ExpressionBuilder.importTranscriptBiotypeAnnotations(species)
    export_path = 'GenMAPPDBs/'+species+'/chr_gene_locations.txt'
    export_data = export.ExportFile(export_path)

    import ExonAnalyze_module
    gene_annotation_file = "AltDatabase/ensembl/"+species+"/"+species+"_Ensembl-annotations.txt"
    annotate_db = ExonAnalyze_module.import_annotations(gene_annotation_file,'RNASeq')
      
    print 'Annotations for',len(gene_location_db),'genes imported'
    
    sorted_list=[]; protein_coding=0 
    for gene in gene_location_db:
        chr,strand,start,end = gene_location_db[gene]
        if gene in gene_biotype_db:
            biotype = gene_biotype_db[gene][-1]
            if biotype == 'protein_coding': protein_coding+=1
                
        else: biotype = 'NA'
        if len(chr)<7:
            sorted_list.append([chr,strand,int(start),int(end),gene,biotype])
        #else: print chr;sys.exit()
    print len(sorted_list),'genes for typical chromosomes present'
    print protein_coding, 'protein coding genes present'
    sorted_list.sort()        
    for values in sorted_list:
        chr,strand,start,end,gene,biotype=values
        try: symbol = annotate_db[gene].Symbol()
        except Exception: symbol = ''
        values = [gene,symbol,chr,strand,str(start),str(end),biotype]
        export_data.write(string.join(values,'\t')+'\n')
    export_data.close()
    print species, 'chromosome locations exported to:\n',export_path
Example #7
0
def importTableEntries(filename,
                       filter_db,
                       ensembl_exon_db,
                       gene_db,
                       root_dir,
                       transpose,
                       display,
                       showIntrons,
                       analysisType='plot'):
    import collections
    average_samples = True
    if showIntrons == 'yes': include_introns = True
    else: include_introns = False
    uid_db = {}  ### probeset or AltAnalyze RNA-Seq ID keyed
    uid_list = {}  ### ordered from first to last exon region
    uid_gene_db = {}  ### Lets us look at multiple genes
    try:
        import UI
        biotypes = UI.getBiotypes(filename)
    except Exception:
        biotypes = {}
    for gene in ensembl_exon_db:
        uid_list[gene] = []
        for (index, ed, id) in ensembl_exon_db[gene]:
            proceed = False
            if 'exp.' in filename:
                if include_introns:
                    proceed = True
                elif 'E' in ed.ExonID():
                    proceed = True
            else:  ### Include introns for splicing index view
                if include_introns == True: proceed = True
                elif 'E' in ed.ExonID(): proceed = True
            if proceed:
                uid_db[id] = ed
                uid_list[gene].append(id)
            uid_gene_db[id] = gene

    if '_vs_' in filename:  ### If one two groups, this is what will be output to the RawSplice folder - need to have this alternate way of getting the expression file location
        rootdir = string.split(filename, 'AltResults')[0]
        exp_dir = getValidExpFile(rootdir + 'ExpressionInput')
        alt_groups_dir = string.split(
            exp_dir, 'ExpressionInput'
        )[0] + 'ExpressionInput/groups.' + findFilename(exp_dir)
        alt_groups_dir = string.replace(alt_groups_dir, 'exp.', '')

    start_time = time.time()
    fn = filepath(filename)
    matrix_gene_db = {}
    stdev_gene_matrix_db = {}
    row_header_gene = {}
    ids = {}
    x = 0

    if 'heatmap' in analysisType:
        average_samples = False

    if '/' in filename:
        dataset_name = string.split(filename, '/')[-1][:-4]
    else:
        dataset_name = string.split(filename, '\\')[-1][:-4]
    for line in open(fn, 'rU').xreadlines():
        data = line.strip()
        t = string.split(data, '\t')
        if data[0] == '#': x = 0
        elif x == 0:
            if platform == 'RNASeq':
                removeExtension = True
            else:
                removeExtension = False
            group_db, column_header, sample_name_db = assignGroupColors(
                t[1:], '', removeExtension=removeExtension)
            x = 1
            altresults = False
            if average_samples:
                if 'AltResults' in filename:
                    altresults = True
                    groups_dir = string.split(
                        filename, 'AltResults'
                    )[0] + 'ExpressionInput/groups.' + findFilename(filename)
                    if verifyFile(groups_dir) == False:
                        groups_dir = alt_groups_dir
                    new_column_header = reformatAltHeaders(t[3:])
                    start = 3
                else:
                    if 'exp.' in filename:
                        groups_dir = string.replace(filename, 'exp.',
                                                    'groups.')
                    else:
                        groups_dir = string.replace(filename, 'counts.',
                                                    'groups.')
                    new_column_header = column_header
                    start = 1  ### starting index with numeric values
                groups_dir = string.replace(groups_dir, 'stats.', 'groups.')
                groups_dir = string.replace(
                    groups_dir, '-steady-state.txt',
                    '.txt')  ### groups is for the non-steady-state file

                try:
                    group_index_db = collections.OrderedDict()
                except Exception:
                    import ordereddict
                    group_index_db = ordereddict.OrderedDict()
                ### use comps in the future to visualize group comparison changes
                sample_list, group_sample_db, group_db, group_name_sample_db, comp_groups, comps_name_db = ExpressionBuilder.simpleGroupImport(
                    groups_dir)
                for item in sample_list:
                    group_name = group_db[item]
                    proceed = False
                    try:
                        sample_index = new_column_header.index(item)
                        proceed = True
                    except Exception:
                        try:
                            item = string.replace(item, '.bed', '')
                            item = string.replace(
                                item, '.CEL',
                                '')  ### Probe-level analyses as RNA-Seq
                            item = string.replace(item, '.cel', '')
                            item = string.replace(item, '.txt', '')
                            item = string.replace(item, '.TXT', '')
                            item = string.replace(item, '.TAB', '')
                            item = string.replace(item, '.tab', '')
                            sample_index = new_column_header.index(item)
                            proceed = True
                        except Exception:
                            pass
                            #print [item]
                            #print column_header
                            #print Error
                    if proceed:
                        try:
                            group_index_db[group_name].append(sample_index)
                        except Exception:
                            try:
                                group_index_db[group_name] = [
                                    sample_index
                                ]  ### dictionary of group to input file sample indexes
                            except Exception:
                                pass  ### Occurs when analyzing splicing-index for two groups when more than two groups exist (error from 5 lines up)
                groups = map(str, group_index_db)  ### store group names
                new_sample_list = map(
                    lambda item: group_db[item], sample_list
                )  ### lookup index of each sample in the ordered group sample list
                column_header = groups
            else:
                if 'AltResults' in filename: start = 3
                else: start = 1  ### starting index with numeric values
                column_header = t[start - 1:]
            row_number = 1
        else:
            if ' ' not in t and '' not in t:  ### Occurs for rows with missing data
                uid = t[start - 1]
                if ';' in uid:
                    uid = string.split(uid, ';')[0]
                ids[uid] = None
                ens_geneID = string.split(uid, ':')[0]
                #if ens_geneID in gene_db: print uid
                if uid in filter_db or ('heatmap' in analysisType
                                        and ens_geneID in gene_db):
                    try:
                        if len(biotypes) == 1 and 'junction' in biotypes:
                            gene = ens_geneID
                        else:
                            gene = uid_gene_db[uid]
                        try:
                            row_header_gene[gene].append(uid)
                        except Exception:
                            row_header_gene[gene] = [uid]
                        if average_samples == False:
                            values = map(float, t[start:])
                            try:
                                matrix_gene_db[gene].append(values)
                            except Exception:
                                matrix_gene_db[gene] = [values]
                        else:
                            if platform == 'RNASeq' and altresults == False:
                                ### Convert to log2 RPKM values - or counts
                                values = map(lambda x: math.log(float(x), 2),
                                             t[start:])
                            else:
                                values = map(float, t[start:])

                            if 'AltResults' in filename:  ### If splicing scores, normalize these to the mean values
                                mean = statistics.avg(values)
                                values = map(lambda x: x - mean, values)
                            avg_ls = []
                            std_ls = []
                            for group_name in group_index_db:
                                group_values = map(
                                    lambda x: values[x],
                                    group_index_db[group_name]
                                )  ### simple and fast way to reorganize the samples
                                avg = statistics.avg(group_values)
                                try:
                                    st_err = statistics.stdev(
                                        group_values) / math.sqrt(
                                            len(group_values))
                                except Exception:
                                    ### Occurs if no replicates in the dataset
                                    st_err = 0
                                avg_ls.append(avg)
                                std_ls.append(st_err)
                            try:
                                matrix_gene_db[gene].append(avg_ls)
                            except Exception:
                                matrix_gene_db[gene] = [avg_ls]
                            try:
                                stdev_gene_matrix_db[gene].append(std_ls)
                            except Exception:
                                stdev_gene_matrix_db[gene] = [std_ls]
                    except Exception:
                        #print traceback.format_exc()
                        pass
            x += 1

    global colors
    original_column_header = list(column_header)
    if len(uid_list) == 0:
        print 'No genes found in the exon expression database'
        forceNoExonExpError
    successfully_output_genes = 0
    display_count = 0  ### Only display a certain number of genes

    for last_gene in uid_list:
        pass
    for gene in uid_list:
        fig = pylab.figure(
        )  ### Create this here - resulting in a single figure for memory purposes
        new_header = []
        new_matrix = []
        new_stdev = []
        annotation_list = []
        gene_symbol = gene_db[gene]
        try:
            matrix = matrix_gene_db[gene]
        except Exception:
            print gene_symbol, 'not in alternative expression database'
            continue  ### go the next gene - no alt.expression for this gene
        row_header = row_header_gene[gene]

        try:
            stdev_matrix = stdev_gene_matrix_db[gene]
        except Exception:
            pass
        for uid in uid_list[gene]:
            #print row_header;sys.exit()
            try:
                i = row_header.index(
                    uid
                )  ### If the ID is in the filtered annotated exon list (not just core)
                new_header.append(uid)
                try:
                    new_matrix.append(matrix[i])
                except Exception:
                    print uid, i, len(matrix)
                    sys.exit()
                ed = uid_db[uid]
                annotation_list.append(ed)
                try:
                    new_stdev.append(stdev_matrix[i])
                except Exception:
                    pass
            except Exception:
                pass

        if len(new_matrix) > 0:
            matrix = new_matrix
        if len(new_header) > 0:
            row_header = new_header
        if 'heatmap' in analysisType:
            export_dir = root_dir + gene_symbol + '-heatmap.txt'
            export_obj = export.ExportFile(export_dir)
            export_obj.write(string.join(column_header, '\t') + '\n')
            ki = 0
            if len(annotation_list) > 0:
                for ed in annotation_list:
                    if 'AltResults' not in filename and platform == 'RNASeq':
                        values = map(lambda x: math.log(x, 2), matrix[ki])
                    else:
                        values = matrix[ki]
                    export_obj.write(
                        string.join([ed.ExonID()] + map(str, values), '\t') +
                        '\n')
                    ki += 1
                row_metric = 'euclidean'
                row_method = None
            else:
                ### Just junctions analyzed here... no sorted junctions yet
                ki = 0
                for uid in row_header_gene[gene]:
                    if 'AltResults' not in filename and platform == 'RNASeq':
                        values = map(lambda x: math.log(x, 2), matrix[ki])
                    else:
                        values = matrix[ki]
                    export_obj.write(
                        string.join([uid] + map(str, values), '\t') + '\n')
                    ki += 1
                row_metric = 'euclidean'
                row_method = 'average'
            export_obj.close()
            import clustering

            column_metric = 'euclidean'
            column_method = 'hopach'
            color_gradient = 'red_black_sky'
            transpose = False
            graphic_links = []
            if ki > 100: transpose = True
            if gene == last_gene: display = True
            else: display = False
            graphic_links = clustering.runHCexplicit(export_dir,
                                                     graphic_links,
                                                     row_method,
                                                     row_metric,
                                                     column_method,
                                                     column_metric,
                                                     color_gradient,
                                                     transpose,
                                                     display=display,
                                                     Normalize=True,
                                                     compressAxis=False,
                                                     contrast=2.5)
            successfully_output_genes += 1
        else:
            stdev_matrix = new_stdev
            time_diff = str(round(time.time() - start_time, 1))
            #print '%d rows and %d columns imported for %s in %s seconds...' % (len(matrix),len(column_header),dataset_name,time_diff)
            if transpose == True:
                matrix = map(numpy.array,
                             zip(*matrix))  ### coverts these to tuples
                column_header, row_header = row_header, original_column_header
                stdev_matrix = map(numpy.array, zip(*stdev_matrix))
            matrix = numpy.array(matrix)

            stdev_matrix = numpy.array(stdev_matrix)
            try:
                if len(uid_list) > 10:
                    #if display_count==5: display=False
                    display = False
                if display_count == 0:
                    ### store a consistent color palete to use
                    colors = []
                    """
                    k=0
                    while k < len(row_header):
                        colors.append(tuple(rand(3)))
                        k+=1"""
                    #http://stackoverflow.com/questions/3016283/create-a-color-generator-from-given-colormap-in-matplotlib
                    cm = pylab.cm.get_cmap('gist_rainbow')  #gist_ncar
                    for i in range(len(row_header)):
                        colors.append(cm(1. * i / len(row_header))
                                      )  # color will now be an RGBA tuple

                plotExonExpression(fig,
                                   matrix,
                                   stdev_matrix,
                                   row_header,
                                   column_header,
                                   dataset_name,
                                   annotation_list,
                                   gene_symbol,
                                   root_dir,
                                   display=display)
                successfully_output_genes += 1
                display_count += 1
            except Exception:
                print traceback.format_exc()
                sys.exit()
                print gene_symbol, 'failed'
        try:
            pylab.close()
        except Exception:
            pass
        if successfully_output_genes > 0:
            try:
                print 'Gene graphs exported to ExonPlots...'
            except Exception:
                pass
        else:
            print '\nWARNING!!!! No genes with associated alternative exon evidence found\n'
            forceNoExonExpError
        try:
            import gc
            fig.clf()
            pylab.close()
            gc.collect()
        except Exception:
            pass
Example #8
0
def remoteSashimiPlot(Species, fl, bamdir, eventsToVisualizeFilename, events=None, show=False):
    global PSIFilename
    global outputdir
    global root_dir
    global steady_state_exp_file
    global species
    species = Species

    try:
        countinp = fl.CountsFile()
        root_dir = fl.RootDir()
    except Exception:
        root_dir = fl
        search_dir = root_dir + "/ExpressionInput"
        files = unique.read_directory(search_dir)
        for file in files:
            if "counts." in file and "steady-state.txt" not in file:
                countinp = search_dir + "/" + file

    PSIFilename = root_dir + "/AltResults/AlternativeOutput/" + species + "_RNASeq_top_alt_junctions-PSI.txt"

    import ExpressionBuilder

    dir_list = unique.read_directory(root_dir + "/ExpressionInput")
    for file in dir_list:
        if "exp." in file and "steady-state" not in file:
            exp_file = root_dir + "/ExpressionInput/" + file
        elif "exp." in file and "steady-state" in file:
            steady_state_exp_file = root_dir + "/ExpressionInput/" + file
    global sample_group_db
    sample_group_db = ExpressionBuilder.simplerGroupImport(exp_file)

    # outputdir=findParentDir(PSIFilename)+"sashimiplots"
    outputdir = root_dir + "/ExonPlots"
    outputdir = root_dir + "/SashimiPlots"
    try:
        os.mkdir(unique.filepath(outputdir))
    except Exception:
        pass

    if show:
        s = open(outputdir + "/show.txt", "w")
        s.write("TRUE")
        s.close()
    else:
        s = open(outputdir + "/show.txt", "w")
        s.write("FALSE")
        s.close()

    geneSymbol_db = Sashimiplottting(bamdir, countinp, PSIFilename, eventsToVisualizeFilename, events=events)
    for filename in os.listdir(outputdir):
        if ".pdf" in filename or ".png" in filename:
            fn = string.replace(filename, ".pdf", "")
            fn = string.replace(fn, ".png", "")
            newname = string.split(fn, "__")
            if newname[0] in geneSymbol_db:
                new_filename = str(filename)
                if "__" in filename:
                    new_filename = string.split(filename, "__")[1]
                elif "\\" in filename:
                    new_filename = string.split(filename, "\\")[1]
                elif "/" in filename:
                    new_filename = string.split(filename, "/")[1]
                nnname = geneSymbol_db[newname[0]][0] + "-SashimiPlot_" + new_filename
                try:
                    os.rename(os.path.join(outputdir, filename), os.path.join(outputdir, nnname))
                except Exception:
                    if "already exists" in traceback.format_exc():
                        ### File already exists, delete the new one
                        try:
                            os.remove(os.path.join(outputdir, nnname))
                        except Exception:
                            pass
                        ### Now right the new one
                        try:
                            os.rename(os.path.join(outputdir, filename), os.path.join(outputdir, nnname))
                        except Exception:
                            pass
                    pass
            else:
                continue
    print ""
Example #9
0
def performGroupNormalization(filename,export_dir,platform):
    expressionDataFormat,increment,convertNonLogToLog = ExpressionBuilder.checkExpressionFileFormat(filename)
    groups_dir = string.replace(export_dir,'exp.','batch.')
    fn=unique.filepath(filename); row_number=0; exp_db={}; relative_headers_exported = False
    group_db = importGroups(groups_dir)
    export_data = export.ExportFile(export_dir)
    for line in open(fn,'rU').xreadlines():
        data = ExpressionBuilder.cleanUpLine(line)
        t = string.split(data,'\t')
        if data[0]=='#' and row_number==0: row_number = 0
        elif row_number==0:
            sample_list = t[1:]
            new_sample_list = []
            for group in group_db:
                group_samples = group_db[group]
                try:
                    sample_index_list = map(lambda x: sample_list.index(x), group_samples)
                    group_db[group] = sample_index_list
                    new_sample_list+=group_samples
                except Exception:
                    missing=[]
                    for x in sample_list:
                        if x not in t[1:]: missing.append(x)
                    print 'missing:',missing
                    print t
                    print sample_list
                    print filename, groups_dir
                    print 'Unknown Error!!! Skipping cluster input file build (check column and row formats for conflicts)'; forceExit
            title = string.join([t[0]]+new_sample_list,'\t')+'\n' ### output the new sample order (group file order)
            export_data.write(title)
            row_number=1
        else:
            gene = t[0]
            if expressionDataFormat == 'non-log' and (convertNonLogToLog or platform == 'RNASeq'):
                ### Convert to log2 RPKM values - or counts
    
                try: all_values = map(lambda x: math.log(float(x)+increment,2), t[1:])
                except Exception:
                    all_values = ExpressionBuilder.logTransformWithNAs(t[1:],increment)
            else:
                try: all_values = map(float,t[1:])
                except Exception:
                    all_values = ExpressionBuilder.logTransformWithNAs(t[1:],increment)
            row_number+=1 ### Keep track of the first gene as to write out column headers for the relative outputs
            gene_log_folds = []

            for group in group_db:
                sample_index_list = group_db[group]
                ### Calculate log-fold values relative to the mean of all sample expression values
                try: values = map(lambda x: all_values[x], sample_index_list) ### simple and fast way to reorganize the samples
                except Exception:
                    print len(values), sample_index_list;kill
                try: avg = statistics.avg(values)
                except Exception:
                    values2=[]
                    for v in values:
                        try: values2.append(float(v))
                        except Exception: pass
                    values = values2
                    try: avg = statistics.avg(values)
                    except Exception:
                        if len(values)>0: avg = values[0]
                        else: avg = 0
                try: log_folds = map(lambda x: (x-avg), values)
                except Exception: 
                    log_folds=[]
                    for x in values:
                        try: log_folds.append(x-avg)
                        except Exception: log_folds.append('')
                gene_log_folds+=log_folds                            
            gene_log_folds = map(lambda x: str(x),gene_log_folds)
            export_data.write(string.join([gene]+gene_log_folds,'\t')+'\n')
    export_data.close()
Example #10
0
def importTableEntries(filename,filter_db,ensembl_exon_db,gene_db,root_dir,transpose,display,showIntrons,analysisType='plot'):
    import collections
    average_samples = True
    if showIntrons == 'yes': include_introns = True
    else: include_introns = False
    uid_db={} ### probeset or AltAnalyze RNA-Seq ID keyed
    uid_list={} ### ordered from first to last exon region
    uid_gene_db={} ### Lets us look at multiple genes
    try:
        import UI
        biotypes = UI.getBiotypes(filename)
    except Exception: biotypes={}
    for gene in ensembl_exon_db:
        uid_list[gene]=[]
        for (index,ed,id) in ensembl_exon_db[gene]:
            proceed = False
            if 'exp.' in filename:
                if include_introns:
                    proceed = True
                elif 'E' in ed.ExonID():
                    proceed = True
            else: ### Include introns for splicing index view
                if include_introns == True: proceed = True
                elif 'E' in ed.ExonID(): proceed = True
            if proceed:
                uid_db[id] = ed
                uid_list[gene].append(id)
            uid_gene_db[id]=gene

    if '_vs_' in filename: ### If one two groups, this is what will be output to the RawSplice folder - need to have this alternate way of getting the expression file location
        rootdir = string.split(filename, 'AltResults')[0]
        exp_dir = getValidExpFile(rootdir+'ExpressionInput')
        alt_groups_dir = string.split(exp_dir, 'ExpressionInput')[0]+'ExpressionInput/groups.'+findFilename(exp_dir)
        alt_groups_dir = string.replace(alt_groups_dir,'exp.','')
        
    start_time = time.time()
    fn = filepath(filename)
    matrix_gene_db={}
    stdev_gene_matrix_db={}
    row_header_gene={}
    ids={}
    x=0
    
    if 'heatmap' in analysisType:
        average_samples = False
        
    if '/' in filename:
        dataset_name = string.split(filename,'/')[-1][:-4]
    else:
        dataset_name = string.split(filename,'\\')[-1][:-4]
    for line in open(fn,'rU').xreadlines():         
        data = line.strip()
        t = string.split(data,'\t')
        if data[0]=='#': x=0
        elif x==0:
            if platform == 'RNASeq':
                removeExtension=True
            else:
                removeExtension=False
            group_db, column_header, sample_name_db = assignGroupColors(t[1:],'',removeExtension=removeExtension)
            x=1
            altresults = False
            if average_samples:
                if 'AltResults' in filename:
                    altresults=True
                    groups_dir = string.split(filename, 'AltResults')[0]+'ExpressionInput/groups.'+findFilename(filename)
                    if verifyFile(groups_dir)==False:
                        groups_dir = alt_groups_dir
                    new_column_header = reformatAltHeaders(t[3:])
                    start = 3
                else:
                    if 'exp.' in filename:
                        groups_dir = string.replace(filename,'exp.','groups.')
                    else:
                        groups_dir = string.replace(filename,'counts.','groups.')
                    new_column_header = column_header
                    start = 1 ### starting index with numeric values
                groups_dir = string.replace(groups_dir,'stats.','groups.')
                groups_dir = string.replace(groups_dir,'-steady-state.txt','.txt') ### groups is for the non-steady-state file
                
                try: group_index_db=collections.OrderedDict()
                except Exception:
                    import ordereddict
                    group_index_db = ordereddict.OrderedDict()
                ### use comps in the future to visualize group comparison changes
                sample_list,group_sample_db,group_db,group_name_sample_db,comp_groups,comps_name_db = ExpressionBuilder.simpleGroupImport(groups_dir)
                for item in sample_list:
                    group_name = group_db[item]
                    proceed=False
                    try: sample_index = new_column_header.index(item); proceed=True
                    except Exception:
                        try:
                            item = string.replace(item,'.bed','')
                            item = string.replace(item,'.CEL','') ### Probe-level analyses as RNA-Seq
                            item = string.replace(item,'.cel','')
                            item = string.replace(item,'.txt','')
                            item = string.replace(item,'.TXT','')
                            item = string.replace(item,'.TAB','')
                            item = string.replace(item,'.tab','')
                            sample_index = new_column_header.index(item)
                            proceed=True
                        except Exception:
                            pass
                            #print [item]
                            #print column_header
                            #print Error
                    if proceed:
                        try: group_index_db[group_name].append(sample_index)
                        except Exception:
                            try: group_index_db[group_name] = [sample_index] ### dictionary of group to input file sample indexes
                            except Exception: pass ### Occurs when analyzing splicing-index for two groups when more than two groups exist (error from 5 lines up)
                groups = map(str, group_index_db) ### store group names
                new_sample_list = map(lambda item: group_db[item], sample_list) ### lookup index of each sample in the ordered group sample list
                column_header = groups
            else:
                if 'AltResults' in filename: start = 3
                else: start = 1 ### starting index with numeric values
                column_header = t[start-1:]
            row_number=1   
        else:
            if ' ' not in t and '' not in t: ### Occurs for rows with missing data
                uid = t[start-1]
                if ';' in uid:
                    uid = string.split(uid,';')[0]
                ids[uid]=None
                ens_geneID = string.split(uid,':')[0]
                #if ens_geneID in gene_db: print uid
                if uid in filter_db or ('heatmap' in analysisType and ens_geneID in gene_db):
                    try:
                        if len(biotypes)==1 and 'junction' in biotypes:
                            gene = ens_geneID
                        else:
                            gene = uid_gene_db[uid]
                        try: row_header_gene[gene].append(uid)
                        except Exception: row_header_gene[gene] = [uid]
                        if average_samples == False:
                            values = map(float,t[start:])
                            try: matrix_gene_db[gene].append(values)
                            except Exception: matrix_gene_db[gene]=[values]
                        else:
                            if platform == 'RNASeq' and altresults==False:
                                ### Convert to log2 RPKM values - or counts
                                values = map(lambda x: math.log(float(x),2), t[start:])
                            else:
                                values = map(float,t[start:])
                                
                            if 'AltResults' in filename: ### If splicing scores, normalize these to the mean values
                                mean = statistics.avg(values)
                                values = map(lambda x: x-mean, values)
                            avg_ls=[]; std_ls = []
                            for group_name in group_index_db:
                                group_values = map(lambda x: values[x], group_index_db[group_name]) ### simple and fast way to reorganize the samples
                                avg = statistics.avg(group_values)
                                try: st_err = statistics.stdev(group_values)/math.sqrt(len(group_values))
                                except Exception:
                                    ### Occurs if no replicates in the dataset
                                    st_err = 0
                                avg_ls.append(avg)
                                std_ls.append(st_err)
                            try: matrix_gene_db[gene].append(avg_ls)
                            except Exception: matrix_gene_db[gene]=[avg_ls]
                            try: stdev_gene_matrix_db[gene].append(std_ls)
                            except Exception: stdev_gene_matrix_db[gene]=[std_ls]
                    except Exception:
                        #print traceback.format_exc()
                        pass
            x+=1

    global colors
    original_column_header = list(column_header)
    if len(uid_list)==0:
        print 'No genes found in the exon expression database'; forceNoExonExpError
    successfully_output_genes=0
    display_count=0 ### Only display a certain number of genes
    
    for last_gene in uid_list: pass
    for gene in uid_list:
        fig = pylab.figure() ### Create this here - resulting in a single figure for memory purposes
        new_header = []
        new_matrix = []
        new_stdev = []
        annotation_list=[]
        gene_symbol = gene_db[gene]
        try: matrix = matrix_gene_db[gene]
        except Exception:
            print gene_symbol, 'not in alternative expression database'
            continue ### go the next gene - no alt.expression for this gene
        row_header = row_header_gene[gene]

        try: stdev_matrix = stdev_gene_matrix_db[gene]
        except Exception: pass
        for uid in uid_list[gene]:
            #print row_header;sys.exit()
            try:
                i = row_header.index(uid) ### If the ID is in the filtered annotated exon list (not just core)
                new_header.append(uid)
                try: new_matrix.append(matrix[i])
                except Exception: print uid, i,len(matrix);sys.exit()
                ed = uid_db[uid]
                annotation_list.append(ed)
                try: new_stdev.append(stdev_matrix[i])
                except Exception: pass
            except Exception: pass

        if len(new_matrix)>0:
            matrix = new_matrix
        if len(new_header)>0:
            row_header = new_header
        if 'heatmap' in analysisType:
            export_dir = root_dir + gene_symbol + '-heatmap.txt'
            export_obj = export.ExportFile(export_dir)
            export_obj.write(string.join(column_header,'\t')+'\n')
            ki=0
            if len(annotation_list)>0:
                for ed in annotation_list:
                    if 'AltResults' not in filename and platform == 'RNASeq':
                        values = map(lambda x: math.log(x,2), matrix[ki])
                    else: values = matrix[ki]
                    export_obj.write(string.join([ed.ExonID()] + map(str,values),'\t')+'\n')
                    ki+=1
                row_metric = 'euclidean'; row_method = None
            else:
                ### Just junctions analyzed here... no sorted junctions yet
                ki=0
                for uid in row_header_gene[gene]:
                    if 'AltResults' not in filename and platform == 'RNASeq':
                        values = map(lambda x: math.log(x,2), matrix[ki])
                    else: values = matrix[ki]
                    export_obj.write(string.join([uid] + map(str,values),'\t')+'\n')
                    ki+=1
                row_metric = 'euclidean'; row_method = 'average'
            export_obj.close()
            import clustering
            
            column_metric = 'euclidean'; column_method = 'hopach'
            color_gradient = 'red_black_sky'; transpose = False; graphic_links=[]
            if ki>100: transpose = True
            if gene == last_gene: display = True
            else: display = False
            graphic_links = clustering.runHCexplicit(export_dir, graphic_links, row_method, row_metric, column_method, column_metric, color_gradient, transpose, display=display, Normalize=True, compressAxis = False, contrast = 2.5)
            successfully_output_genes+=1
        else:
            stdev_matrix = new_stdev
            time_diff = str(round(time.time()-start_time,1))
            #print '%d rows and %d columns imported for %s in %s seconds...' % (len(matrix),len(column_header),dataset_name,time_diff)
            if transpose == True:
                matrix = map(numpy.array, zip(*matrix)) ### coverts these to tuples
                column_header, row_header = row_header, original_column_header
                stdev_matrix = map(numpy.array, zip(*stdev_matrix))
            matrix = numpy.array(matrix)

            stdev_matrix = numpy.array(stdev_matrix)
            try:
                if len(uid_list)>10:
                    #if display_count==5: display=False
                    display=False
                if display_count==0:
                    ### store a consistent color palete to use
                    colors=[]
                    """
                    k=0
                    while k < len(row_header):
                        colors.append(tuple(rand(3)))
                        k+=1"""
                    #http://stackoverflow.com/questions/3016283/create-a-color-generator-from-given-colormap-in-matplotlib
                    cm = pylab.cm.get_cmap('gist_rainbow') #gist_ncar
                    for i in range(len(row_header)):
                        colors.append(cm(1.*i/len(row_header)))  # color will now be an RGBA tuple
        
                plotExonExpression(fig,matrix,stdev_matrix,row_header,column_header,dataset_name,annotation_list,gene_symbol,root_dir,display=display)
                successfully_output_genes+=1
                display_count+=1
            except Exception:
                print traceback.format_exc();sys.exit()
                print gene_symbol, 'failed'
        try: pylab.close()
        except Exception: pass
        if successfully_output_genes>0:
            #try: print 'Gene graphs exported to ExonPlots...'
            #except Exception: pass
            pass
        else:
            print '\nWARNING!!!! No genes with associated alternative exon evidence found\n'; forceNoExonExpError
        try:
            import gc
            fig.clf()
            pylab.close()
            gc.collect()
        except Exception:
            pass
Example #11
0
def matrixImport(filename):
    matrix={}
    compared_groups={} ### track which values correspond to which groups for pairwise group comparisons
    original_data={}
    headerRow=True
    for line in open(filename,'rU').xreadlines():
        original_line = line
        data = line.rstrip()
        values = string.split(data,'\t')
        #print len(values)
        if headerRow:
            group_db={}
            groups=[]
            if ':' in data:
                group_sample_list = map(lambda x: string.split(x,':'),values[1:])
                index=1
                for (g,s) in group_sample_list:
                    try: group_db[g].append(index)
                    except Exception: group_db[g] = [index]
                    index+=1
                    if g not in groups: groups.append(g)
            else:
                import ExpressionBuilder
                search_dir = string.split(filename,'AltResults')[0]+'ExpressionInput'
                files = unique.read_directory(search_dir)
                for file in files:
                    if 'groups.' in file and '.txt' in file:
                        #print file
                        sample_group_db = ExpressionBuilder.simplerGroupImport(search_dir+'/'+file)
                
                index=0; count=0
                for s in values[1:]:
                    if s in sample_group_db:
                        g = sample_group_db[s]
                        try: group_db[g].append(index)
                        except Exception: group_db[g] = [index]
                        count+=1
                        if g not in groups: groups.append(g)
                    #else: print [s]
                    index+=1
            #print count
            headerRow = False
            grouped_values=[]
            original_data['header'] = original_line
        else:
            key = values[0]
            values=values[1:]
            grouped_floats=[]
            float_values = []
            associated_groups=[]
            for g in groups: ### string values
                gvalues_list=[]
                for i in group_db[g]:
                    try:
                        if values[i] != '0':
                            try:
                                gvalues_list.append(float(values[i]))
                            except Exception: pass
                        else:
                            #try: gvalues_list.append('') ### Thus are missing values
                            #except Exception: pass
                            pass
                    except Exception:
                        #try: gvalues_list.append('') ### Thus are missing values
                        #except Exception: pass
                        pass
                grouped_floats.append(gvalues_list)
                if len(gvalues_list)>1:
                    associated_groups.append(g)
            matrix[key] = grouped_floats
            compared_groups[key] = associated_groups
            if '\n' not in original_line:
                original_line+='\n'
            original_data[key] = original_line
            last_line = line
    return matrix,compared_groups,original_data
Example #12
0
def getAnnotations(fl,Array_type,p_threshold,e_threshold,data_source,manufacturer,constitutive_source,Species,avg_all_for_ss,filter_by_DABG,perform_alt_analysis,expression_data_format):
    global species; species = Species; global average_all_probesets; average_all_probesets={}
    global avg_all_probes_for_steady_state; avg_all_probes_for_steady_state = avg_all_for_ss; global filter_by_dabg; filter_by_dabg = filter_by_DABG
    global dabg_p_threshold; dabg_p_threshold = float(p_threshold); global root_dir; global biotypes; global normalize_feature_exp
    global expression_threshold; global exp_data_format; exp_data_format = expression_data_format; global UserOptions; UserOptions = fl
    global full_dataset_export_dir; global excludeLowExpressionExons

    """
    try: exon_exp_threshold = fl.ExonExpThreshold()
    except Exception: exon_exp_threshold = 0
    try: exon_rpkm_threshold = fl.ExonRPKMThreshold()
    except Exception: exon_rpkm_threshold = 0
    try: gene_rpkm_threshold = fl.RPKMThreshold()
    except Exception: gene_rpkm_threshold = 0
    try: gene_exp_threshold = fl.GeneExpThreshold()
    except Exception: gene_exp_threshold = 0
    """
    
    ### The input expression data can be log or non-log. If non-log, transform to log in FilterDABG prior to the alternative exon analysis - v.1.16    
    if expression_data_format == 'log':
        try: expression_threshold = math.log(float(e_threshold),2)
        except Exception: expression_threshold = 0 ### Applies to RNASeq datasets
    else:
        expression_threshold = float(e_threshold)
    
    process_from_scratch = 'no' ###internal variables used while testing
    global dabg_summary; global expression_summary; dabg_summary={};expression_summary={}
    global fulldataset_export_object; global array_type; array_type = Array_type
    global exp_analysis_type; exp_analysis_type = 'expression'
    global stats_input_dir
    expr_input_dir = fl.ExpFile(); stats_input_dir = fl.StatsFile(); root_dir = fl.RootDir()
    try: normalize_feature_exp = fl.FeatureNormalization()
    except Exception: normalize_feature_exp = 'NA'
    try: excludeLowExpressionExons = fl.excludeLowExpressionExons()
    except Exception: excludeLowExpressionExons = True
    try:
        useJunctionsForGeneExpression = fl.useJunctionsForGeneExpression()
        if useJunctionsForGeneExpression:
            print 'Using known junction only to estimate gene expression!!!'
    except Exception: useJunctionsForGeneExpression = False
    
    source_biotype = 'mRNA'
    if array_type == 'gene': source_biotype = 'gene'
    elif array_type == 'junction': source_biotype = 'junction'
    ###Get annotations using Affymetrix as a trusted source or via links to Ensembl

    if array_type == 'AltMouse':
        probeset_db,constitutive_gene_db = ExpressionBuilder.importAltMerge('full'); annotate_db={}
        source_biotype = 'AltMouse'
    elif manufacturer == 'Affymetrix' or array_type == 'RNASeq':
        if array_type == 'RNASeq':
            source_biotype = array_type, root_dir

        probeset_db,annotate_db,constitutive_gene_db,splicing_analysis_db = ExonArrayEnsemblRules.getAnnotations(process_from_scratch,constitutive_source,source_biotype,species)

    ### Get all file locations and get array headers
    #print len(splicing_analysis_db),"genes included in the splicing annotation database (constitutive only containing)"
    stats_file_status = verifyFile(stats_input_dir)
    array_linker_db,array_names = importExonProbesetData(expr_input_dir,{},'arraynames')
    input_dir_split = string.split(expr_input_dir,'/')
    full_dataset_export_dir = root_dir+'AltExpression/FullDatasets/ExonArray/'+species+'/'+string.replace(input_dir_split[-1],'exp.','')
    if array_type == 'gene': full_dataset_export_dir = string.replace(full_dataset_export_dir,'ExonArray','GeneArray')
    if array_type == 'junction': full_dataset_export_dir = string.replace(full_dataset_export_dir,'ExonArray','JunctionArray')
    if array_type == 'AltMouse': full_dataset_export_dir = string.replace(full_dataset_export_dir,'ExonArray','AltMouse')
    if array_type == 'RNASeq': full_dataset_export_dir = string.replace(full_dataset_export_dir,'ExonArray','RNASeq')
    try: fulldataset_export_object = export.ExportFile(full_dataset_export_dir)
    except Exception:
        print 'AltAnalyze is having trouble creating the directory:\n',full_dataset_export_dir
        print 'Report this issue to the AltAnalyze help desk or create this directory manually (Error Code X1).'; force_exception
    ### Organize arrays according to groups and export all probeset data and any pairwise comparisons
    data_type = 'expression'
    if array_type == 'RNASeq':
        expr_input_dir = string.replace(expr_input_dir,'exp.','counts.') ### Filter based on the counts file and then replace values with the normalized as the last step
    comparison_filename_list,biotypes = exportGroupedComparisonProbesetData(expr_input_dir,probeset_db,data_type,array_names,array_linker_db,perform_alt_analysis)
    if useJunctionsForGeneExpression:
        if 'junction' in biotypes:
            if 'exon' in biotypes: del biotypes['exon']
    if filter_by_dabg == 'yes' and stats_file_status == 'found':
        data_type = 'dabg'
        exportGroupedComparisonProbesetData(stats_input_dir,probeset_db,data_type,array_names,array_linker_db,perform_alt_analysis)
    ###Filter expression data based on DABG and annotation filtered probesets (will work without DABG filtering as well) - won't work for RNA-Seq (execute function later)
    filtered_exon_db = removeNonExpressedProbesets(probeset_db,full_dataset_export_dir)
    filterExpressionData(expr_input_dir,filtered_exon_db,constitutive_gene_db,probeset_db,'expression',array_names,perform_alt_analysis)
    constitutive_gene_db={}; probeset_gene_db = makeGeneLevelAnnotations(probeset_db)

    if array_type == 'RNASeq':
        fulldataset_export_object = export.ExportFile(full_dataset_export_dir)
        data_type = 'expression' ### Repeat with counts and then with exp. to add gene-level estimates to both
        exportGroupedComparisonProbesetData(expr_input_dir,probeset_db,data_type,array_names,array_linker_db,perform_alt_analysis)
        fulldataset_export_object = export.ExportFile(full_dataset_export_dir)
        expr_input_dir = string.replace(expr_input_dir,'counts.','exp.')
        exportGroupedComparisonProbesetData(expr_input_dir,probeset_db,data_type,array_names,array_linker_db,perform_alt_analysis)
        
    try: clearObjectsFromMemory(average_all_probesets); clearObjectsFromMemory(expression_summary); clearObjectsFromMemory(splicing_analysis_db)
    except Exception: null=[]
    filtered_exon_db=[]; probeset_db={}; average_all_probesets={}; expression_summary={}; splicing_analysis_db={}
    #filtered_exp_db,group_count,ranked_array_headers = filterExpressionData(expr_input_dir,filtered_exon_db,constitutive_gene_db,probeset_db)
    #filtered_gene_db = permformFtests(filtered_exp_db,group_count,probeset_db)

    """    
    pre_filtered_db=[]
    print 'global vars'
    returnLargeGlobalVars()    
    print 'local vars'
    all = [var for var in locals() if (var[:2], var[-2:]) != ("__", "__")]
    for var in all:
            try:
                if len(locals()[var])>500: print var, len(locals()[var])
            except Exception: null=[]
    """
    return probeset_gene_db, annotate_db, comparison_filename_list
Example #13
0
def exportGroupedComparisonProbesetData(filename,probeset_db,data_type,array_names,array_linker_db,perform_alt_analysis):
        """This function organizes the raw expression data into sorted groups, exports the organized data for all conditions and comparisons
        and calculates which probesets have groups that meet the user defined dabg and expression thresholds."""
        #comparison_filename_list=[]
        #if perform_alt_analysis != 'expression': ### User Option (removed in version 2.0 since the option prevented propper filtering)
        comparison_filename_list=[]
        probeset_dbase={}; exp_dbase={}; constitutive_gene_db={}; probeset_gene_db={} ### reset databases to conserve memory
        global expr_group_list; global comp_group_list; global expr_group_db
        if data_type == 'residuals':
            expr_group_dir = string.replace(filename,'residuals.','groups.')
            comp_group_dir = string.replace(filename,'residuals.','comps.')
        elif data_type == 'expression':
            expr_group_dir = string.replace(filename,'exp.','groups.')
            comp_group_dir = string.replace(filename,'exp.','comps.')
            if 'counts.' in filename:
                expr_group_dir = string.replace(expr_group_dir,'counts.','groups.')
                comp_group_dir = string.replace(comp_group_dir,'counts.','comps.')
                data_type = 'counts'
        elif data_type == 'dabg':
            expr_group_dir = string.replace(filename,'stats.','groups.')
            comp_group_dir = string.replace(filename,'stats.','comps.')

        comp_group_list, comp_group_list2 = ExpressionBuilder.importComparisonGroups(comp_group_dir)
        expr_group_list,expr_group_db = ExpressionBuilder.importArrayGroups(expr_group_dir,array_linker_db)

        print "Reorganizing expression data into comparison groups for export to down-stream splicing analysis software"
        ###Do this only for the header data
        group_count,raw_data_comp_headers = reorder_arrays.reorderArrayHeaders(array_names,expr_group_list,comp_group_list,array_linker_db)

        ###Export the header info and store the export write data for reorder_arrays
        global comparision_export_db; comparision_export_db={}; array_type_name = 'Exon'
        if array_type == 'junction': array_type_name = 'Junction'
        elif array_type == 'RNASeq': array_type_name = 'RNASeq'
        if data_type != 'residuals': AltAnalzye_input_dir = root_dir+"AltExpression/pre-filtered/"+data_type+'/'
        else: AltAnalzye_input_dir = root_dir+"AltExpression/FIRMA/residuals/"+array_type+'/'+species+'/' ### These files does not need to be filtered until AltAnalyze.py

        for comparison in comp_group_list2: ###loop throught the list of comparisons
            group1 = comparison[0]; group2 = comparison[1]
            group1_name = expr_group_db[group1]; group2_name = expr_group_db[group2]
            comparison_filename = species+'_'+array_type_name+'_'+ group1_name + '_vs_' + group2_name + '.txt'
                
            new_file = AltAnalzye_input_dir + comparison_filename; comparison_filename_list.append(comparison_filename)
            data = export.createExportFile(new_file,AltAnalzye_input_dir[:-1])

            try: array_names = raw_data_comp_headers[comparison]
            except KeyError: print raw_data_comp_headers;kill
            title = ['UID']+array_names; title = string.join(title,'\t')+'\n'; data.write(title)
            comparision_export_db[comparison] = data ###store the export file write data so we can write after organizing
        #print filename, normalize_feature_exp
        biotypes = importExonProbesetData(filename,probeset_db,'reorderFilterAndExportAll')
        
        if normalize_feature_exp == 'RPKM': ### Add the gene-level RPKM data (this is in addition to the counts. file)
            exp_gene_db={}
            for i in probeset_db: exp_gene_db[probeset_db[i][0]]=[]
            filename = string.replace(filename,'.txt','-steady-state.txt')
            #print filename, normalize_feature_exp, 'here'
            importExonProbesetData(filename,exp_gene_db,'reorderFilterAndExportAll')
            
        for comparison in comparision_export_db:
            data = comparision_export_db[comparison]; data.close()
        print "Pairwise comparisons for AltAnalyze exported..."
        try: fulldataset_export_object.close()
        except Exception: null=[]
        return comparison_filename_list, biotypes
Example #14
0
    eo.close()


if __name__ == '__main__':
    ################  Comand-line arguments ################
    import getopt
    CLIP_dir = None
    species = 'Hs'
    """ Usage:
    bedtools intersect -wb -a /Clip_merged_reproducible_ENCODE/K562/AARS-human.bed -b /annotations/combined/hg19_annotations-full.bed > /test.bed
    """

    if len(
            sys.argv[1:]
    ) <= 1:  ### Indicates that there are insufficient number of command-line arguments
        print 'WARNING!!!! Too commands supplied.'

    else:
        options, remainder = getopt.getopt(sys.argv[1:], '',
                                           ['species=', 'clip='])
        #print sys.argv[1:]
        for opt, arg in options:
            if opt == '--species':
                species = arg
            elif opt == '--clip':
                CLIP_dir = arg

    import ExpressionBuilder
    coding_db = ExpressionBuilder.importTranscriptBiotypeAnnotations(species)
    dataset_peaks = eCLIPimport(CLIP_dir)
Example #15
0
def sashmi_plot_list(bamdir,
                     eventsToVisualizeFilename,
                     PSIFilename,
                     events=None):
    try:
        import gene_associations
        gene_to_symbol = gene_associations.getGeneToUid(
            species, ('hide', 'Ensembl-Symbol'))
        from import_scripts import OBO_import
        symbol_to_gene = OBO_import.swapKeyValues(gene_to_symbol)
    except Exception:
        symbol_to_gene = {}

    if events == None:
        splicing_events, expandedSearch = importSplicingEventsToVisualize(
            eventsToVisualizeFilename)
    else:
        ### Replace any ":" from the input events
        #for i in range(len(events)): events[i] = string.replace(events[i],':','__')
        expandedSearch = True

        for i in range(len(events)):
            gene = string.split(events[i], '__')[0]
            if gene in gene_to_symbol:
                symbol = gene_to_symbol[gene][0]
            elif 'ENS' not in gene or 'G0000' in gene:
                if gene in symbol_to_gene:
                    ensID = symbol_to_gene[gene][0]
                    symbol = gene
                    events[
                        i] = ensID  ### translate this ID to an Ensembl gene ID for propper SashimiPlot lookup
        splicing_events = events  ### optionally get from supplied variable

    if len(splicing_events) == 0:
        print eventsToVisualizeFilename
        forceNoCompatibleEventsInFile

    print 'Exporting plots',

    ### Determine Groups for Coloring
    groups_file = 'None'
    dir_list = unique.read_directory(root_dir + '/ExpressionInput')

    for file in dir_list:
        if 'groups.' in file:
            groups_file = root_dir + '/ExpressionInput/' + file

    if groups_file != None:
        try:
            import ExpressionBuilder
            sample_group_db = ExpressionBuilder.simplerGroupImport(groups_file)
            groups = []
            for sample in sample_group_db:
                if sample_group_db[sample] not in groups:
                    groups.append(sample_group_db[sample]
                                  )  ### create an ordered list of unique group
        except Exception:
            groups = ['None']
            #print traceback.format_exc()
            pass

    processed_events = formatAndSubmitSplicingEventsToSashimiPlot(
        PSIFilename, bamdir, splicing_events, sample_group_db, groups, False)
    mopup_events = getMopUpEvents(splicing_events, processed_events)

    ### Do the same for supplied gene queries or junctions that didn't map above using the gene expression values as a guide
    #print len(splicing_events),len(processed_events),len(mopup_events)
    processed_events = formatAndSubmitSplicingEventsToSashimiPlot(
        steady_state_exp_file, bamdir, mopup_events, sample_group_db, groups,
        expandedSearch)
    if len(processed_events) > 0:
        mopup_events = getMopUpEvents(mopup_events, processed_events)
        processed_events = formatAndSubmitSplicingEventsToSashimiPlot(
            PSIFilename, bamdir, mopup_events, sample_group_db, groups, True)
    return gene_to_symbol
Example #16
0
def remoteSashimiPlot(Species,fl,bamdir,eventsToVisualizeFilename,events=None,show=False):
    global PSIFilename
    global outputdir
    global root_dir
    global steady_state_exp_file
    global species
    species = Species
    
    try:
        countinp = fl.CountsFile()
        root_dir = fl.RootDir()
    except Exception:
        root_dir = fl
        search_dir = root_dir+'/ExpressionInput'
        files = unique.read_directory(search_dir)
        for file in files:
            if 'counts.' in file and 'steady-state.txt' not in file:
                countinp = search_dir+'/'+file
    
    PSIFilename = root_dir+'/AltResults/AlternativeOutput/'+species+'_RNASeq_top_alt_junctions-PSI.txt'
    
    import ExpressionBuilder
    dir_list = unique.read_directory(root_dir+'/ExpressionInput')
    for file in dir_list:
        if 'exp.' in file and 'steady-state' not in file:
            exp_file = root_dir+'/ExpressionInput/'+file
        elif 'exp.' in file and 'steady-state' in file:
            steady_state_exp_file = root_dir+'/ExpressionInput/'+file
    global sample_group_db
    sample_group_db = ExpressionBuilder.simplerGroupImport(exp_file)
    
    #outputdir=findParentDir(PSIFilename)+"sashimiplots"
    outputdir = root_dir+'/ExonPlots'
    outputdir = root_dir+'/SashimiPlots'
    try: os.mkdir(unique.filepath(outputdir))
    except Exception: pass
    
    if show:
        s = open(outputdir+'/show.txt','w')
        s.write('TRUE'); s.close()
    else:
        s = open(outputdir+'/show.txt','w')
        s.write('FALSE'); s.close()

    geneSymbol_db=Sashimiplottting(bamdir,countinp,PSIFilename,eventsToVisualizeFilename,events=events)
    for filename in os.listdir(outputdir):
        if '.pdf' in filename or '.png' in filename:
            fn = string.replace(filename,'.pdf','')
            fn = string.replace(fn,'.png','')
            newname=string.split(fn,'__')
            if newname[0] in geneSymbol_db:
                new_filename = str(filename)
                if '__' in filename:
                    new_filename = string.split(filename,'__')[1]
                elif '\\' in filename:
                    new_filename = string.split(filename,'\\')[1]
                elif '/' in filename:
                    new_filename = string.split(filename,'/')[1]
                nnname=geneSymbol_db[newname[0]][0]+'-SashimiPlot_'+new_filename
                try: os.rename(os.path.join(outputdir, filename), os.path.join(outputdir,nnname))
                except Exception:
                    if 'already exists' in traceback.format_exc():
                        ### File already exists, delete the new one
                        try: os.remove(os.path.join(outputdir,nnname))
                        except Exception: pass
                        ### Now right the new one
                        try: os.rename(os.path.join(outputdir, filename), os.path.join(outputdir,nnname))
                        except Exception: pass
                    pass
            else:
                continue
    print ''
Example #17
0
def remoteSashimiPlot(Species,
                      fl,
                      bamdir,
                      eventsToVisualizeFilename,
                      events=None,
                      show=False):
    global PSIFilename
    global outputdir
    global root_dir
    global steady_state_exp_file
    global species
    species = Species

    try:
        countinp = fl.CountsFile()
        root_dir = fl.RootDir()
    except Exception:
        root_dir = fl
        search_dir = root_dir + '/ExpressionInput'
        files = unique.read_directory(search_dir)
        for file in files:
            if 'counts.' in file and 'steady-state.txt' not in file:
                countinp = search_dir + '/' + file

    PSIFilename = root_dir + '/AltResults/AlternativeOutput/' + species + '_RNASeq_top_alt_junctions-PSI.txt'

    import ExpressionBuilder
    dir_list = unique.read_directory(root_dir + '/ExpressionInput')
    for file in dir_list:
        if 'exp.' in file and 'steady-state' not in file:
            exp_file = root_dir + '/ExpressionInput/' + file
        elif 'exp.' in file and 'steady-state' in file:
            steady_state_exp_file = root_dir + '/ExpressionInput/' + file
    global sample_group_db
    sample_group_db = ExpressionBuilder.simplerGroupImport(exp_file)

    #outputdir=findParentDir(PSIFilename)+"sashimiplots"
    outputdir = root_dir + '/ExonPlots'
    outputdir = root_dir + '/SashimiPlots'
    try:
        os.mkdir(unique.filepath(outputdir))
    except Exception:
        pass

    if show:
        s = open(outputdir + '/show.txt', 'w')
        s.write('TRUE')
        s.close()
    else:
        s = open(outputdir + '/show.txt', 'w')
        s.write('FALSE')
        s.close()

    geneSymbol_db = Sashimiplottting(bamdir,
                                     countinp,
                                     PSIFilename,
                                     eventsToVisualizeFilename,
                                     events=events)
    for filename in os.listdir(outputdir):
        if '.pdf' in filename or '.png' in filename:
            fn = string.replace(filename, '.pdf', '')
            fn = string.replace(fn, '.png', '')
            newname = string.split(fn, '__')
            if newname[0] in geneSymbol_db:
                new_filename = str(filename)
                if '__' in filename:
                    new_filename = string.split(filename, '__')[1]
                elif '\\' in filename:
                    new_filename = string.split(filename, '\\')[1]
                elif '/' in filename:
                    new_filename = string.split(filename, '/')[1]
                nnname = geneSymbol_db[
                    newname[0]][0] + '-SashimiPlot_' + new_filename
                try:
                    os.rename(os.path.join(outputdir, filename),
                              os.path.join(outputdir, nnname))
                except Exception:
                    if 'already exists' in traceback.format_exc():
                        ### File already exists, delete the new one
                        try:
                            os.remove(os.path.join(outputdir, nnname))
                        except Exception:
                            pass
                        ### Now right the new one
                        try:
                            os.rename(os.path.join(outputdir, filename),
                                      os.path.join(outputdir, nnname))
                        except Exception:
                            pass
                    pass
            else:
                continue
    print ''
Example #18
0
def matrixImport(filename):
    matrix={}
    compared_groups={} ### track which values correspond to which groups for pairwise group comparisons
    original_data={}
    headerRow=True
    for line in open(filename,'rU').xreadlines():
        original_line = line
        data = line.rstrip()
        values = string.split(data,'\t')
        #print len(values)
        if headerRow:
            group_db={}
            groups=[]
            if ':' in data:
                group_sample_list = map(lambda x: string.split(x,':'),values[1:])
                index=1
                for (g,s) in group_sample_list:
                    try: group_db[g].append(index)
                    except Exception: group_db[g] = [index]
                    index+=1
                    if g not in groups: groups.append(g)
            else:
                import ExpressionBuilder
                search_dir = string.split(filename,'AltResults')[0]+'ExpressionInput'
                files = unique.read_directory(search_dir)
                for file in files:
                    if 'groups.' in file and '.txt' in file:
                        #print file
                        sample_group_db = ExpressionBuilder.simplerGroupImport(search_dir+'/'+file)
                
                index=0; count=0
                for s in values[1:]:
                    if s in sample_group_db:
                        g = sample_group_db[s]
                        try: group_db[g].append(index)
                        except Exception: group_db[g] = [index]
                        count+=1
                        if g not in groups: groups.append(g)
                    #else: print [s]
                    index+=1
            #print count
            headerRow = False
            grouped_values=[]
            original_data['header'] = original_line
        else:
            key = values[0]
            values=values[1:]
            grouped_floats=[]
            float_values = []
            associated_groups=[]
            for g in groups: ### string values
                gvalues_list=[]
                for i in group_db[g]:
                    try:
                        if values[i] != '0':
                            try:
                                gvalues_list.append(float(values[i]))
                            except Exception: pass
                        else:
                            #try: gvalues_list.append('') ### Thus are missing values
                            #except Exception: pass
                            pass
                    except Exception:
                        #try: gvalues_list.append('') ### Thus are missing values
                        #except Exception: pass
                        pass
                grouped_floats.append(gvalues_list)
                if len(gvalues_list)>1:
                    associated_groups.append(g)
            matrix[key] = grouped_floats
            compared_groups[key] = associated_groups
            if '\n' not in original_line:
                original_line+='\n'
            original_data[key] = original_line
            last_line = line
    return matrix,compared_groups,original_data