def retreiveAllKnownSpliceSites(returnExonRetention=False,DesignatedSpecies=None,path=None):
    ### Uses a priori strand information when none present
    import export, unique
    chromosomes_found={}
    try: parent_dir = export.findParentDir(bam_file)
    except Exception: parent_dir = export.findParentDir(path)
    species = None
    for file in os.listdir(parent_dir):
        if 'AltAnalyze_report' in file and '.log' in file:
            log_file = unique.filepath(parent_dir+'/'+file)
            log_contents = open(log_file, "rU")
            species_tag = '	species: '
            for line in log_contents:
                line = line.rstrip()
                if species_tag in line:
                    species = string.split(line,species_tag)[1]
    if species == None:
        try: species = IndicatedSpecies
        except Exception: species = DesignatedSpecies
    
    splicesite_db={}
    gene_coord_db={}
    try:
        if ExonReference==None:
            exon_dir = 'AltDatabase/ensembl/'+species+'/'+species+'_Ensembl_exon.txt'
            length = verifyFileLength(exon_dir)
    except Exception:
        #print traceback.format_exc();sys.exit()
        length = 0
    if length==0:
        exon_dir = ExonReference
    refExonCoordinateFile = unique.filepath(exon_dir)
    firstLine=True
    for line in open(refExonCoordinateFile,'rU').xreadlines():
        if firstLine: firstLine=False
        else:
            line = line.rstrip('\n')
            t = string.split(line,'\t'); #'gene', 'exon-id', 'chromosome', 'strand', 'exon-region-start(s)', 'exon-region-stop(s)', 'constitutive_call', 'ens_exon_ids', 'splice_events', 'splice_junctions'
            geneID, exon, chr, strand, start, stop = t[:6]
            spliceEvent = t[-2]
            #start = int(start); stop = int(stop)
            #geneID = string.split(exon,':')[0]
            try:
                gene_coord_db[geneID,chr].append(int(start))
                gene_coord_db[geneID,chr].append(int(stop))
            except Exception:
                gene_coord_db[geneID,chr] = [int(start)]
                gene_coord_db[geneID,chr].append(int(stop))
            if returnExonRetention:
                if 'exclusion' in spliceEvent or 'exclusion' in spliceEvent:
                    splicesite_db[geneID+':'+exon]=[]
            else:
                splicesite_db[chr,start]=strand
                splicesite_db[chr,stop]=strand
                if len(chr)<5 or ('GL0' not in chr and 'GL' not in chr and 'JH' not in chr and 'MG' not in chr):
                    chromosomes_found[string.replace(chr,'chr','')] = []
    for i in gene_coord_db:
        gene_coord_db[i].sort()
        gene_coord_db[i] = [gene_coord_db[i][0],gene_coord_db[i][-1]]
    return splicesite_db,chromosomes_found,gene_coord_db
Beispiel #2
0
def FilterFile(Guidefile, PSI, turn=0):
    if 'Clustering' in Guidefile:
        count = 1
    else:
        count = 0
    val = []
    head = 0
    for line in open(Guidefile, 'rU').xreadlines():
        if head > count:
            line = line.rstrip('\r\n')
            q = string.split(line, '\t')
            val.append(q[0])
        else:
            head += 1
            continue

    dire = export.findParentDir(export.findParentDir(Guidefile)[:-1])
    output_dir = dire + 'SubtypeAnalyses-Results'
    if os.path.exists(output_dir) == False:
        export.createExportFolder(output_dir)

    #output_file = output_dir+'/round'+str(turn)+'/'+export.findFilename(PSI)+'-filtered.txt'
    output_file = output_dir + '/round' + str(
        turn) + '/' + export.findFilename(PSI)[:-4] + '-filtered.txt'
    try:
        os.mkdir(output_dir + '/round' + str(turn))
    except:
        pass  ### already exists
    if turn == 1:
        ### No need to filter this file
        shutil.copyfile(PSI, output_file)
    else:
        filterRows(PSI, output_file, filterDB=val)

    return output_file
Beispiel #3
0
def getPathwayAs(pathway_db, species_code, mod):
    begin_time = time.time()
    try:
        export.deleteFolder(
            'BuildDBs/WPs')  ### clear any remaining pathway files
    except Exception:
        null = []
    for wpid in pathway_db:
        file_type = 'gpml'
        wp_id_data = client.service.getPathwayAs(fileType=file_type,
                                                 pwId=wpid,
                                                 revision=0)
        wp_id_data = base64.b64decode(wp_id_data)
        gpml_path = filepath('BuildDBs/WPs/' + wpid + '.gpml')
        outfile = export.ExportFile(gpml_path)
        outfile.write(wp_id_data)
        outfile.close()
        gene_system_list = string.split(wp_id_data, '\n')
        parent_path = export.findParentDir(gpml_path)
        pathway_db = gene_associations.getGPMLGraphData(
            parent_path, species_code, mod)  ### get GPML data back
        os.remove(gpml_path)  ### Only store the file temporarily

    end_time = time.time()
    time_diff = float(end_time - begin_time)
    """
    try: print "WikiPathways data imported in %d seconds" % time_diff
    except Exception: null=None ### Occurs when transitioning back from the Official Database download window (not sure why) -- TclError: can't invoke "update" command
    """
    return pathway_db
Beispiel #4
0
def runPyCombat(fl):
    """ This method was added specifically for AltAnalyze version 2.0.8 (not in the original GitHub code) """
    print 'Running Combat...',
    expr_input_dir = fl.ExpFile()
    pheno_dir = formatPhenoFile(fl)

    moved_exp_dir = export.findParentDir(
        expr_input_dir) + 'Non-Combat/' + export.findFilename(expr_input_dir)
    try:
        export.copyFile(expr_input_dir, moved_exp_dir)
        print 'Moved original expression file to:'
        print '\t' + moved_exp_dir
        ### now overwrite the origin excluding the commented rows
        export.cleanFile(
            expr_input_dir,
            removeExtra='#')  ### remove comments from the original file
    except Exception:
        None

    pheno = pa.read_table(pheno_dir, index_col=0)
    dat = pa.read_table(expr_input_dir, index_col=0)

    mod = patsy.dmatrix("group", pheno, return_type="dataframe")
    t = time.time()
    #print dat, pheno.batch, mod;sys.exit()
    ebat = combat(dat, pheno.batch, mod, 0)
    print "...Combat completed in %.2f seconds" % (time.time() - t)

    print 'Original expression file over-written with batch effect removal results...'
    ebat.to_csv(expr_input_dir, sep="\t")
def getPathwayAs(pathway_db,species_code,mod):
    begin_time = time.time()
    for wpid in pathway_db:
        #print [wpid],'pathway_db',len(pathway_db)
        file_type = 'gpml'
        #file_type = 'application/gpml+xml'
        processor_time = str(time.clock())
        #try: export.deleteFolder('BuildDBs/WPs') ### clear any remaining pathway files
        #except Exception: pass
        #wp_id_data = client.service.getPathwayAs(fileType = file_type,pwId = wpid, revision = 0)
        kwargs = {
            'identifier': 'WP2062',
            'version': 0,
            'file_format': 'application/gpml+xml'}
        #wp_id_data = wikipathways_api_client_instance.get_pathway_as(**kwargs)
        wp_id_data = wikipathways_api_client_instance.get_pathway_as(file_format = file_type,identifier = wpid, version = 0)
        #wp_id_data = base64.b64decode(wp_id_data)
        gpml_path = filepath('BuildDBs/WPs/'+processor_time+'/'+wpid+'.gpml')
        #print gpml_path
        outfile = export.ExportFile(gpml_path)
        outfile.write(wp_id_data); outfile.close()
        gene_system_list = string.split(wp_id_data,'\n')
        parent_path = export.findParentDir(gpml_path)
        pathway_db = gene_associations.getGPMLGraphData(parent_path,species_code,mod) ### get GPML data back
        
        #os.remove(gpml_path) ### Only store the file temporarily
        try: export.deleteFolder('BuildDBs/WPs/'+processor_time) ### clear any remaining pathway files
        except Exception: pass
        
    end_time = time.time(); time_diff = float(end_time-begin_time)
    """
    try: print "WikiPathways data imported in %d seconds" % time_diff
    except Exception: null=None ### Occurs when transitioning back from the Official Database download window (not sure why) -- TclError: can't invoke "update" command
    """
    return pathway_db
Beispiel #6
0
def downloadCurrentVersion(filename,secondary_dir,file_type):
    import UI
    file_location_defaults = UI.importDefaultFileLocations()

    ud = file_location_defaults['url'] ### Get the location of the download site from Config/default-files.csv
    url_dir = ud.Location() ### Only one entry
    
    dir = export.findParentDir(filename)
    dir = string.replace(dir,'hGlue','')  ### Used since the hGlue data is in a sub-directory
    filename = export.findFilename(filename)
    url = url_dir+secondary_dir+'/'+filename
    file,status = download(url,dir,file_type); continue_analysis = 'yes'
    if 'Internet' in status and 'nnot' not in filename: ### Exclude for Affymetrix annotation files
        print_out = "File:\n"+url+"\ncould not be found on the server or an internet connection is unavailable."
        if len(sys.argv)<2:
            try:
                UI.WarningWindow(print_out,'WARNING!!!')
                continue_analysis = 'no'
            except Exception:
                print 'cannot be downloaded';force_error
        else: print 'cannot be downloaded';force_error
    elif status == 'remove' and ('.zip' in file or '.tar' in file or '.gz' in file):
        try: os.remove(file) ### Not sure why this works now and not before
        except Exception: status = status
    return continue_analysis
def retreiveAllKnownSpliceSites():
    ### Uses a priori strand information when none present
    import export, unique
    chromosomes_found={}
    parent_dir = export.findParentDir(bam_file)
    species = None
    for file in os.listdir(parent_dir):
        if 'AltAnalyze_report' in file and '.log' in file:
            log_file = unique.filepath(parent_dir+'/'+file)
            log_contents = open(log_file, "rU")
            species_tag = '	species: '
            for line in log_contents:
                line = line.rstrip()
                if species_tag in line:
                    species = string.split(line,species_tag)[1]
    if species == None:
        species = IndicatedSpecies

    splicesite_db={}
    refExonCoordinateFile = unique.filepath('AltDatabase/ensembl/'+species+'/'+species+'_Ensembl_exon.txt')
    firstLine=True
    for line in open(refExonCoordinateFile,'rU').xreadlines():
        if firstLine: firstLine=False
        else:
            line = line.rstrip('\n')
            t = string.split(line,'\t'); #'gene', 'exon-id', 'chromosome', 'strand', 'exon-region-start(s)', 'exon-region-stop(s)', 'constitutive_call', 'ens_exon_ids', 'splice_events', 'splice_junctions'
            geneID, exon, chr, strand, start, stop = t[:6]
            #start = int(start); stop = int(stop)
            #geneID = string.split(exon,':')[0]
            splicesite_db[chr,start]=strand
            splicesite_db[chr,stop]=strand
            if len(chr)<5 or ('GL0' not in chr and 'GL' not in chr and 'JH' not in chr and 'MG' not in chr):
                chromosomes_found[string.replace(chr,'chr','')] = []
    
    return splicesite_db,chromosomes_found
Beispiel #8
0
def unzipFiles(filename,dir):
    import zipfile
    output_filepath = filepath(dir+'/'+filename)
    try:
        zfile = zipfile.ZipFile(output_filepath)
        for name in zfile.namelist():
            if name.endswith('/'):null=[] ### Don't need to export
            else:
                if 'EnsMart' in name and 'EnsMart' in dir:
                    dir = export.findParentDir(dir[:-1]) ### Remove EnsMart suffix directory
                try: outfile = export.ExportFile(filepath(dir+name))
                except Exception: outfile = export.ExportFile(filepath(dir+name[1:]))
                outfile.write(zfile.read(name)); outfile.close()
        #print 'Zip extracted to:',output_filepath
        status = 'completed'
    except Exception, e:
        try:
            ### Use the operating system's unzip if all else fails
            extracted_path = string.replace(output_filepath,'.zip','')
            try: os.remove(extracted_path) ### This is necessary, otherwise the empty file created above will require user authorization to delete
            except Exception: null=[]
            subprocessUnzip(dir,output_filepath)
            status = 'completed'
        except IOError:
            print e
            print 'WARNING!!!! The zip file',output_filepath,'does not appear to be a valid zip archive file or is currupt.'
            status = 'failed'
Beispiel #9
0
def covertAffyFormatToBED(filename, ConversionDB=None):
    print 'processing:',filename
    parent = export.findParentDir(filename)
    if ConversionDB==None:
        output_file = 'simple_chr.bed'
    else:
        output_file = export.findFilename(filename)
        output_file = string.replace(output_file,'mm9','mm10')
    export_obj = export.ExportFile(parent+'/'+output_file)
    fn=filepath(filename); entry_count=0; readfiles = False
    for line in open(fn,'rU').xreadlines():
        data = cleanUpLine(line)
        if data[0]=='#': readfiles = False
        elif readfiles==False:
            readfiles = True
            if ConversionDB!=None:
               export_obj.write(line) ### Write header 
        else:
            try:
                t = string.split(data[1:-1],'","')
                probeset_id,chr,strand,start,stop = t[:5]
                int(start)
                if ConversionDB==None:
                    if 'chr' in chr:
                        export_obj.write(chr+'\t'+start+'\t'+stop+'\t'+probeset_id+'\n')
                else:
                    chr,start,stop = ConversionDB[probeset_id]
                    t = [probeset_id,chr,strand,start,stop] + t[5:]
                    values = '"'+string.join(t,'","')+'"\n'
                    export_obj.write(values)
                entry_count+=1
            except Exception:
                pass
    export_obj.close()
    print entry_count, 'entries saved to:',parent+'/'+output_file
Beispiel #10
0
def downloadCurrentVersion(filename,secondary_dir,file_type):
    import UI
    file_location_defaults = UI.importDefaultFileLocations()

    ud = file_location_defaults['url'] ### Get the location of the download site from Config/default-files.csv
    url_dir = ud.Location() ### Only one entry
    
    dir = export.findParentDir(filename)
    dir = string.replace(dir,'hGlue','')  ### Used since the hGlue data is in a sub-directory
    filename = export.findFilename(filename)
    url = url_dir+secondary_dir+'/'+filename
    print url
    file,status = download(url,dir,file_type); continue_analysis = 'yes'
    if 'Internet' in status and 'nnot' not in filename: ### Exclude for Affymetrix annotation files
        print_out = "File:\n"+url+"\ncould not be found on the server or an internet connection is unavailable."
        if len(sys.argv)<2:
            try:
                UI.WarningWindow(print_out,'WARNING!!!')
                continue_analysis = 'no'
            except Exception:
                print 'cannot be downloaded';force_error
        else: print 'cannot be downloaded';force_error
    elif status == 'remove' and ('.zip' in file or '.tar' in file or '.gz' in file):
        try: os.remove(file) ### Not sure why this works now and not before
        except Exception: status = status
    return continue_analysis
Beispiel #11
0
def runPyCombat(fl):
    """ This method was added specifically for AltAnalyze version 2.0.8 (not in the original GitHub code) """
    print "Running Combat...",
    expr_input_dir = fl.ExpFile()
    pheno_dir = formatPhenoFile(fl)

    moved_exp_dir = export.findParentDir(expr_input_dir) + "Non-Combat/" + export.findFilename(expr_input_dir)
    try:
        export.copyFile(expr_input_dir, moved_exp_dir)
        print "Moved original expression file to:"
        print "\t" + moved_exp_dir
        ### now overwrite the origin excluding the commented rows
        export.cleanFile(expr_input_dir, removeExtra="#")  ### remove comments from the original file
    except Exception:
        None

    pheno = pa.read_table(pheno_dir, index_col=0)
    dat = pa.read_table(expr_input_dir, index_col=0)

    mod = patsy.dmatrix("group", pheno, return_type="dataframe")
    t = time.time()
    # print dat, pheno.batch, mod;sys.exit()
    ebat = combat(dat, pheno.batch, mod, 0)
    print "...Combat completed in %.2f seconds" % (time.time() - t)

    print "Original expression file over-written with batch effect removal results..."
    ebat.to_csv(expr_input_dir, sep="\t")
Beispiel #12
0
def normalizeDataset(filename,
                     output=None,
                     normalization='quantile',
                     platform="3'array"):
    """ Perform Quantile Normalization on an input expression dataset """

    if output == None:
        output = filename
        moved_exp_dir = export.findParentDir(
            filename) + 'Non-Normalized/' + export.findFilename(filename)
        try:
            export.copyFile(filename, moved_exp_dir)
            print 'Moved original expression file to:'
            print '\t' + moved_exp_dir
        except Exception:
            None

    if normalization == 'Quantile' or normalization == 'quantile':
        print "Importing data..."
        sample_expression_db = importExpressionValues(filename)
        print "Performing quantile normalization..."
        sample_expression_db = RNASeq.quantileNormalizationSimple(
            sample_expression_db)
        exportExpressionData(output, sample_expression_db)
    elif normalization == 'group':
        performGroupNormalization(moved_exp_dir, filename, platform)
    print 'Exported expression input file to:', output
Beispiel #13
0
def retreiveAllKnownSpliceSites():
    ### Uses a priori strand information when none present
    import export, unique
    chromosomes_found={}
    parent_dir = export.findParentDir(bam_file)
    for file in os.listdir(parent_dir):
        if 'AltAnalyze_report' in file and '.log' in file:
            log_file = unique.filepath(parent_dir+'/'+file)
            log_contents = open(log_file, "rU")
            species_tag = '	species: '
            for line in log_contents:
                line = line.rstrip()
                if species_tag in line:
                    species = string.split(line,species_tag)[1]
    splicesite_db={}
    refExonCoordinateFile = unique.filepath('AltDatabase/ensembl/'+species+'/'+species+'_Ensembl_exon.txt')
    firstLine=True
    for line in open(refExonCoordinateFile,'rU').xreadlines():
        if firstLine: firstLine=False
        else:
            line = line.rstrip('\n')
            t = string.split(line,'\t'); #'gene', 'exon-id', 'chromosome', 'strand', 'exon-region-start(s)', 'exon-region-stop(s)', 'constitutive_call', 'ens_exon_ids', 'splice_events', 'splice_junctions'
            geneID, exon, chr, strand, start, stop = t[:6]
            #start = int(start); stop = int(stop)
            #geneID = string.split(exon,':')[0]
            splicesite_db[chr,start]=strand
            splicesite_db[chr,stop]=strand
            if len(chr)<5 or ('GL0' not in chr and 'GL' not in chr and 'JH' not in chr and 'MG' not in chr):
                chromosomes_found[string.replace(chr,'chr','')] = []
    
    return splicesite_db,chromosomes_found
Beispiel #14
0
def downloadCurrentVersion(filename, secondary_dir, file_type):
    import UI
    file_location_defaults = UI.importDefaultFileLocations()

    uds = file_location_defaults[
        'url']  ### Get the location of the download site from Config/default-files.csv
    for ud in uds:
        url_dir = ud.Location()  ### Only one entry

    dir = export.findParentDir(filename)
    filename = export.findFilename(filename)
    url = url_dir + secondary_dir + '/' + filename

    file, status = download(url, dir, file_type)
    continue_analysis = 'yes'
    if 'Internet' in status:
        print_out = "File:\n" + url + "\ncould not be found on server or internet connection is unavailable."
        try:
            UI.WarningWindow(print_out, 'WARNING!!!')
            continue_analysis = 'no'
        except Exception:
            print url
            print 'cannot be downloaded'
            die
    elif status == 'remove':
        try:
            os.remove(file)  ### Not sure why this works now and not before
        except Exception:
            status = status
    return continue_analysis
def visualizePathwayAssociations(filename,
                                 species,
                                 mod_type,
                                 wpid,
                                 imageExport=True):
    ### Log any potential problems
    log_file = filepath('webservice.log')
    log_report = open(log_file, 'w')
    if wpid == None:
        force_invalid_pathway

    global mod
    global species_code
    global graphic_link
    graphic_link = {}
    mod = mod_type
    species_code = species
    root_dir = export.findParentDir(filename)
    criterion_name = export.findFilename(filename)[:-4]
    log_report.write('Filename: %s and WPID %s\n' % (filename, wpid))
    if 'GO-Elite/input' in root_dir:
        root_dir = string.replace(root_dir, 'GO-Elite/input', 'WikiPathways')
    else:
        root_dir += 'WikiPathways/'
    analysis_type = 'Genes'
    id_db, column_headers = importDataSimple(filename, 'GO-Elite')
    log_report.write('GO-Elite input ID file imported successfully\n')
    log_report.write('%d IDs imported\n' % len(id_db))
    pathway_db = {}
    pathway_db[wpid] = PathwayData(
        None
    )  ### only need to analyze object (method allows for analysis of any number)
    pathway_db = getPathwayAs(pathway_db, species_code, mod)
    log_report.write(
        'Pathway data imported from GPML files obtained from webservice\n')
    id_color_db = getHexadecimalColorRanges(
        id_db, analysis_type)  ### example id_db" is key:gene, value:fold
    graphID_db = getGraphIDAssociations(id_color_db, pathway_db, 'MOD')
    if imageExport != 'png':
        file_type = 'pdf'  ### svg, pdf, png
        getColoredPathway(root_dir,
                          graphID_db,
                          file_type,
                          '-' + criterion_name,
                          WPID=wpid)
    if imageExport != 'pdf':
        file_type = 'png'  ### svg, pdf, png
        getColoredPathway(root_dir,
                          graphID_db,
                          file_type,
                          '-' + criterion_name,
                          WPID=wpid)
    log_report.write(
        'Pathways colored and image data returned. Exiting webservice.\n')
    log_report.close()
    return graphic_link
def viewLineageProfilerResults(filename, graphic_links):
    global graphic_link
    graphic_link = graphic_links  ### This is a list of tuples containing name and file location

    ### Log any potential problems
    log_file = filepath('webservice.log')
    log_report = open(log_file, 'w')

    root_dir = export.findParentDir(filename)
    root_dir = string.replace(root_dir, 'ExpressionOutput/Clustering',
                              'DataPlots')
    if 'DataPlots' not in root_dir:  ### Occurs when directly supplying an input matrix by the user
        root_dir += '/DataPlots/'
        try:
            os.mkdir(root_dir)  ### May need to create this directory
        except Exception:
            None
    id_db, column_headers = importDataSimple(filename, 'LineageProfiler')
    log_report.write('LineageProfiler input ID file imported successfully\n')
    pathway_db = {}
    pathway_db['WP2062'] = PathwayData('TissueFateMap')
    ### MOD and species are not particularly important for Lineage analysis
    pathway_db = getPathwayAs(pathway_db, 'Hs', 'Ensembl')
    log_report.write(
        'Pathway data imported from GPML files obtained from webservice\n')
    i = 0
    group_id_db = {}  ### store the results separately for each sample
    ### When analyzing z-scores, you can have multiple samples you wish to visualize results for (not so for regulated gene lists)
    for biological_group in column_headers:
        group_id_db[biological_group] = db = {}
        for gene in id_db:
            group_id_db[biological_group][gene] = id_db[gene][
                i]  ### get the index value of that biological group (z-score change)
        i += 1
    for biological_group in group_id_db:
        group_specific = group_id_db[biological_group]
        analysis_type = 'Lineage'
        id_color_db = getHexadecimalColorRanges(
            group_specific,
            analysis_type)  ### example "id_db" is key:tissue, value:z-score
        graphID_db = getGraphIDAssociations(id_color_db, pathway_db, 'Label')
        file_type = 'png'  ### svg, pdf, png
        getColoredPathway(root_dir, graphID_db, file_type,
                          '-' + biological_group)
        file_type = 'pdf'  ### svg, pdf, png
        getColoredPathway(root_dir, graphID_db, file_type,
                          '-' + biological_group)

    log_report.write(
        'Pathways colored and images saved to disk. Exiting webservice.\n')
    log_report.close()
    return graphic_link
def viewLineageProfilerResults(filename, graphic_links):
    global graphic_link
    graphic_link = graphic_links  ### This is a list of tuples containing name and file location

    ### Log any potential problems
    log_file = filepath("webservice.log")
    log_report = open(log_file, "w")

    root_dir = export.findParentDir(filename)
    root_dir = string.replace(root_dir, "ExpressionOutput/Clustering", "DataPlots")
    if "DataPlots" not in root_dir:  ### Occurs when directly supplying an input matrix by the user
        root_dir += "/DataPlots/"
        try:
            os.mkdir(root_dir)  ### May need to create this directory
        except Exception:
            None
    id_db, column_headers = importDataSimple(filename, "LineageProfiler")
    log_report.write("LineageProfiler input ID file imported successfully\n")
    pathway_db = {}
    pathway_db["WP2062"] = PathwayData("TissueFateMap")
    ### MOD and species are not particularly important for Lineage analysis
    pathway_db = getPathwayAs(pathway_db, "Hs", "Ensembl")
    log_report.write("Pathway data imported from GPML files obtained from webservice\n")
    i = 0
    group_id_db = {}  ### store the results separately for each sample
    ### When analyzing z-scores, you can have multiple samples you wish to visualize results for (not so for regulated gene lists)
    for biological_group in column_headers:
        group_id_db[biological_group] = db = {}
        for gene in id_db:
            group_id_db[biological_group][gene] = id_db[gene][
                i
            ]  ### get the index value of that biological group (z-score change)
        i += 1
    for biological_group in group_id_db:
        group_specific = group_id_db[biological_group]
        analysis_type = "Lineage"
        id_color_db = getHexadecimalColorRanges(
            group_specific, analysis_type
        )  ### example "id_db" is key:tissue, value:z-score
        graphID_db = getGraphIDAssociations(id_color_db, pathway_db, "Label")
        file_type = "png"  ### svg, pdf, png
        getColoredPathway(root_dir, graphID_db, file_type, "-" + biological_group)
        file_type = "pdf"  ### svg, pdf, png
        getColoredPathway(root_dir, graphID_db, file_type, "-" + biological_group)

    log_report.write("Pathways colored and images saved to disk. Exiting webservice.\n")
    log_report.close()
    return graphic_link
def getPathwayAs(pathway_db, species_code, mod):
    begin_time = time.time()
    for wpid in pathway_db:
        #print [wpid],'pathway_db',len(pathway_db)
        file_type = 'gpml'
        #file_type = 'application/gpml+xml'
        processor_time = str(time.clock())
        #try: export.deleteFolder('BuildDBs/WPs') ### clear any remaining pathway files
        #except Exception: pass
        #wp_id_data = client.service.getPathwayAs(fileType = file_type,pwId = wpid, revision = 0)
        kwargs = {
            'identifier': 'WP2062',
            'version': 0,
            'file_format': 'application/gpml+xml'
        }
        #wp_id_data = wikipathways_api_client_instance.get_pathway_as(**kwargs)
        wp_id_data = wikipathways_api_client_instance.get_pathway_as(
            file_format=file_type, identifier=wpid, version=0)
        #wp_id_data = base64.b64decode(wp_id_data)
        gpml_path = filepath('BuildDBs/WPs/' + processor_time + '/' + wpid +
                             '.gpml')
        #print gpml_path
        outfile = export.ExportFile(gpml_path)
        outfile.write(wp_id_data)
        outfile.close()
        gene_system_list = string.split(wp_id_data, '\n')
        parent_path = export.findParentDir(gpml_path)
        pathway_db = gene_associations.getGPMLGraphData(
            parent_path, species_code, mod)  ### get GPML data back

        #os.remove(gpml_path) ### Only store the file temporarily
        try:
            export.deleteFolder(
                'BuildDBs/WPs/' +
                processor_time)  ### clear any remaining pathway files
        except Exception:
            pass

    end_time = time.time()
    time_diff = float(end_time - begin_time)
    """
    try: print "WikiPathways data imported in %d seconds" % time_diff
    except Exception: null=None ### Occurs when transitioning back from the Official Database download window (not sure why) -- TclError: can't invoke "update" command
    """
    return pathway_db
Beispiel #19
0
def importConvertedBED(filename):
    print 'processing:',filename
    parent = export.findParentDir(filename)
    fn=filepath(filename); entry_count=0; newCoordinates={}
    for line in open(fn,'rU').xreadlines():
        data = cleanUpLine(line)
        if data[0]!='#':
            try:
                t = string.split(data,'\t')
                chr,start,stop,probeset_id = t
                int(start)
                if 'chr' in chr:
                    entry_count+=1
                newCoordinates[probeset_id] = chr,start,stop
            except ZeroDivisionError:
                pass
    print entry_count, 'imported and saved.'
    return newCoordinates
Beispiel #20
0
def FilterFile(Guidefile,Guidefile_block,PSI,turn):
    if 'Clustering' in Guidefile:
        count=1
        flag=True
        rank_Count=0
        prev=0
    else:
        count=0
    val=[]
    head=0
    
    print Guidefile_block
    for line in open(Guidefile_block,'rU').xreadlines():
        if head >count:
            line=line.rstrip('\r\n')
            q= string.split(line,'\t')
            if flag:
               
                if int(q[1])==prev:
                    continue
                else:
                    rank_Count+=1
                    prev=int(q[1])
        else:
            head+=1
            continue
    head=0
    print Guidefile
    for line in open(Guidefile,'rU').xreadlines():
        if head >count:
            line=line.rstrip('\r\n')
            q= string.split(line,'\t')
            val.append(q[0])
        else:
            head+=1
            continue
    dire = export.findParentDir(PSI)
    output_dir = dire+'OncoInputs'
    if os.path.exists(output_dir)==False:
        export.createExportFolder(output_dir)
    
    output_file = output_dir+'/NMFInput-Round'+str(turn)+'.txt'
    filterRows(PSI,output_file,filterDB=val)
    return output_file,rank_Count
Beispiel #21
0
def importConvertedBED(filename):
    print 'processing:', filename
    parent = export.findParentDir(filename)
    fn = filepath(filename)
    entry_count = 0
    newCoordinates = {}
    for line in open(fn, 'rU').xreadlines():
        data = cleanUpLine(line)
        if data[0] != '#':
            try:
                t = string.split(data, '\t')
                chr, start, stop, probeset_id = t
                int(start)
                if 'chr' in chr:
                    entry_count += 1
                newCoordinates[probeset_id] = chr, start, stop
            except ZeroDivisionError:
                pass
    print entry_count, 'imported and saved.'
    return newCoordinates
Beispiel #22
0
def normalizeDataset(filename, output=None):
    """ Perform Quantile Normalization on an input expression dataset """

    print "Importing data..."
    sample_expression_db = importExpressionValues(filename)
    print "Performing quantile normalization..."
    sample_expression_db = RNASeq.quantileNormalizationSimple(sample_expression_db)

    if output == None:
        output = filename
        moved_exp_dir = export.findParentDir(filename) + "Non-Quantile/" + export.findFilename(filename)
        try:
            export.copyFile(filename, moved_exp_dir)
            print "Moved original expression file to:"
            print "\t" + moved_exp_dir
        except Exception:
            None

    exportExpressionData(output, sample_expression_db)
    print "Exported expression input file to:", output
Beispiel #23
0
def normalizeDataset(filename,output = None, normalization='quantile',platform="3'array"):
    """ Perform Quantile Normalization on an input expression dataset """
    
    if output==None:
        output = filename
        moved_exp_dir = export.findParentDir(filename)+'Non-Normalized/'+export.findFilename(filename)
        try:
            export.copyFile(filename, moved_exp_dir)
            print 'Moved original expression file to:'
            print '\t'+moved_exp_dir
        except Exception: None
        
    if normalization == 'Quantile' or normalization == 'quantile':
        print "Importing data..."
        sample_expression_db = importExpressionValues(filename)
        print "Performing quantile normalization..."    
        sample_expression_db = RNASeq.quantileNormalizationSimple(sample_expression_db)
        exportExpressionData(output,sample_expression_db)
    elif normalization == 'group':
        performGroupNormalization(moved_exp_dir,filename,platform)    
    print 'Exported expression input file to:',output
def visualizePathwayAssociations(filename,species,mod_type,wpid,imageExport=True):
    ### Log any potential problems
    log_file = filepath('webservice.log')
    log_report = open(log_file,'w')
    if wpid == None:
        force_invalid_pathway
        
    global mod
    global species_code
    global graphic_link
    graphic_link={}
    mod = mod_type
    species_code = species
    root_dir = export.findParentDir(filename)
    criterion_name = export.findFilename(filename)[:-4]
    log_report.write('Filename: %s and WPID %s\n' % (filename,wpid))
    if 'GO-Elite/input' in root_dir:
        root_dir = string.replace(root_dir,'GO-Elite/input','WikiPathways')
    else:
        root_dir+='WikiPathways/'
    analysis_type = 'Genes'
    id_db,column_headers = importDataSimple(filename,'GO-Elite')
    log_report.write('GO-Elite input ID file imported successfully\n')
    log_report.write('%d IDs imported\n' % len(id_db))
    pathway_db={}
    pathway_db[wpid] = PathwayData(None) ### only need to analyze object (method allows for analysis of any number)
    pathway_db = getPathwayAs(pathway_db,species_code,mod)
    log_report.write('Pathway data imported from GPML files obtained from webservice\n')
    id_color_db = getHexadecimalColorRanges(id_db,analysis_type) ### example id_db" is key:gene, value:fold
    graphID_db = getGraphIDAssociations(id_color_db,pathway_db,'MOD')
    if imageExport != 'png':
        file_type = 'pdf' ### svg, pdf, png
        getColoredPathway(root_dir,graphID_db,file_type,'-'+criterion_name,WPID=wpid)
    if imageExport != 'pdf':
        file_type = 'png' ### svg, pdf, png
        getColoredPathway(root_dir,graphID_db,file_type,'-'+criterion_name,WPID=wpid)
    log_report.write('Pathways colored and image data returned. Exiting webservice.\n')
    log_report.close()
    return graphic_link
def normalizeDataset(filename, output=None):
    """ Perform Quantile Normalization on an input expression dataset """

    print "Importing data..."
    sample_expression_db = importExpressionValues(filename)
    print "Performing quantile normalization..."
    sample_expression_db = RNASeq.quantileNormalizationSimple(
        sample_expression_db)

    if output == None:
        output = filename
        moved_exp_dir = export.findParentDir(
            filename) + 'Non-Quantile/' + export.findFilename(filename)
        try:
            export.copyFile(filename, moved_exp_dir)
            print 'Moved original expression file to:'
            print '\t' + moved_exp_dir
        except Exception:
            None

    exportExpressionData(output, sample_expression_db)
    print 'Exported expression input file to:', output
Beispiel #26
0
def covertAffyFormatToBED(filename, ConversionDB=None):
    print 'processing:', filename
    parent = export.findParentDir(filename)
    if ConversionDB == None:
        output_file = 'simple_chr.bed'
    else:
        output_file = export.findFilename(filename)
        output_file = string.replace(output_file, 'mm9', 'mm10')
    export_obj = export.ExportFile(parent + '/' + output_file)
    fn = filepath(filename)
    entry_count = 0
    readfiles = False
    for line in open(fn, 'rU').xreadlines():
        data = cleanUpLine(line)
        if data[0] == '#': readfiles = False
        elif readfiles == False:
            readfiles = True
            if ConversionDB != None:
                export_obj.write(line)  ### Write header
        else:
            try:
                t = string.split(data[1:-1], '","')
                probeset_id, chr, strand, start, stop = t[:5]
                int(start)
                if ConversionDB == None:
                    if 'chr' in chr:
                        export_obj.write(chr + '\t' + start + '\t' + stop +
                                         '\t' + probeset_id + '\n')
                else:
                    chr, start, stop = ConversionDB[probeset_id]
                    t = [probeset_id, chr, strand, start, stop] + t[5:]
                    values = '"' + string.join(t, '","') + '"\n'
                    export_obj.write(values)
                entry_count += 1
            except Exception:
                pass
    export_obj.close()
    print entry_count, 'entries saved to:', parent + '/' + output_file
Beispiel #27
0
def downloadCurrentVersion(filename,secondary_dir,file_type):
    import UI
    file_location_defaults = UI.importDefaultFileLocations()

    uds = file_location_defaults['url'] ### Get the location of the download site from Config/default-files.csv
    for ud in uds: url_dir = ud.Location() ### Only one entry
    
    dir = export.findParentDir(filename)  
    filename = export.findFilename(filename)
    url = url_dir+secondary_dir+'/'+filename
    
    file,status = download(url,dir,file_type); continue_analysis = 'yes'
    if 'Internet' in status:
        print_out = "File:\n"+url+"\ncould not be found on server or internet connection is unavailable."
        try:
            UI.WarningWindow(print_out,'WARNING!!!')
            continue_analysis = 'no'
        except Exception:
            print url
            print 'cannot be downloaded';die
    elif status == 'remove':
        try: os.remove(file) ### Not sure why this works now and not before
        except Exception: status = status
    return continue_analysis
Beispiel #28
0
def NMFAnalysis(filename,Rank,turn=0,strategy="conservative"):
    
    X=[]
    header=[]
    head=0
    exportnam=export.findParentDir(filename)+'/NMF/round'+str(turn)+'NMFsnmf_versionr.txt'#+str(Rank)+'.txt'
    export_res=export.ExportFile(exportnam)
    exportnam_bin=export.findParentDir(filename)+'/NMF/round'+str(turn)+'NMFsnmf_binary.txt'#+str(Rank)+'.txt'
    export_res1=export.ExportFile(exportnam_bin)
    exportnam_bint=export.findParentDir(filename)+'/NMF/round'+str(turn)+'NMFsnmf_binary_t_.txt'#+str(Rank)+'.txt'
    export_res5=export.ExportFile(exportnam_bint)
    exportnam2=export.findParentDir(filename)+'/SubtypeAnalyses/round'+str(turn)+'Metadata.txt'#+str(Rank)+'.txt'
    export_res2=export.ExportFile(exportnam2)
    exportnam3=export.findParentDir(filename)+'/SubtypeAnalyses/round'+str(turn)+'Annotation.txt'#+str(Rank)+'.txt'
    export_res3=export.ExportFile(exportnam3)
    if 'Clustering' in filename:
        count=1
        start=2
    else:
        count=0
        start=1
        
    print filename
    for line in open(filename,'rU').xreadlines():
        line=line.rstrip('\r\n')
        q= string.split(line,'\t')
        if head >count:
            val=[]
            val2=[]
            me=0.0
            
            for i in range(start,len(q)):
                try:
                    val2.append(float(q[i]))
                except Exception:
                    continue
            me=np.median(val2)
            for i in range(start,len(q)):
                try:
                    val.append(float(q[i]))
                except Exception:
                    val.append(float(me))
            X.append(val)
          
        else:
            export_res1.write(line)
            export_res.write(line)
            export_res1.write("\n")
            export_res.write("\n")
            header=q
            head+=1
            continue

    group=defaultdict(list)
        
    sh=[]
    X=np.array(X)
    mat=[]
    mat=zip(*X)
    mat=np.array(mat)
    nmf = nimfa.Snmf(mat,seed="nndsvd", rank=int(Rank), max_iter=20,n_run=10,track_factor=True)
    nmf_fit = nmf()
    W = nmf_fit.basis()
    W=np.array(W)
    H=nmf_fit.coef()
    H=np.array(H)

    sh=W.shape
    export_res3.write("uid\tUID\tUID\n")
    if int(Rank)==2:
        par=1
    else:
        par=2

    W=zip(*W)
    W=np.array(W)
    sh=W.shape
    Z=[]
    for i in range(sh[0]):
        new_val=[]
        val=W[i,:]
        num=sum(i > 0.10 for i in val)
        if num >40 or num <3:
            compstd=True
        else:
            compstd=False
        me=np.mean(val)
        st=np.std(val)
        #print 'V'+str(i)
        export_res.write('V'+str(i))
        export_res1.write('V'+str(i))
        for j in range(sh[1]):
            if compstd:   
                if float(W[i][j])>=float(me+(par*st)):
                
                    export_res1.write("\t"+str(1))
                    new_val.append(1)
                else:
                    export_res1.write("\t"+str(0))
                    new_val.append(0)
            else:
                if float(W[i][j])>0.1:
                
                    export_res1.write("\t"+str(1))
                    new_val.append(1)
                else:
                    export_res1.write("\t"+str(0))
                    new_val.append(0)
            export_res.write("\t"+str(W[i][j]))
        Z.append(new_val)
        export_res.write("\n")
        export_res1.write("\n")
        
    Z=np.array(Z)
    sh=Z.shape
    Z_new=[]
    val1=[]
    Z1=[]
    dellst=[]
    export_res2.write("uid")
    export_res5.write("uid")
    for i in range(sh[0]):
        indices=[]
        val1=Z[i,:]
        sum1=sum(val1)
        flag=False
        indices=[index for index, value in enumerate(val1) if value == 1]
        for j in range(sh[0]):
            val2=[]
            
            if i!=j:
                val2=Z[j,:]
                
                sum2=sum([val2[x] for x in indices])
                summ2=sum(val2)
                try:
                    if float(sum2)/float(sum1)>0.5:
                        if summ2>sum1:
                            flag=True
                            #print str(i)
                except Exception:
                    continue
        if flag==False:

            Z1.append(val1)
            export_res2.write("\t"+'V'+str(i))
            export_res5.write("\t"+'V'+str(i))
            export_res3.write('V'+str(i)+"\t"+"Covariate"+"\t"+str(1)+"\n")
    
    export_res2.write("\n")
    export_res5.write("\n")
    Z1=np.array(Z1)
    Z=Z1
    Z=zip(*Z)
    Z=np.array(Z)
    sh=Z.shape
    print "stringency = ",[strategy]
    for i in range(sh[0]):
        val1=Z[i,:]
        #print sum(val1)
        #if sum(val)>2:
        if sum(val1)>2:
            val=[0 if x==1 else x for x in val1]
        else:
            val=val1
        me=np.mean(val)
        st=np.std(val)
        export_res2.write(header[i+1])
        export_res5.write(header[i+1])
        
        for j in range(sh[1]):
            if strategy=="conservative":
                #print header[i+1]
                export_res2.write("\t"+str(val1[j]))
                export_res5.write("\t"+str(val1[j]))
            else:
               #print header[i+1] 
               export_res2.write("\t"+str(val[j]))
               export_res5.write("\t"+str(val[j])) 
        export_res2.write("\n")
        export_res5.write("\n")
        Z_new.append(val)
        
    Z_new=zip(*Z_new)
    Z_new=np.array(Z_new)
    sh=Z_new.shape
    export_res5.close()
    Orderedheatmap.Classify(exportnam_bint)
    return exportnam,exportnam_bin,exportnam2,exportnam3
Beispiel #29
0
def performNMF(species, NMFinput, full_PSI_InputFile, filtered_EventAnnot_dir, k, AnalysisRound, strategy):
    """ Run NMF and determine the number of valid clusters based on the magnitude of detected differential splicing """
    
    use_adjusted_p=True
           
    print "Running NMF analyses for dimension reduction using "+str(k)+" k - Round"+str(AnalysisRound)
    NMFResult,BinarizedOutput,metaData,Annotation=NMF_Analysis.NMFAnalysis(NMFinput,k,AnalysisRound,strategy) ### This is where we get the correct version
    print "Running metaData Analyses for finding differential splicing events"
    rootdir,CovariateQuery=metaDataAnalysis.remoteAnalysis(species,filtered_EventAnnot_dir,metaData,'PSI',0.1,use_adjusted_p,0.05,Annotation)
    counter=1
    dPSI_results_dir=rootdir+CovariateQuery
    global upd_guides
    upd_guides=[]
    name=[]
    group=[]
    grplst=[]
    for filename in os.listdir(dPSI_results_dir):
        if filename.startswith("PSI."):
            dPSI_results_fn=os.path.join(dPSI_results_dir, filename)
            dPSI_comparison_alt_name=string.replace(filename,"PSI.","")
            omitcluster=FindTopUniqueEvents(dPSI_results_fn,dPSI_comparison_alt_name,dPSI_results_dir)
            if omitcluster==0: ### Hence, clustering succeeded and did not fail in this dPSI comparison
                group.append(counter)
                name.append(string.replace(filename,"PSI.",""))
                counter+=1
                
    print counter, 'robust splicing subtypes identified in round',AnalysisRound
    if counter>0: #counter>2 --- changed to 0 to force NMF
        dire = export.findParentDir(full_PSI_InputFile)
        output_dir = dire+'OncoInputs'
        if os.path.exists(output_dir)==False:
            export.createExportFolder(output_dir)

        output_file = output_dir+'/SVMInput-Round'+str(AnalysisRound)+'.txt'
        ExpandSampleClusters.filterRows(full_PSI_InputFile,output_file,filterDB=upd_guides,logData=False)
        header=ExpandSampleClusters.header_file(output_file)
        print "Running SVM prediction for improved subtypes - Round"+str(AnalysisRound)
        #print 'AAAAAAAAAAAAAAAAAAAAAAAA',output_file
        #print 'BBBBBBBBBBBBBBBBBBBBBBBB',BinarizedOutput
        train=ExpandSampleClusters.TrainDataGeneration(output_file,BinarizedOutput,name)
        grplst.append(group)
        ExpandSampleClusters.Classify(header,train,output_file,grplst,name,AnalysisRound) ### This is where we write the worng version
        header=Correlationdepletion.header_file(NMFResult)
        
        output_file=output_dir+'/DepletionInput-Round'+str(AnalysisRound)+".txt"
        sampleIndexSelection.filterFile(full_PSI_InputFile,output_file,header)
        print "Running Correlation Depletion - Round"+str(AnalysisRound)
        commonkeys,count=Correlationdepletion.FindCorrelations(NMFResult,output_file,name)
        Depleted=Correlationdepletion.DepleteSplicingevents(commonkeys,output_file,count,full_PSI_InputFile)
        full_PSI_InputFile=Depleted
    
        flag=True ### Indicates that K-means was not run - hence, another round of splice-ICGS should be performed
    """"
    else:
        try:
            print "Running K-means analyses instead of NMF - Round"+str(AnalysisRound)
            header=[]
            header=Kmeans.header_file(dPSI_results_fn_block)
            Kmeans.KmeansAnalysis(dPSI_results_fn_block,header,full_PSI_InputFile,AnalysisRound)
            flag=True
        except Exception:
            print 'WARNING!!!!! DID NOT RUN K-MEANS!!!!!'
            print traceback.format_exc()
            AnalysisRound = True
    """
    return flag,full_PSI_InputFile
Beispiel #30
0
def Enrichment(Inputfile,mutdict,mutfile,Expand,header):
    import collections
    import mappfinder
    X=defaultdict(list)
    prev=""
    head=0
    group=defaultdict(list)
    enrichdict=defaultdict(float)
    mut=export.findFilename(mutfile)
    dire=export.findParentDir(Inputfile)
    output_dir = dire+'MutationEnrichment'
    export.createExportFolder(output_dir)

    exportnam=output_dir+'/Enrichment_Results.txt'
    export_enrich=open(exportnam,"w")
    exportnam=output_dir+'/Enrichment_tophits.txt'
    export_hit=open(exportnam,"w")
   
    export_enrich.write("Mutations"+"\t"+"Cluster"+"\t"+"r"+"\t"+"R"+"\t"+"n"+"\t"+"Sensitivity"+"\t"+"Specificity"+"\t"+"z-score"+"\t"+"Fisher exact test"+"\t"+"adjp value"+"\n")
    if Expand=="yes":
        header2=header_file(Inputfile,Expand="yes")
        
        for line in open(Inputfile,'rU').xreadlines():
            if head >0:
                line=line.rstrip('\r\n')
                q= string.split(line,'\t')
                for i in range(1,len(q)):
                    if q[i]==str(1):
                        #group[q[0]].append(header2[i-1])
                        group[header2[i-1]].append(q[0])
           
            else:
                head+=1
                continue
    else:
        for line in open(Inputfile,'rU').xreadlines():
            line=line.rstrip('\r\n')
            line=string.split(line,'\t')
            #for i in range(1,len(line)):
            group[line[2]].append(line[0])
   
    total_Scores={}
    for kiy in mutdict:
        if kiy =="MDP":
            print mutdict[kiy]
        groupdict={}
        remaining=[]
        remaining=list(set(header) - set(mutdict[kiy]))
        groupdict[1]=mutdict[kiy]
        groupdict[2]=remaining
       # export_enrich1.write(kiy)
        for key2 in group:
           
            
            r=float(len(list(set(group[key2])))-len(list(set(group[key2]) - set(mutdict[kiy]))))
            n=float(len(group[key2]))
            R=float(len(set(mutdict[kiy])))
            N=float(len(header))
        
            if r==0 or R==1.0:
                print kiy,key2,r,n,R,N
                pval=float(1)
                z=float(0)
                null_z = 0.000
                zsd = mappfinder.ZScoreData(key2,r,R,z,null_z,n)
                zsd.SetP(pval)
            else:
                try: z = Zscore(r,n,N,R)
                except : z = 0.0000
                ### Calculate a Z-score assuming zero matching entries
                try: null_z = Zscore(0,n,N,R)
                except Exception: null_z = 0.000
               
                
                try:
                    pval = mappfinder.FishersExactTest(r,n,R,N)
                    zsd = mappfinder.ZScoreData(key2,r,R,z,null_z,n)
                    zsd.SetP(pval)
                except Exception:
                    pval=1.0
                    zsd = mappfinder.ZScoreData(key2,r,R,z,null_z,n)
                    zsd.SetP(pval)
                    #pass
                
          
            if kiy in total_Scores:
                    signature_db = total_Scores[kiy]
                    signature_db[key2]=zsd ### Necessary format for the permutation function
            else:
                    signature_db={key2:zsd}
                    total_Scores[kiy] = signature_db
    sorted_results=[]
    mutlabels={}
    for kiy in total_Scores:
        
        signature_db = total_Scores[kiy]
        ### Updates the adjusted p-value instances
        mappfinder.adjustPermuteStats(signature_db)
        for signature in signature_db:
            zsd = signature_db[signature]
           
            results = [kiy,signature,zsd.Changed(),zsd.Measured(),zsd.InPathway(),str(float(zsd.PercentChanged())/100.0),str(float(float(zsd.Changed())/float(zsd.InPathway()))), zsd.ZScore(), zsd.PermuteP(), zsd.AdjP()] #string.join(zsd.AssociatedIDs(),'|')
            sorted_results.append([signature,float(zsd.PermuteP()),results])
    sorted_results.sort() ### Sort by p-value
    prev=""
    for (sig,p,values) in sorted_results:
        if sig!=prev:
            flag=True
            export_hit.write(string.join(values,'\t')+'\n')
        if flag:
            if (float(values[5])>=0.5 and float(values[6])>=0.5) or float(values[5])>=0.6 :
                mutlabels[values[1]]=values[0]
                flag=False
                export_hit.write(string.join(values,'\t')+'\n')
        export_enrich.write(string.join(values,'\t')+'\n')
        prev=sig
    if len(sorted_results)==0:
            export_enrich.write(string.join([splicing_factor,'NONE','NONE','NONE','NONE','NONE','NONE'],'\t')+'\n')
    export_enrich.close()
    #print mutlabels
    return mutlabels
Beispiel #31
0
def CompleteWorkflow(InputFile, EventAnnot, turn, rho_cutoff, strategy, seq):

    species = "Hs"
    row_method = 'hopach'
    column_method = 'hopach'
    row_metric = 'correlation'
    column_metric = 'euclidean'
    color_gradient = 'yellow_black_blue'
    contrast = 3
    vendor = "RNASeq"
    GeneSelection = ''
    PathwaySelection = ''
    GeneSetSelection = 'None Selected'
    excludeCellCycle = False
    #rho_cutoff = 0.4
    restrictBy = 'protein_coding'
    featurestoEvaluate = 'Genes'
    ExpressionCutoff = 0
    CountsCutoff = 0
    FoldDiff = 1.2
    SamplesDiffering = 4
    JustShowTheseIDs = ''
    removeOutliers = False
    PathwaySelection = []
    array_type = "RNASeq"
    #rho_cutoff=0.4
    gsp = UI.GeneSelectionParameters(species, array_type, vendor)
    gsp.setGeneSet(GeneSetSelection)
    gsp.setPathwaySelect(PathwaySelection)
    gsp.setGeneSelection(GeneSelection)
    gsp.setJustShowTheseIDs(JustShowTheseIDs)
    gsp.setNormalize('median')
    gsp.setSampleDiscoveryParameters(ExpressionCutoff, CountsCutoff, FoldDiff,
                                     SamplesDiffering, removeOutliers,
                                     featurestoEvaluate, restrictBy,
                                     excludeCellCycle, column_metric,
                                     column_method, rho_cutoff)
    #Run splice ICGS
    """import UI
        species='Mm'; platform = "3'array"; vendor = 'Ensembl'
        gsp = UI.GeneSelectionParameters(species,platform,vendor)
        gsp.setGeneSet('None Selected')
        gsp.setPathwaySelect('')
        gsp.setGeneSelection('')
        gsp.setJustShowTheseIDs('')
        gsp.setNormalize('median')
        gsp.setSampleDiscoveryParameters(0,0,1.5,3,
        False,'PSI','protein_coding',False,'cosine','hopach',0.35)"""

    FilteredEventAnnot = filterEventAnnotation.FilterFile(
        InputFile, EventAnnot, turn)

    try:
        print "Running splice-ICGS for feature selection - Round" + str(turn)
        #except Exception:Rank=0
        graphic_links3 = RNASeq.singleCellRNASeqWorkflow(species,
                                                         'exons',
                                                         InputFile,
                                                         mlp,
                                                         exp_threshold=0,
                                                         rpkm_threshold=0,
                                                         parameters=gsp)

        Guidefile = graphic_links3[-1][-1]
        Guidefile = Guidefile[:-4] + '.txt'

        print "Running block identification for rank analyses - Round" + str(
            turn)
        RNASeq_blockIdentification.correlateClusteredGenesParameters(
            Guidefile,
            rho_cutoff=0.4,
            hits_cutoff=4,
            hits_to_report=50,
            ReDefinedClusterBlocks=True,
            filter=True)
        Guidefile_block = Guidefile[:-4] + '-BlockIDs.txt'
        NMFinput, Rank = NMF_Analysis.FilterFile(Guidefile, Guidefile_block,
                                                 InputFile, turn)
    except Exception:
        print 'UNKNOWN ERROR!!!!!'
        print traceback.format_exc()
        Rank = 0

    if Rank > 1:
        print 'Current turn:', turn, 'k =',
        if turn == 1:
            Rank = 2
        elif Rank > 2:
            Rank = 30
        else:
            Rank = 2
        if seq == "bulk":
            use_adjusted_p = True
        else:
            use_adjusted_p = False
        print Rank
        print "Running NMF analyses for dimension reduction using " + str(
            Rank) + " ranks - Round" + str(turn)
        NMFResult, BinarizedOutput, Metadata, Annotation = NMF_Analysis.NMFAnalysis(
            NMFinput, Rank, turn, strategy)
        print "Running Metadata Analyses for finding differential splicing events"
        rootdir, CovariateQuery = metaDataAnalysis.remoteAnalysis(
            'Hs', FilteredEventAnnot, Metadata, 'PSI', 0.1, use_adjusted_p,
            0.05, Annotation)
        counter = 1
        Guidedir = rootdir + CovariateQuery
        PSIdir = rootdir + 'ExpressionProfiles'
        global upd_guides
        upd_guides = []
        name = []
        group = []
        grplst = []
        for filename in os.listdir(Guidedir):
            if filename.startswith("PSI."):
                Guidefile = os.path.join(Guidedir, filename)
                psi = string.replace(filename, "PSI.", "")
                PSIfile = os.path.join(PSIdir, psi)
                omitcluster = FindTopUniqueEvents(Guidefile, psi, Guidedir)

                if omitcluster == 0:
                    group.append(counter)
                    name.append(psi)
                    counter += 1
        if counter > 2:
            dire = export.findParentDir(InputFile)
            output_dir = dire + 'OncoInputs'
            if os.path.exists(output_dir) == False:
                export.createExportFolder(output_dir)

            output_file = output_dir + '/SVMInput-Round' + str(turn) + '.txt'
            ExpandSampleClusters.filterRows(InputFile,
                                            output_file,
                                            filterDB=upd_guides,
                                            logData=False)
            header = ExpandSampleClusters.header_file(output_file)
            print "Running SVM prediction for improved subtypes - Round" + str(
                turn)
            train = ExpandSampleClusters.TrainDataGeneration(
                output_file, BinarizedOutput, name)
            grplst.append(group)
            ExpandSampleClusters.Classify(header, train, output_file, grplst,
                                          name, turn)
            header = Correlationdepletion.header_file(NMFResult)

            output_file = output_dir + '/DepletionInput-Round' + str(
                turn) + ".txt"
            sampleIndexSelection.filterFile(InputFile, output_file, header)
            print "Running Correlation Depletion - Round" + str(turn)
            commonkeys, count = Correlationdepletion.FindCorrelations(
                NMFResult, output_file, name)
            Depleted = Correlationdepletion.DepleteSplicingevents(
                commonkeys, output_file, count, InputFile)
            InputFile = Depleted

            flag = True
        else:
            try:
                print "Running K-means analyses instead of NMF - Round" + str(
                    turn)
                header = []
                header = Kmeans.header_file(Guidefile_block)
                Kmeans.KmeansAnalysis(Guidefile_block, header, InputFile, turn)
                flag = False
            except Exception:
                print 'WARNING!!!!! DID NOT RUN K-MEANS!!!!!'
                print traceback.format_exc()
                flag = False

    else:
        if Rank == 1:
            try:
                print "Running K-means analyses instead of NMF - Round" + str(
                    turn)
                header = []
                header = Kmeans.header_file(Guidefile_block)
                Kmeans.KmeansAnalysis(Guidefile_block, header, InputFile, turn)

                flag = False
            except Exception:
                print 'WARNING!!!!! DID NOT RUN K-MEANS!!!!!'
                print traceback.format_exc()
                flag = False
        else:
            flag = False

    return flag, InputFile, FilteredEventAnnot
Beispiel #32
0
                if column_method == 'None': column_method = None
            elif opt == '--row_metric': row_metric=arg
            elif opt == '--column_metric': column_metric=arg
            elif opt == '--ExpressionCutoff': ExpressionCutoff=arg
            elif opt == '--normalization': normalization=arg
            elif opt == '--rho': rho_cutoff=float(arg)
            elif opt == '--CountsCutoff':CountsCutoff=int(float(arg))
            elif opt == '--FoldDiff':FoldDiff=float(arg)
            elif opt == '--SamplesDiffering':SamplesDiffering=int(float(arg))
            elif opt == '--removeOutliers':
                removeOutliers=arg
                if removeOutliers=='yes' or removeOutliers=='True':
                    removeOutliers = True
    
    print "Subtype discovery stringency:",strategy
    dire = export.findParentDir(EventAnnot)

    if EnrichmentOnly==False:
        
        print 'PSI input files:',EventAnnot
        print 'Using a rho-cutoff of:',rho_cutoff
    
        if filters==True: ### Filter based on a default percentage of samples with detected PSI values
            EventAnnot,SampleNumber=filterPSIValues(EventAnnot,percentCutoff=percentCutoff,filterStatus=True)
        else:
            SampleNumber=filterPSIValues(EventAnnot,percentCutoff=percentCutoff,filterStatus=False)
        output_dir = dire+'ExpressionInput'
    
        export.createExportFolder(output_dir)
        full_PSI_InputFile=output_dir+"/exp.input.txt"
        header=header_list(EventAnnot)
Beispiel #33
0
def Classify(header,Xobs,output_file,grplst,name,turn):
    count=0
    start=1
    Y=[]
    head=0
    for line in open(output_file,'rU').xreadlines():
        if head >count:
            val=[]
            counter2=0
            val2=[]
            me=0.0
            line=line.rstrip('\r\n')
            q= string.split(line,'\t')
            for i in range(start,len(q)):
                try:
                    val2.append(float(q[i]))
                except Exception:
                    continue
            me=np.median(val2)
            for i in range(start,len(q)):
                try:
                    val.append(float(q[i]))
                except Exception:
                    val.append(float(me))
            Y.append(val)
        else:
            head+=1
            continue

    Xobs=zip(*Xobs)
    Xobs=np.array(Xobs)
    Xobs=zip(*Xobs)
    Xobs=np.array(Xobs)
    X=grplst
    X=zip(*X)
    X=np.array(X)
    Y=zip(*Y)
    Y=np.array(Y)

    dire = export.findParentDir(export.findParentDir(export.findParentDir(output_file)[:-1])[:-1])
    output_dir = dire+'SVMOutputs'
    if os.path.exists(output_dir)==False:
        export.createExportFolder(output_dir)

    exportnam1=output_dir+'/round'+str(turn)+'SVC_decision_func.txt'
    export_class1=open(exportnam1,"w")
    exportnam2=output_dir+'/round'+str(turn)+'SVC_Results.txt'
    export_class2=open(exportnam2,"w")
    regr = LinearSVC()
    regr.fit(Xobs,X[:,0])
    q=regr.predict(Y)
    count=1

    if len(X[:,0])>2:
        prob_=regr.fit(Xobs,X[:,0]).decision_function(Y)
        export_class1.write("uid")
        export_class2.write("uid")
        for ni in name:
            sub=string.split(ni,"_")[0]
            export_class1.write("\t"+"R"+str(turn)+"-"+sub)
            export_class2.write("\t"+"R"+str(turn)+"-"+sub)
        export_class1.write("\n")
        export_class2.write("\n")

        for iq in range(0,len(header)-1):
            export_class1.write(header[iq+1])
            export_class2.write(header[iq+1])
            for jq in range(0,len(X[:,0])):
                export_class1.write("\t"+str(prob_[iq][jq]))
                if prob_[iq][jq]>0:
                    export_class2.write("\t"+str(1))
                else:
                    export_class2.write("\t"+str(0))
            export_class1.write("\n")
            export_class2.write("\n")
    else:
        prob_=regr.fit(Xobs,X[:,0]).decision_function(Y)
        export_class1.write("uid"+"\t")
        export_class2.write("uid"+"\t")
        export_class1.write("group")
        export_class2.write("R"+str(turn)+"-V1"+"\t"+"R"+str(turn)+"-V2")
        export_class1.write("\n")
        export_class2.write("\n")

        for iq in range(0,len(header)-1):
            export_class1.write(header[iq+1])
            export_class2.write(header[iq+1])
            export_class1.write("\t"+str(prob_[iq]))
            if prob_[iq]>0.5:
                export_class2.write("\t"+str(1)+"\t"+str(0))
            else:
                if prob_[iq]<-0.5:  
                    export_class2.write("\t"+str(0)+"\t"+str(1))
                else:
                    export_class2.write("\t"+str(0)+"\t"+str(0))
            export_class1.write("\n")
            export_class2.write("\n")
    export_class2.close() 
    Orderedheatmap.Classify(exportnam2)
Beispiel #34
0
def Mergeresults(filename):
    Newlist = defaultdict(list)
    Newval = {}
    genelst = defaultdict(list)
    Allcomp = {}
    dire = export.findParentDir(filename)

    output = dire + "/Motifresults_merged.txt"
    #MergedOutput="/Volumes/Pass/MotifAnalyses/Bridger/Exons_MotifAnalyses/merged_output_allpvalues_nofold.txt"

    #output="/Volumes/Pass/MotifAnalyses/Bridger/Exons_MotifAnalyses/merged_output_allpvalues_nofold_upd.txt"
    output1 = open(output, "w")

    output = dire + "/Motifresults_zscores.txt"
    output2 = open(output, "w")

    output1.write("signature" + "\t" + "gene" + "\t" + "technique" + "\t" +
                  "p-value" + "\t" + "log-transformed" + "\t" + "signature" +
                  "\t" + "gene" + "\t" + "technique" + "\t" + "p-value" +
                  "\t" + "log-transformed" + "\t" + "signature" + "\t" +
                  "gene" + "\t" + "technique" + "\t" + "p-value" + "\t" +
                  "log-transformed" + "\t" + "signature" + "\t" + "gene" +
                  "\t" + "technique" + "\t" + "p-value" + "\t" +
                  "log-transformed" + "\n")
    output2.write("signature" + "\t" + "gene" + "\t" + "cisbp-zscore" + "\t" +
                  "CLIPseq-zscore" + "\t" + "GE-zscore" + "\n")
    for lin in open(filename, 'rU').xreadlines():
        genes = []
        s = lin.rstrip('\r\n')

        s1 = string.split(s, '\t')
        sig = s1[0]

        if s1[2] == "GE":
            genes = [s1[1]]

        else:
            genes = string.split(s1[1], ":")

        tool = s1[2]

        if 'Cisbp_denovo' in tool:
            tool = "Cisbp_denovo"
        if "UpstreamIntron_known" in sig:
            sig = string.replace(sig, "UpstreamIntron_known", "Upstream")

        if "Intron_known" in s1[0]:
            sig = string.replace(sig, "Intron_known", "Combined_intron_new")

        if "Exons_known" in s1[0]:
            sig = string.replace(sig, "Exons_known", "Exon")

        if "DownstreamIntron_known" in s1[0]:
            sig = string.replace(sig, "DownstreamIntron_known", "Downstream")

        for i in range(len(genes)):
            if tool not in genelst[sig, genes[i].upper()]:
                genelst[sig, genes[i].upper()].append(tool)

            Newval[sig, tool, genes[i].upper()] = float(s1[3])
            if tool == "GE":
                sig1 = "Exon:" + sig
                Newval[sig1, tool, genes[i].upper()] = float(s1[3])

                genelst[sig1, genes[i].upper()].append(tool)
                sig1 = "Combined_intron_new:" + sig
                Newval[sig1, tool, genes[i].upper()] = float(s1[3])
                genelst[sig1, genes[i].upper()].append(tool)

    zscoredt = {}
    cisbp = []
    clipseq = []
    ge = []

    for sig, genes in genelst:

        tools = []
        cisbpact = True
        cisbpden = True
        tools = genelst[sig, genes]
        # if genes=="MBNL1":
        # print tools,sig
        a = len(tools)
        if 'Cisbp_Actual' in tools and 'Cisbp_denovo' in tools:
            a = a - 1
            if Newval[sig, "Cisbp_Actual", genes] < Newval[sig, "Cisbp_denovo",
                                                           genes]:
                cisbpden = False
            else:
                cisbpact = False

        pval = 0.0
        count = 0
        if a > 1:
            pval = 0.0
            count = 0
            if "Cisbp_Actual" in tools and cisbpact:

                count += 1
                # print str(Newval[sig,"Cisbp_Actual",genes])
                pval = 0.0 - math.log10(Newval[sig, "Cisbp_Actual", genes])
                output1.write(sig + "\t" + genes + "\t" + "Cisbp_Actual" +
                              "\t" + str(Newval[sig, "Cisbp_Actual", genes]) +
                              "\t" + str(pval) + "\t")
                zscoredt[sig, genes] = [
                    pval,
                ]
                cisbp.append(pval)
            else:

                output1.write(sig + "\t" + genes + "\t" + "Cisbp_Actual" +
                              "\t" + "NA" + "\t" + "NA" + "\t")
            if 'Cisbp_denovo' in tools and cisbpden:
                count += 1
                #print str(Newval[sig,"Cisbp_denovo",genes])
                pval = 0.0 - math.log10(Newval[sig, "Cisbp_denovo", genes])
                output1.write(sig + "\t" + genes + "\t" + "Cisbp_denovo" +
                              "\t" + str(Newval[sig, "Cisbp_denovo", genes]) +
                              "\t" + str(pval) + "\t")
                zscoredt[sig, genes] = [
                    pval,
                ]
                cisbp.append(pval)
            else:
                output1.write(sig + "\t" + genes + "\t" + "Cisbp_denovo" +
                              "\t" + "NA" + "\t" + "NA" + "\t")

            if (sig, genes) not in zscoredt:
                zscoredt[sig, genes] = [
                    0.0,
                ]
                cisbp.append(0.0)

            if "Clipseq" in tools:
                count += 1
                #print str(Newval[sig,"Clipseq",genes])
                pval = 0.0 - math.log10(Newval[sig, "Clipseq", genes])
                output1.write(sig + "\t" + genes + "\t" + "Clipseq" + "\t" +
                              str(Newval[sig, "Clipseq", genes]) + "\t" +
                              str(pval) + "\t")
                zscoredt[sig, genes].append(pval)
                clipseq.append(pval)
            else:
                output1.write(sig + "\t" + genes + "\t" + "Clipseq" + "\t" +
                              "NA" + "\t" + "NA" + "\t")
                zscoredt[sig, genes].append(0.0)
                clipseq.append(0.0)
            if "GE" in tools:
                count += 1
                #print str(Newval[sig,"GE",genes])
                pval = 0.0 - math.log10(Newval[sig, "GE", genes])
                output1.write(sig + "\t" + genes + "\t" + "GE" + "\t" +
                              str(Newval[sig, "GE", genes]) + "\t" +
                              str(pval) + "\n")
                zscoredt[sig, genes].append(pval)
                ge.append(pval)
            else:
                output1.write(sig + "\t" + genes + "\t" + "GE" + "\t" + "NA" +
                              "\t" + "NA" + "\n")
                zscoredt[sig, genes].append(0.0)
                ge.append(0.0)
    meancis = np.mean(cisbp)
    meanclip = np.mean(clipseq)
    meange = np.mean(ge)
    sdcis = np.std(cisbp)
    sdclip = np.std(clipseq)
    sdge = np.std(ge)

    for sig, genes in zscoredt:
        scores = []
        scores = zscoredt[sig, genes]
        if len(scores) == 3:
            val1 = (float(scores[0]) - float(meancis)) / float(sdcis)
            val2 = (float(scores[1]) - float(meanclip)) / float(sdclip)
            val3 = (float(scores[2]) - float(meange)) / float(sdge)
            output2.write(sig + "\t" + genes + "\t" + str(val1) + "\t" +
                          str(val2) + "\t" + str(val3) + "\n")
        else:
            print "error in zscore calculation"
            print sig, genes
Beispiel #35
0
def FilterGuideGeneFile(Guidefile,Guidefile_block,expressionInputFile,iteration,platform,uniqueIDs,symbolIDs):
    """ Filters the original input expression file for Guide3 genes/events. Needed
    Since NMF only can deal with positive values [Guide3 has negative values]"""
    
    root_dir = export.findParentDir(expressionInputFile)[:-1]
    if 'ExpressionInput' in root_dir:
        root_dir = export.findParentDir(root_dir)
    
    if 'Clustering' in Guidefile:
        count=1
        flag=True
        rank_Count=0
        prev=0
    else:
        count=0
    val=[]
    head=0
    for line in open(Guidefile_block,'rU').xreadlines():
        if head >count:
            line=line.rstrip('\r\n')
            q= string.split(line,'\t')
            #val.append(q[0])
            if flag:
                if int(q[1])==prev:
                    continue
                else:
                    rank_Count+=1
                    prev=int(q[1])
        else:
            head+=1
            continue
    head=0
    for line in open(Guidefile,'rU').xreadlines():
        line=line.rstrip('\r\n')
        q= string.split(line,'\t')
        n=len(q)
        if head >count:
            line=line.rstrip('\r\n')
            q= string.split(line,'\t')
            uid = q[0]
            if uid not in uniqueIDs:
                if uid in symbolIDs:
                    uid = symbolIDs[uid]
                    val.append(uid)
                else:
                    continue
            val.append(uid)
            if platform != "PSI" and head==2:
                rank_Count=rank_Count+int(q[1])
                print rank_Count
            head=head+1
        else:
            head+=1
            if platform != "PSI" and q[0]=="column_clusters-flat":
                    rank_Count=int(q[n-1])
            continue

    output_dir = root_dir+'/NMF-SVM'
    if os.path.exists(output_dir)==False:
        export.createExportFolder(output_dir)
    
    output_file = output_dir+'/NMFInput-Round'+str(iteration)+'.txt'
    filterRows(expressionInputFile,output_file,filterDB=val)
    
    return output_file,rank_Count
Beispiel #36
0
def latteralMerge(files_to_merge,original_filename,outputPath = None):
    """ Merging files can be dangerous, if there are duplicate IDs (e.g., gene symbols).
    To overcome issues in redundant gene IDs that are improperly matched (one row with zeros
    and the other with values), this function determines if a lateral merge is more appropriate.
    The latter merge:
    1) Checks to see if the IDs are the same with the same order between the two or more datasets
    2) merges the two or more matrices without looking at the genes.
    
    Note: This function is attempts to be memory efficient and should be updated in the future to
    merge blocks of row IDs sequentially."""
    
    files_to_merge_revised = []
    for filename in files_to_merge:
        ### If a sparse matrix - rename and convert to flat file
        if '.h5' in filename or '.mtx' in filename:
            from import_scripts import ChromiumProcessing
            import export
            
            file = export.findFilename(filename)
            export_name = file[:-4]+'-filt'
            if file == 'filtered_feature_bc_matrix.h5' or file == 'raw_feature_bc_matrix.h5' or file =='filtered_gene_bc_matrix.h5' or file == 'raw_gene_bc_matrix.h5':
                export_name = export.findParentDir(filename)
                export_name = export.findFilename(export_name[:-1])
            elif file == 'matrix.mtx.gz' or file == 'matrix.mtx':
                parent = export.findParentDir(filename)
                export_name = export.findParentDir(parent)
                export_name = export.findFilename(export_name[:-1])
            else:
                export_name = string.replace(file,'.mtx.gz','')
                export_name = string.replace(export_name,'.mtx','')
                export_name = string.replace(export_name,'.h5','')
                export_name = string.replace(export_name,'_matrix','')
            filename = ChromiumProcessing.import10XSparseMatrix(filename,'species',export_name)
        files_to_merge_revised.append(filename)
    files_to_merge = files_to_merge_revised
    print 'Files to merge:',files_to_merge
        
    includeFilenames = True
    file_uids = {}
    for filename in files_to_merge:
        firstRow=True
        fn=filepath(filename); x=0
        if '/' in filename:
            file = string.split(filename,'/')[-1][:-4]
        else:
            file = string.split(filename,'\\')[-1][:-4]
        for line in open(fn,'rU').xreadlines():         
            data = cleanUpLine(line)
            if '\t' in data:
                t = string.split(data,'\t')
            elif ',' in data:
                t = string.split(data,',')
            else:
                t = string.split(data,'\t')
            if firstRow:
                firstRow = False
            else:
                uid = t[0]
                try:
                    file_uids[file].append(uid)
                except:
                    file_uids[file] = [uid]

    perfectMatch = True
    for file1 in file_uids:
        uids1 = file_uids[file1]
        for file2 in file_uids:
            uids2 = file_uids[file2]
            if uids1 != uids2:
                print file1,file2
                perfectMatch = False

    if perfectMatch:
        print 'All ordered IDs match in the files ... performing latteral merge instead of key ID merge to prevent multi-matches...'
        firstRow=True
        increment = 5000
        low = 1
        high = 5000
        added = 1
        eo = open(output_dir+'/MergedFiles.txt','w')
        import collections 
        
        def exportMergedRows(low,high):
            uid_values=collections.OrderedDict()
            for filename in files_to_merge:
                fn=filepath(filename); x=0; file_uids = {}
                if '/' in filename:
                    file = string.split(filename,'/')[-1][:-4]
                else:
                    file = string.split(filename,'\\')[-1][:-4]
                firstRow=True
                row_count = 0
                uids=[] ### Over-ride this for each file
                for line in open(fn,'rU').xreadlines():
                    row_count+=1
                    if row_count<=high and row_count>=low:
                        data = cleanUpLine(line)
                        if '\t' in data:
                            t = string.split(data,'\t')
                        elif ',' in data:
                            t = string.split(data,',')
                        else:
                            t = string.split(data,'\t')
                        if firstRow and low==1:
                            file = string.replace(file,'_matrix_CPTT','')
                            if includeFilenames:
                                header = [s + "."+file for s in t[1:]] ### add filename suffix
                            else:
                                header = t[1:]
                            try: uid_values[row_count]+=header
                            except: uid_values[row_count]=header
                            uids.append('UID')
                            firstRow=False
                        else:
                            uid = t[0]
                            try: uid_values[row_count] += t[1:]
                            except: uid_values[row_count] = t[1:]
                            uids.append(uid)
            i=0
            for index in uid_values:
                uid = uids[i]
                eo.write(string.join([uid]+uid_values[index],'\t')+'\n')
                i+=1
            print 'completed',low,high
        
        uid_list = file_uids[file]
        while (len(uid_list)+increment)>high:
            exportMergedRows(low,high)
            high+=increment
            low+=increment
        eo.close()
        return True
    else:
        print 'Different identifier order in the input files encountered...'
        return False
Beispiel #37
0
py2app_dirs = py2app_ge_dirs + py2app_aa_dirs

for i in py2app_aa_dirs:    
    i = string.replace(i,'AltAnalyze.app','AltAnalyzeViewer.app')
    py2app_dirs.append(i)

if ('linux' in sys.platform or 'posix' in sys.platform) and getattr(sys, 'frozen', False): ### For PyInstaller
    application_path = os.path.dirname(sys.executable)
    #application_path = sys._MEIPASS  ### should be the same as the above
else:
    if '..' in __file__:
        """ Indicates the file callin unique.py is in a subdirectory """
        try:
            if '.py' in __file__:
                import export
                application_path = export.findParentDir(string.split(__file__,'..')[0][:-1])
            else:
                application_path = os.getcwd()
        except Exception:
            application_path = os.getcwd()
    else:
        application_path = os.path.dirname(__file__)

if len(application_path)==0:
    application_path = os.getcwd()

if 'AltAnalyze?' in application_path:
    application_path = string.replace(application_path,'//','/')
    application_path = string.replace(application_path,'\\','/') ### If /// present
    application_path = string.split(application_path,'AltAnalyze?')[0]
Beispiel #38
0
def processBarcodes(viral_barcode_file, cell_cluster_file, reference_48mers):
    eo = export.ExportFile(viral_barcode_file[:-4] + '-cleaned.txt')
    parent = export.findParentDir(viral_barcode_file)
    eom = export.ExportFile(parent + '/MultiLin-cells.txt')
    ### Import a file with the sample names in the groups file in the correct order
    viral_barcodes = {}
    repair = {}
    short = {}
    cluster_header = []

    cell_clusters = {}
    for line in open(cell_cluster_file, 'rU').xreadlines():
        data = cleanUpLine(line)
        cell, cluster, cluster_name = string.split(data, '\t')
        cell_clusters[cell] = cluster_name
        if cluster_name not in cluster_header:
            cluster_header.append(cluster_name)

    cells_with_virus = {}
    for line in open(viral_barcode_file, 'rU').xreadlines():
        data = cleanUpLine(line)
        cellular, viral = string.split(data, '\t')
        if cellular in cell_clusters:
            try:
                if viral not in cells_with_virus[cellular]:
                    cells_with_virus[cellular].append(viral)
            except Exception:
                cells_with_virus[cellular] = [viral]
            if len(viral) < 48:
                #if len(viral)<38:
                if viral not in repair:
                    repair[viral] = [cellular]
                else:
                    if cellular not in repair[viral]:
                        repair[viral].append(cellular)
            else:
                #short[viral[:35]]=viral
                try:
                    if cellular not in viral_barcodes[viral]:
                        viral_barcodes[viral].append(cellular)
                except Exception:
                    viral_barcodes[viral] = [cellular]

    ### Repair the short sequences
    for viral_short in repair:
        cellular_barcodes = repair[viral_short]
        if viral_short[:35] in short:
            viral = short[viral_short[:35]]
            for cellular in cellular_barcodes:
                try:
                    if cellular not in viral_barcodes[viral]:
                        viral_barcodes[viral].append(cellular)
                except Exception:
                    viral_barcodes[viral] = [cellular]
    print len(viral_barcodes), 'unique viral barcodes present'

    #print cells_with_virus['ACGCCGATCTGTTGAG']
    #print cells_with_virus['CAGAATCCAAACTGCT']
    #sys.exit()

    if reference_48mers != None:
        valid_barcodes = 0
        for viral in viral_barcodes:
            if viral in reference_48mers:
                valid_barcodes += 1
        print valid_barcodes, 'unique valid viral barcodes present'

    #"""
    ### If the viral barcodes have frequent errors - associate the error with the reference in a cell-specific manner
    ### Only one virus for cell should be present unless it is a doublet
    print len(cells_with_virus), 'cells with viral barcodes'
    doublet_cell = {}
    mismatch_to_match = {}
    cells_with_valid_barcodes = 0
    viral_barcodes_overide = {}
    cellular_barcodes_overide = {}
    for cellular in cells_with_virus:
        cell_5prime = {}
        cell_3prime = {}
        ref_sequences = []
        if len(cells_with_virus[cellular]) > 1:
            for i in cells_with_virus[cellular]:
                try:
                    cell_5prime[i[:10]].append(i)
                except Exception:
                    cell_5prime[i[:10]] = [i]
                try:
                    cell_3prime[i[-10:]].append(i)
                except Exception:
                    cell_3prime[i[-10:]] = [i]
                if reference_48mers == None:
                    ref_sequences.append(i)
                elif i in reference_48mers:
                    ref_sequences.append(i)
            if len(ref_sequences) > 0:
                cells_with_valid_barcodes += 1  ### Determine how many cells have valid viral barcodes
            cell_5prime_ls = []
            cell_3prime_ls = []
            for i in cell_5prime:
                cell_5prime_ls.append([len(cell_5prime[i]), i])
            for i in cell_3prime:
                cell_3prime_ls.append([len(cell_3prime[i]), i])
            cell_5prime_ls.sort()
            cell_3prime_ls.sort()

            for seq in ref_sequences:
                if cell_5prime_ls[-1][1] in seq and cell_3prime_ls[-1][
                        1] in seq:
                    ref_seq = seq
            try:
                viral_barcodes_overide[ref_seq].append(cellular)
            except:
                viral_barcodes_overide[ref_seq] = [cellular]
            cellular_barcodes_overide[cellular] = [ref_seq]
            for y in cell_5prime[cell_5prime_ls[-1][1]]:
                mismatch_to_match[y] = ref_seq
            for y in cell_3prime[cell_3prime_ls[-1][1]]:
                mismatch_to_match[y] = ref_seq

        else:
            for i in cells_with_virus[cellular]:
                if reference_48mers == None:
                    cells_with_valid_barcodes += 1
                elif i in reference_48mers:
                    cells_with_valid_barcodes += 1  ### Determine how many cells have valid viral barcodes
                try:
                    viral_barcodes_overide[i].append(cellular)
                except:
                    viral_barcodes_overide[i] = [cellular]

    viral_barcodes = viral_barcodes_overide
    cells_with_virus = cellular_barcodes_overide

    ### Update the viral_barcodes dictionary
    viral_barcodes2 = {}
    cells_with_virus2 = {}
    for v in viral_barcodes:
        cell_barcodes = viral_barcodes[v]
        proceed = False
        if v in mismatch_to_match:
            v = mismatch_to_match[v]
            proceed = True
        elif reference_48mers == None:
            proceed = True
        elif v in reference_48mers:
            proceed = True
        if proceed:
            if v in viral_barcodes2:
                for c in cell_barcodes:
                    if c not in viral_barcodes2:
                        viral_barcodes2[v].append(c)
            else:
                viral_barcodes2[v] = cell_barcodes

    print cells_with_valid_barcodes, 'cells with valid viral barcodes.'
    viral_barcodes = viral_barcodes2
    ### Update the cells_with_virus dictionary
    for v in viral_barcodes:
        cell_barcodes = viral_barcodes[v]
        for c in cell_barcodes:
            if c in cells_with_virus2:
                if v not in cells_with_virus2[c]:
                    cells_with_virus2[c].append(v)
            else:
                cells_with_virus2[c] = [v]
    cells_with_virus = cells_with_virus2

    for c in cells_with_virus:
        if len(cells_with_virus[c]) > 1:
            doublet_cell[c] = []
    print len(doublet_cell), 'doublets'
    #print cells_with_virus['ACGCCGATCTGTTGAG']
    #print cells_with_virus['CAGAATCCAAACTGCT']
    #sys.exit()

    print len(cells_with_virus), 'updated cells with virus'
    print len(viral_barcodes), 'updated unique viral barcodes'
    #"""

    #reference_48mers={}

    multi_cell_mapping = 0
    unique_cells = {}
    multiMappingFinal = {}
    import collections
    import unique
    event_db = collections.OrderedDict()
    for cluster in cluster_header:
        event_db[cluster] = '0'
    k_value = 1
    import unique
    cluster_hits_counts = {}
    cluster_pairs = {}
    custom = []
    cells_per_pattern = {}
    for viral in viral_barcodes:
        clusters = []
        k = len(unique.unique(viral_barcodes[viral]))
        if k > k_value:
            proceed = True
            if reference_48mers == None:
                proceed = True
            elif len(reference_48mers) > 0:
                if viral in reference_48mers:
                    proceed = True
                else:
                    proceed = False
            if proceed:
                viral_cluster_db = copy.deepcopy(event_db)  ### copy this
                multi_cell_mapping += 1
                cell_tracker = []
                multilin = []
                all_cells = []
                for cell in viral_barcodes[viral]:
                    #if cell not in doublet_cell:
                    cell_tracker.append(cell)
                    try:
                        unique_cells[cell].append(viral)
                    except:
                        unique_cells[cell] = [viral]
                    if cell in cell_clusters:
                        cluster = cell_clusters[cell]
                        if 'Multi-Lin' == cluster:
                            multilin.append(cell)
                        all_cells.append(cell)
                        viral_cluster_db[cluster] = '1'
                        clusters.append(cluster)
                c1 = unique.unique(clusters)
                c2 = string.join(c1, '|')
                try:
                    cells_per_pattern[c2] += all_cells
                except:
                    cells_per_pattern[c2] = all_cells
                #if c1 == ['Multi-Lin c4-Mast']:
                #if c1 == ['MultiLin','MEP','Myelo-1'] or  c1 == ['MultiLin','MEP','Myelo-2'] or  c1 == ['MultiLin','MEP','Myelo-4']:
                #if 'Multi-Lin c4-Mast' in c1 and ('ERP-primed' not in c1 and 'MEP' not in c1 and 'MKP-primed' not in c1 and 'MKP' not in c1 and 'ERP' not in c1) and 'Monocyte' not in c1 and 'e-Mono' not in c1 and ('Gran' in c1 or 'Myelo-1' in c1 or 'Myelo-2' in c1 and 'Myelo-3' in c1 and 'Myelo-4' in c1):
                #if 'Multi-Lin' in c1 and ('e-Mono' in c1 or 'Monocyte' in c1) and ('ERP-primed' in c1 or 'MEP' in c1 or 'MKP-primed' in c1 or 'MKP' in c1) and ('Gran' in c1 or 'Myelo-4' in c1 or 'Myelo-1' in c1 or 'Myelo-2' in c1 or 'Myelo-3' in c1):
                if 'Multi-Lin' in c1:
                    for cell in multilin:
                        eom.write(
                            string.join(c1, '|') + '\t' + cell + '\t' + viral +
                            '\n')
                    custom += viral_barcodes[viral]
                    #print 'custom:',custom

                multiMappingFinal[viral] = viral_cluster_db

        ### Count the number of cluster pairs to make a weighted network
        for c1 in clusters:
            for c2 in clusters:
                if c1 != c2:
                    try:
                        cx = cluster_pairs[c1]
                        try:
                            cx[c2] += 1
                        except:
                            cx[c2] = 1
                    except:
                        cx = {}
                        cx[c2] = 1
                        cluster_pairs[c1] = cx
        clusters = string.join(unique.unique(clusters), '|')
        try:
            cluster_hits_counts[clusters] += 1
        except Exception:
            cluster_hits_counts[clusters] = 1
    #sys.exit()
    #print custom

    for cluster in cluster_pairs:
        cluster_counts = []
        cx = cluster_pairs[cluster]
        for c2 in cx:
            count = cx[c2]
            cluster_counts.append([count, c2])
        cluster_counts.sort()
        cluster_counts.reverse()
        #print cluster, cluster_counts
    print len(multiMappingFinal)

    final_ranked_cluster_hits = []
    for clusters in cluster_hits_counts:
        final_ranked_cluster_hits.append(
            [cluster_hits_counts[clusters], clusters])
    final_ranked_cluster_hits.sort()
    final_ranked_cluster_hits.reverse()
    for (counts, clusters) in final_ranked_cluster_hits:
        try:
            print str(counts) + '\t' + clusters + '\t' + str(
                len(unique.unique(cells_per_pattern[clusters])))
            #print cells_per_pattern[clusters];sys.exit()
        except:
            print str(counts) + '\t' + clusters

    eo.write(string.join(['UID'] + cluster_header, '\t') + '\n')
    for viral_barcode in multiMappingFinal:
        cluster_db = multiMappingFinal[viral_barcode]
        hits = []
        for cluster in cluster_db:
            hits.append(cluster_db[cluster])
        eo.write(string.join([viral_barcode] + hits, '\t') + '\n')
    eo.close()

    eo = export.ExportFile(viral_barcode_file[:-4] + '-cells-' + str(k_value) +
                           '.txt')
    for cell in unique_cells:
        #eo.write(cell+'\t1\t1\t'+str(len(unique_cells[cell]))+'\t'+string.join(unique_cells[cell],'|')+'\n')
        eo.write(cell + '\t1\t1\t\n')
    eo.close()
Beispiel #39
0
def CompleteWorkflow(InputFile, EventAnnot, rho_cutoff, strategy, seq, gsp,
                     forceBroadClusters, turn):
    """ This function is used perform a single-iteration of the OncoSplice workflow (called from main),
    including the unsupervised splicing analysis (splice-ICGS) and signature depletion """

    ### Filter the EventAnnotation PSI file with non-depleted events from the prior round
    FilteredEventAnnot = filterEventAnnotation.FilterFile(
        InputFile, EventAnnot, turn)

    try:
        print "Running splice-ICGS for feature selection - Round" + str(turn)

        ### Reset the below variables which can be altered in prior rounds
        gsp.setGeneSelection('')
        gsp.setGeneSet('None Selected')
        gsp.setPathwaySelect([])
        if forceBroadClusters == True:
            ### Find Broad clusters with at least 25% of all samples
            originalSamplesDiffering = gsp.SamplesDiffering()
            gsp.setSamplesDiffering(int(SampleNumber * 0.25))

        print 'Number varying samples to identify:', gsp.SamplesDiffering()

        graphic_links3 = RNASeq.singleCellRNASeqWorkflow(species,
                                                         'exons',
                                                         InputFile,
                                                         mlp,
                                                         exp_threshold=0,
                                                         rpkm_threshold=0,
                                                         parameters=gsp)
        if forceBroadClusters == True:
            gsp.setSamplesDiffering(originalSamplesDiffering)

        Guidefile = graphic_links3[-1][-1]
        Guidefile = Guidefile[:-4] + '.txt'

        print "Running block identification for rank analyses - Round" + str(
            turn)
        ### Parameters are fixed as they are distinct
        RNASeq_blockIdentification.correlateClusteredGenesParameters(
            Guidefile,
            rho_cutoff=0.4,
            hits_cutoff=4,
            hits_to_report=50,
            ReDefinedClusterBlocks=True,
            filter=True)
        Guidefile_block = Guidefile[:-4] + '-BlockIDs.txt'
        NMFinput, Rank = NMF_Analysis.FilterFile(Guidefile, Guidefile_block,
                                                 InputFile, turn)

    except Exception:
        print 'UNKNOWN ERROR!!!!! Setting Rank=0'
        #print traceback.format_exc()
        Rank = 0

    if Rank > 1:
        ### ADJUST THE RANKS - MUST UPDATE!!!!
        if turn == 1:
            if force_broad_round1:
                #Rank=2
                Rank = Rank
            else:
                if Rank > 2:
                    Rank = 30
        else:
            if Rank > 2:
                Rank = 30
        if seq == "bulk":
            use_adjusted_p = True
        else:
            use_adjusted_p = False

        print "Running NMF analyses for dimension reduction using " + str(
            Rank) + " ranks - Round" + str(turn)
        NMFResult, BinarizedOutput, Metadata, Annotation = NMF_Analysis.NMFAnalysis(
            NMFinput, Rank, turn, strategy)
        print "Running Metadata Analyses for finding differential splicing events"
        rootdir, CovariateQuery = metaDataAnalysis.remoteAnalysis(
            'Hs', FilteredEventAnnot, Metadata, 'PSI', 0.1, use_adjusted_p,
            0.05, Annotation)
        counter = 1
        Guidedir = rootdir + CovariateQuery
        PSIdir = rootdir + 'ExpressionProfiles'
        global upd_guides
        upd_guides = []
        name = []
        group = []
        grplst = []
        for filename in os.listdir(Guidedir):
            if filename.startswith("PSI."):
                Guidefile = os.path.join(Guidedir, filename)
                psi = string.replace(filename, "PSI.", "")
                PSIfile = os.path.join(PSIdir, psi)
                omitcluster = FindTopUniqueEvents(Guidefile, psi, Guidedir)

                if omitcluster == 0:
                    group.append(counter)
                    name.append(psi)
                    counter += 1
        if counter > 2:
            dire = export.findParentDir(InputFile)
            output_dir = dire + 'OncoInputs'
            if os.path.exists(output_dir) == False:
                export.createExportFolder(output_dir)

            output_file = output_dir + '/SVMInput-Round' + str(turn) + '.txt'
            ExpandSampleClusters.filterRows(InputFile,
                                            output_file,
                                            filterDB=upd_guides,
                                            logData=False)
            header = ExpandSampleClusters.header_file(output_file)
            print "Running SVM prediction for improved subtypes - Round" + str(
                turn)
            train = ExpandSampleClusters.TrainDataGeneration(
                output_file, BinarizedOutput, name)
            grplst.append(group)
            ExpandSampleClusters.Classify(header, train, output_file, grplst,
                                          name, turn)
            header = Correlationdepletion.header_file(NMFResult)

            output_file = output_dir + '/DepletionInput-Round' + str(
                turn) + ".txt"
            sampleIndexSelection.filterFile(InputFile, output_file, header)
            print "Running Correlation Depletion - Round" + str(turn)
            commonkeys, count = Correlationdepletion.FindCorrelations(
                NMFResult, output_file, name)
            Depleted = Correlationdepletion.DepleteSplicingevents(
                commonkeys, output_file, count, InputFile)
            InputFile = Depleted

            flag = True
        else:
            try:
                print "Running K-means analyses instead of NMF - Round" + str(
                    turn)
                header = []
                header = Kmeans.header_file(Guidefile_block)
                Kmeans.KmeansAnalysis(Guidefile_block, header, InputFile, turn)
                flag = False
            except Exception:
                print 'WARNING!!!!! DID NOT RUN K-MEANS!!!!!'
                print traceback.format_exc()
                flag = False
    else:
        if Rank == 1:
            try:
                print "Running K-means analyses instead of NMF - Round" + str(
                    turn)
                header = []
                header = Kmeans.header_file(Guidefile_block)
                Kmeans.KmeansAnalysis(Guidefile_block, header, InputFile, turn)
                flag = False
            except Exception:
                print 'WARNING!!!!! DID NOT RUN K-MEANS!!!!!'
                print traceback.format_exc()
                flag = False
        else:
            flag = False

    return flag, InputFile, FilteredEventAnnot
Beispiel #40
0
def parseResultfolders(motifdir, GEdir, SFlist):
    sfs = []
    for lin in open(SFlist, 'rU').xreadlines():
        s = lin.rstrip('\r\n')
        s1 = string.split(s, '\t')
        sfs.append(s1[0])

    mappingdict = defaultdict(list)
    allden = []
    for filename in os.listdir(motifdir):
        name = filename
        mapping = []
        dellst = []

        if "._" not in filename and "Events" not in filename:
            fol = os.path.join(motifdir, filename)
            if os.path.isdir(fol):
                #for filename2 in os.listdir(fol):
                #filnam2=os.path.join(fol,filename2)
                #if "._" not in filnam2:
                #   if os.path.isdir(filnam2):
                #       #print filnam2
                #       flag=0
                #       if "._" not in filename2:
                #           name=filename+":"+filename2
                #           flag=1
                #
                #       if flag==1:
                for filename3 in os.listdir(fol):

                    if filename3 == "finalResults.tab":

                        clipres = os.path.join(fol, filename3)
                        for lin in open(clipres, 'rU').xreadlines():

                            q = lin.rstrip('\r\n')
                            q1 = string.split(q, '\t')

                            clipnam = q1[0] + ":" + q1[1] + ":" + q1[2]
                            mappingdict[name, clipnam, "Clipseq"] = q1[11]

                    if filename3 == "output_TF_strand":
                        knownrbp = os.path.join(fol, filename3)
                        for filename4 in os.listdir(knownrbp):
                            if filename4 == "knownResults.txt":
                                filenam4 = os.path.join(knownrbp, filename4)
                                try:
                                    head = 0
                                    for line in open(filenam4,
                                                     'rU').xreadlines():
                                        q = line.rstrip('\r\n')
                                        q1 = string.split(q, '\t')
                                        if head == 0:
                                            motif = q1.index('Motif Name')
                                            pval = q1.index('P-value')
                                            head = 1
                                            continue
                                        else:
                                            mappingdict[
                                                name, q1[motif],
                                                "Cisbp_Actual"] = q1[pval]

                                except Exception:
                                    continue

                    if filename3 == "output1":
                        denovorbp = os.path.join(fol, filename3)
                        for filename4 in os.listdir(denovorbp):
                            if filename4 == "homerResults.html":
                                denolink = "file://" + str(
                                    os.path.join(denovorbp, filename4))
                                #print denolink
                                html = urllib2.urlopen(denolink).read()
                                soup = BeautifulSoup(html)
                                for table in soup.find_all('table'):
                                    for row in table.find_all('tr'):
                                        col = map(
                                            cell_text,
                                            row.find_all(re.compile('t[dh]')))

                                        if col[2] == "P-value":
                                            continue
                                        else:

                                            motname = string.split(
                                                col[7], "(")[0]
                                            mapping.append([
                                                name + ";" + motname,
                                                float(col[2])
                                            ])
                                            #mappingdict[name,motname,"Cisbp_denovo"]=col[2]

                    if filename3 == "output2":
                        denovorbp = os.path.join(fol, filename3)
                        for filename4 in os.listdir(denovorbp):
                            if filename4 == "homerResults.html":
                                denolink = "file://" + str(
                                    os.path.join(denovorbp, filename4))
                                #print denolink
                                html = urllib2.urlopen(denolink).read()
                                soup = BeautifulSoup(html)
                                for table in soup.find_all('table'):
                                    for row in table.find_all('tr'):
                                        col = map(
                                            cell_text,
                                            row.find_all(re.compile('t[dh]')))
                                        if col[2] == "P-value":
                                            continue
                                        else:
                                            motname = string.split(
                                                col[7], "(")[0]
                                            mapping.append([
                                                name + ";" + motname,
                                                float(col[2])
                                            ])
                                            #mappingdict[name,motname,"Cisbp_denovo"]=col[2]
                    if filename3 == "output3":
                        denovorbp = os.path.join(fol, filename3)
                        for filename4 in os.listdir(denovorbp):
                            if filename4 == "homerResults.html":
                                denolink = "file://" + str(
                                    os.path.join(denovorbp, filename4))
                                #print denolink
                                html = urllib2.urlopen(denolink).read()
                                soup = BeautifulSoup(html)
                                for table in soup.find_all('table'):
                                    for row in table.find_all('tr'):
                                        col = map(
                                            cell_text,
                                            row.find_all(re.compile('t[dh]')))
                                        if col[2] == "P-value":
                                            continue
                                        else:
                                            motname = string.split(
                                                col[7], "(")[0]
                                            mapping.append([
                                                name + ";" + motname,
                                                float(col[2])
                                            ])
                                            #mappingdict[name,motname,"Cisbp_denovo"]=col[2]
                    if filename3 == "output4":
                        denovorbp = os.path.join(fol, filename3)
                        for filename4 in os.listdir(denovorbp):
                            if filename4 == "homerResults.html":
                                denolink = "file://" + str(
                                    os.path.join(denovorbp, filename4))
                                #print denolink
                                html = urllib2.urlopen(denolink).read()
                                soup = BeautifulSoup(html)
                                for table in soup.find_all('table'):
                                    for row in table.find_all('tr'):
                                        col = map(
                                            cell_text,
                                            row.find_all(re.compile('t[dh]')))
                                        if col[2] == "P-value":
                                            continue
                                        else:
                                            motname = string.split(
                                                col[7], "(")[0]
                                            mapping.append([
                                                name + ";" + motname,
                                                float(col[2])
                                            ])
                                            #mappingdict[name,motname,"Cisbp_denovo"]=col[2]
                    if filename3 == "output5":
                        denovorbp = os.path.join(fol, filename3)
                        for filename4 in os.listdir(denovorbp):
                            if filename4 == "homerResults.html":
                                denolink = "file://" + str(
                                    os.path.join(denovorbp, filename4))
                                #print denolink
                                html = urllib2.urlopen(denolink).read()
                                soup = BeautifulSoup(html)
                                for table in soup.find_all('table'):
                                    for row in table.find_all('tr'):
                                        col = map(
                                            cell_text,
                                            row.find_all(re.compile('t[dh]')))
                                        if col[2] == "P-value":
                                            continue
                                        else:
                                            motname = string.split(
                                                col[7], "(")[0]
                                            mapping.append([
                                                name + ";" + motname,
                                                float(col[2])
                                            ])
                                            #print name,motname,col[2]
                                            #sys.exit()
                                            #mappingdict[name,motname,"Cisbp_denovo"]=col[2]
        mapping.sort(key=lambda x: x[0])

        mapping.sort(key=lambda x: x[1])
        #prev=""
        #output=os.path.join(motifdir,"test.txt")
        #output_w=open(output,"a")
        for i in range(len(mapping)):
            if mapping[i][0] not in dellst:
                mot = string.split(mapping[i][0], ";")[1]
                genes = []
                genes = string.split(mot, ":")[1:]
                allden.append([filename, mot, genes, mapping[i][1]])
                #output_w.write(mapping[i][0]+"\t"+str(mapping[i][1]))
                #      output_w.write("\n")
                dellst.append(mapping[i][0])
        final = {}
        for i in range(len(allden)):
            de = []
            de = allden[i]

            for q in de[2]:
                if q in final:
                    if de[3] < final[q][1]:
                        final[q] = [de[0], de[3], de[1]]
                else:
                    final[q] = [de[0], de[3], de[1]]
        for genes in final:

            de = []
            de = final[genes]
            mappingdict[de[0], de[2], "Cisbp_denovo"] = str(de[1])

    for filename in os.listdir(GEdir):
        if "GE" in filename and "._GE" not in filename:
            InputFile = os.path.join(GEdir, filename)
            name = string.replace(filename, "GE.", "")
            name = string.replace(name, "_vs_Others.txt", "")
            head = 0
            for line in open(InputFile, 'rU').xreadlines():
                q = line.rstrip('\r\n')
                q1 = string.split(q, '\t')
                if head == 0:
                    symbol = q1.index('Symbol')
                    adjp = q1.index('adjp')
                    head = 1
                    continue
                else:
                    if q1[symbol] in sfs:
                        mappingdict[name, q1[symbol], "GE"] = q1[adjp]
    dire = export.findParentDir(motifdir)
    output_dir = dire + 'MotifResults'
    export.createExportFolder(output_dir)
    output = output_dir + "/Motifresults.txt"

    #output=os.path.join(motifdir,"merged_output_allpvalues_nofold.txt")
    output1 = open(output, "w")
    #output1.write("signature"+"\t"+"gene"+"\t"+"tool"+"\t"+"p-value"+"\n")
    for name, gene, key in mappingdict:
        output1.write(name + "\t" + gene + "\t" + key + "\t" +
                      mappingdict[name, gene, key] + "\n")
    output1.close()
    return output
Beispiel #41
0
def NMFAnalysis(expressionInputFile,NMFinputDir,Rank,platform,iteration=0,strategy="conservative"):

    root_dir = export.findParentDir(NMFinputDir)[:-1]
    if 'ExpressionInput' in root_dir:
        root_dir = export.findParentDir(root_dir)
    if 'NMF-SVM' in root_dir:
        root_dir = export.findParentDir(root_dir)
        
    export.findFilename(NMFinputDir)
        
    X=[]
    header=[]
    head=0
    exportnam=root_dir+'/NMF-SVM/NMF/round'+str(iteration)+'NMFsnmf_versionr'+str(Rank)+'.txt'
    export_res=export.ExportFile(exportnam)
    exportnam_bin=root_dir+'/NMF-SVM/NMF/round'+str(iteration)+'NMFsnmf_binary'+str(Rank)+'.txt'
    export_res1=export.ExportFile(exportnam_bin)
    exportnam_bint=root_dir+'/NMF-SVM/NMF/round'+str(iteration)+'NMFsnmf_binary_t_'+str(Rank)+'.txt'
    export_res5=export.ExportFile(exportnam_bint)
    MF_input = root_dir+'/NMF-SVM/ExpressionInput/exp.NMF-MarkerFinder.txt'
    export.customFileCopy(expressionInputFile,root_dir+'/NMF-SVM/ExpressionInput/exp.NMF-MarkerFinder.txt')
    export_res4=open(string.replace(MF_input,'exp.','groups.'),"w")
    export_res7=open(string.replace(MF_input,'exp.','comps.'),"w")
    exportnam2=root_dir+'/NMF-SVM/SubtypeAnalyses/round'+str(iteration)+'Metadata'+str(Rank)+'.txt'
    export_res2=export.ExportFile(exportnam2)
    exportnam3=root_dir+'/NMF-SVM/SubtypeAnalyses/round'+str(iteration)+'Annotation'+str(Rank)+'.txt'
    export_res3=export.ExportFile(exportnam3)
    #if 'Clustering' in NMFinputDir:
     #   count=1
      #  start=2
    #else:
    count=0
    start=1
    #print Rank
    for line in open(NMFinputDir,'rU').xreadlines():
        line=line.rstrip('\r\n')
        q= string.split(line,'\t')
        if head >count:
            val=[]
            val2=[]
            me=0.0
            
            for i in range(start,len(q)):
                try:
                    val2.append(float(q[i]))
                except Exception:
                    continue
            me=np.median(val2)
            for i in range(start,len(q)):
                try:
                    val.append(float(q[i]))
                except Exception:
                    val.append(float(me))
            #if q[1]==prev:
            X.append(val)
          
        else:
            export_res1.write(line)
            export_res.write(line)
            export_res1.write("\n")
            #export_res4.write(line)
            #export_res4.write("\n")
            export_res.write("\n")
            header=q
            head+=1
            continue   
    group=defaultdict(list)
        
    sh=[]
    X=np.array(X)
    #print X.shape
    mat=[]
    #mat=X
    mat=zip(*X)
    mat=np.array(mat)
    #print mat.shape
    #model = NMF(n_components=15, init='random', random_state=0)
    #W = model.fit_transform(mat)
    nmf = nimfa.Snmf(mat,seed="nndsvd", rank=int(Rank), max_iter=20,n_run=1,track_factor=False,theta=0.95)
    nmf_fit = nmf()
    W = nmf_fit.basis()
    W=np.array(W)
    #np.savetxt("basismatrix2.txt",W,delimiter="\t")
    H=nmf_fit.coef()
    H=np.array(H)
   # np.savetxt("coefficientmatrix2.txt",H,delimiter="\t")
    #print W.shape
    sh=W.shape
    export_res3.write("uid\tUID\tUID\n")
    if int(Rank)==2:
        par=1
    else:
        par=2
    #for i in range(sh[1]):
    #    val=W[:,i]
    #    me=np.mean(val)
    #    st=np.std(val)
    #    export_res2.write(header[i+1])
    #    for j in range(sh[0]):
    #        if float(W[i][j])>=float(me+(par*st)):
    #          
    #            export_res2.write("\t"+str(1))
    #        else:
    #            export_res2.write("\t"+str(0))
    #       
    #    export_res2.write("\n")
    if platform != 'PSI':
        sh=W.shape
        Z=[]
        export_res5.write("uid")
        export_res2.write("uid")
        for i in range(sh[1]):
            
            export_res5.write("\t"+'V'+str(i))
            export_res2.write("\t"+'V'+str(i))
            export_res3.write('V'+str(i)+"\t"+"Covariate"+"\t"+str(1)+"\n")
            
        export_res5.write("\n")
        export_res2.write("\n")
        export_res3.write("\n")
        for i in range(sh[0]):
            new_val=[]
            val=W[i,:]
            export_res2.write(header[i+1])
            export_res5.write(header[i+1])
            export_res4.write(header[i+1])
            flag=True
            for j in range(sh[1]):
                if W[i][j]==max(val) and flag:
                    export_res5.write("\t"+str(1))
                    export_res2.write("\t"+str(1))
                    new_val.append(1)
                    export_res4.write("\t"+str(j+1)+"\t"+'V'+str(j))
                    flag=False
                else:
                    export_res5.write("\t"+str(0))
                    export_res2.write("\t"+str(0))
                    new_val.append(0)
                
            Z.append(new_val)
            export_res5.write("\n")
            export_res2.write("\n")
            export_res4.write("\n")
        W=zip(*W)
        W=np.array(W)
        sh=W.shape
        Z=zip(*Z)
        Z=np.array(Z)
        for i in range(sh[0]):
            export_res.write('V'+str(i))
            export_res1.write('V'+str(i))
            for j in range(sh[1]):
                export_res.write("\t"+str(W[i][j]))
                export_res1.write("\t"+str(Z[i][j]))
            export_res.write("\n")
            export_res1.write("\n")
            
        export_res.close()
        export_res1.close()
        export_res2.close()
        export_res5.close()
        Orderedheatmap.Classify(exportnam_bint)    
        
        return exportnam,exportnam_bin,exportnam2,exportnam3
    
    else:
        W=zip(*W)
        W=np.array(W)
        sh=W.shape
        Z=[]
        for i in range(sh[0]):
            new_val=[]
            val=W[i,:]
            num=sum(i > 0.10 for i in val)
            if num >40 or num <3:
                compstd=True
            else:
                compstd=False
            me=np.mean(val)
            st=np.std(val)
            #print 'V'+str(i)
            export_res.write('V'+str(i))
            export_res1.write('V'+str(i))
           
            for j in range(sh[1]):
                
                if compstd:   
                    if float(W[i][j])>=float(me+(par*st)):
                    
                        export_res1.write("\t"+str(1))
                        new_val.append(1)
                    else:
                        export_res1.write("\t"+str(0))
                        new_val.append(0)
                else:
                    if float(W[i][j])>0.1:
                    
                        export_res1.write("\t"+str(1))
                        new_val.append(1)
                    else:
                        export_res1.write("\t"+str(0))
                        new_val.append(0)
                export_res.write("\t"+str(W[i][j]))
                
            Z.append(new_val)
            export_res.write("\n")
            export_res1.write("\n")
       # Z=zip(*Z)
        Z=np.array(Z)
        sh=Z.shape
        Z_new=[]
        val1=[]
        Z1=[]
        dellst=[]
        export_res2.write("uid")
        export_res5.write("uid")
        for i in range(sh[0]):
            indices=[]
            val1=Z[i,:]
            sum1=sum(val1)
            flag=False
            indices=[index for index, value in enumerate(val1) if value == 1]
            for j in range(sh[0]):
                val2=[]
                
                if i!=j:
                    val2=Z[j,:]
                    
                    sum2=sum([val2[x] for x in indices])
                    summ2=sum(val2)
                    try:
                        if float(sum2)/float(sum1)>0.5:
                            if summ2>sum1:
                                flag=True
                                #print str(i)
                    except Exception:
                        continue
            if flag==False:
    
                Z1.append(val1)
                export_res2.write("\t"+'V'+str(i))
                export_res5.write("\t"+'V'+str(i))
                export_res3.write('V'+str(i)+"\t"+"Covariate"+"\t"+str(1)+"\n")
        
        export_res2.write("\n")
        export_res5.write("\n")
        Z1=np.array(Z1)
        Z=Z1
        Z=zip(*Z)
        Z=np.array(Z)
        sh=Z.shape
            
        for i in range(sh[0]):
            val1=Z[i,:]
            #print sum(val1)
            #if sum(val)>2: 
            if sum(val1)>2:
                val=[0 if x==1 else x for x in val1]
            else:
                val=val1
            me=np.mean(val)
            st=np.std(val)
            export_res2.write(header[i+1])
            export_res5.write(header[i+1])
            for j in range(sh[1]):
                if strategy=="conservative":
                    export_res2.write("\t"+str(val1[j]))
                    export_res5.write("\t"+str(val1[j]))
                else:
                   export_res2.write("\t"+str(val[j]))
                   export_res5.write("\t"+str(val[j])) 
            export_res2.write("\n")
            export_res5.write("\n")
            Z_new.append(val)
        Z_new=zip(*Z_new)
        Z_new=np.array(Z_new)
        
        sh=Z_new.shape

        export_res5.close()
        Orderedheatmap.Classify(exportnam_bint)    
        if strategy=="conservative":
            return exportnam,exportnam_bin,exportnam2,exportnam3
        else:
            return exportnam,exportnam_bin,exportnam2,exportnam3