Example #1
0
def exportCorrelationResults(exp_input):
    input_file = export.findFilename(exp_input)
    if '.txt' in exp_output_file:
        corr_output_file = string.replace(exp_output_file, 'DATASET',
                                          'LineageCorrelations')
    else:  ### Occurs when processing a non-standard AltAnalyze file
        corr_output_file = exp_output_file + '/' + input_file
    corr_output_file = string.replace(
        corr_output_file, '.txt',
        '-' + coding_type + '-' + compendiumPlatform + '.txt')
    if analysis_type == 'AltExon':
        corr_output_file = string.replace(corr_output_file, coding_type,
                                          'AltExon')
    filename = export.findFilename(corr_output_file)
    score_data = export.ExportFile(corr_output_file)
    if use_scipy:
        zscore_output_dir = string.replace(corr_output_file, '.txt',
                                           '-zscores.txt')
        probability_data = export.ExportFile(zscore_output_dir)
        #adjustPValues()
        replacePearsonPvalueWithZscore()
    ### Make title row
    headers = ['Sample_name']
    for tissue in tissue_comparison_scores:
        for (r, p, sample) in tissue_comparison_scores[tissue]:
            headers.append(sample)
        break
    title_row = string.join(headers, '\t') + '\n'
    score_data.write(title_row)
    if use_scipy:
        probability_data.write(title_row)
    ### Export correlation data
    tissue_scores = {}
    tissue_probabilities = {}
    tissue_score_list = []  ### store and rank tissues according to max(score)
    for tissue in tissue_comparison_scores:
        scores = []
        probabilities = []
        for (r, p, sample) in tissue_comparison_scores[tissue]:
            scores.append(r)
            probabilities.append(p)
        tissue_score_list.append((max(scores), tissue))
        tissue_scores[tissue] = string.join(map(str, [tissue] + scores),
                                            '\t') + '\n'  ### export line
        if use_scipy:
            tissue_probabilities[tissue] = string.join(
                map(str, [tissue] + probabilities), '\t') + '\n'

    tissue_score_list.sort()
    tissue_score_list.reverse()
    for (score, tissue) in tissue_score_list:
        score_data.write(tissue_scores[tissue])
        if use_scipy:
            probability_data.write(tissue_probabilities[tissue])
    score_data.close()
    if use_scipy:
        probability_data.close()
    print filename, 'exported...'
    return zscore_output_dir
Example #2
0
def exportCorrelationResults(exp_input):
    input_file = export.findFilename(exp_input)
    if '.txt' in exp_output_file:
        corr_output_file = string.replace(exp_output_file,'DATASET','LineageCorrelations')
    else: ### Occurs when processing a non-standard AltAnalyze file
        corr_output_file = exp_output_file+'/'+input_file
    corr_output_file = string.replace(corr_output_file,'.txt','-'+coding_type+'-'+compendiumPlatform+'.txt')
    if analysis_type == 'AltExon':
        corr_output_file = string.replace(corr_output_file,coding_type,'AltExon')
    filename = export.findFilename(corr_output_file)
    score_data = export.ExportFile(corr_output_file)
    if use_scipy:
        zscore_output_dir = string.replace(corr_output_file,'.txt','-zscores.txt')
        probability_data = export.ExportFile(zscore_output_dir)
        #adjustPValues()
        replacePearsonPvalueWithZscore()
    ### Make title row
    headers=['Sample_name']
    for tissue in tissue_comparison_scores:
        for (r,p,sample) in tissue_comparison_scores[tissue]: headers.append(sample)
        break
    title_row = string.join(headers,'\t')+'\n'
    score_data.write(title_row)
    if use_scipy:
        probability_data.write(title_row)
    ### Export correlation data
    tissue_scores = {}; tissue_probabilities={}; tissue_score_list = [] ### store and rank tissues according to max(score)
    for tissue in tissue_comparison_scores:
        scores=[]
        probabilities=[]
        for (r,p,sample) in tissue_comparison_scores[tissue]:
            scores.append(r)
            probabilities.append(p)
        tissue_score_list.append((max(scores),tissue))
        tissue_scores[tissue] = string.join(map(str,[tissue]+scores),'\t')+'\n' ### export line
        if use_scipy:
            tissue_probabilities[tissue] = string.join(map(str,[tissue]+probabilities),'\t')+'\n'
        
    tissue_score_list.sort()
    tissue_score_list.reverse()
    for (score,tissue) in tissue_score_list:
        score_data.write(tissue_scores[tissue])
        if use_scipy:
            probability_data.write(tissue_probabilities[tissue])
    score_data.close()
    if use_scipy:
        probability_data.close()
    print filename,'exported...'
    return zscore_output_dir
Example #3
0
def runPyCombat(fl):
    """ This method was added specifically for AltAnalyze version 2.0.8 (not in the original GitHub code) """
    print 'Running Combat...',
    expr_input_dir = fl.ExpFile()
    pheno_dir = formatPhenoFile(fl)

    moved_exp_dir = export.findParentDir(
        expr_input_dir) + 'Non-Combat/' + export.findFilename(expr_input_dir)
    try:
        export.copyFile(expr_input_dir, moved_exp_dir)
        print 'Moved original expression file to:'
        print '\t' + moved_exp_dir
        ### now overwrite the origin excluding the commented rows
        export.cleanFile(
            expr_input_dir,
            removeExtra='#')  ### remove comments from the original file
    except Exception:
        None

    pheno = pa.read_table(pheno_dir, index_col=0)
    dat = pa.read_table(expr_input_dir, index_col=0)

    mod = patsy.dmatrix("group", pheno, return_type="dataframe")
    t = time.time()
    #print dat, pheno.batch, mod;sys.exit()
    ebat = combat(dat, pheno.batch, mod, 0)
    print "...Combat completed in %.2f seconds" % (time.time() - t)

    print 'Original expression file over-written with batch effect removal results...'
    ebat.to_csv(expr_input_dir, sep="\t")
Example #4
0
def FilterFile(Guidefile, PSI, turn=0):
    if 'Clustering' in Guidefile:
        count = 1
    else:
        count = 0
    val = []
    head = 0
    for line in open(Guidefile, 'rU').xreadlines():
        if head > count:
            line = line.rstrip('\r\n')
            q = string.split(line, '\t')
            val.append(q[0])
        else:
            head += 1
            continue

    dire = export.findParentDir(export.findParentDir(Guidefile)[:-1])
    output_dir = dire + 'SubtypeAnalyses-Results'
    if os.path.exists(output_dir) == False:
        export.createExportFolder(output_dir)

    #output_file = output_dir+'/round'+str(turn)+'/'+export.findFilename(PSI)+'-filtered.txt'
    output_file = output_dir + '/round' + str(
        turn) + '/' + export.findFilename(PSI)[:-4] + '-filtered.txt'
    try:
        os.mkdir(output_dir + '/round' + str(turn))
    except:
        pass  ### already exists
    if turn == 1:
        ### No need to filter this file
        shutil.copyfile(PSI, output_file)
    else:
        filterRows(PSI, output_file, filterDB=val)

    return output_file
Example #5
0
def downloadCurrentVersion(filename,secondary_dir,file_type):
    import UI
    file_location_defaults = UI.importDefaultFileLocations()

    ud = file_location_defaults['url'] ### Get the location of the download site from Config/default-files.csv
    url_dir = ud.Location() ### Only one entry
    
    dir = export.findParentDir(filename)
    dir = string.replace(dir,'hGlue','')  ### Used since the hGlue data is in a sub-directory
    filename = export.findFilename(filename)
    url = url_dir+secondary_dir+'/'+filename
    file,status = download(url,dir,file_type); continue_analysis = 'yes'
    if 'Internet' in status and 'nnot' not in filename: ### Exclude for Affymetrix annotation files
        print_out = "File:\n"+url+"\ncould not be found on the server or an internet connection is unavailable."
        if len(sys.argv)<2:
            try:
                UI.WarningWindow(print_out,'WARNING!!!')
                continue_analysis = 'no'
            except Exception:
                print 'cannot be downloaded';force_error
        else: print 'cannot be downloaded';force_error
    elif status == 'remove' and ('.zip' in file or '.tar' in file or '.gz' in file):
        try: os.remove(file) ### Not sure why this works now and not before
        except Exception: status = status
    return continue_analysis
Example #6
0
def runPyCombat(fl):
    """ This method was added specifically for AltAnalyze version 2.0.8 (not in the original GitHub code) """
    print "Running Combat...",
    expr_input_dir = fl.ExpFile()
    pheno_dir = formatPhenoFile(fl)

    moved_exp_dir = export.findParentDir(expr_input_dir) + "Non-Combat/" + export.findFilename(expr_input_dir)
    try:
        export.copyFile(expr_input_dir, moved_exp_dir)
        print "Moved original expression file to:"
        print "\t" + moved_exp_dir
        ### now overwrite the origin excluding the commented rows
        export.cleanFile(expr_input_dir, removeExtra="#")  ### remove comments from the original file
    except Exception:
        None

    pheno = pa.read_table(pheno_dir, index_col=0)
    dat = pa.read_table(expr_input_dir, index_col=0)

    mod = patsy.dmatrix("group", pheno, return_type="dataframe")
    t = time.time()
    # print dat, pheno.batch, mod;sys.exit()
    ebat = combat(dat, pheno.batch, mod, 0)
    print "...Combat completed in %.2f seconds" % (time.time() - t)

    print "Original expression file over-written with batch effect removal results..."
    ebat.to_csv(expr_input_dir, sep="\t")
Example #7
0
def normalizeDataset(filename,
                     output=None,
                     normalization='quantile',
                     platform="3'array"):
    """ Perform Quantile Normalization on an input expression dataset """

    if output == None:
        output = filename
        moved_exp_dir = export.findParentDir(
            filename) + 'Non-Normalized/' + export.findFilename(filename)
        try:
            export.copyFile(filename, moved_exp_dir)
            print 'Moved original expression file to:'
            print '\t' + moved_exp_dir
        except Exception:
            None

    if normalization == 'Quantile' or normalization == 'quantile':
        print "Importing data..."
        sample_expression_db = importExpressionValues(filename)
        print "Performing quantile normalization..."
        sample_expression_db = RNASeq.quantileNormalizationSimple(
            sample_expression_db)
        exportExpressionData(output, sample_expression_db)
    elif normalization == 'group':
        performGroupNormalization(moved_exp_dir, filename, platform)
    print 'Exported expression input file to:', output
Example #8
0
def importExonIDTranslations(array_type,species,translate_to_genearray):
    gene_translation_db={}; gene_translation_db2={}
    if targetPlatform == 'gene' and translate_to_genearray == 'no':
        ### Get gene array to exon array probeset associations
        gene_translation_db = importExonIDTranslations('gene',species,'yes')
        for geneid in gene_translation_db:
            exonid = gene_translation_db[geneid]
            gene_translation_db2[exonid] = geneid
            #print exonid, geneid
        translation_db = gene_translation_db2
    else:

        filename = 'AltDatabase/'+species+'/'+array_type+'/'+species+'_'+array_type+'-exon_probesets.txt'
        ### Import exon array to target platform translations (built for DomainGraph visualization)
        fn=filepath(filename); x=0; translation_db={}
        print 'Importing the translation file',export.findFilename(fn)
        for line in open(fn,'rU').xreadlines():
            data = cleanUpLine(line)
            t = string.split(data,'\t')
            if x==0:  x=1
            else:
                platform_id,exon_id = t
                if targetPlatform == 'gene' and translate_to_genearray == 'no':
                    try:
                        translation_db[platform_id] = gene_translation_db[exon_id] ### return RNA-Seq to gene array probeset ID
                        #print platform_id, exon_id, gene_translation_db[exon_id];sys.exit()
                    except Exception: null=[]
                else:
                    translation_db[platform_id] = exon_id
        del gene_translation_db; del gene_translation_db2
    return translation_db
Example #9
0
def covertAffyFormatToBED(filename, ConversionDB=None):
    print 'processing:',filename
    parent = export.findParentDir(filename)
    if ConversionDB==None:
        output_file = 'simple_chr.bed'
    else:
        output_file = export.findFilename(filename)
        output_file = string.replace(output_file,'mm9','mm10')
    export_obj = export.ExportFile(parent+'/'+output_file)
    fn=filepath(filename); entry_count=0; readfiles = False
    for line in open(fn,'rU').xreadlines():
        data = cleanUpLine(line)
        if data[0]=='#': readfiles = False
        elif readfiles==False:
            readfiles = True
            if ConversionDB!=None:
               export_obj.write(line) ### Write header 
        else:
            try:
                t = string.split(data[1:-1],'","')
                probeset_id,chr,strand,start,stop = t[:5]
                int(start)
                if ConversionDB==None:
                    if 'chr' in chr:
                        export_obj.write(chr+'\t'+start+'\t'+stop+'\t'+probeset_id+'\n')
                else:
                    chr,start,stop = ConversionDB[probeset_id]
                    t = [probeset_id,chr,strand,start,stop] + t[5:]
                    values = '"'+string.join(t,'","')+'"\n'
                    export_obj.write(values)
                entry_count+=1
            except Exception:
                pass
    export_obj.close()
    print entry_count, 'entries saved to:',parent+'/'+output_file
Example #10
0
def importExonIDTranslations(array_type,species,translate_to_genearray):
    gene_translation_db={}; gene_translation_db2={}
    if targetPlatform == 'gene' and translate_to_genearray == 'no':
        ### Get gene array to exon array probeset associations
        gene_translation_db = importExonIDTranslations('gene',species,'yes')
        for geneid in gene_translation_db:
            exonid = gene_translation_db[geneid]
            gene_translation_db2[exonid] = geneid
            #print exonid, geneid
        translation_db = gene_translation_db2
    else:

        filename = 'AltDatabase/'+species+'/'+array_type+'/'+species+'_'+array_type+'-exon_probesets.txt'
        ### Import exon array to target platform translations (built for DomainGraph visualization)
        fn=filepath(filename); x=0; translation_db={}
        print 'Importing the translation file',export.findFilename(fn)
        for line in open(fn,'rU').xreadlines():
            data = cleanUpLine(line)
            t = string.split(data,'\t')
            if x==0:  x=1
            else:
                platform_id,exon_id = t
                if targetPlatform == 'gene' and translate_to_genearray == 'no':
                    try:
                        translation_db[platform_id] = gene_translation_db[exon_id] ### return RNA-Seq to gene array probeset ID
                        #print platform_id, exon_id, gene_translation_db[exon_id];sys.exit()
                    except Exception: null=[]
                else:
                    translation_db[platform_id] = exon_id
        del gene_translation_db; del gene_translation_db2
    return translation_db
Example #11
0
def downloadCurrentVersion(filename,secondary_dir,file_type):
    import UI
    file_location_defaults = UI.importDefaultFileLocations()

    ud = file_location_defaults['url'] ### Get the location of the download site from Config/default-files.csv
    url_dir = ud.Location() ### Only one entry
    
    dir = export.findParentDir(filename)
    dir = string.replace(dir,'hGlue','')  ### Used since the hGlue data is in a sub-directory
    filename = export.findFilename(filename)
    url = url_dir+secondary_dir+'/'+filename
    print url
    file,status = download(url,dir,file_type); continue_analysis = 'yes'
    if 'Internet' in status and 'nnot' not in filename: ### Exclude for Affymetrix annotation files
        print_out = "File:\n"+url+"\ncould not be found on the server or an internet connection is unavailable."
        if len(sys.argv)<2:
            try:
                UI.WarningWindow(print_out,'WARNING!!!')
                continue_analysis = 'no'
            except Exception:
                print 'cannot be downloaded';force_error
        else: print 'cannot be downloaded';force_error
    elif status == 'remove' and ('.zip' in file or '.tar' in file or '.gz' in file):
        try: os.remove(file) ### Not sure why this works now and not before
        except Exception: status = status
    return continue_analysis
Example #12
0
def downloadCurrentVersion(filename, secondary_dir, file_type):
    import UI
    file_location_defaults = UI.importDefaultFileLocations()

    uds = file_location_defaults[
        'url']  ### Get the location of the download site from Config/default-files.csv
    for ud in uds:
        url_dir = ud.Location()  ### Only one entry

    dir = export.findParentDir(filename)
    filename = export.findFilename(filename)
    url = url_dir + secondary_dir + '/' + filename

    file, status = download(url, dir, file_type)
    continue_analysis = 'yes'
    if 'Internet' in status:
        print_out = "File:\n" + url + "\ncould not be found on server or internet connection is unavailable."
        try:
            UI.WarningWindow(print_out, 'WARNING!!!')
            continue_analysis = 'no'
        except Exception:
            print url
            print 'cannot be downloaded'
            die
    elif status == 'remove':
        try:
            os.remove(file)  ### Not sure why this works now and not before
        except Exception:
            status = status
    return continue_analysis
Example #13
0
def importTissueSpecificProfiles(species):
    if analysis_type == 'AltExon':
        filename = 'AltDatabase/ensembl/'+species+'/'+species+'_'+targetPlatform +'_tissue-specific_AltExon_protein_coding.txt'
    else:
        filename = 'AltDatabase/ensembl/'+species+'/'+species+'_'+targetPlatform +'_tissue-specific_'+coding_type+'.txt'
    if customMarkerFile != False and customMarkerFile != None:
        if len(customMarkerFile)>0:
            filename = customMarkerFile
            
    #filename = 'AltDatabase/ensembl/'+species+'/random.txt'
    #print 'Target platform used for analysis:',species, targetPlatform, coding_type
    if value_type == 'calls':
        filename = string.replace(filename,'.txt','_stats.txt')
    fn=filepath(filename); x=0
    tissues_added={}
    for line in open(fn,'rU').xreadlines():
        data = cleanUpLine(line)
        t = string.split(data,'\t')
        
        if x==0:
            print 'Importing the tissue compedium database:',export.findFilename(filename)
            headers = t; x=1; index=0
            for i in headers:
                if 'UID' == i: ens_index = index; uid_index = index
                if analysis_type == 'AltExon': ens_index = ens_index ### Assigned above when analyzing probesets
                elif 'Ensembl' in i: ens_index = index
                if 'marker-in' in i: tissue_index = index+1; marker_in = index
                index+=1
            try:
                for i in t[tissue_index:]: tissues.append(i)
            except Exception:
                for i in t[1:]: tissues.append(i)
            if keyed_by == 'primaryID':
                try: ens_index = uid_index
                except Exception: None
        else:
            try:
                gene = t[0]
                tissue_exp = map(float, t[1:])
                tissue_specific_db[gene]=x,tissue_exp ### Use this to only grab relevant gene expression profiles from the input dataset
            except Exception:
                try: gene = string.split(t[ens_index],'|')[0] ### Only consider the first listed gene - this gene is the best option based on ExpressionBuilder rankings
                except Exception: pass
                #if 'Pluripotent Stem Cells' in t[marker_in] or 'Heart' in t[marker_in]:
                #if t[marker_in] not in tissues_added: ### Only add the first instance of a gene for that tissue - used more for testing to quickly run the analysis
                tissue_exp = map(float, t[tissue_index:])
                if value_type == 'calls':
                    tissue_exp = produceDetectionCalls(tissue_exp,platform) ### 0 or 1 calls
                tissue_specific_db[gene]=x,tissue_exp ### Use this to only grab relevant gene expression profiles from the input dataset
                tissues_added[t[marker_in]]=[]
            x+=1
    print len(tissue_specific_db), 'genes in the tissue compendium database'

    if correlate_to_tissue_specific == 'yes':
        try: importTissueCorrelations(filename)
        except Exception:
            null=[]
Example #14
0
def importTissueSpecificProfiles(species):
    if analysis_type == 'AltExon':
        filename = 'AltDatabase/ensembl/'+species+'/'+species+'_'+targetPlatform +'_tissue-specific_AltExon_protein_coding.txt'
    else:
        filename = 'AltDatabase/ensembl/'+species+'/'+species+'_'+targetPlatform +'_tissue-specific_'+coding_type+'.txt'
    if customMarkerFile != False and customMarkerFile != None:
        if len(customMarkerFile)>0:
            filename = customMarkerFile
            
    #filename = 'AltDatabase/ensembl/'+species+'/random.txt'
    #print 'Target platform used for analysis:',species, targetPlatform, coding_type
    if value_type == 'calls':
        filename = string.replace(filename,'.txt','_stats.txt')
    fn=filepath(filename); x=0
    tissues_added={}
    for line in open(fn,'rU').xreadlines():
        data = cleanUpLine(line)
        t = string.split(data,'\t')
        
        if x==0:
            print 'Importing the tissue compedium database:',export.findFilename(filename)
            headers = t; x=1; index=0
            for i in headers:
                if 'UID' == i: ens_index = index; uid_index = index
                if analysis_type == 'AltExon': ens_index = ens_index ### Assigned above when analyzing probesets
                elif 'Ensembl' in i: ens_index = index
                if 'marker-in' in i: tissue_index = index+1; marker_in = index
                index+=1
            try:
                for i in t[tissue_index:]: tissues.append(i)
            except Exception:
                for i in t[1:]: tissues.append(i)
            if keyed_by == 'primaryID':
                try: ens_index = uid_index
                except Exception: None
        else:
            try:
                gene = t[0]
                tissue_exp = map(float, t[1:])
                tissue_specific_db[gene]=x,tissue_exp ### Use this to only grab relevant gene expression profiles from the input dataset
            except Exception:
                try: gene = string.split(t[ens_index],'|')[0] ### Only consider the first listed gene - this gene is the best option based on ExpressionBuilder rankings
                except Exception: pass
                #if 'Pluripotent Stem Cells' in t[marker_in] or 'Heart' in t[marker_in]:
                #if t[marker_in] not in tissues_added: ### Only add the first instance of a gene for that tissue - used more for testing to quickly run the analysis
                tissue_exp = map(float, t[tissue_index:])
                if value_type == 'calls':
                    tissue_exp = produceDetectionCalls(tissue_exp,platform) ### 0 or 1 calls
                tissue_specific_db[gene]=x,tissue_exp ### Use this to only grab relevant gene expression profiles from the input dataset
                tissues_added[t[marker_in]]=[]
            x+=1
    print len(tissue_specific_db), 'genes in the tissue compendium database'

    if correlate_to_tissue_specific == 'yes':
        try: importTissueCorrelations(filename)
        except Exception:
            null=[]
def visualizePathwayAssociations(filename,
                                 species,
                                 mod_type,
                                 wpid,
                                 imageExport=True):
    ### Log any potential problems
    log_file = filepath('webservice.log')
    log_report = open(log_file, 'w')
    if wpid == None:
        force_invalid_pathway

    global mod
    global species_code
    global graphic_link
    graphic_link = {}
    mod = mod_type
    species_code = species
    root_dir = export.findParentDir(filename)
    criterion_name = export.findFilename(filename)[:-4]
    log_report.write('Filename: %s and WPID %s\n' % (filename, wpid))
    if 'GO-Elite/input' in root_dir:
        root_dir = string.replace(root_dir, 'GO-Elite/input', 'WikiPathways')
    else:
        root_dir += 'WikiPathways/'
    analysis_type = 'Genes'
    id_db, column_headers = importDataSimple(filename, 'GO-Elite')
    log_report.write('GO-Elite input ID file imported successfully\n')
    log_report.write('%d IDs imported\n' % len(id_db))
    pathway_db = {}
    pathway_db[wpid] = PathwayData(
        None
    )  ### only need to analyze object (method allows for analysis of any number)
    pathway_db = getPathwayAs(pathway_db, species_code, mod)
    log_report.write(
        'Pathway data imported from GPML files obtained from webservice\n')
    id_color_db = getHexadecimalColorRanges(
        id_db, analysis_type)  ### example id_db" is key:gene, value:fold
    graphID_db = getGraphIDAssociations(id_color_db, pathway_db, 'MOD')
    if imageExport != 'png':
        file_type = 'pdf'  ### svg, pdf, png
        getColoredPathway(root_dir,
                          graphID_db,
                          file_type,
                          '-' + criterion_name,
                          WPID=wpid)
    if imageExport != 'pdf':
        file_type = 'png'  ### svg, pdf, png
        getColoredPathway(root_dir,
                          graphID_db,
                          file_type,
                          '-' + criterion_name,
                          WPID=wpid)
    log_report.write(
        'Pathways colored and image data returned. Exiting webservice.\n')
    log_report.close()
    return graphic_link
Example #16
0
def normalizeDataset(filename, output=None):
    """ Perform Quantile Normalization on an input expression dataset """

    print "Importing data..."
    sample_expression_db = importExpressionValues(filename)
    print "Performing quantile normalization..."
    sample_expression_db = RNASeq.quantileNormalizationSimple(sample_expression_db)

    if output == None:
        output = filename
        moved_exp_dir = export.findParentDir(filename) + "Non-Quantile/" + export.findFilename(filename)
        try:
            export.copyFile(filename, moved_exp_dir)
            print "Moved original expression file to:"
            print "\t" + moved_exp_dir
        except Exception:
            None

    exportExpressionData(output, sample_expression_db)
    print "Exported expression input file to:", output
Example #17
0
def normalizeDataset(filename,output = None, normalization='quantile',platform="3'array"):
    """ Perform Quantile Normalization on an input expression dataset """
    
    if output==None:
        output = filename
        moved_exp_dir = export.findParentDir(filename)+'Non-Normalized/'+export.findFilename(filename)
        try:
            export.copyFile(filename, moved_exp_dir)
            print 'Moved original expression file to:'
            print '\t'+moved_exp_dir
        except Exception: None
        
    if normalization == 'Quantile' or normalization == 'quantile':
        print "Importing data..."
        sample_expression_db = importExpressionValues(filename)
        print "Performing quantile normalization..."    
        sample_expression_db = RNASeq.quantileNormalizationSimple(sample_expression_db)
        exportExpressionData(output,sample_expression_db)
    elif normalization == 'group':
        performGroupNormalization(moved_exp_dir,filename,platform)    
    print 'Exported expression input file to:',output
def visualizePathwayAssociations(filename,species,mod_type,wpid,imageExport=True):
    ### Log any potential problems
    log_file = filepath('webservice.log')
    log_report = open(log_file,'w')
    if wpid == None:
        force_invalid_pathway
        
    global mod
    global species_code
    global graphic_link
    graphic_link={}
    mod = mod_type
    species_code = species
    root_dir = export.findParentDir(filename)
    criterion_name = export.findFilename(filename)[:-4]
    log_report.write('Filename: %s and WPID %s\n' % (filename,wpid))
    if 'GO-Elite/input' in root_dir:
        root_dir = string.replace(root_dir,'GO-Elite/input','WikiPathways')
    else:
        root_dir+='WikiPathways/'
    analysis_type = 'Genes'
    id_db,column_headers = importDataSimple(filename,'GO-Elite')
    log_report.write('GO-Elite input ID file imported successfully\n')
    log_report.write('%d IDs imported\n' % len(id_db))
    pathway_db={}
    pathway_db[wpid] = PathwayData(None) ### only need to analyze object (method allows for analysis of any number)
    pathway_db = getPathwayAs(pathway_db,species_code,mod)
    log_report.write('Pathway data imported from GPML files obtained from webservice\n')
    id_color_db = getHexadecimalColorRanges(id_db,analysis_type) ### example id_db" is key:gene, value:fold
    graphID_db = getGraphIDAssociations(id_color_db,pathway_db,'MOD')
    if imageExport != 'png':
        file_type = 'pdf' ### svg, pdf, png
        getColoredPathway(root_dir,graphID_db,file_type,'-'+criterion_name,WPID=wpid)
    if imageExport != 'pdf':
        file_type = 'png' ### svg, pdf, png
        getColoredPathway(root_dir,graphID_db,file_type,'-'+criterion_name,WPID=wpid)
    log_report.write('Pathways colored and image data returned. Exiting webservice.\n')
    log_report.close()
    return graphic_link
Example #19
0
def normalizeDataset(filename, output=None):
    """ Perform Quantile Normalization on an input expression dataset """

    print "Importing data..."
    sample_expression_db = importExpressionValues(filename)
    print "Performing quantile normalization..."
    sample_expression_db = RNASeq.quantileNormalizationSimple(
        sample_expression_db)

    if output == None:
        output = filename
        moved_exp_dir = export.findParentDir(
            filename) + 'Non-Quantile/' + export.findFilename(filename)
        try:
            export.copyFile(filename, moved_exp_dir)
            print 'Moved original expression file to:'
            print '\t' + moved_exp_dir
        except Exception:
            None

    exportExpressionData(output, sample_expression_db)
    print 'Exported expression input file to:', output
Example #20
0
def readFPKMs(path):
    if ".gz" in path:
        f = gzip.open(path, "rb")
    else:
        f = open(path, "rU")
    file_content = f.read()
    fpkm_data = string.split(file_content, "\n")
    sample = export.findFilename(path)
    if "fpkm_tracking" in sample:
        sample = string.split(sample, ".fpkm_tracking")[0]
        sample = string.replace(sample, ".sorted.genes", "")
    fpkm_db = {}
    transcript_db = {}
    firstLine = True
    row_count = 0
    for line in fpkm_data:
        data = cleanUpLine(line)
        t = string.split(data, "\t")
        if firstLine:
            try:
                track_i = t.index("tracking_id")
                gene_i = t.index("gene_id")
                fpkm_i = t.index("FPKM")
            except Exception:
                fpkm_i = 9
                gene_i = 3
                row_count = 1
            firstLine = False
        if firstLine == False and row_count > 0:
            if len(t) > 1:
                geneID = t[gene_i]
                transcriptID = t[gene_i]
                fpkm = t[fpkm_i]
                fpkm_db[transcriptID] = float(fpkm)
                transcript_db[transcriptID] = geneID
        row_count += 1
    sample_FPKM_db[sample] = fpkm_db
    return sample_FPKM_db, transcript_db
Example #21
0
def covertAffyFormatToBED(filename, ConversionDB=None):
    print 'processing:', filename
    parent = export.findParentDir(filename)
    if ConversionDB == None:
        output_file = 'simple_chr.bed'
    else:
        output_file = export.findFilename(filename)
        output_file = string.replace(output_file, 'mm9', 'mm10')
    export_obj = export.ExportFile(parent + '/' + output_file)
    fn = filepath(filename)
    entry_count = 0
    readfiles = False
    for line in open(fn, 'rU').xreadlines():
        data = cleanUpLine(line)
        if data[0] == '#': readfiles = False
        elif readfiles == False:
            readfiles = True
            if ConversionDB != None:
                export_obj.write(line)  ### Write header
        else:
            try:
                t = string.split(data[1:-1], '","')
                probeset_id, chr, strand, start, stop = t[:5]
                int(start)
                if ConversionDB == None:
                    if 'chr' in chr:
                        export_obj.write(chr + '\t' + start + '\t' + stop +
                                         '\t' + probeset_id + '\n')
                else:
                    chr, start, stop = ConversionDB[probeset_id]
                    t = [probeset_id, chr, strand, start, stop] + t[5:]
                    values = '"' + string.join(t, '","') + '"\n'
                    export_obj.write(values)
                entry_count += 1
            except Exception:
                pass
    export_obj.close()
    print entry_count, 'entries saved to:', parent + '/' + output_file
Example #22
0
def readFPKMs(path):
    if '.gz' in path:
        f=gzip.open(path,'rb')
    else:
        f=open(path,"rU")
    file_content=f.read()
    fpkm_data = string.split(file_content,'\n')
    sample = export.findFilename(path)
    if 'fpkm_tracking' in sample:
        sample = string.split(sample,'.fpkm_tracking')[0]
        sample = string.replace(sample,'.sorted.genes','')
    fpkm_db={}
    transcript_db={}
    firstLine=True
    row_count=0
    for line in fpkm_data:
        data = cleanUpLine(line)
        t = string.split(data,'\t')
        if firstLine:
            try:
                track_i = t.index('tracking_id')
                gene_i = t.index('gene_id')
                fpkm_i = t.index('FPKM')
            except Exception:
                fpkm_i = 9
                gene_i = 3
                row_count = 1
            firstLine = False
        if firstLine == False and row_count>0:
            if len(t)>1:
                geneID = t[gene_i]
                transcriptID = t[gene_i]
                fpkm = t[fpkm_i]
                fpkm_db[transcriptID] = float(fpkm)
                transcript_db[transcriptID] = geneID
        row_count+=1
    sample_FPKM_db[sample] = fpkm_db
    return sample_FPKM_db,transcript_db
Example #23
0
def readFPKMs(path):
    f=gzip.open(path,'rb')
    file_content=f.read()
    fpkm_data = string.split(file_content,'\n')
    sample = export.findFilename(path)
    fpkm_db={}
    transcript_db={}
    firstLine=True
    for line in fpkm_data:
        data = cleanUpLine(line)
        t = string.split(data,'\t')
        if firstLine:
            track_i = t.index('tracking_id')
            gene_i = t.index('gene_id')
            fpkm_i = t.index('FPKM')
            firstLine = False
        else:
            geneID = t[gene_i]
            transcriptID = t[gene_i]
            fpkm = t[fpkm_i]
            fpkm_db[transcriptID] = float(fpkm)
            transcript_db[transcriptID] = geneID
    sample_FPKM_db[sample] = fpkm_db
    return sample_FPKM_db,transcript_db
Example #24
0
def downloadCurrentVersion(filename,secondary_dir,file_type):
    import UI
    file_location_defaults = UI.importDefaultFileLocations()

    uds = file_location_defaults['url'] ### Get the location of the download site from Config/default-files.csv
    for ud in uds: url_dir = ud.Location() ### Only one entry
    
    dir = export.findParentDir(filename)  
    filename = export.findFilename(filename)
    url = url_dir+secondary_dir+'/'+filename
    
    file,status = download(url,dir,file_type); continue_analysis = 'yes'
    if 'Internet' in status:
        print_out = "File:\n"+url+"\ncould not be found on server or internet connection is unavailable."
        try:
            UI.WarningWindow(print_out,'WARNING!!!')
            continue_analysis = 'no'
        except Exception:
            print url
            print 'cannot be downloaded';die
    elif status == 'remove':
        try: os.remove(file) ### Not sure why this works now and not before
        except Exception: status = status
    return continue_analysis
Example #25
0
def readFPKMs(path):
    f = gzip.open(path, 'rb')
    file_content = f.read()
    fpkm_data = string.split(file_content, '\n')
    sample = export.findFilename(path)
    fpkm_db = {}
    transcript_db = {}
    firstLine = True
    for line in fpkm_data:
        data = cleanUpLine(line)
        t = string.split(data, '\t')
        if firstLine:
            track_i = t.index('tracking_id')
            gene_i = t.index('gene_id')
            fpkm_i = t.index('FPKM')
            firstLine = False
        else:
            geneID = t[gene_i]
            transcriptID = t[gene_i]
            fpkm = t[fpkm_i]
            fpkm_db[transcriptID] = float(fpkm)
            transcript_db[transcriptID] = geneID
    sample_FPKM_db[sample] = fpkm_db
    return sample_FPKM_db, transcript_db
Example #26
0
def Enrichment(Inputfile,mutdict,mutfile,Expand,header):
    import collections
    import mappfinder
    X=defaultdict(list)
    prev=""
    head=0
    group=defaultdict(list)
    enrichdict=defaultdict(float)
    mut=export.findFilename(mutfile)
    dire=export.findParentDir(Inputfile)
    output_dir = dire+'MutationEnrichment'
    export.createExportFolder(output_dir)

    exportnam=output_dir+'/Enrichment_Results.txt'
    export_enrich=open(exportnam,"w")
    exportnam=output_dir+'/Enrichment_tophits.txt'
    export_hit=open(exportnam,"w")
   
    export_enrich.write("Mutations"+"\t"+"Cluster"+"\t"+"r"+"\t"+"R"+"\t"+"n"+"\t"+"Sensitivity"+"\t"+"Specificity"+"\t"+"z-score"+"\t"+"Fisher exact test"+"\t"+"adjp value"+"\n")
    if Expand=="yes":
        header2=header_file(Inputfile,Expand="yes")
        
        for line in open(Inputfile,'rU').xreadlines():
            if head >0:
                line=line.rstrip('\r\n')
                q= string.split(line,'\t')
                for i in range(1,len(q)):
                    if q[i]==str(1):
                        #group[q[0]].append(header2[i-1])
                        group[header2[i-1]].append(q[0])
           
            else:
                head+=1
                continue
    else:
        for line in open(Inputfile,'rU').xreadlines():
            line=line.rstrip('\r\n')
            line=string.split(line,'\t')
            #for i in range(1,len(line)):
            group[line[2]].append(line[0])
   
    total_Scores={}
    for kiy in mutdict:
        if kiy =="MDP":
            print mutdict[kiy]
        groupdict={}
        remaining=[]
        remaining=list(set(header) - set(mutdict[kiy]))
        groupdict[1]=mutdict[kiy]
        groupdict[2]=remaining
       # export_enrich1.write(kiy)
        for key2 in group:
           
            
            r=float(len(list(set(group[key2])))-len(list(set(group[key2]) - set(mutdict[kiy]))))
            n=float(len(group[key2]))
            R=float(len(set(mutdict[kiy])))
            N=float(len(header))
        
            if r==0 or R==1.0:
                print kiy,key2,r,n,R,N
                pval=float(1)
                z=float(0)
                null_z = 0.000
                zsd = mappfinder.ZScoreData(key2,r,R,z,null_z,n)
                zsd.SetP(pval)
            else:
                try: z = Zscore(r,n,N,R)
                except : z = 0.0000
                ### Calculate a Z-score assuming zero matching entries
                try: null_z = Zscore(0,n,N,R)
                except Exception: null_z = 0.000
               
                
                try:
                    pval = mappfinder.FishersExactTest(r,n,R,N)
                    zsd = mappfinder.ZScoreData(key2,r,R,z,null_z,n)
                    zsd.SetP(pval)
                except Exception:
                    pval=1.0
                    zsd = mappfinder.ZScoreData(key2,r,R,z,null_z,n)
                    zsd.SetP(pval)
                    #pass
                
          
            if kiy in total_Scores:
                    signature_db = total_Scores[kiy]
                    signature_db[key2]=zsd ### Necessary format for the permutation function
            else:
                    signature_db={key2:zsd}
                    total_Scores[kiy] = signature_db
    sorted_results=[]
    mutlabels={}
    for kiy in total_Scores:
        
        signature_db = total_Scores[kiy]
        ### Updates the adjusted p-value instances
        mappfinder.adjustPermuteStats(signature_db)
        for signature in signature_db:
            zsd = signature_db[signature]
           
            results = [kiy,signature,zsd.Changed(),zsd.Measured(),zsd.InPathway(),str(float(zsd.PercentChanged())/100.0),str(float(float(zsd.Changed())/float(zsd.InPathway()))), zsd.ZScore(), zsd.PermuteP(), zsd.AdjP()] #string.join(zsd.AssociatedIDs(),'|')
            sorted_results.append([signature,float(zsd.PermuteP()),results])
    sorted_results.sort() ### Sort by p-value
    prev=""
    for (sig,p,values) in sorted_results:
        if sig!=prev:
            flag=True
            export_hit.write(string.join(values,'\t')+'\n')
        if flag:
            if (float(values[5])>=0.5 and float(values[6])>=0.5) or float(values[5])>=0.6 :
                mutlabels[values[1]]=values[0]
                flag=False
                export_hit.write(string.join(values,'\t')+'\n')
        export_enrich.write(string.join(values,'\t')+'\n')
        prev=sig
    if len(sorted_results)==0:
            export_enrich.write(string.join([splicing_factor,'NONE','NONE','NONE','NONE','NONE','NONE'],'\t')+'\n')
    export_enrich.close()
    #print mutlabels
    return mutlabels
Example #27
0
def Enrichment(Guidefile, mutdict, mutfile, Expand, header):

    X = defaultdict(list)
    prev = ""
    head = 0
    group = defaultdict(list)
    mut = export.findFilename(mutfile)
    exportnam = Guidefile[:-4] + mut[:-4] + 'enrichment.txt'
    export_enrich = open(exportnam, "w")
    export_enrich.write("Mutations" + "\t" + "Cluster" + "\t" + "Pvalue" +
                        "\t" + "r" + "\t" + "R" + "\t" + "n" + "\t" +
                        "z-score" + "\t" + "Sensitivity" + "\t" +
                        "Specificity" + "\n")
    if Expand == "yes":
        header2 = header_file(Guidefile)

        for line in open(Guidefile, 'rU').xreadlines():
            if head > 0:
                line = line.rstrip('\r\n')
                q = string.split(line, '\t')
                for i in range(1, len(q)):
                    if q[i] == str(1):
                        group[q[0]].append(header2[i - 1])

            else:
                head += 1
                continue
    else:
        for line in open(Guidefile, 'rU').xreadlines():
            line = line.rstrip('\r\n')
            line = string.split(line, '\t')
            #for i in range(1,len(line)):
            group[line[2]].append(line[0])

    for kiy in mutdict:

        groupdict = {}
        remaining = []
        remaining = list(set(header) - set(mutdict[kiy]))
        groupdict[1] = mutdict[kiy]
        groupdict[2] = remaining
        for key2 in group:

            r = float(
                len(group[key2]) -
                len(list(set(group[key2]) - set(mutdict[kiy]))))
            n = float(len(group[key2]))
            R = float(len(set(mutdict[kiy])))
            N = float(len(header))
            #print kiy,key2,r,n,R,N
            if r == 0:
                pval = float(1)
                z = float(0)
            else:
                try:
                    pval, z = FishersExactTest(r, n, R, N)
                    export_enrich.write(
                        str(kiy) + "\t" + str(key2) + "\t" + str(pval) + "\t" +
                        str(r) + "\t" + str(R) + "\t" + str(n) + "\t" +
                        str(z) + "\t" + str(float(r) / (float(R))) + "\t" +
                        str(float(r) / (float(n))) + "\n")
                except Exception:
                    print r, n, R, N
                    pass
Example #28
0
def remoteGene(gene,Species,root_dir,comparison_file):
    global Transcript_Annotations_File
    global ExonRegion_File
    global Selected_Gene
    global Prt_Trans_File
    global Prt_Regions_File
    global Prt_Boundaries_File
    global SplicingIndex_File
    global UniPrt_Regions_File
    global microRNA_File
    global domainAnnotation_db
    global platform
    global species

    Selected_Gene = str(gene)
    species = Species
    
    comparison_name = string.split(export.findFilename(comparison_file),'.')[0]
    ExonRegion_File = unique.filepath("AltDatabase/ensembl/"+species+"/"+species+"_Ensembl_exon.txt")
    Transcript_Annotations_File = unique.filepath("AltDatabase/ensembl/"+species+"/"+species+"_Ensembl_transcript-annotations.txt")
    Prt_Trans_File = searchDirectory("AltDatabase/ensembl/"+species+"/",'Ensembl_Protein')
    Prt_Regions_File = searchDirectory("AltDatabase/ensembl/"+species+"/",'ProteinFeatures')
    Prt_Boundaries_File = searchDirectory("AltDatabase/ensembl/"+species+"/",'ProteinCoordinates')
    UniPrt_Regions_File = searchDirectory("AltDatabase/uniprot/"+species+"/",'FeatureCoordinate')
    SplicingIndex_File = searchDirectory(root_dir+'/AltResults/ProcessedSpliceData/','splicing-index',secondary=comparison_name)
    platform = getPlatform(SplicingIndex_File)
    microRNA_File = searchDirectory("AltDatabase/"+species+"/"+platform,'microRNAs_multiple')
    #print(SplicingIndex_File)

    total_val = ProteinCentricIsoformView(Selected_Gene)
    junctions = total_val[0]
    p_boundaries = total_val[1]
    p_domains = total_val[2]
    transcript_db = total_val[3]
    exon_db = total_val[4]
    splice_db = total_val[5]
    microRNA_db = total_val[6]
    domainAnnotation_db = total_val[7]

    #for i in exon_db:
    #    print("THE", i, exon_db[i], "\n")

    #for i in microRNA_db:
    #        m_test = microRNA_db[i]
    #    print(len(m_test))
    #    for q in m_test:
    #        print("microRNA", q.ExonBlock(), q.Description(), q.BP(), "\n")

    #for i in exon_db["ENST00000349238"]:
    #    print(i[2].EnsemblRegion())
    
    domain_color_list = []
    for i in p_domains:
        ploy = p_domains[i]
        for a in ploy:
            domain_color_list.append(a[1])

    domain_color_list = list(set(domain_color_list))
    domain_color_key = {}
    c_color1 = [0.8, 0.6, 0.1]
    c_color2 = [0.1, 0.6, 0.8]
    c_color3 = [0.6, 0.1, 0.8]
    c_color4 = [0.95, 0.6, 0.3]
    c_color5 = [0.3, 0.6, 0.95]
    c_color6 = [0.6, 0.3, 0.95]
    FLAG = 1

    for item in domain_color_list:
        if(FLAG == 1):
            domain_color_key[item] = c_color1
            FLAG = FLAG + 1
            continue
        if(FLAG == 2):
            domain_color_key[item] = c_color2
            FLAG = FLAG + 1
            continue
        if(FLAG == 3):
            domain_color_key[item] = c_color3
            FLAG = FLAG + 1
            continue
        if(FLAG == 4):
            domain_color_key[item] = c_color4
            FLAG = FLAG + 1
            continue
        if(FLAG == 5):
            domain_color_key[item] = c_color5
            FLAG = FLAG + 1
            continue
        if(FLAG == 6):
            domain_color_key[item] = c_color6
            FLAG = 1
            continue

    #for i in domain_color_key:
        #print(i, domain_color_key[i], "\n")
    
    Y = 100
    Transcript_to_Y = {}
    for transcript in transcript_db:
        Transcript_to_Y[transcript] = Y
        Y = Y + 300
    import traceback

    def onpick(event):
        #ind = event.ind
        print(event.artist.get_label())

    #for i in domainAnnotation_db: print(i,len(domainAnnotation_db));break
    
    fig = pylab.figure()
    
    ylim = Y + 200
    currentAxis = pylab.gca()
    #ax = pylab.axes()
    ax = fig.add_subplot(111)
    X_Pos_List = []
    CoordsBank = []
    
    for transcript in transcript_db:
        try:
            Junc_List = junctions[transcript]
            y_pos = Transcript_to_Y[transcript]
            Gene_List = exon_db[transcript]
            color_flag = 1
            for entry in Gene_List:
                G_start = entry[0][0]
                G_end = entry[0][1]
                Exon_Object = entry[2]
                try:
                    LabelClass = splice_db[Exon_Object.EnsemblRegion()]
                    ExonName = Exon_Object.EnsemblExon()
                    RegCall = LabelClass.RegCall()
                    SplicingIndex = LabelClass.SplicingIndex()
                    PVal = LabelClass.PVal()
                    Midas = LabelClass.Midas()
                    Label = "\n" + "Exon: " + str(ExonName) + "\n" + "RegCall: "  + str(RegCall) + "\n" + "Splicing Index: " + str(SplicingIndex) + "\n" + "P-Value: " + str(PVal) + "\n" + "Midas Value: " + str(Midas) + "\n"
                    Label = string.replace(Label,"\n"," ")
                    if(RegCall == "UC"):
                        color_choice = "Grey"
                    else:
                        S_Int = float(SplicingIndex)
                        if(S_Int > 0):
                            #color_choice = (0.7, 0.7, 0.99)
                            color_choice = 'blue'
                        if(S_Int < 0):
                            #color_choice = (0.8, 0.4, 0.4)
                            color_choice = 'red'
                                            
                except:
                    #print(traceback.format_exc());sys.exit()
                    Label = ""
                    color_choice = "Grey"
                #print("Start", G_start, "end", G_end, "Region", entry[2].EnsemblRegion())
                if((color_flag % 2) == 0):
                    currentAxis.add_patch(Rectangle((G_start, y_pos), (G_end - G_start), 50, color = color_choice, label = (entry[2].EnsemblRegion() + Label), picker = True))
                    y_end = y_pos + 50
                    try: CoordsBank.append((G_start, G_end, y_pos, y_end, 'Exon: '+entry[2].EnsemblRegion()+' '+ 'SI: '+str(SplicingIndex)[:4]+' Pval: '+str(Midas)[:4]))
                    except Exception:
                        CoordsBank.append((G_start, G_end, y_pos, y_end, 'Exon: '+entry[2].EnsemblRegion()))
                    #print(entry[2].EnsemblRegion(),y_pos,y_end)
                if((color_flag % 2) != 0):                   
                    currentAxis.add_patch(Rectangle((G_start, y_pos), (G_end - G_start), 50, color = color_choice, label = (entry[2].EnsemblRegion() + Label), picker = True))
                    y_end = y_pos + 50
                    try: CoordsBank.append((G_start, G_end, y_pos, y_end, 'Exon: '+entry[2].EnsemblRegion()+' '+ 'SI: '+str(SplicingIndex)[:4]+' p-value: '+str(Midas)[:4]))
                    except Exception:
                        CoordsBank.append((G_start, G_end, y_pos, y_end, 'Exon: '+entry[2].EnsemblRegion()))
                    #print(entry[2].EnsemblRegion(),y_pos,y_end)
                color_flag = color_flag + 1
                if(entry[2].EnsemblRegion() in microRNA_db):
                    microRNA_object = microRNA_db[entry[2].EnsemblRegion()]
                    mr_label = "MICRORNA MATCHES" + "\n"
                    for class_object in microRNA_object:
                        mr_exonname = class_object.ExonBlock()
                        mr_desc = class_object.Description() + " " + class_object.Algorithms()
                        #print(mr_desc)
                        mr_label = mr_label + mr_desc + "\n"
                    
                    currentAxis.add_patch(Rectangle((G_start, (y_pos - 75)), (G_end - G_start), 40, color = "Green", label = (mr_label), picker = True))
                    y_start = y_pos - 75
                    y_end = y_pos - 35
                    CoordsBank.append((G_start, G_end, y_start, y_end, mr_desc))
                
            for entry in Junc_List:
                junctionID = entry[-1]
                try:
                    LabelClass = splice_db[entry[2]]
                    RegCall = LabelClass.RegCall()
                    SplicingIndex = LabelClass.SplicingIndex()
                    PVal = LabelClass.PVal()
                    Midas = LabelClass.Midas()
                    Label = "\n" + "RegCall: " + str(RegCall) + "\n" + "Splicing Index: " + str(SplicingIndex) + "\n" + "P-Value: " + str(PVal) + "\n" + "Midas Value: " + str(Midas) + "\n"
                    if(float(SplicingIndex) > 0):
                        color_junc = "blue"
                    if(float(SplicingIndex) < 0):
                        color_junc = "red"
                    if(RegCall == "UC"):
                        color_junc = "grey"
                except:
                    Label = ""
                    color_junc = "grey"
                currentAxis.add_patch(Rectangle((entry[0], y_pos), (entry[1] - entry[0]), 50, color = "White", label = (str(entry[2]) + Label), picker = True))
                ax.arrow(entry[0], (y_pos+50), 8, 40, label = (str(entry[2]) + Label), color = color_junc, picker = True)
                ax.arrow((entry[0] + 8), (y_pos+90), 11, -40, label = (str(entry[2]) + Label), color = color_junc, picker = True)
                y_start = y_pos
                y_end = y_pos + 30
                #print(junctionID,y_start,y_end)
                CoordsBank.append((G_start, G_end, y_start, y_end, junctionID))

            try:
                P_Bound_List = p_boundaries[transcript]
                E_Start = P_Bound_List[-2]
                E_End = P_Bound_List[-1]
                P_Start = P_Bound_List[1]
                P_End = P_Bound_List[2]
                #print("Boundaries: ", P_Start, P_End)
                X_Pos_List.append(int(E_End))
                #currentAxis.add_patch(Rectangle((E_Start, y_pos), E_End, 50, color = "Blue"))
                try:
                    currentAxis.add_patch(Rectangle((P_Start, (y_pos + 120)), (P_End - P_Start), 10))
                except:
                    pass
                p_label_list = ["DEF"]
                #CoordsBank.append((P_Start, P_End, y_pos, P_End - P_Start, transcript)) ### Added by NS - needs work
                try: P_Domain_List = p_domains[transcript]
                except Exception: P_Domain_List=[]
                for entry in P_Domain_List:
                    #print("Domain", entry)
                    color_domain_choice = domain_color_key[entry[1]]
                    domain_annotation = domainAnnotation_db[entry[1]]
                    #domain_annotation = string.replace(domain_annotation,'REGION-','')
                    p_label = (str(entry[0]) +  " " + str(domain_annotation))
                    #print(entry[0], entry[2], entry[3], P_Start, P_End, domain_annotation, )
                    Repeat_Flag = 0
                    for i in p_label_list:
                        if(p_label == i):
                            Repeat_Flag = 1
                    if(Repeat_Flag == 1):
                        continue
                    p_label_list.append(p_label)               
                    currentAxis.add_patch(Rectangle((entry[2], y_pos + 100), (entry[3] - entry[2]), 50, color = color_domain_choice, label= p_label, picker = True))
                    y_start = y_pos + 100
                    y_end = y_pos + 150
                    CoordsBank.append((entry[2], entry[3], y_start, y_end, p_label))
            except Exception:
                pass
                #print(traceback.format_exc())
        except:
            #print(traceback.format_exc())
            pass
    pylab.ylim([0.0, ylim])
    try:
        max_x = max(X_Pos_List)
    except:
        max_x = 5000
    try:
        pylab.xlim([0.0, max_x])
    except:
        pylab.xlim([0.0, 3000])
    fig.canvas.mpl_connect('pick_event', onpick)
    def format_coord(x, y):
        for m in CoordsBank:
            if(x >= m[0] and x <= m[1] and y >= m[2] and y <= m[3]):
                string_display = m[4]
                return string_display
        string_display = "  "
        return string_display

    ax.format_coord = format_coord
    #datacursor(hover=True, formatter='{label}'.format, bbox=dict(fc='yellow', alpha=1), arrowprops=None)
    pylab.show()
Example #29
0
def getPlatform(filename):
    prefix = string.split(export.findFilename(filename),'.')[0]
    array_type = string.split(prefix,'_')[1]
    if array_type != 'RNASeq':
        array_type = string.lower(array_type)
    return array_type
Example #30
0
def parseJunctionEntries(bam_dir,multi=False, Species=None):
    global bam_file
    global splicesite_db
    global IndicatedSpecies
    IndicatedSpecies = Species
    bam_file = bam_dir
    try: splicesite_db,chromosomes_found = retreiveAllKnownSpliceSites()
    except Exception: splicesite_db={}; chromosomes_found={}
    start = time.time()
    
    try: import collections; junction_db=collections.OrderedDict()
    except Exception:
        try: import ordereddict; junction_db = ordereddict.OrderedDict()
        except Exception: junction_db={}
    original_junction_db = copy.deepcopy(junction_db)
    
    bamf = pysam.Samfile(bam_dir, "rb" )
    ### Is there are indexed .bai for the BAM? Check.
    try:
        for entry in bamf.fetch():
            codes = map(lambda x: x[0],entry.cigar)
            break
    except Exception:
        ### Make BAM Index
        if multi == False:
            print 'Building BAM index file for', bam_dir
        bam_dir = str(bam_dir)
        #On Windows, this indexing step will fail if the __init__ pysam file line 51 is not set to - catch_stdout = False
        pysam.index(bam_dir)
        bamf = pysam.Samfile(bam_dir, "rb" )

    chromosome = False
    chromosomes={}
    count=0
    jid = 1
    prior_jc_start=0
    l1 = None; l2=None
    o = open (string.replace(bam_dir,'.bam','__junction.bed'),"w")
    o.write('track name=junctions description="TopHat junctions"\n')
    export_isoform_models = False
    if export_isoform_models:
        io = open (string.replace(bam_dir,'.bam','__isoforms.txt'),"w")
    isoform_junctions = copy.deepcopy(junction_db)
    outlier_start = 0; outlier_end = 0; read_count = 0; c=0
    for entry in bamf.fetch():
      try: cigarstring = entry.cigarstring
      except Exception:
          codes = map(lambda x: x[0],entry.cigar)
          if 3 in codes: cigarstring = 'N'
          else: cigarstring = None
    
      if cigarstring != None:
        if 'N' in cigarstring: ### Hence a junction
            """
            if entry.cigar[0][1]<60 and entry.cigar[0][1]>20:
                if count<310:
                    a1 = entry.seq[entry.cigar[0][1]-5:entry.cigar[0][1]]
                    a2 = entry.seq[entry.cigar[0][1]:entry.cigar[0][1]+6]
                    if l1==a1 and l2==a2: continue
                    else:
                        print entry.opt('XS'), a1,a2, entry.seq
                        l1 = a1; l2 = a2
                else: sys.exit()
            """
            if prior_jc_start == 0: pass
            elif (entry.pos-prior_jc_start) > 5000 or bamf.getrname( entry.rname ) != chromosome: ### New chr or far from prior reads
                writeJunctionBedFile(junction_db,jid,o)
                #writeIsoformFile(isoform_junctions,io)
                junction_db = copy.deepcopy(original_junction_db) ### Re-set this object
                jid+=1

            chromosome = bamf.getrname( entry.rname )
            chromosomes[chromosome]=[] ### keep track
            X=entry.pos
            Y=entry.pos+entry.alen
            prior_jc_start = X
            """
            if entry.is_reverse:
                strand = '-' ### This is the strand the seq aligns to but not necessarily the REAL strand the mRNA aligns to (see XS below)
            else:                
                strand = '+' """
            try: tophat_strand = entry.opt('XS') ### TopHat knows which sequences are likely real splice sites so it assigns a real strand to the read
            except Exception:
                #if multi == False:  print 'No TopHat strand information';sys.exit()
                tophat_strand = None
            coordinates,up_to_intron_dist = getSpliceSites(entry.cigar,X)

            for (five_prime_ss,three_prime_ss) in coordinates:
                jc = five_prime_ss,three_prime_ss
                #print X, Y, jc, entry.cigarstring, entry.cigar
                try: junction_db[chromosome,jc,tophat_strand].append([X,Y,up_to_intron_dist])
                except Exception: junction_db[chromosome,jc,tophat_strand] = [[X,Y,up_to_intron_dist]]
                
            if export_isoform_models:
                try:
                    mate = bamf.mate(entry) #https://groups.google.com/forum/#!topic/pysam-user-group/9HM6nx_f2CI
    
                    if 'N' in mate.cigarstring:
                        mate_coordinates,mate_up_to_intron_dist = getSpliceSites(mate.cigar,mate.pos)
                    else: mate_coordinates=[]
                except Exception: mate_coordinates=[]
                #print coordinates,mate_coordinates
                junctions = map(lambda x: tuple(x),coordinates)
                if len(mate_coordinates)>0:
                    try:
                        isoform_junctions[chromosome,tuple(junctions),tophat_strand].append(mate_coordinates)
                    except Exception:
                        isoform_junctions[chromosome,tuple(junctions),tophat_strand] = [mate_coordinates]
                else:
                    if (chromosome,tuple(junctions),tophat_strand) not in isoform_junctions:
                        isoform_junctions[chromosome,tuple(junctions),tophat_strand] = []
                
            count+=1
    writeJunctionBedFile(junction_db,jid,o) ### One last read-out
    if multi == False:
        print time.time()-start, 'seconds required to parse the BAM file'
    o.close()
    bamf.close()
    
    missing_chromosomes=[]
    for chr in chromosomes_found:
        if chr not in chromosomes:
            chr = string.replace(chr,'chr','')
            if chr not in chromosomes_found:
                if chr != 'M' and chr != 'MT':
                    missing_chromosomes.append(chr)
    #missing_chromosomes = ['A','B','C','D']
    try: bam_file = export.findFilename(bam_file)
    except Exception: pass
    return bam_file, missing_chromosomes
Example #31
0
def parseJunctionEntries(bam_dir,multi=False, Species=None):
    global bam_file
    global splicesite_db
    global IndicatedSpecies
    IndicatedSpecies = Species
    bam_file = bam_dir
    try: splicesite_db,chromosomes_found = retreiveAllKnownSpliceSites()
    except Exception: splicesite_db={}; chromosomes_found={}
    start = time.time()
    
    try: import collections; junction_db=collections.OrderedDict()
    except Exception:
        try: import ordereddict; junction_db = ordereddict.OrderedDict()
        except Exception: junction_db={}
    original_junction_db = copy.deepcopy(junction_db)
    
    bamf = pysam.Samfile(bam_dir, "rb" )
    ### Is there are indexed .bai for the BAM? Check.
    try:
        for entry in bamf.fetch():
            codes = map(lambda x: x[0],entry.cigar)
            break
    except Exception:
        ### Make BAM Index
        if multi == False:
            print 'Building BAM index file for', bam_dir
        bam_dir = str(bam_dir)
        #On Windows, this indexing step will fail if the __init__ pysam file line 51 is not set to - catch_stdout = False
        pysam.index(bam_dir)
        bamf = pysam.Samfile(bam_dir, "rb" )

    chromosome = False
    chromosomes={}
    count=0
    jid = 1
    prior_jc_start=0
    l1 = None; l2=None
    o = open (string.replace(bam_dir,'.bam','__junction.bed'),"w")
    o.write('track name=junctions description="TopHat junctions"\n')
    export_isoform_models = False
    if export_isoform_models:
        io = open (string.replace(bam_dir,'.bam','__isoforms.txt'),"w")
    isoform_junctions = copy.deepcopy(junction_db)
    outlier_start = 0; outlier_end = 0; read_count = 0; c=0
    for entry in bamf.fetch():
      try: cigarstring = entry.cigarstring
      except Exception:
          codes = map(lambda x: x[0],entry.cigar)
          if 3 in codes: cigarstring = 'N'
          else: cigarstring = None
      if cigarstring != None:
        if 'N' in cigarstring: ### Hence a junction
            """
            if entry.cigar[0][1]<60 and entry.cigar[0][1]>20:
                if count<310:
                    a1 = entry.seq[entry.cigar[0][1]-5:entry.cigar[0][1]]
                    a2 = entry.seq[entry.cigar[0][1]:entry.cigar[0][1]+6]
                    if l1==a1 and l2==a2: continue
                    else:
                        print entry.opt('XS'), a1,a2, entry.seq
                        l1 = a1; l2 = a2
                else: sys.exit()
            """
            if prior_jc_start == 0: pass
            elif (entry.pos-prior_jc_start) > 5000 or bamf.getrname( entry.rname ) != chromosome: ### New chr or far from prior reads
                writeJunctionBedFile(junction_db,jid,o)
                #writeIsoformFile(isoform_junctions,io)
                junction_db = copy.deepcopy(original_junction_db) ### Re-set this object
                jid+=1

            chromosome = bamf.getrname( entry.rname )
            chromosomes[chromosome]=[] ### keep track
            X=entry.pos
            Y=entry.pos+entry.alen
            prior_jc_start = X
            """
            if entry.is_reverse:
                strand = '-' ### This is the strand the seq aligns to but not necessarily the REAL strand the mRNA aligns to (see XS below)
            else:                
                strand = '+' """
            try: tophat_strand = entry.opt('XS') ### TopHat knows which sequences are likely real splice sites so it assigns a real strand to the read
            except Exception:
                #if multi == False:  print 'No TopHat strand information';sys.exit()
                tophat_strand = None
            coordinates,up_to_intron_dist = getSpliceSites(entry.cigar,X)

            for (five_prime_ss,three_prime_ss) in coordinates:
                jc = five_prime_ss,three_prime_ss
                #print X, Y, jc, entry.cigarstring, entry.cigar
                try: junction_db[chromosome,jc,tophat_strand].append([X,Y,up_to_intron_dist])
                except Exception: junction_db[chromosome,jc,tophat_strand] = [[X,Y,up_to_intron_dist]]
                
            if export_isoform_models:
                try:
                    mate = bamf.mate(entry) #https://groups.google.com/forum/#!topic/pysam-user-group/9HM6nx_f2CI
    
                    if 'N' in mate.cigarstring:
                        mate_coordinates,mate_up_to_intron_dist = getSpliceSites(mate.cigar,mate.pos)
                    else: mate_coordinates=[]
                except Exception: mate_coordinates=[]
                #print coordinates,mate_coordinates
                junctions = map(lambda x: tuple(x),coordinates)
                if len(mate_coordinates)>0:
                    try:
                        isoform_junctions[chromosome,tuple(junctions),tophat_strand].append(mate_coordinates)
                    except Exception:
                        isoform_junctions[chromosome,tuple(junctions),tophat_strand] = [mate_coordinates]
                else:
                    if (chromosome,tuple(junctions),tophat_strand) not in isoform_junctions:
                        isoform_junctions[chromosome,tuple(junctions),tophat_strand] = []
                
            count+=1
    writeJunctionBedFile(junction_db,jid,o) ### One last read-out
    if multi == False:
        print time.time()-start, 'seconds required to parse the BAM file'
    o.close()
    bamf.close()
    
    missing_chromosomes=[]
    for chr in chromosomes_found:
        if chr not in chromosomes:
            chr = string.replace(chr,'chr','')
            if chr not in chromosomes_found:
                if chr != 'M' and chr != 'MT':
                    missing_chromosomes.append(chr)
    #missing_chromosomes = ['A','B','C','D']
    try: bam_file = export.findFilename(bam_file)
    except Exception: pass
    return bam_file, missing_chromosomes
Example #32
0
def filepath(filename, force=None):
    altDatabaseCheck = True
    #dir=os.path.dirname(dirfile.__file__)       #directory file is input as a variable under the main
    dir = application_path
    """
    if os.path.isfile(filename):
        fn = filename
        return fn
    elif os.path.isfile(dir+'/'+filename):
        fn = filename
        return fn
    #"""
    """ If a local file without the full path (e.g., Config/options.txt). Checks in the software directory."""
    import export
    parent_dir = export.findParentDir(filename)
    actual_file = export.findFilename(filename)
    try:
        #if os.path.exists(dir+'/'+parent_dir):
        dir_list = os.listdir(dir + '/' + parent_dir)
        fn = dir + '/' + parent_dir + '/' + actual_file
        if '.txt' in fn or '.log' in fn:
            return fn
    except:
        pass

    if filename == '':  ### Windows will actually recognize '' as the AltAnalyze root in certain situations but not others
        fn = dir
    elif ':' in filename:
        fn = filename
    else:
        try:
            try:
                dir_list = os.listdir(dir + '/' + filename)
                fn = dir + '/' + filename
            except:
                dir_list = os.listdir(filename)
                fn = filename  ### test to see if the path can be found (then it is the full path)
        except Exception:
            fn = os.path.join(dir, filename)
            fileExists = os.path.isfile(fn)
            #print 'filename:',filename, fileExists
            """"When AltAnalyze installed through pypi - AltDatabase and possibly Config in user-directory """
            if 'Config' in fn:
                if fileExists == False and force != 'application-path' and ignoreHome == False:
                    fn = os.path.join(userHomeDir, filename)
            if 'AltDatabase' in fn:
                getCurrentGeneDatabaseVersion()
                fn = correctGeneDatabaseDir(fn)
                altanalyze_dir = string.split(fn,
                                              'AltDatabase')[0] + 'AltDatabase'
                ### Check the AltDatabase dir not the fn, since the fn may not exist yet
                fileExists = os.path.isfile(altanalyze_dir)
                try:
                    dir_list = os.listdir(altanalyze_dir)
                    fileExists = True
                except Exception:
                    pass
                #print 2, [fn],fileExists
                if fileExists == False and ignoreHome == False:
                    fn = os.path.join(userHomeDir, filename)
                    fn = correctGeneDatabaseDir(fn)
                altDatabaseCheck = False

    if '/Volumes/' in filename and altDatabaseCheck:
        filenames = string.split(filename, '/Volumes/')
        fn = '/Volumes/' + filenames[-1]
    for py2app_dir in py2app_dirs:
        fn = string.replace(fn, py2app_dir, '')
    if (('Databases' in fn) or ('AltDatabase' in fn)) and altDatabaseCheck:
        getCurrentGeneDatabaseVersion()
        fn = correctGeneDatabaseDir(fn)
    fn = string.replace(fn, '.txt.txt', '.txt')
    fn = string.replace(fn, '//', '/')
    fn = string.replace(fn, '//', '/')  ### If /// present
    return fn
Example #33
0
def associateQueryGenesWithInteractions(query_db,query_interactions,dir_file):
    suffix=''
    if dir_file!=None:
        if len(dir_file)!=0:
            suffix='-'+intNameShort+'_'+export.findFilename(dir_file)[:-4]
    if len(suffix)==0:
        try: suffix = '_'+FileName
        except Exception: None
    file_name = 'AltAnalyze-network'+suffix
    
    query_interactions_unique={}
    interacting_genes={}
    connections = 1
    primary=0
    secondary=0
    terciary=0
    for ensemblGene in query_db:
        if ensemblGene in interaction_db:
            for interacting_ensembl in interaction_db[ensemblGene]:
                if interacting_ensembl not in blackList:
                    ###Only allow direct interactions found in query
                    if interacting_ensembl in query_db:
                        try: query_interactions[ensemblGene].append(interacting_ensembl)
                        except KeyError: query_interactions[ensemblGene] = [interacting_ensembl]
                        try: query_interactions[interacting_ensembl].append(ensemblGene)
                        except KeyError: query_interactions[interacting_ensembl] = [ensemblGene]
                        primary+=1
                    if degrees == 2 or degrees == 'indirect':
                        try: interacting_genes[interacting_ensembl].append(ensemblGene)
                        except KeyError: interacting_genes[interacting_ensembl] = [ensemblGene]
                    elif degrees == 'allInteracting' or degrees == 'all possible':
                        try: query_interactions[ensemblGene].append(interacting_ensembl)
                        except KeyError: query_interactions[ensemblGene] = [interacting_ensembl]
                    if interacting_ensembl in secondaryQueryIDs: ### IDs in the expression file
                        secondary+=1 ### When indirect degrees selected, no additional power added by this (only for direct or shortest path)
                        try: query_interactions[ensemblGene].append(interacting_ensembl)
                        except KeyError: query_interactions[ensemblGene] = [interacting_ensembl]    
        if ensemblGene in second_degree_obligatory:
            for interacting_ensembl in second_degree_obligatory[ensemblGene]:
                try: interacting_genes[interacting_ensembl].append(ensemblGene)
                except KeyError: interacting_genes[interacting_ensembl] = [ensemblGene]

    ### Include indirect interactions to secondaryQueryIDs from the expression file
    if degrees == 2 or degrees == 'indirect':
        for ensemblGene in secondaryQueryIDs:
            if ensemblGene in interaction_db:
                for interacting_ensembl in interaction_db[ensemblGene]:
                    if interacting_ensembl not in blackList:
                        try:
                            interacting_genes[interacting_ensembl].append(ensemblGene)
                            terciary+=1#; print interacting_ensembl
                        except KeyError: None ### Only increase the interacting_genes count if the interacting partner is present from the primary query list
    #print primary,secondary,terciary
    
    ### Report the number of unique interacting genes
    for interacting_ensembl in interacting_genes:
        if len(interacting_genes[interacting_ensembl])==1:
            interacting_genes[interacting_ensembl] = 1
        else:
            unique_interactions = unique.unique(interacting_genes[interacting_ensembl])
            interacting_genes[interacting_ensembl] = len(unique_interactions)
    
    query_indirect_interactions={}; indirect_interacting_gene_list=[]; interacting_gene_list=[]; added=[] 
    if degrees=='shortestPath' or degrees=='shortest path': ### Typically identifying the single smallest path(s) between two nodes.
        query_indirect_interactions, indirect_interacting_gene_list, interacting_gene_list = evaluateShortestPath(query_db,interaction_db,10)
        
    else:
        if degrees==2 or degrees=='indirect' or len(secondDegreeObligatoryCategories)>0:
            for ensembl in interacting_genes:
                if interacting_genes[ensembl] > connections:
                    if ensembl in interaction_db: ### Only nodes removed due to promiscuity will not be found
                        for interacting_ensembl in interaction_db[ensembl]:
                            if interacting_ensembl in query_db or interacting_ensembl in secondaryQueryIDs:
                                try: query_indirect_interactions[interacting_ensembl].append(ensembl)
                                except KeyError: query_indirect_interactions[interacting_ensembl] = [ensembl]
                        ###Record the highest linked nodes
                        indirect_interacting_gene_list.append((interacting_genes[ensembl],ensembl)) 
        if len(obligatory_interactions)>0: ### Include always
            all_reported_genes = combineDBs(query_interactions,query_indirect_interactions) ### combinesDBs and returns a unique list of genes
            for ensemblGene in all_reported_genes: ###This only includes genes in the original input list
                if ensemblGene in obligatory_interactions:
                    for interacting_ensembl in obligatory_interactions[ensemblGene]:
                        #symbol = ensembl_symbol_db[ensemblGene]                    
                        try: query_interactions[ensemblGene].append(interacting_ensembl)
                        except KeyError: query_interactions[ensemblGene] = [interacting_ensembl]
    
    z = dict(query_interactions.items() + query_indirect_interactions.items())
    interaction_restricted_db={}
    for ensembl in z:
        interacting_nodes = z[ensembl]
        for node in interacting_nodes:
            if ensembl in interaction_restricted_db:
                db = interaction_restricted_db[ensembl]
                db[node] = 1
            else: interaction_restricted_db[ensembl] = {node:1}

            if node in interaction_restricted_db:
                db = interaction_restricted_db[node]
                db[ensembl] = 1
            else: interaction_restricted_db[node] = {ensembl:1}
            
    if degrees==2 or degrees=='indirect': ### get rid of non-specific interactions
        query_indirect_interactions, indirect_interacting_gene_list, interacting_gene_list = evaluateShortestPath(query_db,interaction_restricted_db,4)
        
    ###Record the highest linked nodes
    for ensembl in query_interactions:
        linked_nodes = len(unique.unique(query_interactions[ensembl]))
        interacting_gene_list.append((linked_nodes,ensembl))
    interacting_gene_list.sort(); interacting_gene_list.reverse()
    indirect_interacting_gene_list.sort();  indirect_interacting_gene_list.reverse()
    
    print "Length of query_interactions:",len(query_interactions)
    query_interactions_unique=[]
    for gene1 in query_interactions:
        for gene2 in query_interactions[gene1]:
            temp = []; temp.append(gene2); temp.append(gene1)#; temp.sort()
            if gene1 == gene2: interaction_type = 'self'
            else: interaction_type = 'distinct'
            temp.append(interaction_type); temp.reverse()
            query_interactions_unique.append(temp)
    for gene1 in query_indirect_interactions:
        for gene2 in query_indirect_interactions[gene1]:
            temp = []; temp.append(gene2); temp.append(gene1)#; temp.sort()
            if gene1 == gene2: interaction_type = 'self'
            else: interaction_type = 'indirect'
            temp.append(interaction_type); temp.reverse()
            query_interactions_unique.append(temp)
    query_interactions_unique = unique.unique(query_interactions_unique)
    query_interactions_unique.sort()
    

    ###Write out nodes linked to many other nodes
    new_file = outputDir+'/networks/'+file_name+ '-interactions_'+str(degrees)+'_degrees_summary.txt'
    data = export.ExportFile(new_file)
    for (linked_nodes,ensembl) in interacting_gene_list:
        try: symbol = query_db[ensembl]
        except KeyError: symbol = ensembl_symbol_db[ensembl]
        data.write(str(linked_nodes)+'\t'+ensembl+'\t'+symbol+'\t'+'direct'+'\n')
    for (linked_nodes,ensembl) in indirect_interacting_gene_list:
        try: symbol = query_db[ensembl]
        except KeyError:
            try: symbol = ensembl_symbol_db[ensembl]
            except KeyError: symbol = ensembl
            if 'HMDB' in symbol:
                try: symbol = hmdb_symbol_db[ensembl]
                except Exception: pass
        data.write(str(linked_nodes)+'\t'+ensembl+'\t'+symbol+'\t'+'indirect'+'\n')
    data.close()

    regulated_gene_db = query_db    
    sif_export,symbol_pair_unique = exportInteractionData(file_name,query_interactions_unique,regulated_gene_db)
    return sif_export,symbol_pair_unique
Example #34
0
def compareImportedTables(file_list,outputDir,importDir=False,considerNumericDirection=False,display=True):
    ### added for AltAnalyze
    print 'Creating Venn Diagram from input files...'
    import UI
    import export
    file_id_db={}
    file_list2=[]
    for file in file_list:
        x=0
        if '.txt' in file:
            if importDir !=False: ### When all files in a directory are analyzed
                fn=UI.filepath(import_dir+'/'+file)
            else:
                fn = file
                file = export.findFilename(fn) ### Only report the actual filename
            file_list2.append(file)
            for line in open(fn,'rU').xreadlines():
                if x == 0:
                    data_type = examineFields(line)
                    x+=1
                else:
                    data = UI.cleanUpLine(line)
                    t = string.split(data,'\t')
                    uid = t[0]
                    valid = True
                    if data_type != 'first':
                        if data_type == 'comparison':
                            score = float(string.split(t[6],'|')[0])
                            if 'yes' not in t[5]:
                                valid = False ### not replicated independently
                        if data_type == 'reciprocal':
                            uid = t[8]+'-'+t[10]
                            score = float(t[1])
                        if data_type == 'single':
                            uid = t[6]
                            score = float(t[1])
                    else:
                        try:
                            score = float(t[1]) #t[2]
                        except Exception: score = None
                    if score != None and considerNumericDirection: ### change the UID so that it only matches if the same direction
                        if score>0:
                            uid+='+' ### encode the ID with a negative sign
                        else:
                            uid+='-' ### encode the ID with a negative sign
                    #if score>0:
                    if valid:
                        try: file_id_db[file].append(uid)
                        except Exception: file_id_db[file] = [uid]
                    
    id_lists=[]
    new_file_list=[]
    for file in file_list2: ### Use the sorted names
        if file in file_id_db:
            uids = file_id_db[file]
            id_lists.append(uids)
            new_file_list.append(file)
            #print file, len(new_file_list), len(uids)
            
    if len(file_id_db):
        if len(new_file_list)==2 or len(new_file_list)==3:
            SimpleMatplotVenn(new_file_list,id_lists,outputDir=outputDir,display=False) ### display both below
        venn(id_lists, new_file_list, fill="number", show_names=False, outputDir=outputDir, show_plot=display)
Example #35
0
def importInteractionDatabases(interactionDirs):
    """ Import multiple interaction format file types (designated by the user) """
    exclude=[]
    for file in interactionDirs:
        status = verifyFile(file)
        if status == 'not found':
            exclude.append(file)
    for i in exclude:
        interactionDirs.remove(i)
        
    for fn in interactionDirs:    #loop through each file in the directory to output results
        x=0; imported=0; stored=0
        file = export.findFilename(fn)
        count=0
        print "Parsing interactions from:",file
        for line in open(fn,'rU').xreadlines():
            data = cleanUpLine(line)
            t = string.split(data,'\t')
            count+=1
            if x==0: x=1
            #elif 'PAZAR' in data or 'Amadeus' in data:x+=0
            else:
                obligatory = False
                imported+=1
                proceed = True
                source=''
                interaction_type = 'interaction'
                try:
                    symbol1,interaction_type, symbol2, ensembl1,ensembl2,source = t
                    ens_ls1=[ensembl1]; ens_ls2=[ensembl2]
                    if 'HMDB' in ensembl1:
                        ensembl1 = string.replace(ensembl1,' ','') ### HMDB ID sometimes proceeded by ' '
                        symbol_hmdb_db[symbol1]=ensembl1
                        hmdb_symbol_db[ensembl1] = symbol1
                        interaction_type = 'Metabolic'
                    if 'HMDB' in ensembl2:
                        ensembl2 = string.replace(ensembl2,' ','') ### HMDB ID sometimes proceeded by ' '
                        symbol_hmdb_db[symbol2]=ensembl2
                        hmdb_symbol_db[ensembl2] = symbol2
                        interaction_type = 'Metabolic'
                except Exception:
                    try:
                        ensembl1,ensembl2,symbol1,symbol2,interaction_type=t
                        if ensembl1 == '':
                            try:
                                ens_ls1 = symbol_ensembl_db[symbol1]
                                ens_ls2 = symbol_ensembl_db[symbol2]
                            except Exception: None
                    except Exception:
                        proceed = False
                if proceed: ### If the interaction data conformed to one of the two above types (typically two valid interacting gene IDs)
                    if (len(ens_ls1)>0 and len(ens_ls2)>0):
                        secondary_proceed = True
                        stored+=1
                        for ensembl1 in ens_ls1:
                            for ensembl2 in ens_ls2:
                                """
                                if (ensembl1,ensembl2) == ('ENSG00000111704','ENSG00000152284'):
                                    print t;sys.exit()
                                if (ensembl1,ensembl2) == ('ENSG00000152284','ENSG00000111704'):
                                    print t;sys.exit()
                                """
                                if 'WikiPathways' in file or 'KEGG' in file:
                                    if ensembl2 != ensembl1:
                                        if (ensembl2,ensembl1) in interaction_annotation_dbase:
                                            del interaction_annotation_dbase[(ensembl2,ensembl1)]
                                            ### Exclude redundant entries with fewer interaction details (e.g., arrow direction BIOGRID) - overwrite with the opposite gene arrangement below
                                        if (ensembl1,ensembl2) in interaction_annotation_dbase:
                                            if interaction_annotation_dbase[(ensembl1,ensembl2)].InteractionType() !='physical':
                                                secondary_proceed = False ### Don't overwrite a more informative annotation like transcriptional regulation or microRNA targeting
                                if 'DrugBank' in fn:
                                    source = 'DrugBank'
                                    interaction_type = 'drugInteraction'
                                    obligatory=True
                                    ensembl1, ensembl2 = ensembl2, ensembl1 ### switch the order of these (drugs reported as first ID and gene as the second)
                                
                                if secondary_proceed:
                                    z = InteractionInformation(ensembl1,ensembl2,source,interaction_type)
                                    interaction_annotation_dbase[ensembl1,ensembl2] = z
                                    #z = InteractionInformation(ensembl2,ensembl1,source,interaction_type)
                                    #interaction_annotation_dbase[ensembl2,ensembl1] = z
                                    try: interaction_db[ensembl1][ensembl2]=1
                                    except KeyError: db = {ensembl2:1}; interaction_db[ensembl1] = db ###weight of 1 (weights currently not-supported)
                                    try: interaction_db[ensembl2][ensembl1]=1
                                    except KeyError: db = {ensembl1:1}; interaction_db[ensembl2] = db ###weight of 1 (weights currently not-supported)
                                
                                if obligatory and source in obligatoryList: ### Include these in the final pathway if linked to any input node (e.g., miRNAs, drugs)
                                    try: obligatory_interactions[ensembl1][ensembl2]=1
                                    except KeyError: db = {ensembl2:1}; obligatory_interactions[ensembl1] = db ###weight of 1 (weights currentlynot-supported)
                                elif source in secondDegreeObligatoryCategories:
                                    try: second_degree_obligatory[ensembl1][ensembl2]=1
                                    except KeyError: db = {ensembl2:1}; second_degree_obligatory[ensembl1] = db ###weight of 1 (weights currently not-supported)
                                    
                else:
                    proceed = False
                    try:
                        ID1, null, ID2 = t
                        proceed = True
                    except Exception:
                        try:
                            ID1, ID2 = t
                            proceed = True
                        except Exception:
                            None
                            
                    if proceed:
                        if 'microRNATargets' in fn:
                            if 'mir' in ID2: prefix = 'MIR'
                            else: prefix = 'LET'
                            ID2='MIR'+string.split(ID2,'-')[2] ### Ensembl naming convention
                            source = 'microRNATargets'
                            interaction_type = 'microRNAInteraction'
                            obligatory=True
                        try: ID_ls1 = symbol_ensembl_db[ID1]
                        except Exception: ID_ls1 = [ID1]
                        try: ID_ls2 = symbol_ensembl_db[ID2]
                        except Exception: ID_ls2 = [ID2]
                        """if 'microRNATargets' in fn:
                            if '*' not in ID2: print ID_ls2;sys.exit()"""
                        addInteractions = True
                        for ID1 in ID_ls1:
                            for ID2 in ID_ls2:
                                z = InteractionInformation(ID2,ID1,source,interaction_type)
                                interaction_annotation_dbase[ID2,ID1] = z ### This is the interaction direction that is appropriate
                                try: interaction_db[ID1][ID2]=1
                                except KeyError: db = {ID2:1}; interaction_db[ID1] = db ###weight of 1 (weights currently supported)
                                try: interaction_db[ID2][ID1]=1
                                except KeyError: db = {ID1:1}; interaction_db[ID2] = db ###weight of 1 (weights currently supported)
                                    
                                if source in secondDegreeObligatoryCategories:
                                    try: second_degree_obligatory[ID1][ID2]=1
                                    except KeyError: db = {ID2:1}; second_degree_obligatory[ID1] = db ###weight of 1 (weights currently supported)
                                    
                                elif obligatory and source in obligatoryList: ### Include these in the final pathway if linked to any input node (e.g., miRNAs, drugs)
                                    try: obligatory_interactions[ID1][ID2]=1
                                    except KeyError: db = {ID2:1}; obligatory_interactions[ID1] = db ###weight of 1 (weights currently supported)
                           
    ### Evaluate the most promiscous interactors (e.g., UBC)
    remove_list=[]
    for ID in interaction_db:
        if len(interaction_db[ID])>2000:
            remove_list.append(ID)
            #print len(interaction_db[ID]),ensembl_symbol_db[ID]
    for ID in remove_list:
        #print 'removing', ID
        del interaction_db[ID]
    blackList[ID] = []

    print 'Imported interactions:',len(interaction_annotation_dbase)
Example #36
0
def buildInteractions(species,Degrees,inputType,inputDir,outputdir,interactionDirs,Genes=None,
                      geneSetType=None,PathwayFilter=None,OntologyID=None,directory=None,expressionFile=None,
                      obligatorySet=None,secondarySet=None,IncludeExpIDs=False):
    
    global degrees
    global outputDir
    global inputDataType
    global obligatoryList ### Add these if connected to anything
    global secondaryQueryIDs
    global secondDegreeObligatoryCategories ### Add if common to anything in the input - Indicates systems to apply this to
    global symbol_hmdb_db; symbol_hmdb_db={}; global hmdb_symbol_db; hmdb_symbol_db={} ### Create an annotation database for HMDB IDs
    global FileName
    global intNameShort
    secondaryQueryIDs = {}
    degrees = Degrees
    outputDir = outputdir
    inputDataType = inputType
    obligatoryList = obligatorySet
    secondDegreeObligatoryCategories=[]
    intNameShort=''
    if obligatoryList == None:
        obligatoryList=[]
    if expressionFile == None:
        expressionFile = inputDir ### If it doesn't contain expression values, view as yellow nodes
    if secondarySet!= None and (degrees==1 or degrees=='direct'): ### If degrees == 2, this is redundant
        ### This currently adds alot of predictions - either make more stringent or currently exclude
        secondDegreeObligatoryCategories = secondarySet
    if PathwayFilter != None:
        if len(PathwayFilter)==1:
            FileName = PathwayFilter[0]
        if isinstance(PathwayFilter, tuple) or isinstance(PathwayFilter, list):
            FileName = string.join(list(PathwayFilter),' ')
            FileName = string.replace(FileName,':','-')
        else:
            FileName = PathwayFilter
        if len(FileName)>40:
            FileName = FileName[:40]
    elif OntologyID != None: FileName = OntologyID
    elif Genes != None: FileName = Genes
    
    ### Import Ensembl-Symbol annotations
    getEnsemblGeneData('AltDatabase/ensembl/'+species+'/'+species+'_Ensembl-annotations.txt')
    if len(interactionDirs[0]) == 1: interactionDirs = [interactionDirs]    
    ### Import interaction databases indicated in interactionDirs
    for i in interactionDirs:
        print i
        i = export.findFilename(i)
        i=string.split(i,'-')[1]
        intNameShort+=i[0]

    importInteractionData(interactionDirs)
    getHMDBData(species) ### overwrite the symbol annotation from any HMDB that comes from a WikiPathway or KEGG pathway that we also include (for consistent official annotation) 
    
    input_IDs = getGeneIDs(Genes)
    try:
        if isinstance(PathwayFilter, tuple):
            for pathway in PathwayFilter:
                IDs = gene_associations.simpleGenePathwayImport(species,geneSetType,pathway,OntologyID,directory)
                for id in IDs:input_IDs[id]=None
        else:
            input_IDs = gene_associations.simpleGenePathwayImport(species,geneSetType,PathwayFilter,OntologyID,directory)
    except Exception: None
    if expressionFile == None or len(expressionFile)==0:
        expressionFile = exportSelectedIDs(input_IDs) ### create an expression file
    elif IncludeExpIDs: ### Prioritize selection of IDs for interactions WITH the primary query set (not among expression input IDs)
        secondaryQueryIDs = importqueryResults(species,expressionFile,{})[0]
    input_IDs,query_interactions,dir_file = importqueryResults(species,inputDir,input_IDs)
    sif_file,symbol_pair_unique = associateQueryGenesWithInteractions(input_IDs,query_interactions,dir_file)
    output_filename = exportGraphImage(species,sif_file,expressionFile)
    return output_filename
Example #37
0
def associateQueryGenesWithInteractions(query_db,query_interactions,dir_file):
    suffix=''
    if dir_file!=None:
        if len(dir_file)!=0:
            suffix='-'+intNameShort+'_'+export.findFilename(dir_file)[:-4]
    if len(suffix)==0:
        try: suffix = '_'+FileName
        except Exception: None
    file_name = 'AltAnalyze-network'+suffix
    
    query_interactions_unique={}
    interacting_genes={}
    connections = 1
    primary=0
    secondary=0
    terciary=0
    for ensemblGene in query_db:
        if ensemblGene in interaction_db:
            for interacting_ensembl in interaction_db[ensemblGene]:
                if interacting_ensembl not in blackList:
                    ###Only allow direct interactions found in query
                    if interacting_ensembl in query_db:
                        try: query_interactions[ensemblGene].append(interacting_ensembl)
                        except KeyError: query_interactions[ensemblGene] = [interacting_ensembl]
                        try: query_interactions[interacting_ensembl].append(ensemblGene)
                        except KeyError: query_interactions[interacting_ensembl] = [ensemblGene]
                        primary+=1
                    if degrees == 2 or degrees == 'indirect':
                        try: interacting_genes[interacting_ensembl].append(ensemblGene)
                        except KeyError: interacting_genes[interacting_ensembl] = [ensemblGene]
                    elif degrees == 'allInteracting' or degrees == 'all possible':
                        try: query_interactions[ensemblGene].append(interacting_ensembl)
                        except KeyError: query_interactions[ensemblGene] = [interacting_ensembl]
                    if interacting_ensembl in secondaryQueryIDs: ### IDs in the expression file
                        secondary+=1 ### When indirect degrees selected, no additional power added by this (only for direct or shortest path)
                        try: query_interactions[ensemblGene].append(interacting_ensembl)
                        except KeyError: query_interactions[ensemblGene] = [interacting_ensembl]    
        if ensemblGene in second_degree_obligatory:
            for interacting_ensembl in second_degree_obligatory[ensemblGene]:
                try: interacting_genes[interacting_ensembl].append(ensemblGene)
                except KeyError: interacting_genes[interacting_ensembl] = [ensemblGene]

    ### Include indirect interactions to secondaryQueryIDs from the expression file
    if degrees == 2 or degrees == 'indirect':
        for ensemblGene in secondaryQueryIDs:
            if ensemblGene in interaction_db:
                for interacting_ensembl in interaction_db[ensemblGene]:
                    if interacting_ensembl not in blackList:
                        try:
                            interacting_genes[interacting_ensembl].append(ensemblGene)
                            terciary+=1#; print interacting_ensembl
                        except KeyError: None ### Only increase the interacting_genes count if the interacting partner is present from the primary query list
    #print primary,secondary,terciary
    
    ### Report the number of unique interacting genes
    for interacting_ensembl in interacting_genes:
        if len(interacting_genes[interacting_ensembl])==1:
            interacting_genes[interacting_ensembl] = 1
        else:
            unique_interactions = unique.unique(interacting_genes[interacting_ensembl])
            interacting_genes[interacting_ensembl] = len(unique_interactions)
    
    query_indirect_interactions={}; indirect_interacting_gene_list=[]; interacting_gene_list=[]; added=[] 
    if degrees=='shortestPath' or degrees=='shortest path': ### Typically identifying the single smallest path(s) between two nodes.
        query_indirect_interactions, indirect_interacting_gene_list, interacting_gene_list = evaluateShortestPath(query_db,interaction_db,10)
        
    else:
        if degrees==2 or degrees=='indirect' or len(secondDegreeObligatoryCategories)>0:
            for ensembl in interacting_genes:
                if interacting_genes[ensembl] > connections:
                    if ensembl in interaction_db: ### Only nodes removed due to promiscuity will not be found
                        for interacting_ensembl in interaction_db[ensembl]:
                            if interacting_ensembl in query_db or interacting_ensembl in secondaryQueryIDs:
                                try: query_indirect_interactions[interacting_ensembl].append(ensembl)
                                except KeyError: query_indirect_interactions[interacting_ensembl] = [ensembl]
                        ###Record the highest linked nodes
                        indirect_interacting_gene_list.append((interacting_genes[ensembl],ensembl)) 
        if len(obligatory_interactions)>0: ### Include always
            all_reported_genes = combineDBs(query_interactions,query_indirect_interactions) ### combinesDBs and returns a unique list of genes
            for ensemblGene in all_reported_genes: ###This only includes genes in the original input list
                if ensemblGene in obligatory_interactions:
                    for interacting_ensembl in obligatory_interactions[ensemblGene]:
                        #symbol = ensembl_symbol_db[ensemblGene]                    
                        try: query_interactions[ensemblGene].append(interacting_ensembl)
                        except KeyError: query_interactions[ensemblGene] = [interacting_ensembl]
    
    z = dict(query_interactions.items() + query_indirect_interactions.items())
    interaction_restricted_db={}
    for ensembl in z:
        interacting_nodes = z[ensembl]
        for node in interacting_nodes:
            if ensembl in interaction_restricted_db:
                db = interaction_restricted_db[ensembl]
                db[node] = 1
            else: interaction_restricted_db[ensembl] = {node:1}

            if node in interaction_restricted_db:
                db = interaction_restricted_db[node]
                db[ensembl] = 1
            else: interaction_restricted_db[node] = {ensembl:1}
            
    if degrees==2 or degrees=='indirect': ### get rid of non-specific interactions
        query_indirect_interactions, indirect_interacting_gene_list, interacting_gene_list = evaluateShortestPath(query_db,interaction_restricted_db,4)
        
    ###Record the highest linked nodes
    for ensembl in query_interactions:
        linked_nodes = len(unique.unique(query_interactions[ensembl]))
        interacting_gene_list.append((linked_nodes,ensembl))
    interacting_gene_list.sort(); interacting_gene_list.reverse()
    indirect_interacting_gene_list.sort();  indirect_interacting_gene_list.reverse()
    
    print "Length of query_interactions:",len(query_interactions)
    query_interactions_unique=[]
    for gene1 in query_interactions:
        for gene2 in query_interactions[gene1]:
            temp = []; temp.append(gene2); temp.append(gene1)#; temp.sort()
            if gene1 == gene2: interaction_type = 'self'
            else: interaction_type = 'distinct'
            temp.append(interaction_type); temp.reverse()
            query_interactions_unique.append(temp)
    for gene1 in query_indirect_interactions:
        for gene2 in query_indirect_interactions[gene1]:
            temp = []; temp.append(gene2); temp.append(gene1)#; temp.sort()
            if gene1 == gene2: interaction_type = 'self'
            else: interaction_type = 'indirect'
            temp.append(interaction_type); temp.reverse()
            query_interactions_unique.append(temp)
    query_interactions_unique = unique.unique(query_interactions_unique)
    query_interactions_unique.sort()
    

    ###Write out nodes linked to many other nodes
    new_file = outputDir+'/networks/'+file_name+ '-interactions_'+str(degrees)+'_degrees_summary.txt'
    data = export.ExportFile(new_file)
    for (linked_nodes,ensembl) in interacting_gene_list:
        try: symbol = query_db[ensembl]
        except KeyError: symbol = ensembl_symbol_db[ensembl]
        data.write(str(linked_nodes)+'\t'+ensembl+'\t'+symbol+'\t'+'direct'+'\n')
    for (linked_nodes,ensembl) in indirect_interacting_gene_list:
        try: symbol = query_db[ensembl]
        except KeyError:
            try: symbol = ensembl_symbol_db[ensembl]
            except KeyError: symbol = ensembl
            if 'HMDB' in symbol:
                try: symbol = hmdb_symbol_db[ensembl]
                except Exception: pass
        data.write(str(linked_nodes)+'\t'+ensembl+'\t'+symbol+'\t'+'indirect'+'\n')
    data.close()

    regulated_gene_db = query_db    
    sif_export,symbol_pair_unique = exportInteractionData(file_name,query_interactions_unique,regulated_gene_db)
    return sif_export,symbol_pair_unique
Example #38
0
def importGeneExpressionValues(filename, tissue_specific_db, translation_db):
    ### Import gene-level expression raw values
    fn = filepath(filename)
    x = 0
    genes_added = {}
    gene_expression_db = {}
    dataset_name = export.findFilename(filename)
    print 'importing:', dataset_name
    for line in open(fn, 'rU').xreadlines():
        data = cleanUpLine(line)
        t = string.split(data, '\t')

        if x == 0:
            if '#' not in data:
                for i in t[1:]:
                    sample_headers.append(i)
                x = 1
        else:
            gene = t[0]
            #if '-' not in gene and ':E' in gene: print gene;sys.exit()
            if analysis_type == 'AltExon':
                try:
                    ens_gene, exon = string.split(gene, '-')[:2]
                except Exception:
                    exon = gene
                gene = exon
            if keyed_by == 'translation':  ### alternative value is 'primaryID'
                """if gene == 'ENSMUSG00000025915-E19.3':
                    for i in translation_db: print [i], len(translation_db); break
                    print gene, [translation_db[gene]];sys.exit()"""
                try:
                    gene = translation_db[gene]  ### Ensembl annotations
                except Exception:
                    pass
            if gene in tissue_specific_db:
                index, tissue_exp = tissue_specific_db[gene]
                try:
                    genes_added[gene] += 1
                except Exception:
                    genes_added[gene] = 1
                proceed = True
                try:
                    exp_vals = map(float, t[1:])
                    if platform == 'RNASeq':
                        #if max(exp_vals)<3: proceed=False
                        exp_vals = map(lambda x: math.log(x + 1, 2), exp_vals)
                    if value_type == 'calls':  ### Hence, this is a DABG or RNA-Seq expression
                        exp_vals = produceDetectionCalls(
                            exp_vals, targetPlatform)  ### 0 or 1 calls
                    if proceed:
                        gene_expression_db[gene] = [index, exp_vals]
                except Exception:
                    print 'Formatting error encountered in:', dataset_name
                    forceError

    print len(gene_expression_db
              ), 'matching genes in the dataset and tissue compendium database'

    for gene in genes_added:
        if genes_added[gene] > 1:
            del gene_expression_db[
                gene]  ### delete entries that are present in the input set multiple times (not trustworthy)
        else:
            expession_subset.append(
                gene_expression_db[gene]
            )  ### These contain the rank order and expression
    #print len(expession_subset);sys.exit()
    expession_subset.sort()  ### This order now matches that of
    gene_expression_db = []
Example #39
0
def NMFAnalysis(expressionInputFile,NMFinputDir,Rank,platform,iteration=0,strategy="conservative"):

    root_dir = export.findParentDir(NMFinputDir)[:-1]
    if 'ExpressionInput' in root_dir:
        root_dir = export.findParentDir(root_dir)
    if 'NMF-SVM' in root_dir:
        root_dir = export.findParentDir(root_dir)
        
    export.findFilename(NMFinputDir)
        
    X=[]
    header=[]
    head=0
    exportnam=root_dir+'/NMF-SVM/NMF/round'+str(iteration)+'NMFsnmf_versionr'+str(Rank)+'.txt'
    export_res=export.ExportFile(exportnam)
    exportnam_bin=root_dir+'/NMF-SVM/NMF/round'+str(iteration)+'NMFsnmf_binary'+str(Rank)+'.txt'
    export_res1=export.ExportFile(exportnam_bin)
    exportnam_bint=root_dir+'/NMF-SVM/NMF/round'+str(iteration)+'NMFsnmf_binary_t_'+str(Rank)+'.txt'
    export_res5=export.ExportFile(exportnam_bint)
    MF_input = root_dir+'/NMF-SVM/ExpressionInput/exp.NMF-MarkerFinder.txt'
    export.customFileCopy(expressionInputFile,root_dir+'/NMF-SVM/ExpressionInput/exp.NMF-MarkerFinder.txt')
    export_res4=open(string.replace(MF_input,'exp.','groups.'),"w")
    export_res7=open(string.replace(MF_input,'exp.','comps.'),"w")
    exportnam2=root_dir+'/NMF-SVM/SubtypeAnalyses/round'+str(iteration)+'Metadata'+str(Rank)+'.txt'
    export_res2=export.ExportFile(exportnam2)
    exportnam3=root_dir+'/NMF-SVM/SubtypeAnalyses/round'+str(iteration)+'Annotation'+str(Rank)+'.txt'
    export_res3=export.ExportFile(exportnam3)
    #if 'Clustering' in NMFinputDir:
     #   count=1
      #  start=2
    #else:
    count=0
    start=1
    #print Rank
    for line in open(NMFinputDir,'rU').xreadlines():
        line=line.rstrip('\r\n')
        q= string.split(line,'\t')
        if head >count:
            val=[]
            val2=[]
            me=0.0
            
            for i in range(start,len(q)):
                try:
                    val2.append(float(q[i]))
                except Exception:
                    continue
            me=np.median(val2)
            for i in range(start,len(q)):
                try:
                    val.append(float(q[i]))
                except Exception:
                    val.append(float(me))
            #if q[1]==prev:
            X.append(val)
          
        else:
            export_res1.write(line)
            export_res.write(line)
            export_res1.write("\n")
            #export_res4.write(line)
            #export_res4.write("\n")
            export_res.write("\n")
            header=q
            head+=1
            continue   
    group=defaultdict(list)
        
    sh=[]
    X=np.array(X)
    #print X.shape
    mat=[]
    #mat=X
    mat=zip(*X)
    mat=np.array(mat)
    #print mat.shape
    #model = NMF(n_components=15, init='random', random_state=0)
    #W = model.fit_transform(mat)
    nmf = nimfa.Snmf(mat,seed="nndsvd", rank=int(Rank), max_iter=20,n_run=1,track_factor=False,theta=0.95)
    nmf_fit = nmf()
    W = nmf_fit.basis()
    W=np.array(W)
    #np.savetxt("basismatrix2.txt",W,delimiter="\t")
    H=nmf_fit.coef()
    H=np.array(H)
   # np.savetxt("coefficientmatrix2.txt",H,delimiter="\t")
    #print W.shape
    sh=W.shape
    export_res3.write("uid\tUID\tUID\n")
    if int(Rank)==2:
        par=1
    else:
        par=2
    #for i in range(sh[1]):
    #    val=W[:,i]
    #    me=np.mean(val)
    #    st=np.std(val)
    #    export_res2.write(header[i+1])
    #    for j in range(sh[0]):
    #        if float(W[i][j])>=float(me+(par*st)):
    #          
    #            export_res2.write("\t"+str(1))
    #        else:
    #            export_res2.write("\t"+str(0))
    #       
    #    export_res2.write("\n")
    if platform != 'PSI':
        sh=W.shape
        Z=[]
        export_res5.write("uid")
        export_res2.write("uid")
        for i in range(sh[1]):
            
            export_res5.write("\t"+'V'+str(i))
            export_res2.write("\t"+'V'+str(i))
            export_res3.write('V'+str(i)+"\t"+"Covariate"+"\t"+str(1)+"\n")
            
        export_res5.write("\n")
        export_res2.write("\n")
        export_res3.write("\n")
        for i in range(sh[0]):
            new_val=[]
            val=W[i,:]
            export_res2.write(header[i+1])
            export_res5.write(header[i+1])
            export_res4.write(header[i+1])
            flag=True
            for j in range(sh[1]):
                if W[i][j]==max(val) and flag:
                    export_res5.write("\t"+str(1))
                    export_res2.write("\t"+str(1))
                    new_val.append(1)
                    export_res4.write("\t"+str(j+1)+"\t"+'V'+str(j))
                    flag=False
                else:
                    export_res5.write("\t"+str(0))
                    export_res2.write("\t"+str(0))
                    new_val.append(0)
                
            Z.append(new_val)
            export_res5.write("\n")
            export_res2.write("\n")
            export_res4.write("\n")
        W=zip(*W)
        W=np.array(W)
        sh=W.shape
        Z=zip(*Z)
        Z=np.array(Z)
        for i in range(sh[0]):
            export_res.write('V'+str(i))
            export_res1.write('V'+str(i))
            for j in range(sh[1]):
                export_res.write("\t"+str(W[i][j]))
                export_res1.write("\t"+str(Z[i][j]))
            export_res.write("\n")
            export_res1.write("\n")
            
        export_res.close()
        export_res1.close()
        export_res2.close()
        export_res5.close()
        Orderedheatmap.Classify(exportnam_bint)    
        
        return exportnam,exportnam_bin,exportnam2,exportnam3
    
    else:
        W=zip(*W)
        W=np.array(W)
        sh=W.shape
        Z=[]
        for i in range(sh[0]):
            new_val=[]
            val=W[i,:]
            num=sum(i > 0.10 for i in val)
            if num >40 or num <3:
                compstd=True
            else:
                compstd=False
            me=np.mean(val)
            st=np.std(val)
            #print 'V'+str(i)
            export_res.write('V'+str(i))
            export_res1.write('V'+str(i))
           
            for j in range(sh[1]):
                
                if compstd:   
                    if float(W[i][j])>=float(me+(par*st)):
                    
                        export_res1.write("\t"+str(1))
                        new_val.append(1)
                    else:
                        export_res1.write("\t"+str(0))
                        new_val.append(0)
                else:
                    if float(W[i][j])>0.1:
                    
                        export_res1.write("\t"+str(1))
                        new_val.append(1)
                    else:
                        export_res1.write("\t"+str(0))
                        new_val.append(0)
                export_res.write("\t"+str(W[i][j]))
                
            Z.append(new_val)
            export_res.write("\n")
            export_res1.write("\n")
       # Z=zip(*Z)
        Z=np.array(Z)
        sh=Z.shape
        Z_new=[]
        val1=[]
        Z1=[]
        dellst=[]
        export_res2.write("uid")
        export_res5.write("uid")
        for i in range(sh[0]):
            indices=[]
            val1=Z[i,:]
            sum1=sum(val1)
            flag=False
            indices=[index for index, value in enumerate(val1) if value == 1]
            for j in range(sh[0]):
                val2=[]
                
                if i!=j:
                    val2=Z[j,:]
                    
                    sum2=sum([val2[x] for x in indices])
                    summ2=sum(val2)
                    try:
                        if float(sum2)/float(sum1)>0.5:
                            if summ2>sum1:
                                flag=True
                                #print str(i)
                    except Exception:
                        continue
            if flag==False:
    
                Z1.append(val1)
                export_res2.write("\t"+'V'+str(i))
                export_res5.write("\t"+'V'+str(i))
                export_res3.write('V'+str(i)+"\t"+"Covariate"+"\t"+str(1)+"\n")
        
        export_res2.write("\n")
        export_res5.write("\n")
        Z1=np.array(Z1)
        Z=Z1
        Z=zip(*Z)
        Z=np.array(Z)
        sh=Z.shape
            
        for i in range(sh[0]):
            val1=Z[i,:]
            #print sum(val1)
            #if sum(val)>2: 
            if sum(val1)>2:
                val=[0 if x==1 else x for x in val1]
            else:
                val=val1
            me=np.mean(val)
            st=np.std(val)
            export_res2.write(header[i+1])
            export_res5.write(header[i+1])
            for j in range(sh[1]):
                if strategy=="conservative":
                    export_res2.write("\t"+str(val1[j]))
                    export_res5.write("\t"+str(val1[j]))
                else:
                   export_res2.write("\t"+str(val[j]))
                   export_res5.write("\t"+str(val[j])) 
            export_res2.write("\n")
            export_res5.write("\n")
            Z_new.append(val)
        Z_new=zip(*Z_new)
        Z_new=np.array(Z_new)
        
        sh=Z_new.shape

        export_res5.close()
        Orderedheatmap.Classify(exportnam_bint)    
        if strategy=="conservative":
            return exportnam,exportnam_bin,exportnam2,exportnam3
        else:
            return exportnam,exportnam_bin,exportnam2,exportnam3
Example #40
0
def importGeneExpressionValues(filename,tissue_specific_db,translation_db,useLog=False,previouslyRun=False,species=None):
    ### Import gene-level expression raw values           
    fn=filepath(filename); x=0; genes_added={}; gene_expression_db={}
    dataset_name = export.findFilename(filename)
    max_val=0
    print 'importing:',dataset_name
    
    try:
        import gene_associations, OBO_import
        gene_to_symbol = gene_associations.getGeneToUid(species,('hide','Ensembl-Symbol'))
        symbol_to_gene = OBO_import.swapKeyValues(gene_to_symbol)
    except Exception: symbol_to_gene={}
    
    for line in open(fn,'rU').xreadlines():
        data = cleanUpLine(line)
        t = string.split(data,'\t')
        
        if x==0:
            if '#' not in data:
                for i in t[1:]: sample_headers.append(i)
                x=1
        else:
            gene = t[0]
            try: gene = string.split(t[0],'|')[0]
            except Exception: pass
            #if '-' not in gene and ':E' in gene: print gene;sys.exit()
            if analysis_type == 'AltExon':
                try: ens_gene,exon = string.split(gene,'-')[:2]
                except Exception: exon = gene
                gene = exon
            if keyed_by == 'translation': ### alternative value is 'primaryID'
                """if gene == 'ENSMUSG00000025915-E19.3':
                    for i in translation_db: print [i], len(translation_db); break
                    print gene, [translation_db[gene]];sys.exit()"""
                try: gene = translation_db[gene] ### Ensembl annotations
                except Exception: pass
            try: gene = symbol_to_gene[gene][0] ### If RNASeq is the selected platform and Symbol is the uid
            except Exception: pass
            if gene in tissue_specific_db:
                index,tissue_exp=tissue_specific_db[gene]
                try: genes_added[gene]+=1
                except Exception: genes_added[gene]=1
                proceed=True
                try:
                    exp_vals = t[1:]
                    if '' in exp_vals:
                        ### If missing values present (PSI values)
                        exp_vals = ['0.000101' if i=='' else i for i in exp_vals]
                        useLog = False
                    exp_vals = map(float, exp_vals)
                    if platform == 'RNASeq':
                        if max(exp_vals)>max_val: max_val = max(exp_vals)
                        #if max(exp_vals)<3: proceed=False
                        if useLog==False:
                            exp_vals = map(lambda x: math.log(x+1,2),exp_vals)
                    if value_type == 'calls': ### Hence, this is a DABG or RNA-Seq expression
                        exp_vals = produceDetectionCalls(exp_vals,targetPlatform) ### 0 or 1 calls
                    if proceed:
                        gene_expression_db[gene] = [index,exp_vals]
                except Exception:
                    print 'Non-numeric values detected:'
                    x = 5
                    print t[:x]
                    while x < t:
                        t[x:x+5]
                        x+=5
                    print 'Formatting error encountered in:',dataset_name; forceError
            """else:
                for gene in tissue_specific_db:
                    if 'Ndufa9:ENSMUSG00000000399:I2.1-E3.1' in gene:
                        print gene, 'dog';sys.exit()
                print gene;kill"""
        
    print len(gene_expression_db), 'matching genes in the dataset and tissue compendium database'
    
    for gene in genes_added:
        if genes_added[gene]>1:
            del gene_expression_db[gene] ### delete entries that are present in the input set multiple times (not trustworthy)
        else: expession_subset.append(gene_expression_db[gene]) ### These contain the rank order and expression
    #print len(expession_subset);sys.exit()
    expession_subset.sort() ### This order now matches that of 
    gene_expression_db=[]
    
    if max_val<20 and platform == 'RNASeq' and previouslyRun==False: ### Only allow to happen once
        importGeneExpressionValues(filename,tissue_specific_db,translation_db,useLog=True,previouslyRun=True,species=species)
Example #41
0
def importInteractionDatabases(interactionDirs):
    """ Import multiple interaction format file types (designated by the user) """
    exclude=[]
    for file in interactionDirs:
        status = verifyFile(file)
        if status == 'not found':
            exclude.append(file)
    for i in exclude:
        interactionDirs.remove(i)
        
    for fn in interactionDirs:    #loop through each file in the directory to output results
        x=0; imported=0; stored=0
        file = export.findFilename(fn)
        print "Parsing interactions from:",file
        for line in open(fn,'rU').xreadlines():
            data,null = string.split(line,'\n')
            t = string.split(data,'\t')
            if x==0: x=1
            #elif 'PAZAR' in data or 'Amadeus' in data:x+=0
            else:
                obligatory = False
                imported+=1
                proceed = True
                source=''
                interaction_type = 'interaction'
                try:
                    symbol1,interaction_type, symbol2, ensembl1,ensembl2,source = t
                    ens_ls1=[ensembl1]; ens_ls2=[ensembl2]
                    if 'HMDB' in ensembl1:
                        ensembl1 = string.replace(ensembl1,' ','') ### HMDB ID sometimes proceeded by ' '
                        symbol_hmdb_db[symbol1]=ensembl1
                        hmdb_symbol_db[ensembl1] = symbol1
                        interaction_type = 'Metabolic'
                    if 'HMDB' in ensembl2:
                        ensembl2 = string.replace(ensembl2,' ','') ### HMDB ID sometimes proceeded by ' '
                        symbol_hmdb_db[symbol2]=ensembl2
                        hmdb_symbol_db[ensembl2] = symbol2
                        interaction_type = 'Metabolic'
                except Exception:
                    try:
                        ensembl1,ensembl2,symbol1,symbol2,interaction_type=t
                        if ensembl1 == '':
                            try:
                                ens_ls1 = symbol_ensembl_db[symbol1]
                                ens_ls2 = symbol_ensembl_db[symbol2]
                            except Exception: None
                    except Exception:
                        proceed = False
                if proceed: ### If the interaction data conformed to one of the two above types (typically two valid interacting gene IDs)
                    if (len(ens_ls1)>0 and len(ens_ls2)>0):
                        secondary_proceed = True
                        stored+=1
                        for ensembl1 in ens_ls1:
                            for ensembl2 in ens_ls2:
                                """
                                if (ensembl1,ensembl2) == ('ENSG00000111704','ENSG00000152284'):
                                    print t;sys.exit()
                                if (ensembl1,ensembl2) == ('ENSG00000152284','ENSG00000111704'):
                                    print t;sys.exit()
                                """
                                if 'WikiPathways' in file or 'KEGG' in file:
                                    if ensembl2 != ensembl1:
                                        if (ensembl2,ensembl1) in interaction_annotation_dbase:
                                            del interaction_annotation_dbase[(ensembl2,ensembl1)]
                                            ### Exclude redundant entries with fewer interaction details (e.g., arrow direction BIOGRID) - overwrite with the opposite gene arrangement below
                                        if (ensembl1,ensembl2) in interaction_annotation_dbase:
                                            if interaction_annotation_dbase[(ensembl1,ensembl2)].InteractionType() !='physical':
                                                secondary_proceed = False ### Don't overwrite a more informative annotation like transcriptional regulation or microRNA targeting
                                if 'DrugBank' in fn:
                                    source = 'DrugBank'
                                    interaction_type = 'drugInteraction'
                                    obligatory=True
                                    ensembl1, ensembl2 = ensembl2, ensembl1 ### switch the order of these (drugs reported as first ID and gene as the second)
                                
                                if secondary_proceed:
                                    z = InteractionInformation(ensembl1,ensembl2,source,interaction_type)
                                    interaction_annotation_dbase[ensembl1,ensembl2] = z
                                    #z = InteractionInformation(ensembl2,ensembl1,source,interaction_type)
                                    #interaction_annotation_dbase[ensembl2,ensembl1] = z
                                    try: interaction_db[ensembl1][ensembl2]=1
                                    except KeyError: db = {ensembl2:1}; interaction_db[ensembl1] = db ###weight of 1 (weights currently not-supported)
                                    try: interaction_db[ensembl2][ensembl1]=1
                                    except KeyError: db = {ensembl1:1}; interaction_db[ensembl2] = db ###weight of 1 (weights currently not-supported)
                                
                                if obligatory and source in obligatoryList: ### Include these in the final pathway if linked to any input node (e.g., miRNAs, drugs)
                                    try: obligatory_interactions[ensembl1][ensembl2]=1
                                    except KeyError: db = {ensembl2:1}; obligatory_interactions[ensembl1] = db ###weight of 1 (weights currentlynot-supported)
                                elif source in secondDegreeObligatoryCategories:
                                    try: second_degree_obligatory[ensembl1][ensembl2]=1
                                    except KeyError: db = {ensembl2:1}; second_degree_obligatory[ensembl1] = db ###weight of 1 (weights currently not-supported)
                                    
                else:
                    proceed = False
                    try:
                        ID1, null, ID2 = t
                        proceed = True
                    except Exception:
                        try:
                            ID1, ID2 = t
                            proceed = True
                        except Exception:
                            None
                            
                    if proceed:
                        if 'microRNATargets' in fn:
                            if 'mir' in ID2: prefix = 'MIR'
                            else: prefix = 'LET'
                            ID2='MIR'+string.split(ID2,'-')[2] ### Ensembl naming convention
                            source = 'microRNATargets'
                            interaction_type = 'microRNAInteraction'
                            obligatory=True
                        try: ID_ls1 = symbol_ensembl_db[ID1]
                        except Exception: ID_ls1 = [ID1]
                        try: ID_ls2 = symbol_ensembl_db[ID2]
                        except Exception: ID_ls2 = [ID2]
                        """if 'microRNATargets' in fn:
                            if '*' not in ID2: print ID_ls2;sys.exit()"""
                        addInteractions = True
                        for ID1 in ID_ls1:
                            for ID2 in ID_ls2:
                                z = InteractionInformation(ID2,ID1,source,interaction_type)
                                interaction_annotation_dbase[ID2,ID1] = z ### This is the interaction direction that is appropriate
                                try: interaction_db[ID1][ID2]=1
                                except KeyError: db = {ID2:1}; interaction_db[ID1] = db ###weight of 1 (weights currently supported)
                                try: interaction_db[ID2][ID1]=1
                                except KeyError: db = {ID1:1}; interaction_db[ID2] = db ###weight of 1 (weights currently supported)
                                    
                                if source in secondDegreeObligatoryCategories:
                                    try: second_degree_obligatory[ID1][ID2]=1
                                    except KeyError: db = {ID2:1}; second_degree_obligatory[ID1] = db ###weight of 1 (weights currently supported)
                                    
                                elif obligatory and source in obligatoryList: ### Include these in the final pathway if linked to any input node (e.g., miRNAs, drugs)
                                    try: obligatory_interactions[ID1][ID2]=1
                                    except KeyError: db = {ID2:1}; obligatory_interactions[ID1] = db ###weight of 1 (weights currently supported)
                           
    ### Evaluate the most promiscous interactors (e.g., UBC)
    remove_list=[]
    for ID in interaction_db:
        if len(interaction_db[ID])>2000:
            remove_list.append(ID)
            #print len(interaction_db[ID]),ensembl_symbol_db[ID]
    for ID in remove_list:
        #print 'removing', ID
        del interaction_db[ID]
    blackList[ID] = []

    print 'Imported interactions:',len(interaction_annotation_dbase)
Example #42
0
def compareImportedTables(file_list,outputDir,importDir=False,considerNumericDirection=False,display=True):
    ### added for AltAnalyze
    print 'Creating Venn Diagram from input files...'
    import UI
    import export
    file_id_db={}
    file_list2=[]
    for file in file_list:
        x=0
        if '.txt' in file:
            if importDir !=False: ### When all files in a directory are analyzed
                fn=UI.filepath(import_dir+'/'+file)
            else:
                fn = file
                file = export.findFilename(fn) ### Only report the actual filename
            file_list2.append(file)
            for line in open(fn,'rU').xreadlines():
                if x == 0:
                    data_type = examineFields(line)
                    x+=1
                else:
                    data = UI.cleanUpLine(line)
                    t = string.split(data,'\t')
                    uid = t[0]
                    valid = True
                    if data_type != 'first':
                        if data_type == 'comparison':
                            score = float(string.split(t[6],'|')[0])
                            if 'yes' not in t[5]:
                                valid = False ### not replicated independently
                        if data_type == 'reciprocal':
                            uid = t[8]+'-'+t[10]
                            score = float(t[1])
                        if data_type == 'single':
                            uid = t[6]
                            score = float(t[1])
                    else:
                        try:
                            score = float(t[1]) #t[2]
                        except Exception: score = None
                    if score != None and considerNumericDirection: ### change the UID so that it only matches if the same direction
                        if score>0:
                            uid+='+' ### encode the ID with a negative sign
                        else:
                            uid+='-' ### encode the ID with a negative sign
                    #if score>0:
                    if valid:
                        try: file_id_db[file].append(uid)
                        except Exception: file_id_db[file] = [uid]
                    
    id_lists=[]
    new_file_list=[]
    for file in file_list2: ### Use the sorted names
        if file in file_id_db:
            uids = file_id_db[file]
            id_lists.append(uids)
            new_file_list.append(file)
            #print file, len(new_file_list), len(uids)
            
    if len(file_id_db):
        if len(new_file_list)==2 or len(new_file_list)==3:
            SimpleMatplotVenn(new_file_list,id_lists,outputDir=outputDir,display=False) ### display both below
        venn(id_lists, new_file_list, fill="number", show_names=False, outputDir=outputDir, show_plot=display)
Example #43
0
def latteralMerge(files_to_merge,original_filename,outputPath = None):
    """ Merging files can be dangerous, if there are duplicate IDs (e.g., gene symbols).
    To overcome issues in redundant gene IDs that are improperly matched (one row with zeros
    and the other with values), this function determines if a lateral merge is more appropriate.
    The latter merge:
    1) Checks to see if the IDs are the same with the same order between the two or more datasets
    2) merges the two or more matrices without looking at the genes.
    
    Note: This function is attempts to be memory efficient and should be updated in the future to
    merge blocks of row IDs sequentially."""
    
    files_to_merge_revised = []
    for filename in files_to_merge:
        ### If a sparse matrix - rename and convert to flat file
        if '.h5' in filename or '.mtx' in filename:
            from import_scripts import ChromiumProcessing
            import export
            
            file = export.findFilename(filename)
            export_name = file[:-4]+'-filt'
            if file == 'filtered_feature_bc_matrix.h5' or file == 'raw_feature_bc_matrix.h5' or file =='filtered_gene_bc_matrix.h5' or file == 'raw_gene_bc_matrix.h5':
                export_name = export.findParentDir(filename)
                export_name = export.findFilename(export_name[:-1])
            elif file == 'matrix.mtx.gz' or file == 'matrix.mtx':
                parent = export.findParentDir(filename)
                export_name = export.findParentDir(parent)
                export_name = export.findFilename(export_name[:-1])
            else:
                export_name = string.replace(file,'.mtx.gz','')
                export_name = string.replace(export_name,'.mtx','')
                export_name = string.replace(export_name,'.h5','')
                export_name = string.replace(export_name,'_matrix','')
            filename = ChromiumProcessing.import10XSparseMatrix(filename,'species',export_name)
        files_to_merge_revised.append(filename)
    files_to_merge = files_to_merge_revised
    print 'Files to merge:',files_to_merge
        
    includeFilenames = True
    file_uids = {}
    for filename in files_to_merge:
        firstRow=True
        fn=filepath(filename); x=0
        if '/' in filename:
            file = string.split(filename,'/')[-1][:-4]
        else:
            file = string.split(filename,'\\')[-1][:-4]
        for line in open(fn,'rU').xreadlines():         
            data = cleanUpLine(line)
            if '\t' in data:
                t = string.split(data,'\t')
            elif ',' in data:
                t = string.split(data,',')
            else:
                t = string.split(data,'\t')
            if firstRow:
                firstRow = False
            else:
                uid = t[0]
                try:
                    file_uids[file].append(uid)
                except:
                    file_uids[file] = [uid]

    perfectMatch = True
    for file1 in file_uids:
        uids1 = file_uids[file1]
        for file2 in file_uids:
            uids2 = file_uids[file2]
            if uids1 != uids2:
                print file1,file2
                perfectMatch = False

    if perfectMatch:
        print 'All ordered IDs match in the files ... performing latteral merge instead of key ID merge to prevent multi-matches...'
        firstRow=True
        increment = 5000
        low = 1
        high = 5000
        added = 1
        eo = open(output_dir+'/MergedFiles.txt','w')
        import collections 
        
        def exportMergedRows(low,high):
            uid_values=collections.OrderedDict()
            for filename in files_to_merge:
                fn=filepath(filename); x=0; file_uids = {}
                if '/' in filename:
                    file = string.split(filename,'/')[-1][:-4]
                else:
                    file = string.split(filename,'\\')[-1][:-4]
                firstRow=True
                row_count = 0
                uids=[] ### Over-ride this for each file
                for line in open(fn,'rU').xreadlines():
                    row_count+=1
                    if row_count<=high and row_count>=low:
                        data = cleanUpLine(line)
                        if '\t' in data:
                            t = string.split(data,'\t')
                        elif ',' in data:
                            t = string.split(data,',')
                        else:
                            t = string.split(data,'\t')
                        if firstRow and low==1:
                            file = string.replace(file,'_matrix_CPTT','')
                            if includeFilenames:
                                header = [s + "."+file for s in t[1:]] ### add filename suffix
                            else:
                                header = t[1:]
                            try: uid_values[row_count]+=header
                            except: uid_values[row_count]=header
                            uids.append('UID')
                            firstRow=False
                        else:
                            uid = t[0]
                            try: uid_values[row_count] += t[1:]
                            except: uid_values[row_count] = t[1:]
                            uids.append(uid)
            i=0
            for index in uid_values:
                uid = uids[i]
                eo.write(string.join([uid]+uid_values[index],'\t')+'\n')
                i+=1
            print 'completed',low,high
        
        uid_list = file_uids[file]
        while (len(uid_list)+increment)>high:
            exportMergedRows(low,high)
            high+=increment
            low+=increment
        eo.close()
        return True
    else:
        print 'Different identifier order in the input files encountered...'
        return False
Example #44
0
def remoteGene(gene, Species, root_dir, comparison_file):
    global Transcript_Annotations_File
    global ExonRegion_File
    global Selected_Gene
    global Prt_Trans_File
    global Prt_Regions_File
    global Prt_Boundaries_File
    global SplicingIndex_File
    global UniPrt_Regions_File
    global microRNA_File
    global domainAnnotation_db
    global platform
    global species

    Selected_Gene = str(gene)
    species = Species

    comparison_name = string.split(export.findFilename(comparison_file),
                                   '.')[0]
    ExonRegion_File = unique.filepath("AltDatabase/ensembl/" + species + "/" +
                                      species + "_Ensembl_exon.txt")
    Transcript_Annotations_File = unique.filepath(
        "AltDatabase/ensembl/" + species + "/" + species +
        "_Ensembl_transcript-annotations.txt")
    Prt_Trans_File = searchDirectory("AltDatabase/ensembl/" + species + "/",
                                     'Ensembl_Protein')
    Prt_Regions_File = searchDirectory("AltDatabase/ensembl/" + species + "/",
                                       'ProteinFeatures')
    Prt_Boundaries_File = searchDirectory(
        "AltDatabase/ensembl/" + species + "/", 'ProteinCoordinates')
    UniPrt_Regions_File = searchDirectory(
        "AltDatabase/uniprot/" + species + "/", 'FeatureCoordinate')
    SplicingIndex_File = searchDirectory(root_dir +
                                         '/AltResults/ProcessedSpliceData/',
                                         'splicing-index',
                                         secondary=comparison_name)
    platform = getPlatform(SplicingIndex_File)
    microRNA_File = searchDirectory("AltDatabase/" + species + "/" + platform,
                                    'microRNAs_multiple')
    #print(SplicingIndex_File)

    total_val = ProteinCentricIsoformView(Selected_Gene)
    junctions = total_val[0]
    p_boundaries = total_val[1]
    p_domains = total_val[2]
    transcript_db = total_val[3]
    exon_db = total_val[4]
    splice_db = total_val[5]
    microRNA_db = total_val[6]
    domainAnnotation_db = total_val[7]

    #for i in exon_db:
    #    print("THE", i, exon_db[i], "\n")

    #for i in microRNA_db:
    #        m_test = microRNA_db[i]
    #    print(len(m_test))
    #    for q in m_test:
    #        print("microRNA", q.ExonBlock(), q.Description(), q.BP(), "\n")

    #for i in exon_db["ENST00000349238"]:
    #    print(i[2].EnsemblRegion())

    domain_color_list = []
    for i in p_domains:
        ploy = p_domains[i]
        for a in ploy:
            domain_color_list.append(a[1])

    domain_color_list = list(set(domain_color_list))
    domain_color_key = {}
    c_color1 = [0.8, 0.6, 0.1]
    c_color2 = [0.1, 0.6, 0.8]
    c_color3 = [0.6, 0.1, 0.8]
    c_color4 = [0.95, 0.6, 0.3]
    c_color5 = [0.3, 0.6, 0.95]
    c_color6 = [0.6, 0.3, 0.95]
    FLAG = 1

    for item in domain_color_list:
        if (FLAG == 1):
            domain_color_key[item] = c_color1
            FLAG = FLAG + 1
            continue
        if (FLAG == 2):
            domain_color_key[item] = c_color2
            FLAG = FLAG + 1
            continue
        if (FLAG == 3):
            domain_color_key[item] = c_color3
            FLAG = FLAG + 1
            continue
        if (FLAG == 4):
            domain_color_key[item] = c_color4
            FLAG = FLAG + 1
            continue
        if (FLAG == 5):
            domain_color_key[item] = c_color5
            FLAG = FLAG + 1
            continue
        if (FLAG == 6):
            domain_color_key[item] = c_color6
            FLAG = 1
            continue

    #for i in domain_color_key:
    #print(i, domain_color_key[i], "\n")

    Y = 100
    Transcript_to_Y = {}
    for transcript in transcript_db:
        Transcript_to_Y[transcript] = Y
        Y = Y + 300
    import traceback

    def onpick(event):
        #ind = event.ind
        print(event.artist.get_label())

    #for i in domainAnnotation_db: print(i,len(domainAnnotation_db));break

    fig = pylab.figure()

    ylim = Y + 200
    currentAxis = pylab.gca()
    #ax = pylab.axes()
    ax = fig.add_subplot(111)
    X_Pos_List = []
    CoordsBank = []

    for transcript in transcript_db:
        try:
            Junc_List = junctions[transcript]
            y_pos = Transcript_to_Y[transcript]
            Gene_List = exon_db[transcript]
            color_flag = 1
            for entry in Gene_List:
                G_start = entry[0][0]
                G_end = entry[0][1]
                Exon_Object = entry[2]
                try:
                    LabelClass = splice_db[Exon_Object.EnsemblRegion()]
                    ExonName = Exon_Object.EnsemblExon()
                    RegCall = LabelClass.RegCall()
                    SplicingIndex = LabelClass.SplicingIndex()
                    PVal = LabelClass.PVal()
                    Midas = LabelClass.Midas()
                    Label = "\n" + "Exon: " + str(
                        ExonName) + "\n" + "RegCall: " + str(
                            RegCall) + "\n" + "Splicing Index: " + str(
                                SplicingIndex) + "\n" + "P-Value: " + str(
                                    PVal) + "\n" + "Midas Value: " + str(
                                        Midas) + "\n"
                    Label = string.replace(Label, "\n", " ")
                    if (RegCall == "UC"):
                        color_choice = "Grey"
                    else:
                        S_Int = float(SplicingIndex)
                        if (S_Int > 0):
                            #color_choice = (0.7, 0.7, 0.99)
                            color_choice = 'blue'
                        if (S_Int < 0):
                            #color_choice = (0.8, 0.4, 0.4)
                            color_choice = 'red'

                except:
                    #print(traceback.format_exc());sys.exit()
                    Label = ""
                    color_choice = "Grey"
                #print("Start", G_start, "end", G_end, "Region", entry[2].EnsemblRegion())
                if ((color_flag % 2) == 0):
                    currentAxis.add_patch(
                        Rectangle((G_start, y_pos), (G_end - G_start),
                                  50,
                                  color=color_choice,
                                  label=(entry[2].EnsemblRegion() + Label),
                                  picker=True))
                    y_end = y_pos + 50
                    try:
                        CoordsBank.append(
                            (G_start, G_end, y_pos, y_end,
                             'Exon: ' + entry[2].EnsemblRegion() + ' ' +
                             'SI: ' + str(SplicingIndex)[:4] + ' Pval: ' +
                             str(Midas)[:4]))
                    except Exception:
                        CoordsBank.append(
                            (G_start, G_end, y_pos, y_end,
                             'Exon: ' + entry[2].EnsemblRegion()))
                    #print(entry[2].EnsemblRegion(),y_pos,y_end)
                if ((color_flag % 2) != 0):
                    currentAxis.add_patch(
                        Rectangle((G_start, y_pos), (G_end - G_start),
                                  50,
                                  color=color_choice,
                                  label=(entry[2].EnsemblRegion() + Label),
                                  picker=True))
                    y_end = y_pos + 50
                    try:
                        CoordsBank.append(
                            (G_start, G_end, y_pos, y_end,
                             'Exon: ' + entry[2].EnsemblRegion() + ' ' +
                             'SI: ' + str(SplicingIndex)[:4] + ' p-value: ' +
                             str(Midas)[:4]))
                    except Exception:
                        CoordsBank.append(
                            (G_start, G_end, y_pos, y_end,
                             'Exon: ' + entry[2].EnsemblRegion()))
                    #print(entry[2].EnsemblRegion(),y_pos,y_end)
                color_flag = color_flag + 1
                if (entry[2].EnsemblRegion() in microRNA_db):
                    microRNA_object = microRNA_db[entry[2].EnsemblRegion()]
                    mr_label = "MICRORNA MATCHES" + "\n"
                    for class_object in microRNA_object:
                        mr_exonname = class_object.ExonBlock()
                        mr_desc = class_object.Description(
                        ) + " " + class_object.Algorithms()
                        #print(mr_desc)
                        mr_label = mr_label + mr_desc + "\n"

                    currentAxis.add_patch(
                        Rectangle((G_start, (y_pos - 75)), (G_end - G_start),
                                  40,
                                  color="Green",
                                  label=(mr_label),
                                  picker=True))
                    y_start = y_pos - 75
                    y_end = y_pos - 35
                    CoordsBank.append(
                        (G_start, G_end, y_start, y_end, mr_desc))

            for entry in Junc_List:
                junctionID = entry[-1]
                try:
                    LabelClass = splice_db[entry[2]]
                    RegCall = LabelClass.RegCall()
                    SplicingIndex = LabelClass.SplicingIndex()
                    PVal = LabelClass.PVal()
                    Midas = LabelClass.Midas()
                    Label = "\n" + "RegCall: " + str(
                        RegCall) + "\n" + "Splicing Index: " + str(
                            SplicingIndex) + "\n" + "P-Value: " + str(
                                PVal) + "\n" + "Midas Value: " + str(
                                    Midas) + "\n"
                    if (float(SplicingIndex) > 0):
                        color_junc = "blue"
                    if (float(SplicingIndex) < 0):
                        color_junc = "red"
                    if (RegCall == "UC"):
                        color_junc = "grey"
                except:
                    Label = ""
                    color_junc = "grey"
                currentAxis.add_patch(
                    Rectangle((entry[0], y_pos), (entry[1] - entry[0]),
                              50,
                              color="White",
                              label=(str(entry[2]) + Label),
                              picker=True))
                ax.arrow(entry[0], (y_pos + 50),
                         8,
                         40,
                         label=(str(entry[2]) + Label),
                         color=color_junc,
                         picker=True)
                ax.arrow((entry[0] + 8), (y_pos + 90),
                         11,
                         -40,
                         label=(str(entry[2]) + Label),
                         color=color_junc,
                         picker=True)
                y_start = y_pos
                y_end = y_pos + 30
                #print(junctionID,y_start,y_end)
                CoordsBank.append((G_start, G_end, y_start, y_end, junctionID))

            try:
                P_Bound_List = p_boundaries[transcript]
                E_Start = P_Bound_List[-2]
                E_End = P_Bound_List[-1]
                P_Start = P_Bound_List[1]
                P_End = P_Bound_List[2]
                #print("Boundaries: ", P_Start, P_End)
                X_Pos_List.append(int(E_End))
                #currentAxis.add_patch(Rectangle((E_Start, y_pos), E_End, 50, color = "Blue"))
                try:
                    currentAxis.add_patch(
                        Rectangle((P_Start, (y_pos + 120)), (P_End - P_Start),
                                  10))
                except:
                    pass
                p_label_list = ["DEF"]
                #CoordsBank.append((P_Start, P_End, y_pos, P_End - P_Start, transcript)) ### Added by NS - needs work
                try:
                    P_Domain_List = p_domains[transcript]
                except Exception:
                    P_Domain_List = []
                for entry in P_Domain_List:
                    #print("Domain", entry)
                    color_domain_choice = domain_color_key[entry[1]]
                    domain_annotation = domainAnnotation_db[entry[1]]
                    #domain_annotation = string.replace(domain_annotation,'REGION-','')
                    p_label = (str(entry[0]) + " " + str(domain_annotation))
                    #print(entry[0], entry[2], entry[3], P_Start, P_End, domain_annotation, )
                    Repeat_Flag = 0
                    for i in p_label_list:
                        if (p_label == i):
                            Repeat_Flag = 1
                    if (Repeat_Flag == 1):
                        continue
                    p_label_list.append(p_label)
                    currentAxis.add_patch(
                        Rectangle((entry[2], y_pos + 100),
                                  (entry[3] - entry[2]),
                                  50,
                                  color=color_domain_choice,
                                  label=p_label,
                                  picker=True))
                    y_start = y_pos + 100
                    y_end = y_pos + 150
                    CoordsBank.append(
                        (entry[2], entry[3], y_start, y_end, p_label))
            except Exception:
                pass
                #print(traceback.format_exc())
        except:
            #print(traceback.format_exc())
            pass
    pylab.ylim([0.0, ylim])
    try:
        max_x = max(X_Pos_List)
    except:
        max_x = 5000
    try:
        pylab.xlim([0.0, max_x])
    except:
        pylab.xlim([0.0, 3000])
    fig.canvas.mpl_connect('pick_event', onpick)

    def format_coord(x, y):
        for m in CoordsBank:
            if (x >= m[0] and x <= m[1] and y >= m[2] and y <= m[3]):
                string_display = m[4]
                return string_display
        string_display = "  "
        return string_display

    ax.format_coord = format_coord
    #datacursor(hover=True, formatter='{label}'.format, bbox=dict(fc='yellow', alpha=1), arrowprops=None)
    pylab.show()
Example #45
0
def buildInteractions(species,Degrees,inputType,inputDir,outputdir,interactionDirs,Genes=None,
                      geneSetType=None,PathwayFilter=None,OntologyID=None,directory=None,expressionFile=None,
                      obligatorySet=None,secondarySet=None,IncludeExpIDs=False):
    
    global degrees
    global outputDir
    global inputDataType
    global obligatoryList ### Add these if connected to anything
    global secondaryQueryIDs
    global secondDegreeObligatoryCategories ### Add if common to anything in the input - Indicates systems to apply this to
    global symbol_hmdb_db; symbol_hmdb_db={}; global hmdb_symbol_db; hmdb_symbol_db={} ### Create an annotation database for HMDB IDs
    global FileName
    global intNameShort
    secondaryQueryIDs = {}
    degrees = Degrees
    outputDir = outputdir
    inputDataType = inputType
    obligatoryList = obligatorySet
    secondDegreeObligatoryCategories=[]
    intNameShort=''
    if obligatoryList == None:
        obligatoryList=[]
    if expressionFile == None:
        expressionFile = inputDir ### If it doesn't contain expression values, view as yellow nodes
    if secondarySet!= None and (degrees==1 or degrees=='direct'): ### If degrees == 2, this is redundant
        ### This currently adds alot of predictions - either make more stringent or currently exclude
        secondDegreeObligatoryCategories = secondarySet
    if PathwayFilter != None:
        if len(PathwayFilter)==1:
            FileName = PathwayFilter[0]
        if isinstance(PathwayFilter, tuple) or isinstance(PathwayFilter, list):
            FileName = string.join(list(PathwayFilter),' ')
            FileName = string.replace(FileName,':','-')
        else:
            FileName = PathwayFilter
        if len(FileName)>40:
            FileName = FileName[:40]
    elif OntologyID != None: FileName = OntologyID
    elif Genes != None: FileName = Genes
    
    ### Import Ensembl-Symbol annotations
    getEnsemblGeneData('AltDatabase/ensembl/'+species+'/'+species+'_Ensembl-annotations.txt')
    if len(interactionDirs[0]) == 1: interactionDirs = [interactionDirs]    
    ### Import interaction databases indicated in interactionDirs
    for i in interactionDirs:
        print i
        i = export.findFilename(i)
        i=string.split(i,'-')[1]
        intNameShort+=i[0]

    importInteractionData(interactionDirs)
    getHMDBData(species) ### overwrite the symbol annotation from any HMDB that comes from a WikiPathway or KEGG pathway that we also include (for consistent official annotation) 
    
    input_IDs = getGeneIDs(Genes)
    try:
        if isinstance(PathwayFilter, tuple):
            for pathway in PathwayFilter:
                IDs = gene_associations.simpleGenePathwayImport(species,geneSetType,pathway,OntologyID,directory)
                for id in IDs:input_IDs[id]=None
        else:
            input_IDs = gene_associations.simpleGenePathwayImport(species,geneSetType,PathwayFilter,OntologyID,directory)
    except Exception: None
    if expressionFile == None or len(expressionFile)==0:
        expressionFile = exportSelectedIDs(input_IDs) ### create an expression file
    elif IncludeExpIDs: ### Prioritize selection of IDs for interactions WITH the primary query set (not among expression input IDs)
        secondaryQueryIDs = importqueryResults(species,expressionFile,{})[0]
    input_IDs,query_interactions,dir_file = importqueryResults(species,inputDir,input_IDs)
    sif_file,symbol_pair_unique = associateQueryGenesWithInteractions(input_IDs,query_interactions,dir_file)
    output_filename = exportGraphImage(species,sif_file,expressionFile)
    return output_filename
Example #46
0
def getPlatform(filename):
    prefix = string.split(export.findFilename(filename), '.')[0]
    array_type = string.split(prefix, '_')[1]
    if array_type != 'RNASeq':
        array_type = string.lower(array_type)
    return array_type
Example #47
0
def parseJunctionEntries(bam_dir,
                         multi=False,
                         Species=None,
                         ReferenceDir=None):
    global bam_file
    global splicesite_db
    global IndicatedSpecies
    global ExonReference
    IndicatedSpecies = Species
    ExonReference = ReferenceDir
    bam_file = bam_dir
    try:
        splicesite_db, chromosomes_found, gene_coord_db = retreiveAllKnownSpliceSites(
        )
    except Exception:
        print traceback.format_exc()
        splicesite_db = {}
        chromosomes_found = {}

    start = time.time()
    try:
        import collections
        junction_db = collections.OrderedDict()
    except Exception:
        try:
            import ordereddict
            junction_db = ordereddict.OrderedDict()
        except Exception:
            junction_db = {}
    original_junction_db = copy.deepcopy(junction_db)

    bamf = pysam.Samfile(bam_dir, "rb")
    ### Is there an indexed .bai for the BAM? Check.
    try:
        for entry in bamf.fetch():
            codes = map(lambda x: x[0], entry.cigar)
            break
    except Exception:
        ### Make BAM Index
        if multi == False:
            print 'Building BAM index file for', bam_dir
        bam_dir = str(bam_dir)
        #On Windows, this indexing step will fail if the __init__ pysam file line 51 is not set to - catch_stdout = False
        pysam.index(bam_dir)
        bamf = pysam.Samfile(bam_dir, "rb")

    chromosome = False
    chromosomes = {}
    bam_reads = 0
    count = 0
    jid = 1
    prior_jc_start = 0
    l1 = None
    l2 = None
    o = open(string.replace(bam_dir, '.bam', '__junction.bed'), "w")
    o.write('track name=junctions description="TopHat junctions"\n')
    export_isoform_models = False
    if export_isoform_models:
        io = open(string.replace(bam_dir, '.bam', '__isoforms.txt'), "w")
        isoform_junctions = copy.deepcopy(junction_db)
    outlier_start = 0
    outlier_end = 0
    read_count = 0
    c = 0
    for entry in bamf.fetch():
        bam_reads += 1
        try:
            cigarstring = entry.cigarstring
        except Exception:
            codes = map(lambda x: x[0], entry.cigar)
            if 3 in codes: cigarstring = 'N'
            else: cigarstring = None

        if cigarstring != None:
            if 'N' in cigarstring:  ### Hence a junction
                if prior_jc_start == 0: pass
                elif (entry.pos - prior_jc_start) > 5000 or bamf.getrname(
                        entry.rname
                ) != chromosome:  ### New chr or far from prior reads
                    writeJunctionBedFile(junction_db, jid, o)
                    #writeIsoformFile(isoform_junctions,io)
                    junction_db = copy.deepcopy(
                        original_junction_db)  ### Re-set this object
                    jid += 1

                chromosome = bamf.getrname(entry.rname)
                chromosomes[chromosome] = []  ### keep track
                X = entry.pos
                #if entry.query_name == 'SRR791044.33673569':
                #print chromosome, entry.pos, entry.reference_length, entry.alen, entry.query_name
                Y = entry.pos + entry.alen
                prior_jc_start = X

                try:
                    tophat_strand = entry.opt(
                        'XS'
                    )  ### TopHat knows which sequences are likely real splice sites so it assigns a real strand to the read
                except Exception:
                    #if multi == False:  print 'No TopHat strand information';sys.exit()
                    tophat_strand = None
                coordinates, up_to_intron_dist = getSpliceSites(entry.cigar, X)
                #if count > 100: sys.exit()
                #print entry.query_name,X, Y, entry.cigarstring, entry.cigar, tophat_strand
                for (five_prime_ss, three_prime_ss) in coordinates:
                    jc = five_prime_ss, three_prime_ss
                    #print X, Y, jc, entry.cigarstring, entry.cigar
                    try:
                        junction_db[chromosome, jc, tophat_strand].append(
                            [X, Y, up_to_intron_dist])
                    except Exception:
                        junction_db[chromosome, jc, tophat_strand] = [[
                            X, Y, up_to_intron_dist
                        ]]

                if export_isoform_models:
                    try:
                        mate = bamf.mate(
                            entry
                        )  #https://groups.google.com/forum/#!topic/pysam-user-group/9HM6nx_f2CI

                        if 'N' in mate.cigarstring:
                            mate_coordinates, mate_up_to_intron_dist = getSpliceSites(
                                mate.cigar, mate.pos)
                        else:
                            mate_coordinates = []
                    except Exception:
                        mate_coordinates = []
                    #print coordinates,mate_coordinates
                    junctions = map(lambda x: tuple(x), coordinates)
                    if len(mate_coordinates) > 0:
                        try:
                            isoform_junctions[chromosome,
                                              tuple(junctions),
                                              tophat_strand].append(
                                                  mate_coordinates)
                        except Exception:
                            isoform_junctions[chromosome,
                                              tuple(junctions),
                                              tophat_strand] = [
                                                  mate_coordinates
                                              ]
                    else:
                        if (chromosome, tuple(junctions),
                                tophat_strand) not in isoform_junctions:
                            isoform_junctions[chromosome,
                                              tuple(junctions),
                                              tophat_strand] = []

                count += 1
    writeJunctionBedFile(junction_db, jid, o)  ### One last read-out
    if multi == False:
        print bam_reads, count, time.time(
        ) - start, 'seconds required to parse the BAM file'
    o.close()
    bamf.close()

    missing_chromosomes = []
    for chr in chromosomes_found:
        if chr not in chromosomes:
            chr = string.replace(chr, 'chr', '')
            if chr not in chromosomes_found:
                if chr != 'M' and chr != 'MT':
                    missing_chromosomes.append(chr)
    #missing_chromosomes = ['A','B','C','D']
    try:
        bam_file = export.findFilename(bam_file)
    except Exception:
        pass
    return bam_file, missing_chromosomes
Example #48
0
def importGeneExpressionValues(filename,tissue_specific_db,translation_db,useLog=False,previouslyRun=False,species=None):
    ### Import gene-level expression raw values           
    fn=filepath(filename); x=0; genes_added={}; gene_expression_db={}
    dataset_name = export.findFilename(filename)
    max_val=0
    print 'importing:',dataset_name
    
    try:
        import gene_associations, OBO_import
        gene_to_symbol = gene_associations.getGeneToUid(species,('hide','Ensembl-Symbol'))
        symbol_to_gene = OBO_import.swapKeyValues(gene_to_symbol)
    except Exception: symbol_to_gene={}
    
    for line in open(fn,'rU').xreadlines():
        data = cleanUpLine(line)
        t = string.split(data,'\t')
        
        if x==0:
            if '#' not in data:
                for i in t[1:]: sample_headers.append(i)
                x=1
        else:
            gene = t[0]
            #if '-' not in gene and ':E' in gene: print gene;sys.exit()
            if analysis_type == 'AltExon':
                try: ens_gene,exon = string.split(gene,'-')[:2]
                except Exception: exon = gene
                gene = exon
            if keyed_by == 'translation': ### alternative value is 'primaryID'
                """if gene == 'ENSMUSG00000025915-E19.3':
                    for i in translation_db: print [i], len(translation_db); break
                    print gene, [translation_db[gene]];sys.exit()"""
                try: gene = translation_db[gene] ### Ensembl annotations
                except Exception: pass
            try: gene = symbol_to_gene[gene][0] ### If RNASeq is the selected platform and Symbol is the uid
            except Exception: pass
            if gene in tissue_specific_db:
                index,tissue_exp=tissue_specific_db[gene]
                try: genes_added[gene]+=1
                except Exception: genes_added[gene]=1
                proceed=True
                try:
                    exp_vals = map(float, t[1:])
                    if platform == 'RNASeq':
                        if max(exp_vals)>max_val: max_val = max(exp_vals)
                        #if max(exp_vals)<3: proceed=False
                        if useLog==False:
                            exp_vals = map(lambda x: math.log(x+1,2),exp_vals)
                    if value_type == 'calls': ### Hence, this is a DABG or RNA-Seq expression
                        exp_vals = produceDetectionCalls(exp_vals,targetPlatform) ### 0 or 1 calls
                    if proceed:
                        gene_expression_db[gene] = [index,exp_vals]
                except Exception:
                    print 'Non-numeric values detected:'
                    x = 5
                    print t[:x]
                    while x < t:
                        t[x:x+5]
                        x+=5
                    print 'Formatting error encountered in:',dataset_name; forceError

    print len(gene_expression_db), 'matching genes in the dataset and tissue compendium database'
    
    for gene in genes_added:
        if genes_added[gene]>1: del gene_expression_db[gene] ### delete entries that are present in the input set multiple times (not trustworthy)
        else: expession_subset.append(gene_expression_db[gene]) ### These contain the rank order and expression
    #print len(expession_subset);sys.exit()
    expession_subset.sort() ### This order now matches that of 
    gene_expression_db=[]
    
    if max_val<20 and platform == 'RNASeq' and previouslyRun==False: ### Only allow to happen once
        importGeneExpressionValues(filename,tissue_specific_db,translation_db,useLog=True,previouslyRun=True,species=species)
Example #49
0
def parseJunctionEntries(bam_dir,multi=False):
    global bam_file
    global splicesite_db
    bam_file = bam_dir
    try: splicesite_db,chromosomes_found = retreiveAllKnownSpliceSites()
    except Exception: splicesite_db={}; chromosomes_found={}
    start = time.time()
    
    try: import collections; junction_db=collections.OrderedDict()
    except Exception:
        try: import ordereddict; junction_db = ordereddict.OrderedDict()
        except Exception: junction_db={}
    original_junction_db = copy.deepcopy(junction_db)
    
    bamf = pysam.Samfile(bam_dir, "rb" )
    chromosome = False
    chromosomes={}
    count=0
    jid = 1
    prior_jc_start=0
    l1 = None; l2=None
    o = open (string.replace(bam_dir,'.bam','__junction.bed'),"w")
    o.write('track name=junctions description="TopHat junctions"\n')
    outlier_start = 0; outlier_end = 0; read_count = 0
    for entry in bamf.fetch():
      #chromosome = bamf.getrname( entry.rname ) 
      codes = map(lambda x: x[0],entry.cigar)
      try: cigarstring = entry.cigarstring
      except Exception:
          if 3 in codes: cigarstring = 'N'
          else: cigarstring = None
      if cigarstring != None:
        if 'N' in cigarstring: ### Hence a junction
            if entry.cigar[0][1]<60 and entry.cigar[0][1]>20:
                """
                if count<310:
                    a1 = entry.seq[entry.cigar[0][1]-5:entry.cigar[0][1]]
                    a2 = entry.seq[entry.cigar[0][1]:entry.cigar[0][1]+6]
                    if l1==a1 and l2==a2: continue
                    else:
                        print entry.opt('XS'), a1,a2, entry.seq
                        l1 = a1; l2 = a2
                else: sys.exit()"""
                
            if prior_jc_start == 0: pass
            elif (entry.pos-prior_jc_start) > 5000 or bamf.getrname( entry.rname ) != chromosome: ### New chr or far from prior reads
                writeJunctionBedFile(junction_db,jid,o)
                junction_db = copy.deepcopy(original_junction_db) ### Re-set this object
                jid+=1
            chromosome = bamf.getrname( entry.rname )
            chromosomes[chromosome]=[] ### keep track
            X=entry.pos
            Y=entry.pos+entry.alen
            prior_jc_start = X
            if entry.is_reverse:
                strand = '-' ### This is the strand the seq aligns to but not necessarily the REAL strand the mRNA aligns to (see XS below)
            else:                
                strand = '+'
            try: tophat_strand = entry.opt('XS') ### TopHat knows which sequences are likely real splice sites so it assigns a real strand to the read
            except Exception:
                #if multi == False:  print 'No TopHat strand information';sys.exit()
                tophat_strand = None
            coordinates,up_to_intron_dist = getSpliceSites(entry.cigar,X)
            for (five_prime_ss,three_prime_ss) in coordinates:
                jc = five_prime_ss,three_prime_ss
                #print X, Y, jc, entry.cigarstring, entry.cigar
                try: junction_db[chromosome,jc,tophat_strand].append([X,Y,up_to_intron_dist])
                except Exception: junction_db[chromosome,jc,tophat_strand] = [[X,Y,up_to_intron_dist]]
            count+=1
    writeJunctionBedFile(junction_db,jid,o) ### One last read-out
    if multi == False:
        print time.time()-start, 'seconds required to parse the BAM file'
    o.close()
    bamf.close()
    
    missing_chromosomes=[]
    for chr in chromosomes_found:
        if chr not in chromosomes:
            chr = string.replace(chr,'chr','')
            if chr not in chromosomes_found:
                if chr != 'M' and chr != 'MT':
                    missing_chromosomes.append(chr)
    #missing_chromosomes = ['A','B','C','D']
    try: bam_file = export.findFilename(bam_file)
    except Exception: pass
    return bam_file, missing_chromosomes
Example #50
0
def Enrichment(Inputfile,mutdict,mutfile,metaDataMatrixFormat,header):
    import collections
    import mappfinder
    X=defaultdict(list)
    prev=""
    head=0
    group=defaultdict(list)
    enrichdict=defaultdict(float)
    mut=export.findFilename(mutfile)
    dire=export.findParentDir(Inputfile)
    output_dir = dire+'MutationEnrichment'
    print output_dir
    export.createExportFolder(output_dir)
    number_of_samples = 0
    
    ### All enrichment results
    exportnam=output_dir+'/Enrichment_Results.txt'
    export_enrich=open(exportnam,"w")
    
    ### Selected Enrichment results based on p-value, sensitivity and specificity for association with cluster names
    exportnam=output_dir+'/Enrichment_tophits.txt'
    export_hit=open(exportnam,"w")
   
    header = "Mutations"+"\t"+"Cluster"+"\t"+"r"+"\t"+"R"+"\t"+"n"+"\t"+"Sensitivity"+"\t"+"Specificity"+"\t"+"z-score"+"\t"+"Fisher exact test"+"\t"+"adjp value"+"\n"
    export_enrich.write(header)
    export_hit.write(header)
    header2=returnSamplesInMetaData(Inputfile,metaDataMatrixFormat=True)
    print header2
    for line in open(Inputfile,'rU').xreadlines():
        if head > 0:
            number_of_samples+=1
            line=line.rstrip('\r\n')
            q = string.split(line,'\t')
            for i in range(1,len(q)):
                if q[i]==str(1):
                    #group[q[0]].append(header2[i-1])
                    group[header2[i-1]].append(q[0]) ### [Cluster] = [full_sample_ID]
        else:
            head+=1
            continue
   
    print 'Number of patient samples in dataset =',number_of_samples
    total_Scores={}
    for kiy in mutdict:
        if kiy =="MDP":
            print mutdict[kiy]
        groupdict={}
        remaining=[]
        remaining=list(set(header) - set(mutdict[kiy]))
        groupdict[1]=mutdict[kiy]
        groupdict[2]=remaining
        #export_enrich1.write(kiy)
        for key2 in group:
            r=float(len(list(set(group[key2])))-len(list(set(group[key2]) - set(mutdict[kiy]))))
            n=float(len(group[key2]))
            R=float(len(set(mutdict[kiy])))
            N=float(number_of_samples)
            if r==0 or key2=="1" or R==1.0:
                #print kiy,key2,r,n,R,N
                pval=float(1)
                z=float(0)
                null_z = 0.000
                zsd = mappfinder.ZScoreData(key2,r,R,z,null_z,n)
                zsd.SetP(pval)
            else:
                try: z = Zscore(r,n,N,R)
                except: z=0
                ### Calculate a Z-score assuming zero matching entries
                try: null_z = Zscore(0,n,N,R)
                except Exception: null_z = 0.000
               
                try:
                    pval = mappfinder.FishersExactTest(r,n,R,N)
                    zsd = mappfinder.ZScoreData(key2,r,R,z,null_z,n)
                    zsd.SetP(pval)
                except Exception:
                    pval=1.0
                    zsd = mappfinder.ZScoreData(key2,r,R,z,null_z,n)
                    zsd.SetP(pval)
                    #pass
                
            if kiy in total_Scores:
                    signature_db = total_Scores[kiy]
                    signature_db[key2]=zsd ### Necessary format for the permutation function
            else:
                    signature_db={key2:zsd}
                    total_Scores[kiy] = signature_db
    sorted_results=[]
    mutlabels={}
    for kiy in total_Scores:
        signature_db = total_Scores[kiy]
        ### Updates the adjusted p-value instances
        mappfinder.adjustPermuteStats(signature_db)
        for signature in signature_db:
            zsd = signature_db[signature]
            results = [kiy,signature,zsd.Changed(),zsd.Measured(),zsd.InPathway(),str(float(zsd.PercentChanged())/100.0),str(float(float(zsd.Changed())/float(zsd.InPathway()))), zsd.ZScore(), zsd.PermuteP(), zsd.AdjP()] #string.join(zsd.AssociatedIDs(),'|')
            sorted_results.append([signature,-1*float(zsd.ZScore()),results])
    sorted_results.sort() ### Sort z-score
    
    prev=""
    for (sig,p,values) in sorted_results:
        if sig!=prev:
            flag=True
            export_hit.write(string.join(values,'\t')+'\n')
        if flag:
            ### Update the cluster label to include the top enriched term meeting, sensitivity and specificity cutoffs
            #print values[5],values[6],values[6],values[2]; sys.exit()
            if (float(values[5])>=0.2 and float(values[6])>=0.2 and float(values[7])>=1.95 and float(values[2])>=2):
                clusterID = values[1]
                topEnrichedTerm=values[0]
                mutlabels[clusterID]=clusterID+' ('+topEnrichedTerm+')'
                flag=False
                export_hit.write(string.join(values,'\t')+'\n')
        export_enrich.write(string.join(values,'\t')+'\n')
        prev=sig
    if len(sorted_results)==0:
            export_enrich.write(string.join([splicing_factor,'NONE','NONE','NONE','NONE','NONE','NONE'],'\t')+'\n')
    export_enrich.close()

    return mutlabels