コード例 #1
0
ファイル: update.py プロジェクト: venkatmi/altanalyze
    def __init__(self,url,dir,file_type):
        try: self.suppress = suppress_printouts
        except Exception: self.suppress = 'no'
        
        """Copy the contents of a file from a given URL to a local file."""
        filename = url.split('/')[-1]; self.status = ''
        #print [url, dir]
        if file_type == None: file_type =''
        if len(file_type) == 2: filename, file_type = file_type ### Added this feature for when a file has an invalid filename
        output_filepath_object = export.createExportFile(dir+filename,dir[:-1])
        output_filepath = filepath(dir+filename); self.output_filepath = output_filepath
        
        if self.suppress == 'no':
            print "Downloading the following file:",filename,' ',
        
        self.original_increment = 5
        self.increment = 0
        from urllib import urlretrieve
        webfile, msg = urlretrieve(url, output_filepath,reporthook=self.reporthookFunction)
        if self.suppress == 'no': print ''
        self.testFile()
        if self.suppress == 'no': print self.status

        if 'Internet' not in self.status:
            if '.zip' in filename:
                if self.suppress == 'no': print "Extracting zip file...",
                try: decompressZipStackOverflow(filename,dir); status = 'completed'
                except Exception:
                    #print 'Native unzip not present...trying python unzip methods...'
                    status = unzipFiles(filename,dir)
                    if status == 'failed': print 'zip extraction failed!'
                self.gz_filepath = filepath(output_filepath); self.status = 'remove'
                if self.suppress == 'no': print "zip file extracted"
            elif '.gz' in filename:
                self.gz_filepath = output_filepath
                if len(file_type)==0: extension = '.gz'
                else: extension = 'gz'
                decompressed_filepath = string.replace(self.gz_filepath,extension,file_type)
                ### Below code can be too memory intensive
                #file_size = os.path.getsize(output_filepath)
                #megabtyes = file_size/1000000.00
                #if megabtyes>5000: force_error ### force_error is an undefined variable which causes an exception
                import gzip; content = gzip.GzipFile(self.gz_filepath, 'rb')
                data = open(decompressed_filepath,'wb')
                #print "\nExtracting downloaded file:",self.gz_filepath
                import shutil; shutil.copyfileobj(content,data)
                # http://pythonicprose.blogspot.com/2009/10/python-extract-or-unzip-tar-file.html
                os.chdir(filepath(dir))
                if '.tar' in decompressed_filepath:
                    import tarfile
                    tfile = tarfile.open(decompressed_filepath)
                    tfile.extractall()
                    tfile.close()
                    tar_dir = string.replace(decompressed_filepath,'.tar','')
                self.status = 'remove'
            else: self.gz_filepath = ''; self.status = 'remove'
コード例 #2
0
    def __init__(self,url,dir,file_type):
        try: self.suppress = suppress_printouts
        except Exception: self.suppress = 'no'
        
        """Copy the contents of a file from a given URL to a local file."""
        filename = url.split('/')[-1]; self.status = ''
        #print [url, dir]
        if file_type == None: file_type =''
        if len(file_type) == 2: filename, file_type = file_type ### Added this feature for when a file has an invalid filename
        output_filepath_object = export.createExportFile(dir+filename,dir[:-1])
        output_filepath = filepath(dir+filename); self.output_filepath = output_filepath
        
        if self.suppress == 'no':
            print "Downloading the following file:",filename,' ',
        
        self.original_increment = 5
        self.increment = 0
        from urllib import urlretrieve
        webfile, msg = urlretrieve(url, output_filepath,reporthook=self.reporthookFunction)
        if self.suppress == 'no': print ''
        self.testFile()
        if self.suppress == 'no': print self.status

        if 'Internet' not in self.status:
            if '.zip' in filename:
                if self.suppress == 'no': print "Extracting zip file...",
                try: decompressZipStackOverflow(filename,dir); status = 'completed'
                except Exception:
                    #print 'Native unzip not present...trying python unzip methods...'
                    status = unzipFiles(filename,dir)
                    if status == 'failed': print 'zip extraction failed!'
                self.gz_filepath = filepath(output_filepath); self.status = 'remove'
                if self.suppress == 'no': print "zip file extracted"
            elif '.gz' in filename:
                self.gz_filepath = output_filepath
                if len(file_type)==0: extension = '.gz'
                else: extension = 'gz'
                decompressed_filepath = string.replace(self.gz_filepath,extension,file_type)
                ### Below code can be too memory intensive
                #file_size = os.path.getsize(output_filepath)
                #megabtyes = file_size/1000000.00
                #if megabtyes>5000: force_error ### force_error is an undefined variable which causes an exception
                import gzip; content = gzip.GzipFile(self.gz_filepath, 'rb')
                data = open(decompressed_filepath,'wb')
                #print "\nExtracting downloaded file:",self.gz_filepath
                import shutil; shutil.copyfileobj(content,data)
                # http://pythonicprose.blogspot.com/2009/10/python-extract-or-unzip-tar-file.html
                os.chdir(filepath(dir))
                if '.tar' in decompressed_filepath:
                    import tarfile
                    tfile = tarfile.open(decompressed_filepath)
                    tfile.extractall()
                    tfile.close()
                    tar_dir = string.replace(decompressed_filepath,'.tar','')
                self.status = 'remove'
            else: self.gz_filepath = ''; self.status = 'remove'
コード例 #3
0
ファイル: FeatureAlignment.py プロジェクト: wuxue/altanalyze
def exportProbesetDomainMappings(species,array_type,indirect_mapping,probeset_domain_match_db):            
    if (array_type == 'junction' or array_type == 'RNASeq') and data_type != 'null':
        export_file = "AltDatabase/"+species+"/"+array_type+"/"+data_type+"/"+species+"_Ensembl_"+indirect_mapping+"domain_aligning_probesets.txt" 
    else:
        export_file = "AltDatabase/"+species+"/"+array_type+"/"+species+"_Ensembl_"+indirect_mapping+"domain_aligning_probesets.txt"                       
    data = export.createExportFile(export_file,"AltDatabase/"+species+"/"+array_type)
    data.write('Probeset\tInterPro-Description\n')
    for probeset in probeset_domain_match_db:
        domain_info_list = probeset_domain_match_db[probeset]
        for ipd in domain_info_list: data.write(probeset+'\t'+ipd+'\n')
    data.close()
    print "Direct probeset to domain associations exported to:",export_file
コード例 #4
0
def outputSummaryResults(summary_results_db,name,analysis_method,root_dir):
    #summary_results_db[dataset_name] = udI,udI-up_diff,ddI,ddI-down_diff,udI_mx,udI_mx-mx_diff,up_dI_genes,down_gene, annotation_list
    annotation_db = {}
    for dataset in summary_results_db:
        for entry in summary_results_db[dataset][-1]:
            annotation = entry[0]
            count = entry[1]
            if 'AA:' not in annotation:
                try: annotation_db[annotation].append((dataset,count))
                except KeyError: annotation_db[annotation] = [(dataset,count)]
    annotation_ls = []

    for annotation in annotation_db: annotation_ls.append(annotation)
    annotation_ls.sort()
    annotation_db2={}
    for annotation in annotation_ls:
        for dataset in summary_results_db:
            y=0
            for entry in summary_results_db[dataset][-1]:
                annotation2 = entry[0]
                count = entry[1]
                if annotation2 == annotation:
                    y=1; new_count = count
            if y == 1:
                try: annotation_db2[dataset].append((annotation,new_count))
                except KeyError: annotation_db2[dataset] = [(annotation,new_count)]
            else:
                try: annotation_db2[dataset].append((annotation,0))
                except KeyError: annotation_db2[dataset] = [(annotation,0)]
      
    summary_output = root_dir+'AltResults/AlternativeOutput/'+analysis_method+'-summary-results'+name+'.txt'
    fn=filepath(summary_output)
    data = export.createExportFile(summary_output,'AltResults/AlternativeOutput')
    if analysis_method == 'splicing-index' or analysis_method == 'FIRMA':
        event_type1 = 'inclusion-events'; event_type2 = 'exclusion-events'; event_type3 = 'alternative-exons'
    else:
        event_type1 = 'inclusion-events'; event_type2 = 'exclusion-events'; event_type3 = 'mutually-exlusive-events'
    title = 'Dataset-name' +'\t'+ event_type1+'\t'+event_type2 +'\t'+ event_type3 +'\t'+ 'up-deltaI-genes' +'\t'+ 'down-deltaI-genes' +'\t'+ 'total-'+analysis_method+'-genes'
    title = title +'\t' + 'upregulated_genes' +'\t'+ 'downregulated_genes' +'\t'+ analysis_method+'-genes-differentially-exp'+'\t'+ 'RNA_processing/binding-factors-upregulated' +'\t'+ 'RNA_processing/binding-factors-downregulated' +'\t'+ analysis_method+'_RNA_processing/binding-factors'
    title = title +'\t'+ 'avg-downregulated-peptide-length' +'\t'+ 'std-downregulated-peptide-length' +'\t'+ 'avg-upregulated-peptide-length' +'\t'+ 'std-upregulated-peptide-length' +'\t'+ 'ttest-peptide-length' +'\t'+ 'median-peptide-length-fold-change'

    for entry in annotation_ls: title = title +'\t'+ entry
    data.write(title+'\n')
    for dataset in summary_results_db:
        values = dataset
        for entry in summary_results_db[dataset][0:-1]: values = values +'\t'+ str(entry)
        if dataset in annotation_db2:
            for entry in annotation_db2[dataset]: values = values +'\t'+ str(entry[1])
        data.write(values+'\n')
    data.close()
コード例 #5
0
def outputSummaryResults(summary_results_db,name,analysis_method,root_dir):
    #summary_results_db[dataset_name] = udI,udI-up_diff,ddI,ddI-down_diff,udI_mx,udI_mx-mx_diff,up_dI_genes,down_gene, annotation_list
    annotation_db = {}
    for dataset in summary_results_db:
        for entry in summary_results_db[dataset][-1]:
            annotation = entry[0]
            count = entry[1]
            if 'AA:' not in annotation:
                try: annotation_db[annotation].append((dataset,count))
                except KeyError: annotation_db[annotation] = [(dataset,count)]
    annotation_ls = []

    for annotation in annotation_db: annotation_ls.append(annotation)
    annotation_ls.sort()
    annotation_db2={}
    for annotation in annotation_ls:
        for dataset in summary_results_db:
            y=0
            for entry in summary_results_db[dataset][-1]:
                annotation2 = entry[0]
                count = entry[1]
                if annotation2 == annotation:
                    y=1; new_count = count
            if y == 1:
                try: annotation_db2[dataset].append((annotation,new_count))
                except KeyError: annotation_db2[dataset] = [(annotation,new_count)]
            else:
                try: annotation_db2[dataset].append((annotation,0))
                except KeyError: annotation_db2[dataset] = [(annotation,0)]
      
    summary_output = root_dir+'AltResults/AlternativeOutput/'+analysis_method+'-summary-results'+name+'.txt'
    fn=filepath(summary_output)
    data = export.createExportFile(summary_output,'AltResults/AlternativeOutput')
    if analysis_method == 'splicing-index' or analysis_method == 'FIRMA':
        event_type1 = 'inclusion-events'; event_type2 = 'exclusion-events'; event_type3 = 'alternative-exons'
    else:
        event_type1 = 'inclusion-events'; event_type2 = 'exclusion-events'; event_type3 = 'mutually-exlusive-events'
    title = 'Dataset-name' +'\t'+ event_type1+'\t'+event_type2 +'\t'+ event_type3 +'\t'+ 'up-deltaI-genes' +'\t'+ 'down-deltaI-genes' +'\t'+ 'total-'+analysis_method+'-genes'
    title = title +'\t' + 'upregulated_genes' +'\t'+ 'downregulated_genes' +'\t'+ analysis_method+'-genes-differentially-exp'+'\t'+ 'RNA_processing/binding-factors-upregulated' +'\t'+ 'RNA_processing/binding-factors-downregulated' +'\t'+ analysis_method+'_RNA_processing/binding-factors'
    title = title +'\t'+ 'avg-downregulated-peptide-length' +'\t'+ 'std-downregulated-peptide-length' +'\t'+ 'avg-upregulated-peptide-length' +'\t'+ 'std-upregulated-peptide-length' +'\t'+ 'ttest-peptide-length' +'\t'+ 'median-peptide-length-fold-change'

    for entry in annotation_ls: title = title +'\t'+ entry
    data.write(title+'\n')
    for dataset in summary_results_db:
        values = dataset
        for entry in summary_results_db[dataset][0:-1]: values = values +'\t'+ str(entry)
        if dataset in annotation_db2:
            for entry in annotation_db2[dataset]: values = values +'\t'+ str(entry[1])
        data.write(values+'\n')
    data.close()
コード例 #6
0
def exportData(gene_db,data_type,species):
    export_file = 'AltDatabase/ensembl/SubGeneViewer/'+species+'/Xport_sgv_'+data_type+'.csv'
    if data_type == 'feature': title = 'gene'+'\t'+'symbol'+'\t'+'sgv_feature'+'\n'
    if data_type == 'structure': title = 'gene'+'\t'+'symbol'+'\t'+'sgv_structure'+'\n'
    if data_type == 'splice': title = 'gene'+'\t'+'symbol'+'\t'+'sgv_splice'+'\n'
    data = export.createExportFile(export_file,'AltDatabase/ensembl/SubGeneViewer/'+species)
    #fn=filepath(export_file); data = open(fn,'w')
    data.write(title)
    for gene in gene_db:
        try:
            symbol = gene_symbol_db[gene]
            value_str_list = gene_db[gene]
            value_str = string.join(value_str_list,',')
            values = string.join([gene,symbol,value_str],'\t')+'\n'; data.write(values)
        except KeyError: null = []
    data.close()
    print "exported to",export_file
コード例 #7
0
    def __init__(self,url,dir,file_type):
        """Copy the contents of a file from a given URL to a local file."""
        filename = url.split('/')[-1]
        if len(file_type) == 2: filename, file_type = file_type ### Added this feature for when a file has an invalid filename
        output_filepath_object = export.createExportFile(dir+filename,dir[:-1])
        output_filepath = filepath(dir+filename)

        print "Downloading the following file:",filename,' ',
        self.original_increment = 10
        self.increment = 0
        import urllib
        from urllib import urlretrieve
        try:
            try: webfile, msg = urlretrieve(url,output_filepath,reporthook=self.reporthookFunction)
            except IOError:
                if 'Binary' in traceback.format_exc(): #IOError: [Errno ftp error] 200 Switching to Binary mode.
                    ### https://bugs.python.org/issue1067702 - some machines the socket doesn't close and causes an error - reload to close the socket
                    reload(urllib)
                    webfile, msg = urlretrieve(url,output_filepath,reporthook=self.reporthookFunction)
                    reload(urllib)
        except:
            print 'Unknown URL error encountered...'; forceURLError
        print ''
        print "\nFile downloaded to:",output_filepath
        if '.zip' in filename:
            try: decompressZipStackOverflow(filename,dir); status = 'completed'
            except Exception:
                status = unzipFiles(filename,dir)
                if status == 'failed': print 'Zip Extraction failed'
            self.gz_filepath = filepath(output_filepath); self.status = 'remove'
            print "zip file extracted..."
        elif '.gz' in filename:
            self.gz_filepath = output_filepath
            if len(file_type)==0: extension = '.gz'
            else: extension = 'gz'
            decompressed_filepath = string.replace(self.gz_filepath,extension,file_type)
            ### Below code can be too memory intensive
            #file_size = os.path.getsize(output_filepath)
            #megabtyes = file_size/1000000.00
            #if megabtyes>5000: force_error ### force_error is an undefined variable which causes an exception
            import gzip; content = gzip.GzipFile(self.gz_filepath, 'rb')
            data = open(decompressed_filepath,'wb')
            #print "\nExtracting downloaded file:",self.gz_filepath
            import shutil; shutil.copyfileobj(content,data)
            self.status = 'remove'
        else: self.gz_filepath = ''; self.status = 'NA'
コード例 #8
0
def exportData(gene_db,data_type,species):
    export_file = 'AltDatabase/ensembl/SubGeneViewer/'+species+'/Xport_sgv_'+data_type+'.csv'
    if data_type == 'feature': title = 'gene'+'\t'+'symbol'+'\t'+'sgv_feature'+'\n'
    if data_type == 'structure': title = 'gene'+'\t'+'symbol'+'\t'+'sgv_structure'+'\n'
    if data_type == 'splice': title = 'gene'+'\t'+'symbol'+'\t'+'sgv_splice'+'\n'
    data = export.createExportFile(export_file,'AltDatabase/ensembl/SubGeneViewer/'+species)
    #fn=filepath(export_file); data = open(fn,'w')
    data.write(title)
    for gene in gene_db:
        try:
            symbol = gene_symbol_db[gene]
            value_str_list = gene_db[gene]
            value_str = string.join(value_str_list,',')
            values = string.join([gene,symbol,value_str],'\t')+'\n'; data.write(values)
        except KeyError: null = []
    data.close()
    print "exported to",export_file
コード例 #9
0
def exportData(gene_db, data_type, species):
    export_file = "AltDatabase/ensembl/SubGeneViewer/" + species + "/Xport_sgv_" + data_type + ".csv"
    if data_type == "feature":
        title = "gene" + "\t" + "symbol" + "\t" + "sgv_feature" + "\n"
    if data_type == "structure":
        title = "gene" + "\t" + "symbol" + "\t" + "sgv_structure" + "\n"
    if data_type == "splice":
        title = "gene" + "\t" + "symbol" + "\t" + "sgv_splice" + "\n"
    data = export.createExportFile(export_file, "AltDatabase/ensembl/SubGeneViewer/" + species)
    # fn=filepath(export_file); data = open(fn,'w')
    data.write(title)
    for gene in gene_db:
        try:
            symbol = gene_symbol_db[gene]
            value_str_list = gene_db[gene]
            value_str = string.join(value_str_list, ",")
            values = string.join([gene, symbol, value_str], "\t") + "\n"
            data.write(values)
        except KeyError:
            null = []
    data.close()
    print "exported to", export_file
コード例 #10
0
 def __init__(self,url,dir,file_type):
     """Copy the contents of a file from a given URL to a local file."""
     filename = url.split('/')[-1]
     if len(file_type) == 2: filename, file_type = file_type ### Added this feature for when a file has an invalid filename
     output_filepath_object = export.createExportFile(dir+filename,dir[:-1])
     output_filepath = filepath(dir+filename)
     
     print "Downloading the following file:",filename,' ',
     self.original_increment = 10
     self.increment = 0
     from urllib import urlretrieve
     webfile, msg = urlretrieve(url, output_filepath,reporthook=self.reporthookFunction)
     print ''
     print "\nFile downloaded to:",output_filepath
     if '.zip' in filename:
         try: decompressZipStackOverflow(filename,dir); status = 'completed'
         except Exception:
             status = unzipFiles(filename,dir)
             if status == 'failed': print 'Zip Extraction failed'
         self.gz_filepath = filepath(output_filepath); self.status = 'remove'
         print "zip file extracted..."
     elif '.gz' in filename:
         self.gz_filepath = output_filepath
         if len(file_type)==0: extension = '.gz'
         else: extension = 'gz'
         decompressed_filepath = string.replace(self.gz_filepath,extension,file_type)
         ### Below code can be too memory intensive
         #file_size = os.path.getsize(output_filepath)
         #megabtyes = file_size/1000000.00
         #if megabtyes>5000: force_error ### force_error is an undefined variable which causes an exception
         import gzip; content = gzip.GzipFile(self.gz_filepath, 'rb')
         data = open(decompressed_filepath,'wb')
         #print "\nExtracting downloaded file:",self.gz_filepath
         import shutil; shutil.copyfileobj(content,data)
         self.status = 'remove'
     else: self.gz_filepath = ''; self.status = 'NA'
コード例 #11
0
ファイル: download.py プロジェクト: wuxue/altanalyze
 def __init__(self,url,dir,file_type):
     """Copy the contents of a file from a given URL to a local file."""
     filename = url.split('/')[-1]
     if len(file_type) == 2: filename, file_type = file_type ### Added this feature for when a file has an invalid filename
     output_filepath_object = export.createExportFile(dir+filename,dir[:-1])
     output_filepath = filepath(dir+filename)
     
     print "Downloading the following file:",filename,' ',
     self.original_increment = 10
     self.increment = 0
     from urllib import urlretrieve
     webfile, msg = urlretrieve(url, output_filepath,reporthook=self.reporthookFunction)
     print ''
     print "\nFile downloaded to:",output_filepath
     if '.zip' in filename:
         try: decompressZipStackOverflow(filename,dir); status = 'completed'
         except Exception:
             status = unzipFiles(filename,dir)
             if status == 'failed': print 'Zip Extraction failed'
         self.gz_filepath = filepath(output_filepath); self.status = 'remove'
         print "zip file extracted..."
     elif '.gz' in filename:
         self.gz_filepath = output_filepath
         if len(file_type)==0: extension = '.gz'
         else: extension = 'gz'
         decompressed_filepath = string.replace(self.gz_filepath,extension,file_type)
         ### Below code can be too memory intensive
         #file_size = os.path.getsize(output_filepath)
         #megabtyes = file_size/1000000.00
         #if megabtyes>5000: force_error ### force_error is an undefined variable which causes an exception
         import gzip; content = gzip.GzipFile(self.gz_filepath, 'rb')
         data = open(decompressed_filepath,'wb')
         #print "\nExtracting downloaded file:",self.gz_filepath
         import shutil; shutil.copyfileobj(content,data)
         self.status = 'remove'
     else: self.gz_filepath = ''; self.status = 'NA'
コード例 #12
0
ファイル: FilterDabg.py プロジェクト: venkatmi/altanalyze
def parse_input_data(filename,data_type):
    fn=filepath(filename); first_line = 1; array_group_name_db = {}; z=0; array_group_db = {}; output_file = []
    #print "Reading",filename
    secondary_data_type = export.getParentDir(filename) ### e.g., expression or counts
    
    for line in open(fn,'rU').xreadlines():
      data = cleanUpLine(line); t = string.split(data,'\t'); probeset = t[0]; z+=1
      if first_line == 1:
          first_line = 0 #makes this value null for the next loop of actual array data
          ###Below ocucrs if the data is raw opposed to precomputed
          if data_type == 'export':
              if array_type == 'exon': folder = 'ExonArray'+'/'+species + '/'
              elif array_type == 'gene': folder = 'GeneArray'+'/'+species + '/'
              elif array_type == 'junction': folder = 'JunctionArray'+'/'+species + '/'
              elif array_type == 'RNASeq': folder = 'RNASeq'+'/'+species + '/'
              else: folder = array_type + '/'
              parent_path = root_dir+'AltExpression/'+folder
              if array_type == 'RNASeq':
                  output_file =  altanalzye_input[0:-4] + '.ExpCutoff-' + str(original_exp_threshold) +'_'+ filter_method+'.txt'
              else:
                  output_file = altanalzye_input[0:-4] + '.p' + str(int(100*p)) +'_'+ filter_method+'.txt'
              output_file_dir = parent_path+output_file
              print "...Exporting",output_file_dir
              export_data = export.createExportFile(output_file_dir,root_dir+'AltExpression/'+folder)
              fn=filepath(output_file_dir); export_data = open(fn,'w');
              export_data.write(line)
          if ':' in t[1]:
              array_group_list = []; x=0 ###gives us an original index value for each entry in the group
              for entry in t[1:]:
                  array_group,array_name = string.split(entry,':')
                  try:
                      array_group_db[array_group].append(x)
                      array_group_name_db[array_group].append(array_name)
                  except KeyError:
                      array_group_db[array_group] = [x]
                      array_group_name_db[array_group] = [array_name]
                      ### below only occurs with a new group addition
                      array_group_list.append(array_group) #use this to generate comparisons in the below linked function
                  x += 1
          #print '##### array_group_list',array_group_list
      elif len(probeset)>0 and data_type != 'export':
          ###Use the index values from above to assign each expression value to a new database
          temp_group_array={}; array_index_list = []  ###Use this list for permutation analysis
          for group in array_group_db:
              #array_index_list.append(array_group_db[group])
              group_values = []
              for array_index in array_group_db[group]:
                  try: exp_val = float(t[array_index+1])
                  except IndexError: print t, z,'\n',array_index,'\n',group, probeset;kill
                  group_values.append(exp_val)
              avg_stat = statistics.avg(group_values)

              if data_type == 'expression':
                  ###If non-log array data
                  if exp_data_format == 'non-log':
                      ### This works better for RNASeq as opposed to log transforming and then filtering which is more stringent and different than the filtering in ExonArray().
                      if array_type == 'RNASeq':
                        if normalization_method == 'RPKM' and secondary_data_type == 'expression':
                            if ':I' in probeset: k=1 ### Don't require an RPKM threshold for intron IDs (these will likely never meet this unless small or fully retained and highly expressed)
                            elif ':' not in probeset:
                                if avg_stat>=gene_rpkm_threshold: k=1
                                else: k=0
                            elif avg_stat>=exon_rpkm_threshold: k=1
                            elif '-' in probeset: k=1 ### Don't consider RPKM for junctions, just counts
                            else: k=0
                            #if 'ENSMUSG00000045991:E2.2' in probeset: print [probeset, normalization_method, secondary_data_type, gene_rpkm_threshold, avg_stat, k]
                        else: ### Otherwise, we are looking at count data
                            if '-' in probeset: ### junction meeting minimum read-count number
                                if avg_stat>=junction_exp_threshold: k=1 ### junction_exp_threshold is the same as nonlog_exp_threshold
                                else: k=0
                            elif ':' not in probeset:
                                if avg_stat>=gene_exp_threshold: k=1
                                else: k=0
                            else: ### exon or intron meeting minimum read-count number
                                if avg_stat>=exon_exp_threshold: k=1
                                else: k=0
                            #if 'ENSMUSG00000045991:E2.2' in probeset: print [probeset, normalization_method, secondary_data_type, exon_exp_threshold, junction_exp_threshold, avg_stat, k]
                      else:
                        if avg_stat>=nonlog_exp_threshold: k=1
                        else: k=0
                  elif avg_stat>=log_expression_threshold: k=1
                  else: k=0
                  if normalization_method == 'RPKM' and secondary_data_type == 'expression': ### Treat as dabp p-value
                      try: pvalue_status_db[probeset].append(k)
                      except KeyError: pvalue_status_db[probeset] = [k]
                  else:
                      try: expression_status_db[probeset].append(k)
                      except KeyError: expression_status_db[probeset] = [k]
                  #if probeset == '3209315': print [group],k,len(group_values),array_group_list
              if data_type == 'p-value':
                  if avg_stat<=p: k=1
                  else: k=0
                  #if 'G7216513_a_at' in probeset: print k, avg_stat
                  try: pvalue_status_db[probeset].append(k)
                  except KeyError: pvalue_status_db[probeset] = [k]
      elif data_type == 'export':
          if exp_data_format == 'non-log':
              ### This code was added in version 1.16 in conjunction with a switch from logstatus to
              ### non-log in AltAnalyze to prevent "Process AltAnalyze Filtered" associated errors
              exp_values = t[1:]; exp_values_log2=[]
              for exp_val in exp_values:
                  exp_values_log2.append(str(math.log(float(exp_val),2))) ### exp_val+=1 was removed in 2.0.5
              line = string.join([probeset]+exp_values_log2,'\t')+'\n'
          try: null = export_db[probeset]; export_data.write(line)
          except KeyError: null = [] ### occurs if not a probeset to include in the filtered results export file
    if data_type == 'export': export_data.close()
    return output_file
コード例 #13
0
def exportTransitResults(array_group_list,array_raw_group_values,array_group_db,avg_const_exp_db,adj_fold_dbase,exon_db,dataset_name,apt_dir):
    """Export processed raw expression values (e.g. add global fudge factor or eliminate probe sets based on filters) to txt files
    for analysis with MiDAS"""
    #array_group_list contains group names in order of analysis
    #array_raw_group_values contains expression values for the x number of groups in above list
    #array_group_db key is the group name and values are the list of array names
    #avg_const_exp_db contains the average expression values for all arrays for all constitutive probesets, with gene as the key

    ordered_array_header_list=[]
    for group in array_group_list: ###contains the correct order for each group
        for array_id in array_group_db[group]:
            ordered_array_header_list.append(str(array_id))
    ordered_exp_val_db = {} ###new dictionary containing all expression values together, but organized based on group
    probeset_affygene_db = {} ###lists all altsplice probesets and corresponding affygenes
    for probeset in array_raw_group_values:
        try:
            include_probeset = 'yes'
            ###Examines user input parameters for inclusion of probeset types in the analysis
            if include_probeset == 'yes':
                if probeset in adj_fold_dbase: ###indicates that this probeset is analyzed for splicing (e.g. has a constitutive probeset)
                    for group_val_list in array_raw_group_values[probeset]:
                        non_log_group_exp_vals = statistics.log_fold_conversion(group_val_list)
                        for val in non_log_group_exp_vals:
                            try: ordered_exp_val_db[probeset].append(str(val))
                            except KeyError: ordered_exp_val_db[probeset] = [str(val)]
                    affygene = exon_db[probeset].GeneID()
                    try: probeset_affygene_db[affygene].append(probeset)
                    except KeyError: probeset_affygene_db[affygene] = [probeset]
        except KeyError:
            ###Indicates that the expression dataset file was not filtered for whether annotations exist in the exon annotation file
            ###In that case, just ignore the entry
            null = ''

    gene_count = 0
    ordered_gene_val_db={}
    for affygene in avg_const_exp_db: ###now, add all constitutive gene level expression values (only per anlayzed gene)
        if affygene in probeset_affygene_db: ###ensures we only include gene data where there are altsplice examined probesets
            non_log_ordered_exp_const_val = statistics.log_fold_conversion(avg_const_exp_db[affygene])
            gene_count+=1
            for val in non_log_ordered_exp_const_val:
                try: ordered_gene_val_db[affygene].append(str(val))
                except KeyError: ordered_gene_val_db[affygene] = [str(val)]

    convert_probesets_to_numbers={}
    convert_affygene_to_numbers={}; array_type = 'junction'
    probeset_affygene_number_db={}; x=0; y=0
    for affygene in probeset_affygene_db:
        x+=1; y = x  ###each affygene has a unique number, from other affygenes and probesets and probesets count up from each affygene
        x_copy = x
        example_gene_probeset = probeset_affygene_db[affygene][0]
        #if exon_db[example_gene_probeset].ArrayType() == 'exon': x_copy = exon_db[example_gene_probeset].SecondaryGeneID()
        if x_copy not in exon_db:
            convert_affygene_to_numbers[affygene] = str(x_copy)
        else: print affygene, x_copy,'new numeric for MIDAS already exists as a probeset ID number'; kill
        for probeset in probeset_affygene_db[affygene]:
            y = y+1; y_copy = y
            if exon_db[probeset].ArrayType() == 'exon':
                y_copy = probeset ### Only appropriate when the probeset ID is a number
                array_type = 'exon'
            convert_probesets_to_numbers[probeset] = str(y_copy)
            try: probeset_affygene_number_db[str(x_copy)].append(str(y_copy))
            except KeyError: probeset_affygene_number_db[str(x_copy)] = [str(y_copy)]
        x=y
    
    metafile = 'AltResults/MIDAS/meta-'+dataset_name[0:-1]+'.txt'
    data1 = export.createExportFile(metafile,'AltResults/MIDAS')
    title = 'probeset_id\ttranscript_cluster_id\tprobeset_list\tprobe_count\n'    
    data1.write(title)
    for affygene in probeset_affygene_number_db:
        probeset_list = probeset_affygene_number_db[affygene]; probe_number = str(len(probeset_list)*6)
        probeset_list = [string.join(probeset_list,' ')]
        probeset_list.append(affygene); probeset_list.append(affygene); probeset_list.reverse(); probeset_list.append(probe_number)
        probeset_list = string.join(probeset_list,'\t'); probeset_list=probeset_list+'\n'
        data1.write(probeset_list)
    data1.close()

    junction_exp_file = 'AltResults/MIDAS/'+array_type+'-exp-'+dataset_name[0:-1]+'.txt'
    fn2=filepath(junction_exp_file)
    data2 = open(fn2,'w')
    ordered_array_header_list.reverse(); ordered_array_header_list.append('probeset_id'); ordered_array_header_list.reverse()
    title = string.join(ordered_array_header_list,'\t')
    data2.write(title+'\n')
    for probeset in ordered_exp_val_db:
        probeset_number = convert_probesets_to_numbers[probeset]
        exp_values = ordered_exp_val_db[probeset]; exp_values.reverse(); exp_values.append(probeset_number); exp_values.reverse()
        exp_values = string.join(exp_values,'\t'); exp_values = exp_values +'\n'
        data2.write(exp_values)
    data2.close()

    gene_exp_file = 'AltResults/MIDAS/gene-exp-'+dataset_name[0:-1]+'.txt'
    fn3=filepath(gene_exp_file)
    data3 = open(fn3,'w')
    title = string.join(ordered_array_header_list,'\t')
    data3.write(title+'\n')
    for affygene in ordered_gene_val_db:
        try: affygene_number = convert_affygene_to_numbers[affygene]
        except KeyError: print len(convert_affygene_to_numbers), len(ordered_gene_val_db); kill
        exp_values = ordered_gene_val_db[affygene]; exp_values.reverse(); exp_values.append(affygene_number); exp_values.reverse()
        exp_values = string.join(exp_values,'\t'); exp_values = exp_values +'\n'
        data3.write(exp_values)
    data3.close()

    exportMiDASArrayNames(array_group_list,array_group_db,dataset_name,'new')
    
    coversionfile = 'AltResults/MIDAS/probeset-conversion-'+dataset_name[0:-1]+'.txt'
    fn5=filepath(coversionfile)
    data5 = open(fn5,'w')
    title = 'probeset\tprobeset_number\n'; data5.write(title)
    for probeset in convert_probesets_to_numbers: ###contains the correct order for each group
        probeset_number = convert_probesets_to_numbers[probeset]
        values = probeset+'\t'+probeset_number+'\n'
        data5.write(values)
    data5.close()

    """
    ### This code is obsolete... used before AltAnalyze could connect to APT directly.
    commands = 'AltResults/MIDAS/commands-'+dataset_name[0:-1]+'.txt'
    data = export.createExportFile(commands,'AltResults/MIDAS')
    path = filepath('AltResults/MIDAS'); path = string.replace(path,'\\','/'); path = 'cd '+path+'\n\n'
    metafile = 'meta-'+dataset_name[0:-1]+'.txt'
    junction_exp_file = array_type+'-exp-'+dataset_name[0:-1]+'.txt'
    gene_exp_file = 'gene-exp-'+dataset_name[0:-1]+'.txt'
    celfiles = 'celfiles-'+dataset_name[0:-1]+'.txt'
    command_line = 'apt-midas -c '+celfiles+' -g '+gene_exp_file+' -e '+junction_exp_file+' -m '+metafile+' -o '+dataset_name[0:-1]+'-output'
    data.write(path); data.write(command_line); data.close()
    """

    status = runMiDAS(apt_dir,array_type,dataset_name,array_group_list,array_group_db)    
    return status
コード例 #14
0
ファイル: update.py プロジェクト: cwt1/altanalyze
    def __init__(self,url,dir,file_type):
        try: self.suppress = suppress_printouts
        except Exception: self.suppress = 'no'
        """Copy the contents of a file from a given URL to a local file."""
        filename = url.split('/')[-1]; self.status = ''
        #print [url, dir, file_type]
        #dir = unique.filepath(dir) ### Can screw up directory structures
        if file_type == None: file_type =''
        if len(file_type) == 2: filename, file_type = file_type ### Added this feature for when a file has an invalid filename
        output_filepath_object = export.createExportFile(dir+filename,dir[:-1])
        output_filepath = filepath(dir+filename); self.output_filepath = output_filepath
        
        if self.suppress == 'no':
            print "Downloading the following file:",filename,' ',
        self.original_increment = 5
        self.increment = 0
        import urllib
        from urllib import urlretrieve
        #if 'gene.txt.gz' in url: print [self.reporthookFunction];sys.exit()
        try:
            try: webfile, msg = urlretrieve(url,output_filepath,reporthook=self.reporthookFunction)
            except IOError:
                if 'Binary' in traceback.format_exc(): #IOError: [Errno ftp error] 200 Switching to Binary mode.
                    ### https://bugs.python.org/issue1067702 - some machines the socket doesn't close and causes an error - reload to close the socket
                    reload(urllib)
                    webfile, msg = urlretrieve(url,output_filepath,reporthook=self.reporthookFunction)
                    reload(urllib)
        except:
            print 'Unknown URL error encountered...'; forceURLError

        if self.suppress == 'no': print ''
        self.testFile()
        if self.suppress == 'no': print self.status

        if 'Internet' not in self.status:
            if '.zip' in filename:
                if self.suppress == 'no': print "Extracting zip file...",
                try: decompressZipStackOverflow(filename,dir); status = 'completed'
                except Exception:
                    #print 'Native unzip not present...trying python unzip methods...'
                    status = unzipFiles(filename,dir)
                    if status == 'failed': print 'zip extraction failed!'
                self.gz_filepath = filepath(output_filepath); self.status = 'remove'
                if self.suppress == 'no': print "zip file extracted"
            elif '.gz' in filename:
                self.gz_filepath = output_filepath
                if len(file_type)==0: extension = '.gz'
                else: extension = 'gz'
                decompressed_filepath = string.replace(self.gz_filepath,extension,file_type)
                ### Below code can be too memory intensive
                #file_size = os.path.getsize(output_filepath)
                #megabtyes = file_size/1000000.00
                #if megabtyes>5000: force_error ### force_error is an undefined variable which causes an exception
                import gzip; content = gzip.GzipFile(self.gz_filepath, 'rb')
                data = open(decompressed_filepath,'wb')
                #print "\nExtracting downloaded file:",self.gz_filepath
                import shutil; shutil.copyfileobj(content,data)
                # http://pythonicprose.blogspot.com/2009/10/python-extract-or-unzip-tar-file.html
                os.chdir(filepath(dir))
                if '.tar' in decompressed_filepath:
                    import tarfile
                    tfile = tarfile.open(decompressed_filepath)
                    tfile.extractall()
                    tfile.close()
                    tar_dir = string.replace(decompressed_filepath,'.tar','')
                self.status = 'remove'
            else: self.gz_filepath = ''; self.status = 'remove'
コード例 #15
0
def exportTransitResults(array_group_list, array_raw_group_values,
                         array_group_db, avg_const_exp_db, adj_fold_dbase,
                         exon_db, dataset_name, apt_dir):
    """Export processed raw expression values (e.g. add global fudge factor or eliminate probe sets based on filters) to txt files
    for analysis with MiDAS"""
    #array_group_list contains group names in order of analysis
    #array_raw_group_values contains expression values for the x number of groups in above list
    #array_group_db key is the group name and values are the list of array names
    #avg_const_exp_db contains the average expression values for all arrays for all constitutive probesets, with gene as the key

    ordered_array_header_list = []
    for group in array_group_list:  ###contains the correct order for each group
        for array_id in array_group_db[group]:
            ordered_array_header_list.append(str(array_id))
    ordered_exp_val_db = {
    }  ###new dictionary containing all expression values together, but organized based on group
    probeset_affygene_db = {
    }  ###lists all altsplice probesets and corresponding affygenes
    for probeset in array_raw_group_values:
        try:
            include_probeset = 'yes'
            ###Examines user input parameters for inclusion of probeset types in the analysis
            if include_probeset == 'yes':
                if probeset in adj_fold_dbase:  ###indicates that this probeset is analyzed for splicing (e.g. has a constitutive probeset)
                    for group_val_list in array_raw_group_values[probeset]:
                        non_log_group_exp_vals = statistics.log_fold_conversion(
                            group_val_list)
                        for val in non_log_group_exp_vals:
                            try:
                                ordered_exp_val_db[probeset].append(str(val))
                            except KeyError:
                                ordered_exp_val_db[probeset] = [str(val)]
                    affygene = exon_db[probeset].GeneID()
                    try:
                        probeset_affygene_db[affygene].append(probeset)
                    except KeyError:
                        probeset_affygene_db[affygene] = [probeset]
        except KeyError:
            ###Indicates that the expression dataset file was not filtered for whether annotations exist in the exon annotation file
            ###In that case, just ignore the entry
            null = ''

    gene_count = 0
    ordered_gene_val_db = {}
    for affygene in avg_const_exp_db:  ###now, add all constitutive gene level expression values (only per anlayzed gene)
        if affygene in probeset_affygene_db:  ###ensures we only include gene data where there are altsplice examined probesets
            non_log_ordered_exp_const_val = statistics.log_fold_conversion(
                avg_const_exp_db[affygene])
            gene_count += 1
            for val in non_log_ordered_exp_const_val:
                try:
                    ordered_gene_val_db[affygene].append(str(val))
                except KeyError:
                    ordered_gene_val_db[affygene] = [str(val)]

    convert_probesets_to_numbers = {}
    convert_affygene_to_numbers = {}
    array_type = 'junction'
    probeset_affygene_number_db = {}
    x = 0
    y = 0
    for affygene in probeset_affygene_db:
        x += 1
        y = x  ###each affygene has a unique number, from other affygenes and probesets and probesets count up from each affygene
        x_copy = x
        example_gene_probeset = probeset_affygene_db[affygene][0]
        #if exon_db[example_gene_probeset].ArrayType() == 'exon': x_copy = exon_db[example_gene_probeset].SecondaryGeneID()
        if x_copy not in exon_db:
            convert_affygene_to_numbers[affygene] = str(x_copy)
        else:
            print affygene, x_copy, 'new numeric for MIDAS already exists as a probeset ID number'
            kill
        for probeset in probeset_affygene_db[affygene]:
            y = y + 1
            y_copy = y
            if exon_db[probeset].ArrayType() == 'exon':
                y_copy = probeset  ### Only appropriate when the probeset ID is a number
                array_type = 'exon'
            convert_probesets_to_numbers[probeset] = str(y_copy)
            try:
                probeset_affygene_number_db[str(x_copy)].append(str(y_copy))
            except KeyError:
                probeset_affygene_number_db[str(x_copy)] = [str(y_copy)]
        x = y

    metafile = 'AltResults/MIDAS/meta-' + dataset_name[0:-1] + '.txt'
    data1 = export.createExportFile(metafile, 'AltResults/MIDAS')
    title = 'probeset_id\ttranscript_cluster_id\tprobeset_list\tprobe_count\n'
    data1.write(title)
    for affygene in probeset_affygene_number_db:
        probeset_list = probeset_affygene_number_db[affygene]
        probe_number = str(len(probeset_list) * 6)
        probeset_list = [string.join(probeset_list, ' ')]
        probeset_list.append(affygene)
        probeset_list.append(affygene)
        probeset_list.reverse()
        probeset_list.append(probe_number)
        probeset_list = string.join(probeset_list, '\t')
        probeset_list = probeset_list + '\n'
        data1.write(probeset_list)
    data1.close()

    junction_exp_file = 'AltResults/MIDAS/' + array_type + '-exp-' + dataset_name[
        0:-1] + '.txt'
    fn2 = filepath(junction_exp_file)
    data2 = open(fn2, 'w')
    ordered_array_header_list.reverse()
    ordered_array_header_list.append('probeset_id')
    ordered_array_header_list.reverse()
    title = string.join(ordered_array_header_list, '\t')
    data2.write(title + '\n')
    for probeset in ordered_exp_val_db:
        probeset_number = convert_probesets_to_numbers[probeset]
        exp_values = ordered_exp_val_db[probeset]
        exp_values.reverse()
        exp_values.append(probeset_number)
        exp_values.reverse()
        exp_values = string.join(exp_values, '\t')
        exp_values = exp_values + '\n'
        data2.write(exp_values)
    data2.close()

    gene_exp_file = 'AltResults/MIDAS/gene-exp-' + dataset_name[0:-1] + '.txt'
    fn3 = filepath(gene_exp_file)
    data3 = open(fn3, 'w')
    title = string.join(ordered_array_header_list, '\t')
    data3.write(title + '\n')
    for affygene in ordered_gene_val_db:
        try:
            affygene_number = convert_affygene_to_numbers[affygene]
        except KeyError:
            print len(convert_affygene_to_numbers), len(ordered_gene_val_db)
            kill
        exp_values = ordered_gene_val_db[affygene]
        exp_values.reverse()
        exp_values.append(affygene_number)
        exp_values.reverse()
        exp_values = string.join(exp_values, '\t')
        exp_values = exp_values + '\n'
        data3.write(exp_values)
    data3.close()

    exportMiDASArrayNames(array_group_list, array_group_db, dataset_name,
                          'new')

    coversionfile = 'AltResults/MIDAS/probeset-conversion-' + dataset_name[
        0:-1] + '.txt'
    fn5 = filepath(coversionfile)
    data5 = open(fn5, 'w')
    title = 'probeset\tprobeset_number\n'
    data5.write(title)
    for probeset in convert_probesets_to_numbers:  ###contains the correct order for each group
        probeset_number = convert_probesets_to_numbers[probeset]
        values = probeset + '\t' + probeset_number + '\n'
        data5.write(values)
    data5.close()
    """
    ### This code is obsolete... used before AltAnalyze could connect to APT directly.
    commands = 'AltResults/MIDAS/commands-'+dataset_name[0:-1]+'.txt'
    data = export.createExportFile(commands,'AltResults/MIDAS')
    path = filepath('AltResults/MIDAS'); path = string.replace(path,'\\','/'); path = 'cd '+path+'\n\n'
    metafile = 'meta-'+dataset_name[0:-1]+'.txt'
    junction_exp_file = array_type+'-exp-'+dataset_name[0:-1]+'.txt'
    gene_exp_file = 'gene-exp-'+dataset_name[0:-1]+'.txt'
    celfiles = 'celfiles-'+dataset_name[0:-1]+'.txt'
    command_line = 'apt-midas -c '+celfiles+' -g '+gene_exp_file+' -e '+junction_exp_file+' -m '+metafile+' -o '+dataset_name[0:-1]+'-output'
    data.write(path); data.write(command_line); data.close()
    """

    status = runMiDAS(apt_dir, array_type, dataset_name, array_group_list,
                      array_group_db)
    return status
コード例 #16
0
ファイル: FilterDabg.py プロジェクト: xflicsu/altanalyze
def parse_input_data(filename, data_type):
    fn = filepath(filename)
    first_line = 1
    array_group_name_db = {}
    z = 0
    array_group_db = {}
    output_file = []
    #print "Reading",filename
    secondary_data_type = export.getParentDir(
        filename)  ### e.g., expression or counts

    for line in open(fn, 'rU').xreadlines():
        data = cleanUpLine(line)
        t = string.split(data, '\t')
        probeset = t[0]
        z += 1
        if first_line == 1:
            first_line = 0  #makes this value null for the next loop of actual array data
            ###Below ocucrs if the data is raw opposed to precomputed
            if data_type == 'export':
                if array_type == 'exon':
                    folder = 'ExonArray' + '/' + species + '/'
                elif array_type == 'gene':
                    folder = 'GeneArray' + '/' + species + '/'
                elif array_type == 'junction':
                    folder = 'JunctionArray' + '/' + species + '/'
                elif array_type == 'RNASeq':
                    folder = 'RNASeq' + '/' + species + '/'
                else:
                    folder = array_type + '/'
                parent_path = root_dir + 'AltExpression/' + folder
                if array_type == 'RNASeq':
                    output_file = altanalzye_input[0:-4] + '.ExpCutoff-' + str(
                        original_exp_threshold) + '_' + filter_method + '.txt'
                else:
                    output_file = altanalzye_input[0:-4] + '.p' + str(
                        int(100 * p)) + '_' + filter_method + '.txt'
                output_file_dir = parent_path + output_file
                print "...Exporting", output_file_dir
                export_data = export.createExportFile(
                    output_file_dir, root_dir + 'AltExpression/' + folder)
                fn = filepath(output_file_dir)
                export_data = open(fn, 'w')
                export_data.write(line)
            if ':' in t[1]:
                array_group_list = []
                x = 0  ###gives us an original index value for each entry in the group
                for entry in t[1:]:
                    array_group, array_name = string.split(entry, ':')
                    try:
                        array_group_db[array_group].append(x)
                        array_group_name_db[array_group].append(array_name)
                    except KeyError:
                        array_group_db[array_group] = [x]
                        array_group_name_db[array_group] = [array_name]
                        ### below only occurs with a new group addition
                        array_group_list.append(
                            array_group
                        )  #use this to generate comparisons in the below linked function
                    x += 1
            #print '##### array_group_list',array_group_list
        elif len(probeset) > 0 and data_type != 'export':
            ###Use the index values from above to assign each expression value to a new database
            temp_group_array = {}
            array_index_list = []  ###Use this list for permutation analysis
            for group in array_group_db:
                #array_index_list.append(array_group_db[group])
                group_values = []
                for array_index in array_group_db[group]:
                    try:
                        exp_val = float(t[array_index + 1])
                    except IndexError:
                        print t, z, '\n', array_index, '\n', group, probeset
                        kill
                    group_values.append(exp_val)
                avg_stat = statistics.avg(group_values)

                if data_type == 'expression':
                    ###If non-log array data
                    if exp_data_format == 'non-log':
                        ### This works better for RNASeq as opposed to log transforming and then filtering which is more stringent and different than the filtering in ExonArray().
                        if array_type == 'RNASeq':
                            if normalization_method == 'RPKM' and secondary_data_type == 'expression':
                                if ':I' in probeset:
                                    k = 1  ### Don't require an RPKM threshold for intron IDs (these will likely never meet this unless small or fully retained and highly expressed)
                                elif ':' not in probeset:
                                    if avg_stat >= gene_rpkm_threshold: k = 1
                                    else: k = 0
                                elif avg_stat >= exon_rpkm_threshold: k = 1
                                elif '-' in probeset:
                                    k = 1  ### Don't consider RPKM for junctions, just counts
                                else:
                                    k = 0
                                #if 'ENSMUSG00000045991:E2.2' in probeset: print [probeset, normalization_method, secondary_data_type, gene_rpkm_threshold, avg_stat, k]
                            else:  ### Otherwise, we are looking at count data
                                if '-' in probeset:  ### junction meeting minimum read-count number
                                    if avg_stat >= junction_exp_threshold:
                                        k = 1  ### junction_exp_threshold is the same as nonlog_exp_threshold
                                    else:
                                        k = 0
                                elif ':' not in probeset:
                                    if avg_stat >= gene_exp_threshold: k = 1
                                    else: k = 0
                                else:  ### exon or intron meeting minimum read-count number
                                    if avg_stat >= exon_exp_threshold: k = 1
                                    else: k = 0
                                #if 'ENSMUSG00000045991:E2.2' in probeset: print [probeset, normalization_method, secondary_data_type, exon_exp_threshold, junction_exp_threshold, avg_stat, k]
                        else:
                            if avg_stat >= nonlog_exp_threshold: k = 1
                            else: k = 0
                    elif avg_stat >= log_expression_threshold: k = 1
                    else: k = 0
                    if normalization_method == 'RPKM' and secondary_data_type == 'expression':  ### Treat as dabp p-value
                        try:
                            pvalue_status_db[probeset].append(k)
                        except KeyError:
                            pvalue_status_db[probeset] = [k]
                    else:
                        try:
                            expression_status_db[probeset].append(k)
                        except KeyError:
                            expression_status_db[probeset] = [k]
                    #if probeset == '3209315': print [group],k,len(group_values),array_group_list
                if data_type == 'p-value':
                    if avg_stat <= p: k = 1
                    else: k = 0
                    #if 'G7216513_a_at' in probeset: print k, avg_stat
                    try:
                        pvalue_status_db[probeset].append(k)
                    except KeyError:
                        pvalue_status_db[probeset] = [k]
        elif data_type == 'export':
            if exp_data_format == 'non-log':
                ### This code was added in version 1.16 in conjunction with a switch from logstatus to
                ### non-log in AltAnalyze to prevent "Process AltAnalyze Filtered" associated errors
                exp_values = t[1:]
                exp_values_log2 = []
                for exp_val in exp_values:
                    exp_values_log2.append(str(
                        math.log(float(exp_val),
                                 2)))  ### exp_val+=1 was removed in 2.0.5
                line = string.join([probeset] + exp_values_log2, '\t') + '\n'
            try:
                null = export_db[probeset]
                export_data.write(line)
            except KeyError:
                null = [
                ]  ### occurs if not a probeset to include in the filtered results export file
    if data_type == 'export': export_data.close()
    return output_file
コード例 #17
0
ファイル: ExonArray.py プロジェクト: wuxue/altanalyze
def exportGroupedComparisonProbesetData(filename,probeset_db,data_type,array_names,array_linker_db,perform_alt_analysis):
        """This function organizes the raw expression data into sorted groups, exports the organized data for all conditions and comparisons
        and calculates which probesets have groups that meet the user defined dabg and expression thresholds."""
        #comparison_filename_list=[]
        #if perform_alt_analysis != 'expression': ### User Option (removed in version 2.0 since the option prevented propper filtering)
        comparison_filename_list=[]
        probeset_dbase={}; exp_dbase={}; constitutive_gene_db={}; probeset_gene_db={} ### reset databases to conserve memory
        global expr_group_list; global comp_group_list; global expr_group_db
        if data_type == 'residuals':
            expr_group_dir = string.replace(filename,'residuals.','groups.')
            comp_group_dir = string.replace(filename,'residuals.','comps.')
        elif data_type == 'expression':
            expr_group_dir = string.replace(filename,'exp.','groups.')
            comp_group_dir = string.replace(filename,'exp.','comps.')
            if 'counts.' in filename:
                expr_group_dir = string.replace(expr_group_dir,'counts.','groups.')
                comp_group_dir = string.replace(comp_group_dir,'counts.','comps.')
                data_type = 'counts'
        elif data_type == 'dabg':
            expr_group_dir = string.replace(filename,'stats.','groups.')
            comp_group_dir = string.replace(filename,'stats.','comps.')

        comp_group_list, comp_group_list2 = ExpressionBuilder.importComparisonGroups(comp_group_dir)
        expr_group_list,expr_group_db = ExpressionBuilder.importArrayGroups(expr_group_dir,array_linker_db)

        print "Reorganizing expression data into comparison groups for export to down-stream splicing analysis software"
        ###Do this only for the header data
        group_count,raw_data_comp_headers = reorder_arrays.reorderArrayHeaders(array_names,expr_group_list,comp_group_list,array_linker_db)

        ###Export the header info and store the export write data for reorder_arrays
        global comparision_export_db; comparision_export_db={}; array_type_name = 'Exon'
        if array_type == 'junction': array_type_name = 'Junction'
        elif array_type == 'RNASeq': array_type_name = 'RNASeq'
        if data_type != 'residuals': AltAnalzye_input_dir = root_dir+"AltExpression/pre-filtered/"+data_type+'/'
        else: AltAnalzye_input_dir = root_dir+"AltExpression/FIRMA/residuals/"+array_type+'/'+species+'/' ### These files does not need to be filtered until AltAnalyze.py

        for comparison in comp_group_list2: ###loop throught the list of comparisons
            group1 = comparison[0]; group2 = comparison[1]
            group1_name = expr_group_db[group1]; group2_name = expr_group_db[group2]
            comparison_filename = species+'_'+array_type_name+'_'+ group1_name + '_vs_' + group2_name + '.txt'
                
            new_file = AltAnalzye_input_dir + comparison_filename; comparison_filename_list.append(comparison_filename)
            data = export.createExportFile(new_file,AltAnalzye_input_dir[:-1])

            try: array_names = raw_data_comp_headers[comparison]
            except KeyError: print raw_data_comp_headers;kill
            title = ['UID']+array_names; title = string.join(title,'\t')+'\n'; data.write(title)
            comparision_export_db[comparison] = data ###store the export file write data so we can write after organizing
        #print filename, normalize_feature_exp
        biotypes = importExonProbesetData(filename,probeset_db,'reorderFilterAndExportAll')
        
        if normalize_feature_exp == 'RPKM': ### Add the gene-level RPKM data (this is in addition to the counts. file)
            exp_gene_db={}
            for i in probeset_db: exp_gene_db[probeset_db[i][0]]=[]
            filename = string.replace(filename,'.txt','-steady-state.txt')
            #print filename, normalize_feature_exp, 'here'
            importExonProbesetData(filename,exp_gene_db,'reorderFilterAndExportAll')
            
        for comparison in comparision_export_db:
            data = comparision_export_db[comparison]; data.close()
        print "Pairwise comparisons for AltAnalyze exported..."
        try: fulldataset_export_object.close()
        except Exception: null=[]
        return comparison_filename_list, biotypes