Esempio n. 1
0
def justConvertFilenames(species, outputdir):
    import gene_associations

    gene_to_symbol = gene_associations.getGeneToUid(species, ("hide", "Ensembl-Symbol"))
    import OBO_import

    symbol_to_gene = OBO_import.swapKeyValues(gene_to_symbol)

    for filename in os.listdir(outputdir):
        if ".pdf" in filename or ".png" in filename:
            fn = string.replace(filename, ".pdf", "")
            fn = string.replace(fn, ".png", "")
            newname = string.split(fn, "__")

            if newname[0] in gene_to_symbol:
                new_filename = str(filename)
                if "__" in filename:
                    new_filename = string.split(filename, "__")[1]
                elif "\\" in filename:
                    new_filename = string.split(filename, "\\")[1]
                elif "/" in filename:
                    new_filename = string.split(filename, "/")[1]
                nnname = gene_to_symbol[newname[0]][0] + "-SashimiPlot_" + new_filename
                try:
                    os.rename(os.path.join(outputdir, filename), os.path.join(outputdir, nnname))
                except Exception:
                    pass
            else:
                continue
Esempio n. 2
0
def sashmi_plot_list(bamdir,eventsToVisualizeFilename,PSIFilename,events=None):
    import gene_associations
    gene_to_symbol = gene_associations.getGeneToUid(species,('hide','Ensembl-Symbol'))
    import OBO_import; symbol_to_gene = OBO_import.swapKeyValues(gene_to_symbol)

    if events==None:
        splicing_events,expandedSearch = importSplicingEventsToVisualize(eventsToVisualizeFilename)
    else:
        ### Replace any ":" from the input events
        #for i in range(len(events)): events[i] = string.replace(events[i],':','__')
        expandedSearch = True
        
        for i in range(len(events)):
            gene = string.split(events[i],'__')[0]
            if gene in gene_to_symbol:
                symbol = gene_to_symbol[gene][0]
            elif 'ENS' not in gene or 'G0000' in gene:
                if gene in symbol_to_gene:
                    ensID = symbol_to_gene[gene][0]
                    symbol = gene
                    events[i] = ensID ### translate this ID to an Ensembl gene ID for propper SashimiPlot lookup
        splicing_events = events ### optionally get from supplied variable

    if len(splicing_events)==0:
        print eventsToVisualizeFilename
        forceNoCompatibleEventsInFile
    
    print 'Exporting plots',
    
    ### Determine Groups for Coloring
    groups_file = 'None'
    dir_list = unique.read_directory(root_dir+'/ExpressionInput')

    for file in dir_list:
         if 'groups.' in file:
            groups_file = root_dir+'/ExpressionInput/'+file

    if groups_file != None:
        try:
            import ExpressionBuilder
            sample_group_db = ExpressionBuilder.simplerGroupImport(groups_file)
            groups=[]
            for sample in sample_group_db:
                if sample_group_db[sample] not in groups:
                    groups.append(sample_group_db[sample]) ### create an ordered list of unique group
        except Exception:
            groups = ['None']
            #print traceback.format_exc()
            pass

    processed_events = formatAndSubmitSplicingEventsToSashimiPlot(PSIFilename, bamdir, splicing_events, sample_group_db, groups, False)
    mopup_events = getMopUpEvents(splicing_events, processed_events)

    ### Do the same for supplied gene queries or junctions that didn't map above using the gene expression values as a guide
    #print len(splicing_events),len(processed_events),len(mopup_events)
    processed_events = formatAndSubmitSplicingEventsToSashimiPlot(steady_state_exp_file,bamdir,mopup_events,sample_group_db,groups,expandedSearch)
    if len(processed_events)>0:
        mopup_events = getMopUpEvents(mopup_events, processed_events)
        processed_events = formatAndSubmitSplicingEventsToSashimiPlot(PSIFilename, bamdir, mopup_events, sample_group_db, groups, True)
    return gene_to_symbol
Esempio n. 3
0
def translateToEntrezGene(species,filename):
    x=0; type = 'pathway'
    try: ens_to_entrez = gene_associations.getGeneToUid(species,('hide','Ensembl-EntrezGene'))
    except Exception: ens_to_entrez ={}

    if len(ens_to_entrez)>0:
        export_file = string.replace(filename,'Ensembl','EntrezGene')
        export_data = export.ExportFile(export_file)
        export_data.write('EntrezGene\tOntologyID\n')
        fn = filepath(filename)
        for line in open(fn,'rU').xreadlines():
            if x==0: x=1
            else:
                data = cleanUpLine(line)
                try:
                    ensembl,pathway = string.split(data,'\t')
                    type = 'ontology'
                except Exception:
                    ensembl,null,pathway = string.split(data,'\t')
                try:
                    entrezs = ens_to_entrez[ensembl]
                    for entrez in entrezs:
                        if type == 'ontology':
                            export_data.write(entrez+'\t'+pathway+'\n')
                        else:
                            export_data.write(entrez+'\tEn\t'+pathway+'\n')
                except Exception:
                    null=[]
        export_data.close()
Esempio n. 4
0
def justConvertFilenames(species, outputdir):
    import gene_associations
    gene_to_symbol = gene_associations.getGeneToUid(species,
                                                    ('hide', 'Ensembl-Symbol'))
    from import_scripts import OBO_import
    symbol_to_gene = OBO_import.swapKeyValues(gene_to_symbol)

    for filename in os.listdir(outputdir):
        if '.pdf' in filename or '.png' in filename:
            fn = string.replace(filename, '.pdf', '')
            fn = string.replace(fn, '.png', '')
            newname = string.split(fn, '__')

            if newname[0] in gene_to_symbol:
                new_filename = str(filename)
                if '__' in filename:
                    new_filename = string.split(filename, '__')[1]
                elif '\\' in filename:
                    new_filename = string.split(filename, '\\')[1]
                elif '/' in filename:
                    new_filename = string.split(filename, '/')[1]
                nnname = gene_to_symbol[
                    newname[0]][0] + '-SashimiPlot_' + new_filename
                try:
                    os.rename(os.path.join(outputdir, filename),
                              os.path.join(outputdir, nnname))
                except Exception:
                    pass
            else:
                continue
Esempio n. 5
0
def importGeneSymbols(species):
    import gene_associations
    gene_to_symbol = gene_associations.getGeneToUid(species,
                                                    ('hide', 'Ensembl-Symbol'))

    from import_scripts import OBO_import
    symbol_to_gene = OBO_import.swapKeyValues(gene_to_symbol)

    return gene_to_symbol, symbol_to_gene
def importDataSimple(filename,input_type,MOD=None,Species=None):
    id_db={}
    fn = filepath(filename)
    x=0
    for line in open(fn,'rU').xreadlines():         
        data = cleanUpLine(line)
        t = string.split(data,'\t')
        if data[0]=='#' and x==0: x=0
        elif x==0:
            column_headers = t[1:]
            if input_type != 'LineageProfiler':
                try: column_headers = t[2] ### exclude the ID, system code and p-value column headers
                except Exception: column_headers = 'NA'
            x=1
        else:
            if x==1 and input_type != 'LineageProfiler':
                ### get system conversions
                system_code = t[1]
                import GO_Elite
                import OBO_import
                system_codes,source_types,mod_types = GO_Elite.getSourceData()
                source_data = system_codes[system_code]
                try:
                    Mod=mod ### global established in upstream functions
                    speciescode = species_code
                except Exception:
                    Mod=MOD
                    speciescode = Species
                if source_data == Mod:
                    source_is_mod = True
                else:
                    source_is_mod = False
                    mod_source = Mod+'-'+source_data+'.txt'
                    gene_to_source_id = gene_associations.getGeneToUid(speciescode,('hide',mod_source))
                    source_to_gene = OBO_import.swapKeyValues(gene_to_source_id)
            if input_type != 'LineageProfiler':
                if source_is_mod == True:
                    try: id_db[t[0]] = float(t[2])
                    except Exception: id_db[t[0]] = 'NA'
                elif t[0] in source_to_gene:
                    mod_ids = source_to_gene[t[0]]
                    for mod_id in mod_ids:
                        try: value = t[2]
                        except Exception: value = 'NA'
                        if value == '+': value = 1
                        elif value == '-': value = -1
                        try: id_db[mod_id] = float(value) ### If multiple Ensembl IDs in dataset, only record the last associated fold change
                        except Exception: id_db[mod_id] = 'NA'
                        break
            else:
                id_db[t[0]]= map(float,t[1:]) ### Applies to LineageProfiler
            x+=1
    #print len(id_db),column_headers
    return id_db,column_headers
Esempio n. 7
0
def exportSymbolRelationships(pathway_to_symbol,selected_species,pathway_type,type):    
    if selected_species != None: ### Restrict to selected species only
        current_species_dirs=selected_species
    else:
        current_species_dirs = unique.read_directory('/'+database_dir)
    
    for species in current_species_dirs:
        if '.' not in species:
            ens_dir = database_dir+'/'+species+'/gene-'+type+'/Ensembl-'+pathway_type+'.txt'
            ens_data = export.ExportFile(ens_dir)
            if 'mapp' in type: ens_data.write('GeneID\tSystem\tGeneSet\n')
            else: ens_data.write('GeneID\tGeneSet\n')
            try: ens_to_entrez = gene_associations.getGeneToUid(species,('hide','Ensembl-EntrezGene'))
            except Exception: ens_to_entrez ={}
            if len(ens_to_entrez)>0:
                entrez_dir = database_dir+'/'+species+'/gene-'+type+'/EntrezGene-'+pathway_type+'.txt'
                entrez_data = export.ExportFile(entrez_dir)
                if 'mapp' in type: entrez_data.write('GeneID\tSystem\tGeneSet\n')
                else: entrez_data.write('GeneID\tGeneSet\n')
            #print 'Exporting '+pathway_type+' databases for:',species
            try: gene_to_source_id = gene_associations.getGeneToUid(species,('hide','Ensembl-Symbol'))
            except Exception: gene_to_source_id={}
            source_to_gene = OBO_import.swapKeyValues(gene_to_source_id)
            source_to_gene = lowerSymbolDB(source_to_gene)
            for pathway in pathway_to_symbol:
                for symbol in pathway_to_symbol[pathway]:
                    try:
                        genes = source_to_gene[symbol]
                        for gene in genes:
                            if 'mapp' in type: ens_data.write(gene+'\tEn\t'+pathway+'\n')
                            else: ens_data.write(gene+'\t'+pathway+'\n')
                            if gene in ens_to_entrez:
                                for entrez in ens_to_entrez[gene]:
                                    if 'mapp' in type: entrez_data.write(entrez+'\tL\t'+pathway+'\n')
                                    else: entrez_data.write(entrez+'\t'+pathway+'\n')
                    except Exception: null=[]
            ens_data.close()
            try: entrez_data.close()
            except Exception: null=[]
Esempio n. 8
0
def importDataSimple(filename, input_type):
    id_db = {}
    fn = filepath(filename)
    x = 0
    for line in open(fn, 'rU').xreadlines():
        data = cleanUpLine(line)
        t = string.split(data, '\t')
        if data[0] == '#': x = 0
        elif x == 0:
            column_headers = t[1:]
            if input_type != 'LineageProfiler':
                column_headers = t[
                    2]  ### exclude the ID, system code and p-value column headers
            x = 1
        else:
            if x == 1 and input_type != 'LineageProfiler':
                ### get system conversions
                system_code = t[1]
                import GO_Elite
                import OBO_import
                system_codes, source_types, mod_types = GO_Elite.getSourceData(
                )
                source_data = system_codes[system_code]
                if source_data == mod:
                    source_is_mod = True
                else:
                    source_is_mod = False
                    mod_source = mod + '-' + source_data + '.txt'
                    gene_to_source_id = gene_associations.getGeneToUid(
                        species_code, ('hide', mod_source))
                    source_to_gene = OBO_import.swapKeyValues(
                        gene_to_source_id)
            if input_type != 'LineageProfiler':
                if source_is_mod == True:
                    id_db[t[0]] = float(t[2])
                elif t[0] in source_to_gene:
                    mod_ids = source_to_gene[t[0]]
                    for mod_id in mod_ids:
                        id_db[mod_id] = float(
                            t[2]
                        )  ### If multiple Ensembl IDs in dataset, only record the last associated fold chagne
            else:
                id_db[t[0]] = map(float, t[1:])  ### Applies to LineageProfiler
            x += 1
    return id_db, column_headers
Esempio n. 9
0
def justConvertFilenames(species,outputdir):
    import gene_associations
    gene_to_symbol = gene_associations.getGeneToUid(species,('hide','Ensembl-Symbol'))
    import OBO_import; symbol_to_gene = OBO_import.swapKeyValues(gene_to_symbol)
    
    for filename in os.listdir(outputdir):
        if '.pdf' in filename or '.png' in filename:
            fn = string.replace(filename,'.pdf','')
            fn = string.replace(fn,'.png','')
            newname=string.split(fn,'__')

            if newname[0] in gene_to_symbol:
                new_filename = str(filename)
                if '__' in filename:
                    new_filename = string.split(filename,'__')[1]
                elif '\\' in filename:
                    new_filename = string.split(filename,'\\')[1]
                elif '/' in filename:
                    new_filename = string.split(filename,'/')[1]
                nnname=gene_to_symbol[newname[0]][0]+'-SashimiPlot_'+new_filename
                try: os.rename(os.path.join(outputdir, filename), os.path.join(outputdir,nnname))
                except Exception: pass
            else:
                continue
Esempio n. 10
0
def CreateFilesMonocle(filename, rawExpressionFile, species='Hs'):
    try:
        import gene_associations
        gene_to_symbol = gene_associations.getGeneToUid(
            species, ('hide', 'Ensembl-Symbol'))
    except Exception:
        gene_to_symbol = {}

    #Create the files for Monocle
    setWorkingDirectory(findParentDir(filename)[:-1])
    try:
        os.mkdir(findParentDir(filename)[:-1])
    except Exception:
        None
    #filename=self.File()
    x = 0
    data_name = findParentDir(filename) + '/data.txt'
    gene_name = findParentDir(filename) + '/gene.txt'
    sample_name = findParentDir(filename) + '/sample.txt'
    gene_names = []
    gene_list = []
    dat = []

    export_cdt = open(sample_name, 'w')
    export_gene = open(gene_name, 'w')
    for line in open(filename, 'rU').xreadlines():
        data = cleanUpLine(line)
        headers = string.split(data, '\t')
        dat.append(line)
        if data[0] != '#':
            if x == 1:
                gen = headers[0]
                gen = (gen.split(" "))
                ge_lt = gen[0]
                gene = string.join(gen, '\t')
                gene_names.append(gene)
                gene_list.append(ge_lt)
            if x == 0:
                array_names = []
                array_linker_db = {}
                d = 0
                for entry in headers[1:]:
                    if '::' in entry:
                        a = (entry.split("::"))
                    else:
                        a = (entry.split(":"))
                    a = reversed(a)
                    ent = string.join(a, '\t')
                    if (ent[0].isdigit()):
                        ent = 'X' + ent[0:]
                        #print j
                    array_names.append(ent)
                x = 1

    i = 0
    eheader = string.join(
        [''] + ['Group'],
        '\t') + '\n'  ### format column-flat-clusters for export
    export_cdt.write(eheader)
    for row in array_names:
        export_cdt.write(row + '\n')
        i += 1
    export_cdt.close()
    gheader = string.join(
        [''] + ['gene_short_name'],
        '\t') + '\n'  ### format column-flat-clusters for export
    export_gene.write(gheader)

    export_object = open(data_name, 'w')
    """
    for row in array_names:
        group=string.split(row,'\t')
        export_object.write('\t'+group[0])
        #print group[0]
    export_object.write('\n')
    """
    firstRow = True
    for line in open(rawExpressionFile, 'rU').xreadlines():
        data = cleanUpLine(line)
        t = string.split(data, '\t')
        id = t[0]
        nid = id
        proceed = False
        if firstRow:
            new_headers = []
            headers = t[1:]
            for i in headers:
                i = string.replace(i, ':', '-')
                new_headers.append(i)
            export_object.write(
                string.join(['UID'] + new_headers, '\t') + '\n')
            firstRow = False
        else:
            if id in gene_list:
                proceed = True
            else:
                if id in gene_to_symbol:
                    symbol = gene_to_symbol[id][0]
                    if symbol in gene_list:
                        nid = symbol
                        proceed = True
                if proceed:
                    k = gene_list.index(nid)
                    export_object.write(line)
                    export_gene.write(id + '\n')
                    #export_gene.write(gene_list[k]+'\n')
    export_object.close()
    export_gene.close()
Esempio n. 11
0
def sashmi_plot_list(bamdir,
                     eventsToVisualizeFilename,
                     PSIFilename,
                     events=None):
    try:
        import gene_associations
        gene_to_symbol = gene_associations.getGeneToUid(
            species, ('hide', 'Ensembl-Symbol'))
        from import_scripts import OBO_import
        symbol_to_gene = OBO_import.swapKeyValues(gene_to_symbol)
    except Exception:
        symbol_to_gene = {}

    if events == None:
        splicing_events, expandedSearch = importSplicingEventsToVisualize(
            eventsToVisualizeFilename)
    else:
        ### Replace any ":" from the input events
        #for i in range(len(events)): events[i] = string.replace(events[i],':','__')
        expandedSearch = True

        for i in range(len(events)):
            gene = string.split(events[i], '__')[0]
            if gene in gene_to_symbol:
                symbol = gene_to_symbol[gene][0]
            elif 'ENS' not in gene or 'G0000' in gene:
                if gene in symbol_to_gene:
                    ensID = symbol_to_gene[gene][0]
                    symbol = gene
                    events[
                        i] = ensID  ### translate this ID to an Ensembl gene ID for propper SashimiPlot lookup
        splicing_events = events  ### optionally get from supplied variable

    if len(splicing_events) == 0:
        print eventsToVisualizeFilename
        forceNoCompatibleEventsInFile

    print 'Exporting plots',

    ### Determine Groups for Coloring
    groups_file = 'None'
    dir_list = unique.read_directory(root_dir + '/ExpressionInput')

    for file in dir_list:
        if 'groups.' in file:
            groups_file = root_dir + '/ExpressionInput/' + file

    if groups_file != None:
        try:
            import ExpressionBuilder
            sample_group_db = ExpressionBuilder.simplerGroupImport(groups_file)
            groups = []
            for sample in sample_group_db:
                if sample_group_db[sample] not in groups:
                    groups.append(sample_group_db[sample]
                                  )  ### create an ordered list of unique group
        except Exception:
            groups = ['None']
            #print traceback.format_exc()
            pass

    processed_events = formatAndSubmitSplicingEventsToSashimiPlot(
        PSIFilename, bamdir, splicing_events, sample_group_db, groups, False)
    mopup_events = getMopUpEvents(splicing_events, processed_events)

    ### Do the same for supplied gene queries or junctions that didn't map above using the gene expression values as a guide
    #print len(splicing_events),len(processed_events),len(mopup_events)
    processed_events = formatAndSubmitSplicingEventsToSashimiPlot(
        steady_state_exp_file, bamdir, mopup_events, sample_group_db, groups,
        expandedSearch)
    if len(processed_events) > 0:
        mopup_events = getMopUpEvents(mopup_events, processed_events)
        processed_events = formatAndSubmitSplicingEventsToSashimiPlot(
            PSIFilename, bamdir, mopup_events, sample_group_db, groups, True)
    return gene_to_symbol
def importDataSimple(filename, input_type, MOD=None, Species=None):
    id_db = {}
    fn = filepath(filename)
    x = 0
    for line in open(fn, 'rU').xreadlines():
        data = cleanUpLine(line)
        t = string.split(data, '\t')
        if data[0] == '#' and x == 0: x = 0
        elif x == 0:
            column_headers = t[1:]
            if input_type != 'LineageProfiler':
                try:
                    column_headers = t[
                        2]  ### exclude the ID, system code and p-value column headers
                except Exception:
                    column_headers = 'NA'
            x = 1
        else:
            if x == 1 and input_type != 'LineageProfiler':
                ### get system conversions
                system_code = t[1]
                import GO_Elite
                import OBO_import
                system_codes, source_types, mod_types = GO_Elite.getSourceData(
                )
                source_data = system_codes[system_code]
                try:
                    Mod = mod  ### global established in upstream functions
                    speciescode = species_code
                except Exception:
                    Mod = MOD
                    speciescode = Species
                if source_data == Mod:
                    source_is_mod = True
                else:
                    source_is_mod = False
                    mod_source = Mod + '-' + source_data + '.txt'
                    gene_to_source_id = gene_associations.getGeneToUid(
                        speciescode, ('hide', mod_source))
                    source_to_gene = OBO_import.swapKeyValues(
                        gene_to_source_id)
            if input_type != 'LineageProfiler':
                if source_is_mod == True:
                    try:
                        id_db[t[0]] = float(t[2])
                    except Exception:
                        id_db[t[0]] = 'NA'
                elif t[0] in source_to_gene:
                    mod_ids = source_to_gene[t[0]]
                    for mod_id in mod_ids:
                        try:
                            value = t[2]
                        except Exception:
                            value = 'NA'
                        if value == '+': value = 1
                        elif value == '-': value = -1
                        try:
                            id_db[mod_id] = float(
                                value
                            )  ### If multiple Ensembl IDs in dataset, only record the last associated fold change
                        except Exception:
                            id_db[mod_id] = 'NA'
                        break
            else:
                id_db[t[0]] = map(float, t[1:])  ### Applies to LineageProfiler
            x += 1
    #print len(id_db),column_headers
    return id_db, column_headers
Esempio n. 13
0
def generateMAPPFinderScores(species_title, species_id, source, mod_db,
                             system_Codes, permute, resources_to_analyze,
                             file_dirs, parent_root):
    global mappfinder_output_dir
    global custom_sets_folder
    global root
    root = parent_root
    global mapp_to_mod_genes
    global ontology_to_mod_genes
    global system_codes
    system_codes = system_Codes
    criterion_input_folder, criterion_denom_folder, output_dir, custom_sets_folder = file_dirs
    previous_denominator_file_dir = ''
    ontology_to_mod_genes = {}
    mapp_to_mod_genes = {}
    global test
    test = 'no'
    program_type, database_dir = unique.whatProgramIsThis()
    if resources_to_analyze == 'Gene Ontology':
        resources_to_analyze = 'GeneOntology'

    if len(output_dir) == 0: mappfinder_output_dir = 'input/MAPPFinder'
    else:
        mappfinder_output_dir = output_dir + '/GO-Elite_results/CompleteResults/ORA'

    global source_data
    source_data = source
    global mod
    mod = mod_db
    global species_code
    species_code = species_id
    global species_name
    species_name = species_title
    global gene_to_mapp
    global permutations
    permutations = permute
    global eliminate_redundant_genes
    eliminate_redundant_genes = 'yes'
    global permuted_z_scores
    global ontology_annotations
    global original_ontology_z_score_data
    global original_mapp_z_score_data
    global input_gene_list
    global denominator_gene_list
    global gene_file
    global denom_file_status
    global input_count
    global denom_count
    global gene_annotations
    global source_to_gene
    global use_FET
    if permutations == "FisherExactTest":
        use_FET = 'yes'  ### Use Fisher's Exact test instead of permutation-based p-values
        permutations = 0
    else:
        use_FET = 'no'

    start_time = time.time()

    gene_annotations = gene_associations.importGeneData(species_code, mod)

    OBO_date = importVersionData('OBO/')
    if len(criterion_input_folder) == 0:
        import_dir = '/input/GenesToQuery/' + species_code
        import_dir_alt = import_dir[1:]
    else:
        import_dir = criterion_input_folder
        import_dir_alt = criterion_input_folder
    m = GrabFiles()
    m.setdirectory(import_dir)
    try:
        dir_list = readDirText(
            import_dir
        )  #send a sub_directory to a function to identify all files in a directory
    except Exception:
        print_out = 'Warning! Input directory location is not a valid folder. Exiting GO-Elite.'
        ForceCriticalError(print_out)
    try:
        denom_dir_list = readDirText(criterion_denom_folder)
    except Exception:
        print_out = 'Warning! Denominator directory location is not a valid folder. Exiting GO-Elite.'
        ForceCriticalError(print_out)
    if len(dir_list) == 0:
        error_message = 'No files with the extension ".txt" found in the input directory.'
        ForceCriticalError(error_message)
    if len(denom_dir_list) == 0:
        error_message = 'No files with the extension ".txt" found in the denominator directory.'
        ForceCriticalError(error_message)

    inputs_analyzed = 0
    for mappfinder_input in dir_list:  #loop through each file in the directory
        permuted_z_scores = {}
        original_ontology_z_score_data = {}
        original_mapp_z_score_data = {}
        print 'Performing over-representation analysis (ORA) on', mappfinder_input
        gene_file_dir, gene_file = m.searchdirectory(mappfinder_input)
        ###Import Input gene/source-id lists
        input_gene_list, source_data_input, error_message = gene_associations.importUIDsForMAPPFinderQuery(
            import_dir_alt + '/' + gene_file, system_codes, 'no')
        input_count = len(input_gene_list)
        if 'WARNING!!!' in error_message:  ### Warn the user about SwissProt issues when importing the denominator
            ForceCriticalError(error_message)
        if len(criterion_denom_folder) == 0:
            denom_folder = '/input/GenesToQuery/' + species_code + '/DenominatorGenes'
        else:
            denom_folder = criterion_denom_folder
        error_warning = "\nThe directory\n" + '[' + denom_folder + ']' + "\nwas not found. Please create the directory\nand place an appropriate denominator file\nor files in it."
        denominator_file_dir = identifyGeneFiles(
            denom_folder,
            gene_file)  ###input is in input\Genes, denominator in
        try:
            denominator_file_dir = identifyGeneFiles(
                denom_folder,
                gene_file)  ###input is in input\Genes, denominator in
            denominator_file = string.split(denominator_file_dir, '/')[-1]
            print 'Using:', denominator_file, 'for the denominator.'
        except Exception:
            print_out = "WARNING: No denominator file included in\nthe Denominator directory.\nTo proceed, place all denominator\nIDs in a file in that directory."
            ForceCriticalError(print_out)
        if denominator_file_dir == previous_denominator_file_dir:
            denom_file_status = 'old'
        else:
            denom_file_status = 'new'
        if denom_file_status == 'new':
            previous_denominator_file_dir = denominator_file_dir
            denominator_gene_list, source_data_denom, error_message = gene_associations.importUIDsForMAPPFinderQuery(
                denominator_file_dir, system_codes, 'no')
            denom_count = len(denominator_gene_list)
            if 'SwissProt' in error_message and 'WARNING!!!' not in error_message:
                if len(input_gene_list) == 0:
                    error_message += '\nNo valid input IDs found. Exiting GO-Elite.'
                    try:
                        UI.WarningWindow(
                            error_message, 'Warning!!! Identifier Error'
                        )  ### Only warn, don't force an exit (if SwissProt full IDs are present)
                    except Exception:
                        None
                    sys.exit()
                else:
                    try:
                        UI.WarningWindow(
                            error_message, 'Warning!!! Identifier Error'
                        )  ### Only warn, don't force an exit (if SwissProt full IDs are present)
                    except Exception:
                        None
            elif len(error_message) > 0:
                ForceCriticalError(error_message)
            if len(denominator_gene_list) == len(input_gene_list):
                print_out = 'Input and Denominator lists have identical counts.\nPlease load a propper denominator set (containing\nthe input list with all assayed gene IDs) before proceeding.'
                ForceCriticalError(print_out)
            original_denominator_gene_list = []
            for id in denominator_gene_list:
                original_denominator_gene_list.append(
                    id
                )  ###need this to be a valid list not dictionary for permutation analysis
        if len(source_data_input) > 0:
            source_data = source_data_input  ###over-ride source_data if a source was identified from the input file
        if source_data != mod:
            if denom_file_status == 'new':
                mod_source = mod + '-' + source_data + '.txt'
                #checkDenominatorMatchesInput(input_gene_list,denominator_gene_list,gene_file) ###This is checked for the source IDs not associated MOD IDs
                try:
                    gene_to_source_id = gene_associations.getGeneToUid(
                        species_code, mod_source)
                    print mod_source, 'imported'
                except Exception:
                    try:
                        if mod == 'EntrezGene': mod = 'Ensembl'
                        else: mod = 'EntrezGene'
                        print 'The primary system (MOD) has been switched from', mod_db, 'to', mod, '\n(' + mod_db, 'not supported for the %s ID system).' % source_data
                        mod_source = mod + '-' + source_data + '.txt'
                        gene_to_source_id = gene_associations.getGeneToUid(
                            species_code, mod_source)
                    except Exception:
                        print_out = "WARNING: The primary gene ID system '" + mod + "'\ndoes not support relationships with '" + source_data + "'.\nRe-run using a supported primary ID system."
                        ForceCriticalError(print_out)
                source_to_gene = OBO_import.swapKeyValues(gene_to_source_id)
                denominator_gene_list = associateInputSourceWithGene(
                    source_to_gene, denominator_gene_list)
                ### Introduced the below method in version 1.21 to improve permutation speed (no longer need to search all source IDs)
                ### Only includes source ID to gene relationships represented in the denominator file (needed for Affymetrix)
                source_to_gene = OBO_import.swapKeyValues(
                    denominator_gene_list)
            ###Replace input lists with corresponding MOD IDs
            input_gene_list = associateInputSourceWithGene(
                source_to_gene, input_gene_list)
        checkDenominatorMatchesInput(
            input_gene_list, denominator_gene_list,
            gene_file)  ###This is for only the associated MOD IDs

        gd = GrabFiles()
        gd.setdirectory('/' + database_dir + '/' + species_code + '/gene-mapp')
        available_genesets = reorganizeResourceList(gd.getAllFiles(mod))
        od = GrabFiles()
        od.setdirectory('/' + database_dir + '/' + species_code + '/gene-go')
        available_ontologies = reorganizeResourceList(od.getAllFiles(mod))

        input_gene_count = len(
            input_gene_list
        )  ###Count number of genes associated with source input IDs
        if len(input_gene_list) == 0 or len(denominator_gene_list) == 0:
            if len(input_gene_list) == 0:
                print_out = 'WARNING!!!! None of the input IDs provided map to genes for ' + mappfinder_input + '. Check to make sure the selected species is correct.'
                print_out += '\nSelected species: ' + species_name
                print_out += '\nInput ID system: ' + str(source_data_input)
                print_out += '\nPrimary ID system (MOD): ' + str(mod)
                ForceCriticalError(print_out)
            if len(denominator_gene_list) == 0:
                print_out = 'WARNING!!!! None of the denominator IDs provided map to genes for ' + denominator_file_dir + '. Check to make sure the selected species is correct.'
                print_out += '\nSelected species: ' + species_name
                print_out += '\nDenominator ID system: ' + str(source)
                print_out += '\nPrimary ID system (MOD):' + str(mod)
                ForceCriticalError(print_out)
        elif len(available_ontologies) == 0 and len(available_genesets) == 0:
            print_out = 'WARNING!!!! No Ontology or GeneSets appear to be available for this species. Please supply and re-analyze.'
            ForceCriticalError(print_out)
        else:
            """ Perform permutation analysis and ORA on available GeneSets or Ontologies"""
            inputs_analyzed += 1

            global permute_inputs
            permute_inputs = []
            if permutations != 0 or use_FET == 'no':
                buildPermutationDatabase(original_denominator_gene_list,
                                         input_count)

            run_status = 0
            ### Analyzed ontologies
            if len(available_ontologies) > 0:
                print '    Analyzing input ID list with available ontologies'
            for ontology_dir in available_ontologies:
                ontology_type = getResourceType(ontology_dir)
                permuted_z_scores = {}
                original_ontology_z_score_data = {}
                #print ontology_type, resources_to_analyze
                if resources_to_analyze == ontology_type or resources_to_analyze == 'all':
                    ontology_annotations = importOntologyAnnotations(
                        species_code, ontology_type)
                    if ontology_annotations != None:  ### Occurs when the files are named or formatted correctly
                        status, ontology_to_mod_genes = performOntologyORA(
                            ontology_dir)
                        run_status += status

            ### Analyzed gene-sets
            if len(available_genesets) > 0:
                print '    Analyzing input ID list with available gene-sets'
            for geneset_dir in available_genesets:
                geneset_type = getResourceType(geneset_dir)
                permuted_z_scores = {}
                original_mapp_z_score_data = {}
                if resources_to_analyze == geneset_type or resources_to_analyze == 'all':
                    status, mapp_to_mod_genes = performGeneSetORA(geneset_dir)
                    run_status += status
            if len(custom_sets_folder) > 0:
                ### Hence - Analyze User Supplied GeneSets
                permuted_z_scores = {}
                original_mapp_z_score_data = {}
                run_status += performGeneSetORA('UserSuppliedAssociations')[0]

            permute_inputs = []
            permute_mapp_inputs = []
            ontology_input_gene_count = []
            mapp_input_gene_count = []

            if run_status == 0:
                ### Returns the number of successfully analyzed gene-set databases
                program_type, database_dir = unique.whatProgramIsThis()
                print_out = "Warning!!! Either the MOD you have selected: " + mod + "\nis missing the appropriate relationshipfiles necessary to run GO-Elite\nor you have selected an invalid resource to analyze.  Either replace\nthe missing MOD files in " + database_dir + '/' + species_code + ' sub-directories or\nselect a different MOD at run-time.'
                ForceCriticalError(print_out)

    end_time = time.time()
    time_diff = formatTime(start_time, end_time)
    print 'ORA analyses finished in %s seconds' % time_diff
    return ontology_to_mod_genes, mapp_to_mod_genes  ###Return the MOD genes associated with each GO term and MAPP
Esempio n. 14
0
def importGeneExpressionValues(filename,tissue_specific_db,translation_db,useLog=False,previouslyRun=False,species=None):
    ### Import gene-level expression raw values           
    fn=filepath(filename); x=0; genes_added={}; gene_expression_db={}
    dataset_name = export.findFilename(filename)
    max_val=0
    print 'importing:',dataset_name
    
    try:
        import gene_associations, OBO_import
        gene_to_symbol = gene_associations.getGeneToUid(species,('hide','Ensembl-Symbol'))
        symbol_to_gene = OBO_import.swapKeyValues(gene_to_symbol)
    except Exception: symbol_to_gene={}
    
    for line in open(fn,'rU').xreadlines():
        data = cleanUpLine(line)
        t = string.split(data,'\t')
        
        if x==0:
            if '#' not in data:
                for i in t[1:]: sample_headers.append(i)
                x=1
        else:
            gene = t[0]
            try: gene = string.split(t[0],'|')[0]
            except Exception: pass
            #if '-' not in gene and ':E' in gene: print gene;sys.exit()
            if analysis_type == 'AltExon':
                try: ens_gene,exon = string.split(gene,'-')[:2]
                except Exception: exon = gene
                gene = exon
            if keyed_by == 'translation': ### alternative value is 'primaryID'
                """if gene == 'ENSMUSG00000025915-E19.3':
                    for i in translation_db: print [i], len(translation_db); break
                    print gene, [translation_db[gene]];sys.exit()"""
                try: gene = translation_db[gene] ### Ensembl annotations
                except Exception: pass
            try: gene = symbol_to_gene[gene][0] ### If RNASeq is the selected platform and Symbol is the uid
            except Exception: pass
            if gene in tissue_specific_db:
                index,tissue_exp=tissue_specific_db[gene]
                try: genes_added[gene]+=1
                except Exception: genes_added[gene]=1
                proceed=True
                try:
                    exp_vals = t[1:]
                    if '' in exp_vals:
                        ### If missing values present (PSI values)
                        exp_vals = ['0.000101' if i=='' else i for i in exp_vals]
                        useLog = False
                    exp_vals = map(float, exp_vals)
                    if platform == 'RNASeq':
                        if max(exp_vals)>max_val: max_val = max(exp_vals)
                        #if max(exp_vals)<3: proceed=False
                        if useLog==False:
                            exp_vals = map(lambda x: math.log(x+1,2),exp_vals)
                    if value_type == 'calls': ### Hence, this is a DABG or RNA-Seq expression
                        exp_vals = produceDetectionCalls(exp_vals,targetPlatform) ### 0 or 1 calls
                    if proceed:
                        gene_expression_db[gene] = [index,exp_vals]
                except Exception:
                    print 'Non-numeric values detected:'
                    x = 5
                    print t[:x]
                    while x < t:
                        t[x:x+5]
                        x+=5
                    print 'Formatting error encountered in:',dataset_name; forceError
            """else:
                for gene in tissue_specific_db:
                    if 'Ndufa9:ENSMUSG00000000399:I2.1-E3.1' in gene:
                        print gene, 'dog';sys.exit()
                print gene;kill"""
        
    print len(gene_expression_db), 'matching genes in the dataset and tissue compendium database'
    
    for gene in genes_added:
        if genes_added[gene]>1:
            del gene_expression_db[gene] ### delete entries that are present in the input set multiple times (not trustworthy)
        else: expession_subset.append(gene_expression_db[gene]) ### These contain the rank order and expression
    #print len(expession_subset);sys.exit()
    expession_subset.sort() ### This order now matches that of 
    gene_expression_db=[]
    
    if max_val<20 and platform == 'RNASeq' and previouslyRun==False: ### Only allow to happen once
        importGeneExpressionValues(filename,tissue_specific_db,translation_db,useLog=True,previouslyRun=True,species=species)
Esempio n. 15
0
def performEventEnrichment(output_dir, eventDir, species):
    """Import significant splicing events from metaDataAnalysis.py comparisons and test for their
    statistical enrichmet relative to the Splicing Factor correlated events."""
    import collections
    import mappfinder
    event_db = collections.OrderedDict()
    import UI
    ### Import the splice-ICGS significant splicing events per signature
    files = UI.read_directory(eventDir)
    for file in files:
        if '.txt' in file and 'PSI.' in file:
            ls = []
            event_db[
                file[:-4]] = ls  ### This list is subsequently updated below
            fn = eventDir + '/' + file
            firstLine = True
            for line in open(fn, 'rU').xreadlines():
                data = line.rstrip()
                t = string.split(data, '\t')
                if firstLine:
                    event_index = t.index('Event-Direction')
                    firstLine = False
                    continue
                uid = t[0]
                if 'U2AF1-like' in file:
                    if t[1] == "inclusion":
                        ls.append(uid)  #ls.append((uid,t[event_index]))
                else:
                    ls.append(uid)  #ls.append((uid,t[event_index]))

    ### Import the splicing-factor correlated splicing events to identify associated signatures
    splicing_factor_correlated_scores = {}
    gene_to_symbol = None
    files = UI.read_directory(output_dir)
    for file in files:
        if '.txt' in file and '_' in file:
            R_ls = []
            if 'ENS' in file:
                splicing_factor = file[:-4]
                if gene_to_symbol == None:  ### Import only once
                    import gene_associations
                    gene_to_symbol = gene_associations.getGeneToUid(
                        species, ('hide', 'Ensembl-Symbol'))
                sf = 'ENS' + string.split(splicing_factor, 'ENS')[1]
                splicing_factor = string.split(sf, '_')[0]
                if splicing_factor in gene_to_symbol:
                    splicing_factor = gene_to_symbol[splicing_factor][0]
            else:
                splicing_factor = string.split(file[:-4], '_')[0]
            fn = output_dir + '/' + file
            firstLine = True
            for line in open(fn, 'rU').xreadlines():
                data = line.rstrip()
                t = string.split(data, '\t')
                event = t[0]
                R_ls.append(event)
            R = len(R_ls)
            N = 80000
            for signature in event_db:
                n_ls = event_db[signature]
                n = len(n_ls)
                r_ls = set(R_ls).intersection(n_ls)
                r = len(r_ls)
                ### Calculate a Z-score
                try:
                    z = Zscore(r, n, N, R)
                except ZeroDivisionError:
                    z = 0.0000
                ### Calculate a Z-score assuming zero matching entries
                try:
                    null_z = Zscore(0, n, N, R)
                except ZeroDivisionError:
                    null_z = 0.000
                ### Calculate a Fischer's Exact P-value
                pval = mappfinder.FishersExactTest(r, n, R, N)
                ### Store these data in an object
                zsd = mappfinder.ZScoreData(signature, r, n, z, null_z, n)
                zsd.SetP(pval)
                zsd.setAssociatedIDs(r_ls)
                #print splicing_factor,'\t', signature,'\t', z, pval;sys.exit()
                if splicing_factor in splicing_factor_correlated_scores:
                    signature_db = splicing_factor_correlated_scores[
                        splicing_factor]
                    signature_db[
                        signature] = zsd  ### Necessary format for the permutation function
                else:
                    signature_db = {signature: zsd}
                    splicing_factor_correlated_scores[
                        splicing_factor] = signature_db

    results_dir = output_dir + '/SFEnrichmentResults'
    result_file = results_dir + '/SF-correlated_SignatureScores.txt'
    try:
        os.mkdir(results_dir)
    except:
        pass
    eo = open(result_file, 'w')
    eo.write(
        string.join([
            'Splicing Factor', 'Signature', 'Number Changed',
            'Number Measured', 'Z-score', 'FisherExactP', 'AdjustedP'
        ], '\t') + '\n')  #'Events'

    ### Perform a permutation analysis to get BH adjusted p-values
    for splicing_factor in splicing_factor_correlated_scores:
        sorted_results = []
        signature_db = splicing_factor_correlated_scores[splicing_factor]
        ### Updates the adjusted p-value instances
        mappfinder.adjustPermuteStats(signature_db)
        for signature in signature_db:
            zsd = signature_db[signature]
            if float(zsd.ZScore()) > 1.96 and float(
                    zsd.Changed()) > 2 and float(zsd.PermuteP()) < 0.05:
                enriched_SFs = {}
                results = [
                    splicing_factor, signature,
                    zsd.Changed(),
                    zsd.Measured(),
                    zsd.ZScore(),
                    zsd.PermuteP(),
                    zsd.AdjP()
                ]  #string.join(zsd.AssociatedIDs(),'|')
                sorted_results.append([float(zsd.PermuteP()), results])
        sorted_results.sort()  ### Sort by p-value
        for (p, values) in sorted_results:
            eo.write(string.join(values, '\t') + '\n')
        if len(sorted_results) == 0:
            eo.write(
                string.join([
                    splicing_factor, 'NONE', 'NONE', 'NONE', 'NONE', 'NONE',
                    'NONE'
                ], '\t') + '\n')
    eo.close()
Esempio n. 16
0
def CreateFilesMonocle(filename, rawExpressionFile, species="Hs"):
    first_row = True
    key_db = {}
    key_list = []
    fn = filepath(filename)
    offset = 0
    nonNumericsPresent = False
    try:
        import gene_associations

        gene_to_symbol = gene_associations.getGeneToUid(species, ("hide", "Ensembl-Symbol"))
    except Exception:
        print "gene_symbols present"
        gene_to_symbol = {}
    setWorkingDirectory(findParentDir(filename)[:-1])
    try:
        os.mkdir(findParentDir(filename) + "/Monocle")
    except Exception:
        None
    # filename=self.File()
    x = 0
    data_name = findParentDir(filename) + "/Monocle/expressionFile.txt"
    gene_name = findParentDir(filename) + "/Monocle/geneAnnotations.txt"
    sample_name = findParentDir(filename) + "/Monocle/sampleGroups.txt"
    gene_names = []
    gene_list = []
    dat = []
    export_cdt = open(sample_name, "w")
    export_gene = open(gene_name, "w")
    for line in open(fn, "rU").xreadlines():
        data = cleanUpLine(line)
        t = string.split(data, "\t")
        if first_row == True:
            if "row_clusters-flat" in t and "row_clusters-flat" not in t[0]:
                headers = string.join(t[2:], "\t") + "\n"
                offset = 1
            else:
                headers = string.join(t[1:], "\t") + "\n"

            first_row = False
        else:
            key = t[0]
            if key != "column_clusters-flat":
                key_list.append(key)

                try:
                    s = map(float, t[offset + 1 :])
                except Exception:
                    nonNumericsPresent = True
                key_db[key] = t

    for key in key_list:
        t = key_db[key]
        s = [key]
        if offset == 1:
            s.append("")
        temp = []
        for value in t[offset + 1 :]:
            try:
                temp.append(float(value))
            except Exception:
                pass
        min1 = min(temp)

        for value in t[offset + 1 :]:
            try:
                s.append(str(float(value) - min1))
            except Exception:
                s.append("0.000101")
        key_db[key] = s
    export_object = open(data_name, "w")
    export_object.write("" + "\t" + headers)  ### Header is the same for each file
    for key in key_list:
        t = key_db[key]
        if offset > 0:
            t = [t[0]] + t[1 + offset :]
        export_object.write(string.join(t, "\t") + "\n")  ### Write z-score values and row names
    export_object.close()
    print "File written..."
    # return input_file

    array_names = []
    array_linker_db = {}
    d = 0
    for entry in headers.split("\t"):

        entry = cleanUpLine(entry)
        if "::" in entry:
            a = entry.split("::")
        else:
            a = entry.split(":")

        # entry=string.join(a,'.')

        ent = entry + "\t" + a[0]
        # if(ent[0].isdigit()):
        #    ent='X'+ent[0:]

        # if '-' in ent:
        #   ent=string.replace(ent,'-','.')
        # if '+' in ent:
        #   ent=string.replace(ent,'+','.')
        # print j
        array_names.append(ent)
    i = 0
    eheader = string.join([""] + ["Group"], "\t") + "\n"  ### format column-flat-clusters for export
    export_cdt.write(eheader)
    for row in array_names:
        export_cdt.write(row + "\n")
        i += 1
    export_cdt.close()
    gheader = string.join([""] + ["gene_short_name"], "\t") + "\n"  ### format column-flat-clusters for export
    export_gene.write(gheader)

    for key in key_list:
        if key in gene_to_symbol:
            symbol = gene_to_symbol[id][0]
            if symbol in gene_list:
                nid = symbol
                proceed = True
            if proceed:
                k = gene_list.index(nid)
                export_object.write(line)
                export_gene.write(id + "\n")
        else:
            export_gene.write(key + "\t" + key + "\n")

    export_object.close()
    export_gene.close()
Esempio n. 17
0
def reformatPolyAdenylationCoordinates(species,force):
    """ PolyA annotations are currently only available from UCSC for human, but flat file
    annotations from 2003-2006 are available for multiple species. Convert these to BED format"""
    version={}
    version['Rn'] = '2003(rn3)'
    version['Dr'] = '2003(zv4)'
    version['Gg'] = '2004(galGal2)'
    version['Hs'] = '2006(hg8)'
    version['Mm'] = '2004(mm5)'

    print 'Exporting polyADB_2 coordinates as BED for',species
    ### Obtain the necessary database files
    url = 'http://altanalyze.org/archiveDBs/all/polyAsite.txt'
    output_dir = 'AltDatabase/ucsc/'+species + '/'
    if force == 'yes':
        filename, status = update.download(url,output_dir,'')
    else: filename = output_dir+'polyAsite.txt'

    ### Import the refseq to Ensembl information
    import gene_associations; import OBO_import; import EnsemblImport; import export
    try:
        ens_unigene = gene_associations.getGeneToUid(species,'Ensembl-UniGene')
        print len(ens_unigene),'Ensembl-UniGene entries imported'
        external_ensembl = OBO_import.swapKeyValues(ens_unigene); use_entrez='no'
    except Exception:
        ens_entrez = gene_associations.getGeneToUid(species,'Ensembl-EntrezGene')
        print len(ens_entrez),'Ensembl-EntrezGene entries imported'
        external_ensembl = OBO_import.swapKeyValues(ens_entrez); use_entrez='yes'
    gene_location_db = EnsemblImport.getEnsemblGeneLocations(species,'RNASeq','key_by_array')
    
    export_bedfile = output_dir+species+'_polyADB_2_predictions.bed'
    print 'exporting',export_bedfile
    export_data = export.ExportFile(export_bedfile)
    header = '#'+species+'\t'+'polyADB_2'+'\t'+version[species]+'\n'
    export_data.write(header)
    
    fn=filepath(filename); x=0; not_found={}
    for line in open(fn,'rU').xreadlines():
        data = cleanUpLine(line)
        if x==0: x=1
        else:
            siteid,llid,chr,sitenum,position,supporting_EST,cleavage = string.split(data,'\t')
            if chr == 'chrM': chr = 'chrMT' ### MT is the Ensembl convention whereas M is the Affymetrix and UCSC convention
            if chr == 'M': chr = 'MT' ### MT is the Ensembl convention whereas M is the Affymetrix and UCSC convention
            if species in siteid:
                if 'NA' not in chr: chr = 'chr'+chr
                strand = '+'; geneid = siteid
                pos_start = str(int(position)-1); pos_end = position
                if use_entrez=='no':
                    external_geneid = string.join(string.split(siteid,'.')[:2],'.')
                else: external_geneid=llid
                if external_geneid in external_ensembl:
                    ens_geneid = external_ensembl[external_geneid][0]
                    geneid += '-'+ens_geneid
                    chr,strand,start,end = gene_location_db[ens_geneid]
                else:
                    not_found[external_geneid]=[]
                    bed_format = string.join([chr,pos_start,pos_end,geneid,'0','-'],'\t')+'\n' ### We don't know the strand, so write out both strands
                    export_data.write(bed_format)
                bed_format = string.join([chr,pos_start,pos_end,geneid,'0',strand],'\t')+'\n'
                export_data.write(bed_format)
    export_data.close()   
Esempio n. 18
0
def CreateFilesMonocle(filename,rawExpressionFile,species='Hs'):
    first_row = True
    key_db={}
    key_list=[]
    fn=filepath(filename)
    offset=0
    nonNumericsPresent=False
    try:
        import gene_associations
        gene_to_symbol = gene_associations.getGeneToUid(species,('hide','Ensembl-Symbol'))
    except Exception:
        print "gene_symbols present"
        gene_to_symbol={}
    setWorkingDirectory(findParentDir(filename)[:-1])
    try: os.mkdir(findParentDir(filename)+'/Monocle')
    except Exception: None     
    #filename=self.File() 
    x = 0
    data_name=findParentDir(filename)+'/Monocle/expressionFile.txt'
    gene_name=findParentDir(filename)+'/Monocle/geneAnnotations.txt'
    sample_name=findParentDir(filename)+'/Monocle/sampleGroups.txt'
    gene_names = [];
    gene_list=[];
    dat=[];
    export_cdt = open(sample_name,'w')
    export_gene=open(gene_name,'w')
    for line in open(fn,'rU').xreadlines():
        data = cleanUpLine(line)
        t = string.split(data,'\t')
        if first_row == True:
            if 'row_clusters-flat' in t and 'row_clusters-flat' not in t[0]:
                headers = string.join(t[2:],'\t')+'\n'
                offset = 1
            else:
                headers = string.join(t[1:],'\t')+'\n'
                
            first_row = False
        else:
            key = t[0]
            if key!='column_clusters-flat':
                key_list.append(key)
                
                try: s = map(float,t[offset+1:])
                except Exception:
                    nonNumericsPresent=True
                key_db[key]=t
            else:
                clusters = map(str,t[offset+1:])
    for key in key_list:
            t = key_db[key]
            s=[key]
            if offset ==1: s.append('')
            temp=[]
            for value in t[offset+1:]:
                try: temp.append(float(value))
                except Exception: pass
            min1=min(temp)
        
            for value in t[offset+1:]:
                try: s.append(str(float(value)-min1))
                except Exception: s.append('0.000101')
            key_db[key]=s
    export_object = open(data_name,'w') 
    export_object.write(''+'\t'+headers) ### Header is the same for each file
    for key in key_list:
            t = key_db[key]
            if offset > 0:
                t = [t[0]]+t[1+offset:]
            export_object.write(string.join(t,'\t')+'\n') ### Write z-score values and row names
    export_object.close()
    print 'File written...'
    #return input_file

    
    array_names = []; array_linker_db = {}; d = 0; i = 0
    for entry in headers.split('\t'):
                
                entry=cleanUpLine(entry)
                if '::' in entry:
                    a = (entry.split("::"))
                elif ':' in entry:
                    a = (entry.split(":"))
                else:
                    a = (clusters[i],entry)
                #entry=string.join(a,'.')
              
                ent=entry+'\t'+a[0];
                #if(ent[0].isdigit()):
                #    ent='X'+ent[0:]
                
                #if '-' in ent:
                 #   ent=string.replace(ent,'-','.')
                #if '+' in ent:
                 #   ent=string.replace(ent,'+','.')
                    #print j
                array_names.append(ent);
                i+=1
        
    i=0
    eheader = string.join(['']+['Group'],'\t')+'\n' ### format column-flat-clusters for export
    export_cdt.write(eheader)
    for row in array_names:
        export_cdt.write(row+'\n')
        i+=1
    export_cdt.close()
    gheader = string.join(['']+ ['gene_short_name'],'\t')+'\n' ### format column-flat-clusters for export
    export_gene.write(gheader)
   
    for key in key_list:
        if key in gene_to_symbol:
             symbol = gene_to_symbol[id][0]
             if symbol in gene_list:
                        nid = symbol
                        proceed = True
             if proceed:
                    k=gene_list.index(nid)
                    export_object.write(line)
                    export_gene.write(id+'\n')
        else:
            export_gene.write(key+'\t'+key+'\n')
 
            
    export_object.close() 
    export_gene.close()
Esempio n. 19
0
def CreateFilesMonocle(filename, rawExpressionFile, species='Hs'):
    first_row = True
    key_db = {}
    key_list = []
    fn = filepath(filename)
    offset = 0
    nonNumericsPresent = False
    try:
        import gene_associations
        gene_to_symbol = gene_associations.getGeneToUid(
            species, ('hide', 'Ensembl-Symbol'))
    except Exception:
        print "gene_symbols present"
        gene_to_symbol = {}
    setWorkingDirectory(findParentDir(filename)[:-1])
    try:
        os.mkdir(findParentDir(filename) + '/Monocle')
    except Exception:
        None
    #filename=self.File()
    x = 0
    data_name = findParentDir(filename) + '/Monocle/expressionFile.txt'
    gene_name = findParentDir(filename) + '/Monocle/geneAnnotations.txt'
    sample_name = findParentDir(filename) + '/Monocle/sampleGroups.txt'
    gene_names = []
    gene_list = []
    dat = []
    export_cdt = open(sample_name, 'w')
    export_gene = open(gene_name, 'w')
    for line in open(fn, 'rU').xreadlines():
        data = cleanUpLine(line)
        t = string.split(data, '\t')
        if first_row == True:
            if 'row_clusters-flat' in t and 'row_clusters-flat' not in t[0]:
                headers = string.join(t[2:], '\t') + '\n'
                offset = 1
            else:
                headers = string.join(t[1:], '\t') + '\n'

            first_row = False
        else:
            key = t[0]
            if key != 'column_clusters-flat':
                key_list.append(key)

                try:
                    s = map(float, t[offset + 1:])
                except Exception:
                    nonNumericsPresent = True
                key_db[key] = t

    for key in key_list:
        t = key_db[key]
        s = [key]
        if offset == 1: s.append('')
        temp = []
        for value in t[offset + 1:]:
            try:
                temp.append(float(value))
            except Exception:
                pass
        min1 = min(temp)

        for value in t[offset + 1:]:
            try:
                s.append(str(float(value) - min1))
            except Exception:
                s.append('0.000101')
        key_db[key] = s
    export_object = open(data_name, 'w')
    export_object.write('' + '\t' +
                        headers)  ### Header is the same for each file
    for key in key_list:
        t = key_db[key]
        if offset > 0:
            t = [t[0]] + t[1 + offset:]
        export_object.write(string.join(t, '\t') +
                            '\n')  ### Write z-score values and row names
    export_object.close()
    print 'File written...'
    #return input_file

    array_names = []
    array_linker_db = {}
    d = 0
    for entry in headers.split('\t'):

        entry = cleanUpLine(entry)
        if '::' in entry:
            a = (entry.split("::"))
        else:
            a = (entry.split(":"))

        #entry=string.join(a,'.')

        ent = entry + '\t' + a[0]
        #if(ent[0].isdigit()):
        #    ent='X'+ent[0:]

        #if '-' in ent:
        #   ent=string.replace(ent,'-','.')
        #if '+' in ent:
        #   ent=string.replace(ent,'+','.')
        #print j
        array_names.append(ent)
    i = 0
    eheader = string.join(
        [''] + ['Group'],
        '\t') + '\n'  ### format column-flat-clusters for export
    export_cdt.write(eheader)
    for row in array_names:
        export_cdt.write(row + '\n')
        i += 1
    export_cdt.close()
    gheader = string.join(
        [''] + ['gene_short_name'],
        '\t') + '\n'  ### format column-flat-clusters for export
    export_gene.write(gheader)

    for key in key_list:
        if key in gene_to_symbol:
            symbol = gene_to_symbol[id][0]
            if symbol in gene_list:
                nid = symbol
                proceed = True
            if proceed:
                k = gene_list.index(nid)
                export_object.write(line)
                export_gene.write(id + '\n')
        else:
            export_gene.write(key + '\t' + key + '\n')

    export_object.close()
    export_gene.close()
Esempio n. 20
0
def importGeneExpressionValues(filename,tissue_specific_db,translation_db,useLog=False,previouslyRun=False,species=None):
    ### Import gene-level expression raw values           
    fn=filepath(filename); x=0; genes_added={}; gene_expression_db={}
    dataset_name = export.findFilename(filename)
    max_val=0
    print 'importing:',dataset_name
    
    try:
        import gene_associations, OBO_import
        gene_to_symbol = gene_associations.getGeneToUid(species,('hide','Ensembl-Symbol'))
        symbol_to_gene = OBO_import.swapKeyValues(gene_to_symbol)
    except Exception: symbol_to_gene={}
    
    for line in open(fn,'rU').xreadlines():
        data = cleanUpLine(line)
        t = string.split(data,'\t')
        
        if x==0:
            if '#' not in data:
                for i in t[1:]: sample_headers.append(i)
                x=1
        else:
            gene = t[0]
            #if '-' not in gene and ':E' in gene: print gene;sys.exit()
            if analysis_type == 'AltExon':
                try: ens_gene,exon = string.split(gene,'-')[:2]
                except Exception: exon = gene
                gene = exon
            if keyed_by == 'translation': ### alternative value is 'primaryID'
                """if gene == 'ENSMUSG00000025915-E19.3':
                    for i in translation_db: print [i], len(translation_db); break
                    print gene, [translation_db[gene]];sys.exit()"""
                try: gene = translation_db[gene] ### Ensembl annotations
                except Exception: pass
            try: gene = symbol_to_gene[gene][0] ### If RNASeq is the selected platform and Symbol is the uid
            except Exception: pass
            if gene in tissue_specific_db:
                index,tissue_exp=tissue_specific_db[gene]
                try: genes_added[gene]+=1
                except Exception: genes_added[gene]=1
                proceed=True
                try:
                    exp_vals = map(float, t[1:])
                    if platform == 'RNASeq':
                        if max(exp_vals)>max_val: max_val = max(exp_vals)
                        #if max(exp_vals)<3: proceed=False
                        if useLog==False:
                            exp_vals = map(lambda x: math.log(x+1,2),exp_vals)
                    if value_type == 'calls': ### Hence, this is a DABG or RNA-Seq expression
                        exp_vals = produceDetectionCalls(exp_vals,targetPlatform) ### 0 or 1 calls
                    if proceed:
                        gene_expression_db[gene] = [index,exp_vals]
                except Exception:
                    print 'Non-numeric values detected:'
                    x = 5
                    print t[:x]
                    while x < t:
                        t[x:x+5]
                        x+=5
                    print 'Formatting error encountered in:',dataset_name; forceError

    print len(gene_expression_db), 'matching genes in the dataset and tissue compendium database'
    
    for gene in genes_added:
        if genes_added[gene]>1: del gene_expression_db[gene] ### delete entries that are present in the input set multiple times (not trustworthy)
        else: expession_subset.append(gene_expression_db[gene]) ### These contain the rank order and expression
    #print len(expession_subset);sys.exit()
    expession_subset.sort() ### This order now matches that of 
    gene_expression_db=[]
    
    if max_val<20 and platform == 'RNASeq' and previouslyRun==False: ### Only allow to happen once
        importGeneExpressionValues(filename,tissue_specific_db,translation_db,useLog=True,previouslyRun=True,species=species)
Esempio n. 21
0
def importPAZARAssociations():
    pazar_files = unique.read_directory('/BuildDBs/PAZAR')
    species_db={}
    tf_to_target={}
    for file in pazar_files:
        if '.csv' in file:
            name = string.join(string.split(file,'_')[1:-1],'_')
            fn = filepath('BuildDBs/PAZAR/'+file)
            for line in open(fn,'rU').xreadlines():
                data = cleanUpLine(line)
                try:
                    ### Each line contains the following 11 tab-delim fields:
                    ### Fields are: <PAZAR TF ID>  <TF Name>  <PAZAR Gene ID>  <ensembl gene accession>  <chromosome>  <gene start coordinate>  <gene end coordinate>  <species>  <project name>  <PMID>  <analysis method> 
                    pazar_tf_id, tf_name, pazar_geneid, ens_gene, chr, gene_start,gene_end,species,project,pmid,analysis_method = string.split(data,'\t')
                    species,genus = string.split(species,' ')
                    species = species[0]+genus[0]
                    tft=TFTargetInfo(tf_name,ens_gene,project,pmid,analysis_method)
                    try: tf_to_target[species,tf_name].append(tft)
                    except Exception: tf_to_target[species,tf_name] = [tft]
                    species_db[species]=[]
                except Exception:
                    None ### Occurs due to file formatting issues (during an update?)

    determine_tf_geneids = 'no'
    if determine_tf_geneids == 'yes':
        """ The below code is probably most useful for creation of complex regulatory inference networks in Cytoscape """
        uniprot_ensembl_db = importUniProtAnnotations(species_db)
        missing=[]
        tf_to_target_ens={}
        for (species,tf_name) in tf_to_target:
            original_tf_name = tf_name
            try:
                ens_gene = uniprot_ensembl_db[species,tf_name]
                tf_to_target_ens[ens_gene]=tf_to_target[species,tf_name]
            except Exception:
                try:
                    tf_name = string.split(tf_name,'_')[0]
                    ens_gene = uniprot_ensembl_db[species,tf_name]
                    tf_to_target_ens[ens_gene]=tf_to_target[species,original_tf_name]
                except Exception:
                    try:
                        tf_names=[]
                        if '/' in tf_name:
                            tf_names = string.split(tf_name,'/')
                        elif ' ' in tf_name:
                            tf_names = string.split(tf_name,' ')
                        for tf_name in tf_names:
                            ens_gene = uniprot_ensembl_db[species,tf_name]
                            tf_to_target_ens[ens_gene]=tf_to_target[species,original_tf_name]          
                    except Exception: missing.append((tf_name,species))
        print 'Ensembl IDs found for UniProt Transcription factor names:',len(tf_to_target_ens),'and missing:', len(missing)
        #print missing[:20]
        
    ### Translate all species data to gene symbol to export for all species
    species_tf_targets={}
    for (species,tf_name) in tf_to_target:
        try:
            tf_db = species_tf_targets[species]
            tf_db[tf_name] = tf_to_target[species,tf_name]
        except Exception:
            tf_db = {}
            tf_db[tf_name] = tf_to_target[species,tf_name]
            species_tf_targets[species] = tf_db
        
    tf_dir = 'BuildDBs/PAZAR/symbol/tf-target.txt'
    tf_data = export.ExportFile(tf_dir)
    tf_to_symbol={}
    #print 'Exporting:',tf_dir
    #print len(species_tf_targets)
    for species in species_tf_targets:
        try: gene_to_source_id = gene_associations.getGeneToUid(species,('hide','Ensembl-Symbol'))
        except Exception: gene_to_source_id={}
        tf_db = species_tf_targets[species]
        for tf_name in tf_db:
            for tft in tf_db[tf_name]:
                try:
                    for symbol in gene_to_source_id[tft.Ensembl()]:
                        symbol = string.lower(symbol)
                        tf_id = tf_name+'(Source:'+tft.Project()+'-PAZAR'+')'
                        tf_data.write(tf_id+'\t'+symbol+'\n')
                        try: tf_to_symbol[tf_id].append(symbol)
                        except Exception: tf_to_symbol[tf_id] = [symbol]
                except Exception: null=[]; 
    tf_data.close()
    tf_to_symbol = gene_associations.eliminate_redundant_dict_values(tf_to_symbol)
    return tf_to_symbol
Esempio n. 22
0
def reformatPolyAdenylationCoordinates(species, force):
    """ PolyA annotations are currently only available from UCSC for human, but flat file
    annotations from 2003-2006 are available for multiple species. Convert these to BED format"""
    version = {}
    version['Rn'] = '2003(rn3)'
    version['Dr'] = '2003(zv4)'
    version['Gg'] = '2004(galGal2)'
    version['Hs'] = '2006(hg8)'
    version['Mm'] = '2004(mm5)'

    print 'Exporting polyADB_2 coordinates as BED for', species
    ### Obtain the necessary database files
    url = 'http://altanalyze.org/archiveDBs/all/polyAsite.txt'
    output_dir = 'AltDatabase/ucsc/' + species + '/'
    if force == 'yes':
        filename, status = update.download(url, output_dir, '')
    else:
        filename = output_dir + 'polyAsite.txt'

    ### Import the refseq to Ensembl information
    import gene_associations
    import OBO_import
    import EnsemblImport
    import export
    try:
        ens_unigene = gene_associations.getGeneToUid(species,
                                                     'Ensembl-UniGene')
        print len(ens_unigene), 'Ensembl-UniGene entries imported'
        external_ensembl = OBO_import.swapKeyValues(ens_unigene)
        use_entrez = 'no'
    except Exception:
        ens_entrez = gene_associations.getGeneToUid(species,
                                                    'Ensembl-EntrezGene')
        print len(ens_entrez), 'Ensembl-EntrezGene entries imported'
        external_ensembl = OBO_import.swapKeyValues(ens_entrez)
        use_entrez = 'yes'
    gene_location_db = EnsemblImport.getEnsemblGeneLocations(
        species, 'RNASeq', 'key_by_array')

    export_bedfile = output_dir + species + '_polyADB_2_predictions.bed'
    print 'exporting', export_bedfile
    export_data = export.ExportFile(export_bedfile)
    header = '#' + species + '\t' + 'polyADB_2' + '\t' + version[species] + '\n'
    export_data.write(header)

    fn = filepath(filename)
    x = 0
    not_found = {}
    for line in open(fn, 'rU').xreadlines():
        data = cleanUpLine(line)
        if x == 0: x = 1
        else:
            siteid, llid, chr, sitenum, position, supporting_EST, cleavage = string.split(
                data, '\t')
            if chr == 'chrM':
                chr = 'chrMT'  ### MT is the Ensembl convention whereas M is the Affymetrix and UCSC convention
            if chr == 'M':
                chr = 'MT'  ### MT is the Ensembl convention whereas M is the Affymetrix and UCSC convention
            if species in siteid:
                if 'NA' not in chr: chr = 'chr' + chr
                strand = '+'
                geneid = siteid
                pos_start = str(int(position) - 1)
                pos_end = position
                if use_entrez == 'no':
                    external_geneid = string.join(
                        string.split(siteid, '.')[:2], '.')
                else:
                    external_geneid = llid
                if external_geneid in external_ensembl:
                    ens_geneid = external_ensembl[external_geneid][0]
                    geneid += '-' + ens_geneid
                    chr, strand, start, end = gene_location_db[ens_geneid]
                else:
                    not_found[external_geneid] = []
                    bed_format = string.join(
                        [chr, pos_start, pos_end, geneid, '0', '-'], '\t'
                    ) + '\n'  ### We don't know the strand, so write out both strands
                    export_data.write(bed_format)
                bed_format = string.join(
                    [chr, pos_start, pos_end, geneid, '0', strand],
                    '\t') + '\n'
                export_data.write(bed_format)
    export_data.close()