def compareProteinFeatures(protein_ft,neg_coding_seq,pos_coding_seq):
    ###Parse out ft-information. Generate ft-fragment sequences for querying
    ###This is a modification of the original script from FeatureAlignment but simplified for exon analysis
    protein_ft_unique=[]; new_ft_list = []
    for ft_data in protein_ft:
        ft_name = ft_data.PrimaryAnnot(); domain_seq = ft_data.DomainSeq(); annotation = ft_data.SecondaryAnnot()
        protein_ft_unique.append((ft_name,annotation,domain_seq))
    ###Redundant entries that are class objects can't be eliminated, so save to a new list and eliminate redundant entries
    protein_ft_unique = unique.unique(protein_ft_unique)
    for (ft_name,annotation,domain_seq) in protein_ft_unique:
        ft_length = len(domain_seq)
        new_ft_data = 'null',domain_seq,ft_name,annotation
        new_ft_list.append(new_ft_data)
    new_ft_list = unique.unique(new_ft_list)
    pos_ft = []; neg_ft = []; all_fts = []
    for (pos,seq,ft_name,annot) in new_ft_list:
        if seq in pos_coding_seq:
            pos_ft.append([pos,seq,ft_name,annot]); all_fts.append([pos,seq,ft_name,annot])
        if seq in neg_coding_seq:
            neg_ft.append([pos,seq,ft_name,annot]); all_fts.append([pos,seq,ft_name,annot])
    all_fts = unique.unique(all_fts)
    pos_ft_missing=[]; neg_ft_missing=[]
    for entry in all_fts:
        if entry not in pos_ft: pos_ft_missing.append(entry)
        if entry not in neg_ft: neg_ft_missing.append(entry)
    pos_ft_missing2=[]; neg_ft_missing2=[]
    for entry in pos_ft_missing: entry[1] = ''; pos_ft_missing2.append(entry)
    for entry in neg_ft_missing: entry[1] = ''; neg_ft_missing2.append(entry)
        
    pos_ft_missing2 = unique.unique(pos_ft_missing2)
    neg_ft_missing2 = unique.unique(neg_ft_missing2)  
    return neg_ft_missing2,pos_ft_missing2
def compareProteinFeatures(protein_ft,neg_coding_seq,pos_coding_seq):
    ###Parse out ft-information. Generate ft-fragment sequences for querying
    ###This is a modification of the original script from FeatureAlignment but simplified for exon analysis
    protein_ft_unique=[]; new_ft_list = []
    for ft_data in protein_ft:
        ft_name = ft_data.PrimaryAnnot(); domain_seq = ft_data.DomainSeq(); annotation = ft_data.SecondaryAnnot()
        protein_ft_unique.append((ft_name,annotation,domain_seq))
    ###Redundant entries that are class objects can't be eliminated, so save to a new list and eliminate redundant entries
    protein_ft_unique = unique.unique(protein_ft_unique)
    for (ft_name,annotation,domain_seq) in protein_ft_unique:
        ft_length = len(domain_seq)
        new_ft_data = 'null',domain_seq,ft_name,annotation
        new_ft_list.append(new_ft_data)
    new_ft_list = unique.unique(new_ft_list)
    pos_ft = []; neg_ft = []; all_fts = []
    for (pos,seq,ft_name,annot) in new_ft_list:
        if seq in pos_coding_seq:
            pos_ft.append([pos,seq,ft_name,annot]); all_fts.append([pos,seq,ft_name,annot])
        if seq in neg_coding_seq:
            neg_ft.append([pos,seq,ft_name,annot]); all_fts.append([pos,seq,ft_name,annot])
    all_fts = unique.unique(all_fts)
    pos_ft_missing=[]; neg_ft_missing=[]
    for entry in all_fts:
        if entry not in pos_ft: pos_ft_missing.append(entry)
        if entry not in neg_ft: neg_ft_missing.append(entry)
    pos_ft_missing2=[]; neg_ft_missing2=[]
    for entry in pos_ft_missing: entry[1] = ''; pos_ft_missing2.append(entry)
    for entry in neg_ft_missing: entry[1] = ''; neg_ft_missing2.append(entry)
        
    pos_ft_missing2 = unique.unique(pos_ft_missing2)
    neg_ft_missing2 = unique.unique(neg_ft_missing2)  
    return neg_ft_missing2,pos_ft_missing2
def grabRNAIdentifiers(mrna_assignment):
    ensembl_ids=[]; mRNA_ids=[]
    mRNA_entries = string.split(mrna_assignment,' /// ')
    for entry in mRNA_entries:
        mRNA_info = string.split(entry,' // '); mrna_ac = mRNA_info[0]
        if 'ENS' in mrna_ac: ensembl_ids.append(mrna_ac)
        else:
            try: int(mrna_ac[-3:]); mRNA_ids.append(mrna_ac)
            except ValueError: continue
    ensembl_ids = unique.unique(ensembl_ids)
    mRNA_ids = unique.unique(mRNA_ids)
    return ensembl_ids, mRNA_ids
def getFeatureIsoformGenomePositions(species,protein_ft_db,mRNA_protein_seq_db,gene_transcript_db,coordinate_type):
    """ Adapted from compareProteinFeatures but for one isoform and returns genomic coordinates for each feature
    This function is designed to export all unique isoforms rather than just comparison isoforms """
    
    import export
    export_file = 'AltDatabase/ensembl/'+species+'/ProteinFeatureIsoform_complete.txt'                
    export_data = export.ExportFile(export_file)

    failed = 0
    worked = 0
    failed_ac=[]
    for gene in protein_ft_db:
        transcript_feature_db={}
        for ft in protein_ft_db[gene]:
            try:
                ft_name = ft.PrimaryAnnot(); annotation = ft.SecondaryAnnot()
                for (mRNA,type) in gene_transcript_db[gene]:
                    try:
                        protein,protein_seq = mRNA_protein_seq_db[mRNA]
                        error = False
                    except Exception:
                        failed_ac.append(mRNA)
                        error = True
                    if error == False:
                        if ft.DomainSeq() in protein_seq:
                            #if coordinate_type == 'genomic':
                            pos1_genomic = ft.GenomicStart(); pos2_genomic = ft.GenomicStop()
                            #else:
                            pos1 = str(ft.DomainStart()); pos2 = str(ft.DomainEnd())
    
                            ### There are often many features that overlap within a transcript, so consistently pick just one
                            if mRNA in transcript_feature_db:
                                db = transcript_feature_db[mRNA]
                                if (pos1,pos2) in db:
                                    db[pos1, pos2].append([pos1_genomic, pos2_genomic, protein,ft_name,annotation])
                                else:
                                    db[pos1, pos2]=[[pos1_genomic, pos2_genomic, protein,ft_name,annotation]]
                            else:
                                db={}
                                db[pos1, pos2]=[[pos1_genomic, pos2_genomic, protein,ft_name,annotation]]
                                transcript_feature_db[mRNA] = db
                                
                            #values = [mRNA, protein, pos1, pos2,ft_name,annotation]; unique_entries.append(values)
                            worked+=1
            except IOError:
                failed+=1

        for transcript in transcript_feature_db:
            db = transcript_feature_db[transcript]
            for (pos1,pos2) in db:
                db[pos1,pos2].sort() ### Pick the alphabetically listed first feature
                pos1_genomic, pos2_genomic, protein,ft_name,annotation = db[pos1,pos2][0]
                values = [transcript, protein, pos1, pos2,pos1_genomic, pos2_genomic, ft_name,annotation]
                export_data.write(string.join(values,'\t')+'\n')
                
    export_data.close()
    print failed,'features failed to have corresponding aligned genomic locations out of', worked+failed
    failed_ac = unique.unique(failed_ac)
    print len(failed_ac),'mRNAs without identified/in silico derived proteins'  ### Appear to be ncRNAs without ATGs
    print failed_ac[:20]
Exemple #5
0
    def FindAssociatedExonsBlocks(self):
        """ Indentify matching exon/intron regions for the circRNA coordinates """

        chr, cstart, cend = self.coordinates
        circRNA_coords = [cstart, cend]
        circRNA_coords.sort()
        if self.GeneID() in gene_to_exons:
            search_blocks = gene_to_exons[self.GeneID()]
            aligned_blocks = []
            for exon_block in search_blocks:
                chr, strand, start, end = exon_block_coordinates[exon_block]
                block_coords = [start, end]
                block_coords.sort()
                coords = circRNA_coords + block_coords
                coords.sort()
                if len(unique.unique(coords)) == 3:
                    if exon_block not in aligned_blocks:
                        aligned_blocks.append(exon_block)
                elif coords[:2] == circRNA_coords or coords[
                        -2:] == circRNA_coords:
                    pass
                else:
                    if exon_block not in aligned_blocks:
                        aligned_blocks.append(exon_block)
            self.aligned_blocks = aligned_blocks
        else:
            self.aligned_blocks = []
        return self.aligned_blocks
Exemple #6
0
def eliminate_redundant_dict_values(database):
    db1 = {}
    for key in database:
        list = unique.unique(database[key])
        list.sort()
        db1[key] = list
    return db1
def eliminate_redundant_dict_values(database):
    db1={}
    for key in database:
        list = unique.unique(database[key])
        list.sort()
        db1[key] = list
    return db1
Exemple #8
0
def reformatHeatmapFile(input_file):
    import unique

    export_file = string.replace(input_file, "Clustering-", "Input-")
    eo = export.ExportFile(export_file)
    first_row = True
    fn = filepath(input_file)
    for line in open(fn, "rU").xreadlines():
        data = cleanUpLine(line)
        t = string.split(data, "\t")
        if first_row == True:
            if "column_clusters-flat" not in t:
                array_names = []
                for i in t[2:]:
                    array_names.append(string.replace(i, ":", "-"))
                    # array_names.append(i)
            elif "column_clusters-flat" in t:
                array_clusters = t[2:]
                unique_clusters = unique.unique(array_clusters)
                ind = 0
                headers = []
                for c in array_clusters:
                    headers.append(c + "::" + array_names[ind])
                    ind += 1
                headers = string.join(["uid"] + headers, "\t") + "\n"
                eo.write(headers)
                first_row = False
        else:
            values = string.join([t[0]] + t[2:], "\t") + "\n"
            eo.write(values)
    return export_file, len(unique_clusters)
Exemple #9
0
def eliminateRedundant(database):
    db1 = {}
    for key in database:
        list = unique.unique(database[key])
        list.sort()
        db1[key] = list
    return db1
Exemple #10
0
def reformatHeatmapFile(input_file):
    import unique
    export_file = string.replace(input_file, 'Clustering-', 'Input-')
    eo = export.ExportFile(export_file)
    first_row = True
    fn = filepath(input_file)
    for line in open(fn, 'rU').xreadlines():
        data = cleanUpLine(line)
        t = string.split(data, '\t')
        if first_row == True:
            if 'column_clusters-flat' not in t:
                array_names = []
                for i in t[2:]:
                    array_names.append(string.replace(i, ':', '-'))
                    #array_names.append(i)
            elif 'column_clusters-flat' in t:
                array_clusters = t[2:]
                unique_clusters = unique.unique(array_clusters)
                ind = 0
                headers = []
                for c in array_clusters:
                    headers.append(c + '::' + array_names[ind])
                    ind += 1
                headers = string.join(['uid'] + headers, '\t') + '\n'
                eo.write(headers)
                first_row = False
        else:
            values = string.join([t[0]] + t[2:], '\t') + '\n'
            eo.write(values)
    return export_file, len(unique_clusters)
def getFeatureIsoformGenomePositions(species,protein_ft_db,mRNA_protein_seq_db,gene_transcript_db,coordinate_type):
    """ Adapted from compareProteinFeatures but for one isoform and returns genomic coordinates for each feature
    This function is designed to export all unique isoforms rather than just comparison isoforms """
    
    import export
    export_file = 'AltDatabase/ensembl/'+species+'/ProteinFeatureIsoform_complete.txt'                
    export_data = export.ExportFile(export_file)

    failed = 0
    worked = 0
    failed_ac=[]
    for gene in protein_ft_db:
        transcript_feature_db={}
        for ft in protein_ft_db[gene]:
            try:
                ft_name = ft.PrimaryAnnot(); annotation = ft.SecondaryAnnot()
                for (mRNA,type) in gene_transcript_db[gene]:
                    try:
                        protein,protein_seq = mRNA_protein_seq_db[mRNA]
                        error = False
                    except Exception:
                        failed_ac.append(mRNA)
                        error = True
                    if error == False:
                        if ft.DomainSeq() in protein_seq:
                            if coordinate_type == 'genomic':
                                pos1 = ft.GenomicStart(); pos2 = ft.GenomicStop()
                            else:
                                pos1 = str(ft.DomainStart()); pos2 = str(ft.DomainEnd())
    
                            ### There are often many features that overlap within a transcript, so consistently pick just one
                            if mRNA in transcript_feature_db:
                                db = transcript_feature_db[mRNA]
                                if (pos1,pos2) in db:
                                    db[pos1, pos2].append([protein,ft_name,annotation])
                                else:
                                    db[pos1, pos2]=[[protein,ft_name,annotation]]
                            else:
                                db={}
                                db[pos1, pos2]=[[protein,ft_name,annotation]]
                                transcript_feature_db[mRNA] = db
                                
                            #values = [mRNA, protein, pos1, pos2,ft_name,annotation]; unique_entries.append(values)
                            worked+=1
            except IOError:
                failed+=1

        for transcript in transcript_feature_db:
            db = transcript_feature_db[transcript]
            for (pos1,pos2) in db:
                db[pos1,pos2].sort() ### Pick the alphabetically listed first feature
                protein,ft_name,annotation = db[pos1,pos2][0]
                values = [transcript, protein, pos1, pos2,ft_name,annotation]
                export_data.write(string.join(values,'\t')+'\n')
                
    export_data.close()
    print failed,'features failed to have corresponding aligned genomic locations out of', worked+failed
    failed_ac = unique.unique(failed_ac)
    print len(failed_ac),'mRNAs without identified/in silico derived proteins'  ### Appear to be ncRNAs without ATGs
    print failed_ac[:20]
Exemple #12
0
def eliminateRedundant(database):
    db1 = {}
    for key in database:
        list = unique.unique(database[key])
        list.sort()
        db1[key] = list
    return db1
Exemple #13
0
def write_subj_list(ls_dir, test, subj_ls_stem, subjects, studies):
    ls_prep = '8100_20180806_'
    outdata = np.array([subjects, studies])
    outdata = outdata.T.tolist()

    outdir = ls_dir + '/subjects/per_test/' + test
    mkdir_p(outdir)
    outcsv = outdir + '/' + ls_prep + subj_ls_stem + '_SubjStudy.csv'
    outtxt = outdir + '/' + ls_prep + subj_ls_stem + '_UniqSubj.txt'

    with open(outcsv, 'w') as f:
        wr = csv.writer(f)
        wr.writerows(outdata)

    f = open(outtxt, 'w')
    strout = '\n'.join(unique(subjects))
    f.write(strout + '\n')
    f.close()

    # per race
    races = ['1', '2', '3', '4', '5', '6', 'DK']
    for race in races:
        race_ls = ls_dir + '/subjects/per_race/' + ls_prep + subj_ls_stem + '_subj_' + race + '.txt'
        f = open(race_ls, 'r')
        subj_race = f.read()
        subj_race = subj_race.split()
        subj_keep_race = np.intersect1d(subjects, subj_race)

        outtxt = outdir + '/' + ls_prep + subj_ls_stem + '_subj_' + race + '.txt'
        f = open(outtxt, 'w')
        strout = '\n'.join(subj_keep_race)
        if strout:
            strout = strout + '\n'
        f.write(strout)
        f.close()
def polynomials2matrix(polynomial):
    p = eqsize(polynomial)
    nt = sum(nterms(p))
    nv = nvars(p[0])

    M = zeros((nv, nt), dtype=int32)
    inds = [None] * p.size
    k = 0
    for i in range(0, p.size):
        inds.append[i] = range(k, k + nterms(p[i]) - 1)
        M[:, k:k + nterms(p[i]) - 1] = monomials(p[i])
        k = k + nterms(p[i])

    neg_M_sum = -1 * M.sum(axis=0)
    M_trans = M.conj().T
    neg_M_sum_trans = neg_M_sum.conj().T
    M_fliplr = fliplr(M_trans)

    new_grev_M = concatenate(neg_M_sum_trans, M_fliplr)

    _, ia, ib = unique(new_grev_M)

    M = float(M[:, ia])

    mon = zeros(M.shape[1], 1)
    for i in range(M.shape[1], -1, -1):
        mon[i, 1] = Multipol.multipol(1, M[:, i])

    C = zeros(p.size, M.shape[0])
    for i in range(0, p.size):
        ind = ib[inds[i]]
        C[i, ind] = coeffs(p[i])

    return C, mon
Exemple #15
0
def reformatHeatmapFile(input_file):
    import unique
    export_file=string.replace(input_file,'Clustering-','Input-')
    eo = export.ExportFile(export_file)
    first_row = True
    fn=filepath(input_file)
    for line in open(fn,'rU').xreadlines():
        data = cleanUpLine(line)
        t = string.split(data,'\t')
        if first_row == True:
            if 'column_clusters-flat' not in t:
                array_names = []
                for i in t[2:]:
                    array_names.append(string.replace(i,':','-'))
                    #array_names.append(i)
            elif 'column_clusters-flat' in t:
                array_clusters = t[2:]
                unique_clusters = unique.unique(array_clusters)
                ind=0; headers=[]
                for c in array_clusters:
                    headers.append(c+'::'+array_names[ind])
                    ind+=1
                headers = string.join(['uid']+headers,'\t')+'\n'
                eo.write(headers)
                first_row = False
        else:
            values = string.join([t[0]]+t[2:],'\t')+'\n'
            eo.write(values)
    return export_file, len(unique_clusters)
Exemple #16
0
def importSplicingEventsToVisualize(eventsToVisualizeFilename):
    splicing_events = []
    ### Import the splicing events to visualize from an external text file (multiple formats supported)
    type = None
    expandedSearch = False
    firstLine = True
    for line in open(eventsToVisualizeFilename, 'rU').xreadlines():
        line = cleanUpLine(line)
        t = string.split(line, '\t')
        if firstLine:
            if 'junctionID-1' in t:
                j1i = t.index('junctionID-1')
                j2i = t.index('junctionID-2')
                type = 'ASPIRE'
                expandedSearch = True
            if 'ANOVA' in t:
                type = 'PSI'
            elif 'independent confirmation' in t:
                type = 'confirmed'
                expandedSearch = True
            elif 'ANOVA' in eventsToVisualizeFilename:
                type = 'ANOVA'
            firstLine = False
        if '|' in t[0]:
            type = 'ANOVA'
        if ' ' in t[0] and ':' in t[0]:
            splicing_events.append(t[0])
        elif type == 'ASPIRE':
            splicing_events.append(t[j1i] + ' ' + t[j2i])
            splicing_events.append(t[j2i] + ' ' + t[j1i])
        elif type == 'ANOVA':
            try:
                a, b = string.split(t[0], '|')
                a = string.split(a, ':')
                a = string.join(a[1:], ':')
                splicing_events.append(a + ' ' + b)
                splicing_events.append(b + ' ' + a)
            except Exception:
                pass
        elif type == 'PSI':
            try:
                j1, j2 = string.split(t[0], '|')
                a, b, c = string.split(j1, ':')
                j1 = b + ':' + c
                splicing_events.append(j1 + ' ' + j2)
                splicing_events.append(j2 + ' ' + j1)
            except Exception:
                #print traceback.format_exc();sys.exit()
                pass
        elif type == 'confirmed':
            try:
                event_pair1 = string.split(t[1], '|')[0]
                a, b, c, d = string.split(event_pair1, '-')
                splicing_events.append(a + '-' + b + ' ' + c + '-' + d)
                splicing_events.append(c + '-' + d + ' ' + a + '-' + b)
            except Exception:
                pass
    splicing_events = unique.unique(splicing_events)
    return splicing_events, expandedSearch
Exemple #17
0
def importSplicingEventsToVisualize(eventsToVisualizeFilename):
    splicing_events = []
    ### Import the splicing events to visualize from an external text file (multiple formats supported)
    type = None
    expandedSearch = False
    firstLine = True
    for line in open(eventsToVisualizeFilename, "rU").xreadlines():
        line = cleanUpLine(line)
        t = string.split(line, "\t")
        if firstLine:
            if "junctionID-1" in t:
                j1i = t.index("junctionID-1")
                j2i = t.index("junctionID-2")
                type = "ASPIRE"
                expandedSearch = True
            if "ANOVA" in t:
                type = "PSI"
            elif "independent confirmation" in t:
                type = "confirmed"
                expandedSearch = True
            elif "ANOVA" in eventsToVisualizeFilename:
                type = "ANOVA"
            firstLine = False
        if "|" in t[0]:
            type = "ANOVA"
        if " " in t[0] and ":" in t[0]:
            splicing_events.append(t[0])
        elif type == "ASPIRE":
            splicing_events.append(t[j1i] + " " + t[j2i])
            splicing_events.append(t[j2i] + " " + t[j1i])
        elif type == "ANOVA":
            try:
                a, b = string.split(t[0], "|")
                a = string.split(a, ":")
                a = string.join(a[1:], ":")
                splicing_events.append(a + " " + b)
                splicing_events.append(b + " " + a)
            except Exception:
                pass
        elif type == "PSI":
            try:
                j1, j2 = string.split(t[0], "|")
                a, b, c = string.split(j1, ":")
                j1 = b + ":" + c
                splicing_events.append(j1 + " " + j2)
                splicing_events.append(j2 + " " + j1)
            except Exception:
                # print traceback.format_exc();sys.exit()
                pass
        elif type == "confirmed":
            try:
                event_pair1 = string.split(t[1], "|")[0]
                a, b, c, d = string.split(event_pair1, "-")
                splicing_events.append(a + "-" + b + " " + c + "-" + d)
                splicing_events.append(c + "-" + d + " " + a + "-" + b)
            except Exception:
                pass
    splicing_events = unique.unique(splicing_events)
    return splicing_events, expandedSearch
def sangerImport(parse_sequences):
    """"Sanger center (miRBase) sequence was provided as a custom (requested) dump of their v5 target predictions
    (http://microrna.sanger.ac.uk/targets/v5/), containing Ensembl gene IDs, microRNA names, and putative target
    sequences, specific for either mouse or human. Mouse was requested in late 2005 whereas human in late 2007.
    These same annotation files, missing the actual target sequence but containing an ENS transcript and coordinate
    locations for that build (allowing seqeunce extraction with the appropriate Ensembl build) exist at:
    http://microrna.sanger.ac.uk/cgi-bin/targets/v5/download.pl"""
    
    if species == 'Hs': filename = 'AltDatabase/miRBS/'+species+'/'+'mirbase-v5_homo_sapiens.mirna.txt'; prefix = 'hsa-'
    if species == 'Rn': filename = 'AltDatabase/miRBS/'+species+'/'+'sanger_miR_target_predictions.txt'; prefix = 'rno-'
    if species == 'Mm': filename = 'AltDatabase/miRBS/'+species+'/'+'sanger_miR_target_predictions.txt'; prefix = 'mmu-'    

    print 'parsing', filename; count=0
    fn=filepath(filename); x=1; mir_sequences=[]
    verifyFile(filename,species) ### Makes sure file is local and if not downloads.
    for line in open(fn,'rU').xreadlines():         
        data = cleanUpLine(line)
        t = string.split(data,'\t')
        if x==0: x=1
        else:
            ensembl_geneids=[]
            if species == 'Hs':
                try:
                    mir = t[1]; ens_transcript = t[2]; ensembl_geneid = t[17]; mir_sequences = string.upper(t[14])
                    ensembl_geneids.append(ensembl_geneid)
                except IndexError: print line;kill
            elif species == 'Mm':
                ens_transcript,mir,mir_sequences = t
                if ens_transcript in ens_gene_to_transcript:
                    ensembl_geneids = ens_gene_to_transcript[ens_transcript]; ensembl_geneid = ensembl_geneids[0]
            elif species == 'Rn':
                ensembl_geneid,mir,mir_sequences = t
                mir_sequences = string.lower(mir_sequences); mir = string.replace(mir,'hsa','rno'); mir = string.replace(mir,'mmu','rno')
                ensembl_geneids=[ensembl_geneid]
            geneid_ls=[]
            #mir_sequences = string.replace(mir_sequences,'-',''); mir_sequences = string.replace(mir_sequences,'=','')
            #mir_sequences = string.upper(mir_sequences)
            #if 'GGCTCCTGTCACCTGGGTCCGT' in mir_sequences:
            #print ensembl_geneid, mir; sys.exit()
            for ensembl_geneid in ensembl_geneids:
                if ensembl_geneid in redundant_ensembl_by_build: ###Thus there are redundant geneids
                    geneid_ls += redundant_ensembl_by_build[ensembl_geneid]+[ensembl_geneid]
                else: geneid_ls += [ensembl_geneid]
                if species == 'Hs':
                    if ens_transcript in ens_gene_to_transcript: geneid_ls+= ens_gene_to_transcript[ens_transcript] 
            geneid_ls = unique.unique(geneid_ls)
            if len(geneid_ls) == 1 and geneid_ls[0]=='': null =[] ###not a valid gene
            elif prefix in mir:
                for ensembl_geneid in geneid_ls:
                    if parse_sequences == 'yes':
                        if (mir,ensembl_geneid) in combined_results:
                            mir_sequences = string.replace(mir_sequences,'-',''); mir_sequences = string.replace(mir_sequences,'=',''); count+=1
                            combined_results[(mir,ensembl_geneid)].append(string.upper(mir_sequences))
                    else:
                        if prefix in mir:
                            y = MicroRNATargetData(ensembl_geneid,'',mir,mir_sequences,'mirbase'); count+=1
                            try: microRNA_target_db[mir].append(y)
                            except KeyError: microRNA_target_db[mir] = [y]
    print count, 'miRNA-target relationships added for mirbase'
Exemple #19
0
def importSplicingEventsToVisualize(eventsToVisualizeFilename):
    splicing_events=[]
    ### Import the splicing events to visualize from an external text file (multiple formats supported)
    type = None
    expandedSearch = False
    firstLine = True
    for line in open(eventsToVisualizeFilename,'rU').xreadlines():
        line = cleanUpLine(line)
        t = string.split(line,'\t')
        if firstLine:
            if 'junctionID-1' in t:
                j1i = t.index('junctionID-1')
                j2i = t.index('junctionID-2')
                type='ASPIRE'
                expandedSearch = True
            if 'ANOVA' in t:
                type='PSI'
            elif 'independent confirmation' in t:
                type='confirmed'
                expandedSearch = True
            elif 'ANOVA' in eventsToVisualizeFilename:
                type = 'ANOVA'
            firstLine=False
        if '|' in t[0]:
            type = 'ANOVA'
        if ' ' in t[0] and ':' in t[0]:
            splicing_events.append(t[0])
        elif type=='ASPIRE':
            splicing_events.append(t[j1i] +' '+ t[j2i])
            splicing_events.append(t[j2i] +' '+ t[j1i])
        elif type=='ANOVA':
            try:
                a,b = string.split(t[0],'|')
                a = string.split(a,':')
                a = string.join(a[1:],':')
                splicing_events.append(a +' '+ b)
                splicing_events.append(b +' '+ a)
            except Exception: pass
        elif type=='PSI':
            try:
                j1,j2 = string.split(t[0],'|')
                a,b,c = string.split(j1,':')
                j1 = b+':'+c
                splicing_events.append(j1 +' '+ j2)
                splicing_events.append(j2 +' '+ j1)
            except Exception:
                #print traceback.format_exc();sys.exit()
                pass
        elif type=='confirmed':
            try:
                event_pair1 = string.split(t[1],'|')[0]
                a,b,c,d = string.split(event_pair1,'-')
                splicing_events.append(a+'-'+b +' '+ c+'-'+d)
                splicing_events.append(c+'-'+d +' '+ a+'-'+b)
            except Exception: pass
	else:
	   splicing_events.append(t[0])
    splicing_events = unique.unique(splicing_events)
    return splicing_events,expandedSearch
Exemple #20
0
def writeIsoformFile(isoform_junctions,o):
    for coord in isoform_junctions:
        isoform_junctions[coord] = unique.unique(isoform_junctions[coord])
        if '+' in coord:
            print coord, isoform_junctions[coord] 
    
    if '+' in coord:
        sys.exit()
def grabRNAIdentifiers(mrna_assignment):
    ensembl_ids = []
    mRNA_ids = []
    mRNA_entries = string.split(mrna_assignment, ' /// ')
    for entry in mRNA_entries:
        mRNA_info = string.split(entry, ' // ')
        mrna_ac = mRNA_info[0]
        if 'ENS' in mrna_ac: ensembl_ids.append(mrna_ac)
        else:
            try:
                int(mrna_ac[-3:])
                mRNA_ids.append(mrna_ac)
            except ValueError:
                continue
    ensembl_ids = unique.unique(ensembl_ids)
    mRNA_ids = unique.unique(mRNA_ids)
    return ensembl_ids, mRNA_ids
Exemple #22
0
def writeIsoformFile(isoform_junctions, o):
    for coord in isoform_junctions:
        isoform_junctions[coord] = unique.unique(isoform_junctions[coord])
        if '+' in coord:
            print coord, isoform_junctions[coord]

    if '+' in coord:
        sys.exit()
Exemple #23
0
def eliminateRedundant(database):
    for key in database:
        try:
            list = makeUnique(database[key])
            list.sort()
        except Exception: list = unique.unique(database[key])
        database[key] = list
    return database
Exemple #24
0
 def Coordinates(self):
     x=0; coords=[]
     for i in self.start_set:
         coord = self.Chr()+':'+str(i)+'-'+str(self.end_set[x])
         coords.append(coord)
         x+=1
     coords = unique.unique(coords)
     coords = string.join(coords,'|') ###If multiple coordinates
     return coords
Exemple #25
0
def get_subjects(csv_dir, ls_dir, test, subj_ls_stem):
    if 'Penn_CNP' in test:
        csv = csv_dir + '/8100_' + 'Penn_CNP_(12-18-13)' + '_20180806.csv'
    else:
        csv = csv_dir + '/8100_' + test + '_20180806.csv'
    pheno_ls = ls_dir + '/phenotypes/' + test + '.txt'
    subj_ls = ls_dir + '/subjects/8100_20180806_' + subj_ls_stem + '_subj.txt'
    outdir = ls_dir + '/subjects/per_test/' + test
    mkdir_p(outdir)

    subj_hdr = 'ID'
    study_hdr = "SUB_STUDY"
    f = open(pheno_ls, 'r')
    hdr_str = f.read()
    f.close()
    hdr_ls = hdr_str.split()
    hdr_ls.insert(0, study_hdr)
    hdr_ls.insert(0, subj_hdr)

    df = gs.JWL_parse_delimited_file(fname=csv, keep_hdrs=hdr_ls, rowfilter_hdr=subj_hdr, \
     rowfilter_list=subj_ls, delim=',')
    df.index = range(len(df))
    tot_Nsubj, subj_keep, row_idx = gs.JWL_count_subj_nonempty_intersect(
        df, subj_hdr, hdr_ls[1:], out_txt=None)

    subj_keep = subj_keep.to_list()
    subj_uniq = unique(subj_keep)
    Nsubj_uniq = len(subj_uniq)

    subj_multi_studies = []
    for s in subj_uniq:
        Nstudy_subj = subj_keep.count(s)
        if Nstudy_subj > 1:
            subj_multi_studies.append(s)

    studies = df.loc[row_idx, study_hdr]
    studies = studies.to_list()
    studies_uniq = unique(studies)
    Nsubj_per_study = []
    for study in studies_uniq:
        Nsubj_study = studies.count(study)
        Nsubj_per_study.append(Nsubj_study)
    return tot_Nsubj, Nsubj_uniq, subj_keep, subj_multi_studies, studies, studies_uniq, Nsubj_per_study
Exemple #26
0
def grabNestedOntologyIDs():
    nested_ontology_tree={}
    for path in path_dictionary:
        parent_ontology_id = path_ontology_db[path]
        child_ontology_list=[]
        for child_path in path_dictionary[path]:
            child_ontology_id = path_ontology_db[child_path]; child_ontology_list.append(child_ontology_id)
        child_ontology_list = unique.unique(child_ontology_list)
        nested_ontology_tree[parent_ontology_id] = child_ontology_list
    return nested_ontology_tree
Exemple #27
0
def grabNestedOntologyIDs():
    nested_ontology_tree={}
    for path in path_dictionary:
        parent_ontology_id = path_ontology_db[path]
        child_ontology_list=[]
        for child_path in path_dictionary[path]:
            child_ontology_id = path_ontology_db[child_path]; child_ontology_list.append(child_ontology_id)
        child_ontology_list = unique.unique(child_ontology_list)
        nested_ontology_tree[parent_ontology_id] = child_ontology_list
    return nested_ontology_tree
Exemple #28
0
def findAvailableOntologies(species,mod_types):
    program_type,database_dir = unique.whatProgramIsThis()
    c = GrabFiles(); c.setdirectory('/'+database_dir+'/'+species+'/gene-go'); file_dirs=[]
    for mod in mod_types:
        file_dirs+= c.searchdirectory(mod+'-')
    avaialble_ontologies=[]
    for filedir in file_dirs:
        ontology_type = string.split(filedir,'-')[-1][:-4] ### remove the .txt
        avaialble_ontologies.append(ontology_type)
    avaialble_ontologies = unique.unique(avaialble_ontologies)
    return avaialble_ontologies
Exemple #29
0
def findAvailableOntologies(species,mod_types):
    program_type,database_dir = unique.whatProgramIsThis()
    c = GrabFiles(); c.setdirectory('/'+database_dir+'/'+species+'/gene-go'); file_dirs=[]
    for mod in mod_types:
        file_dirs+= c.searchdirectory(mod+'-')
    avaialble_ontologies=[]
    for filedir in file_dirs:
        ontology_type = string.split(filedir,'-')[-1][:-4] ### remove the .txt
        avaialble_ontologies.append(ontology_type)
    avaialble_ontologies = unique.unique(avaialble_ontologies)
    return avaialble_ontologies
def mirandaImport(parse_sequences,force):
    """Miranda data is avaialble from two file types from different websites. The first is human-centric with multi-species
    target alignment information organized by Ensembl gene ID (http://cbio.mskcc.org/research/sander/data/miRNA2003/mammalian/index.html).
    A larger set of associations was also pulled from species specific files (http://www.microrna.org/microrna/getDownloads.do),
    where gene symbol was related to Ensembl gene. Both files provided target microRNA sequence."""
    
    ### Then download the latest annotations and sequences
    if parse_sequences == 'coordinates':
        export_object = export.ExportFile('miRanda/'+species+'/miRanda.txt')
        print "Exporting to:"+'miRanda/'+species+'/miRanda.txt'
    verify, filename = verifyExternalDownload('miRanda')
    if verify == 'no': filename = downloadFile('miRanda')
    print 'parsing', filename; count=0; null_count=[]
    fn=filepath(filename); x=1; mir_sequences=[]
    verifyFile(filename,species) ### Makes sure file is local and if not downloads.
    for line in open(fn,'rU').xreadlines():         
        data = cleanUpLine(line)
        t = string.split(data,'\t')
        if x==0: x=1
        else:
            symbol = string.upper(t[3]); mir = t[1]; entrez_gene = t[2]; mir_sequences = string.upper(t[8])
            mir_sequences = string.replace(mir_sequences,'-',''); mir_sequences = string.replace(mir_sequences,'=','')
            mir_sequences = string.replace(mir_sequences,'U','T')
            #if 'GGCTCCTGTCACCTGGGTCCGT' in mir_sequences:
            #print symbol, mir; sys.exit()
            ensembl_gene_ids = []
            if symbol in symbol_ensembl_current:
                ensembl_gene_ids = symbol_ensembl_current[symbol]
            else: ensembl_gene_ids=[]; null_count.append(symbol)

            if len(ensembl_gene_ids) > 0:
                for ensembl_geneid in ensembl_gene_ids:
                    if parse_sequences == 'yes':
                        if (mir,ensembl_geneid) in combined_results:
                            combined_results[(mir,ensembl_geneid)].append(string.upper(mir_sequences)); count+=1
                    else:
                        y = MicroRNATargetData(ensembl_geneid,'',mir,mir_sequences,'miRanda'); count+=1
                        try: microRNA_target_db[mir].append(y)
                        except KeyError: microRNA_target_db[mir] = [y]
                        if parse_sequences == 'coordinates':
                            """
                            genome_coord = string.split(t[13],':')[1:]; chr = 'chr'+ genome_coord[0]
                            strand = genome_coord[-1]; start, stop = string.split(genome_coord[1],'-')
                            """
                            genome_coord = t[13][1:-1]
                            align_score = t[15]
                            y.setScore(align_score); y.setCoordinates(genome_coord)
                            export_object.write(y.Output())
    print count, 'miRNA-target relationships added for miRanda'
    null_count = unique.unique(null_count)
    print len(null_count), 'missing symbols',null_count[:10]
    if parse_sequences == 'coordinates': export_object.close()
Exemple #31
0
def count_subj_categorical(fcsv, subj_ls, subj_hdr, cat_hdr, delim=','):
    # Given the header of a categorical column in the "fcsv" file, this function
    # calculates the number of subject based on each unique value of that column
    #
    # fcsv:
    #     string, abs path of csv file containing the necessary phenotypical information
    # subj_ls:
    #     list of subject IDs. Each line corresponds to one subject
    # subj_hdr:
    #     header name of subject ID column
    # cat_hdr:
    #     header name of the categorical column
    # delim (optional):
    #     delimiter of fcsv. Default is ','
    #
    # ------ Outputs ------ #
    # uniq:
    #     list, unique values of the "cat_hdr" column
    # sgroups:
    #     list of lists. Each sub-list contains the subjects with the same value in
    #     "cat_hdr" column
    # idx:
    #     list of lists. Each sub-list contains the index of the corresponding subjects
    #     in "sgroups".
    # counts:
    #     list, the number of subjects per group

    X_keep = JWL_parse_delimited_file(fname=fcsv,
                                      keep_hdrs=None,
                                      rowfilter_hdr=subj_hdr,
                                      rowfilter_list=subj_ls)

    # get unique values of the given categorical column
    cat_col = X_keep[cat_hdr]
    cat_list = cat_col.tolist()
    uniq = unique(cat_list)

    # find which subject corresponds to which unique value
    subj_col = X_keep[subj_hdr]
    subj_list = subj_col.tolist()
    idx = []
    sgroups = []
    counts = []
    for x in uniq:
        curr_idx = [i for i in range(len(cat_list)) if cat_list[i] == x]
        idx.append(curr_idx)
        sgroups.append(np.array(subj_list)[curr_idx])
        counts.append(len(curr_idx))

    return uniq, sgroups, idx, counts
Exemple #32
0
 def escape_inline_syntax( self, text ):
   if not self.escape_inline_syntax_regexp:
     self.escape_inline_syntax_regexp = re.compile(
       String(
         '|'.join(
           map( 
             lambda r: "(#" + r + ")", 
             unique(self.inline_regexps)
           ) 
         )
       ).from_xs()
     )
   return re.sub( 
     self.escape_inline_syntax_regexp, 
     lambda md: "`" + md.group(0) + "`",
     text
   )
Exemple #33
0
def predictSplicingEventTypes(junction1, junction2):
    if 'I' not in junction1 and '_' in junction1:
        junction1 = string.replace(
            junction1, '_',
            '')  ### allows this to be seen as an alternative splice site
    if 'I' not in junction2 and '_' in junction2:
        junction2 = string.replace(
            junction2, '_',
            '')  ### allows this to be seen as an alternative splice site
    if 'I' in junction1:
        forceError
    if 'I' in junction2:
        forceError

    j1a, j1b = string.split(junction1, '-')
    j2a, j2b = string.split(junction2, '-')
    j1a = string.split(j1a, ':')[1][1:]
    j2a = string.split(j2a, ':')[1][1:]

    j1a, r1a = string.split(j1a, '.')
    j1b, r1b = string.split(j1b[1:], '.')

    j2a, r2a = string.split(j2a, '.')
    j2b, r2b = string.split(j2b[1:], '.')

    ### convert to integers
    j1a, r1a, j1b, r1b, j2a, r2a, j2b, r2b = map(
        lambda x: int(float(x)), [j1a, r1a, j1b, r1b, j2a, r2a, j2b, r2b])

    splice_event = []
    if j1a == j2a and j1b == j2b:  ### alt-splice site
        if r1a == r2a: splice_event.append("alt-3'")
        else: splice_event.append("alt-5'")
    elif j1a == j2a: splice_event.append("cassette-exon")
    elif j1b == j2b:
        if 'E1.' in junction1: splice_event.append("altPromoter")
        else: splice_event.append("cassette-exon")
    elif 'E1.' in junction1 or 'E1.1' in junction2:
        splice_event.append("altPromoter")
    else:
        splice_event.append("cassette-exon")
    splice_event = unique.unique(splice_event)
    splice_event.sort()
    splice_event = string.join(splice_event, '|')

    return splice_event
Exemple #34
0
    def match(self, lines):
        if not self.block_children:
            self.block_children = re.compile(
                '^(' + '|'.join(
                    unique(
                        map(
                            lambda rule: rule.regexp.pattern,
                            filter(lambda rule: not rule.is_inline(),
                                   self.children)))) + ')', re.MULTILINE)

        if 0 == len(lines):
            return

        self.cache = [lines.popleft()]
        while len(lines) > 0 and self.block_children.search(lines[0]):
            self.cache.append(lines.popleft())

        return self.cache
Exemple #35
0
def importPhenotypeOntologyGeneAssociations():
    x=0
    pheno_symbol={}; phen=[]
    fn = filepath('BuildDBs/Pheno/HMD_HumanPhenotype.rpt')
    for line in open(fn,'rU').xreadlines():
        data = cleanUpLine(line)
        if x==0: x=1
        else:
            t = string.split(data,'\t')
            hs_symbol=t[0]; hs_entrez=t[1]; mm_symbol=t[2]; mgi=t[3]; pheno_ids=t[4]
            hs_symbol = string.lower(hs_symbol)
            mm_symbol = string.lower(mm_symbol)
            symbols = [mm_symbol,hs_symbol]
            pheno_ids = string.split(pheno_ids,' '); phen+=pheno_ids
            for pheno_id in pheno_ids:
                if len(pheno_id)>0:
                    for symbol in symbols:
                        try: pheno_symbol[pheno_id].append(symbol)
                        except Exception: pheno_symbol[pheno_id]=[symbol]
    phen = unique.unique(phen)
    pheno_symbol = gene_associations.eliminate_redundant_dict_values(pheno_symbol)
    return pheno_symbol
Exemple #36
0
def parse_affymetrix_annotations(filename):
    temp_affy_db = {}
    x = 0
    fn = filepath(filename)
    for line in open(fn, "rU").xreadlines():
        probeset_data, null = string.split(line, "\n")  # remove endline
        affy_data = string.split(probeset_data[1:-1], '","')  # remove endline
        if x == 0:
            if probeset_data[0] == "#":
                continue
            x += 1
            affy_headers = affy_data
        else:
            x += 1
            probesets = affy_data[0]
            temp_affy_db[probesets] = affy_data[1:]
    for header in affy_headers:
        x = 0
        eg = ""
        gs = ""
        while x < len(affy_headers):
            if "rocess" in affy_headers[x]:
                gb = x - 1
            if "omponent" in affy_headers[x]:
                gc = x - 1
            if "olecular" in affy_headers[x]:
                gm = x - 1
            if "athway" in affy_headers[x]:
                gp = x - 1
            if "Gene Symbol" in affy_headers[x]:
                gs = x - 1
            if "Ensembl" in affy_headers[x]:
                eg = x - 1
            x += 1
        ###Below code used if human exon array parsed
        global analyze_human_exon_data
        analyze_human_exon_data = "no"
        if eg == "":
            x = 0
            while x < len(affy_headers):
                if "mrna_assignment" in affy_headers[x]:
                    eg = x - 1
                    analyze_human_exon_data = "yes"
                x += 1
    for probeset in temp_affy_db:
        affy_data = temp_affy_db[probeset]
        try:
            go_bio = affy_data[gb]
        except IndexError:
            ###Occurs due to a new line error
            continue
        go_com = affy_data[gc]
        go_mol = affy_data[gm]
        genmapp = affy_data[gp]
        if gs == "":
            symbol = ""
        else:
            symbol = affy_data[gs]
        if analyze_human_exon_data == "no":
            ensembl = affy_data[eg]
        else:
            ensembl_data = affy_data[eg]
            ensembl = ""
            try:
                if "gene:ENSMUSG" in ensembl_data:
                    ensembl_data = string.split(ensembl_data, "gene:ENSMUSG")
                    ensembl_data = string.split(ensembl_data[1], " ")
                    ensembl = "ENSMUSG" + ensembl_data[0]
                if "gene:ENSG" in ensembl_data:
                    ensembl_data = string.split(ensembl_data, "gene:ENSG")
                    ensembl_data = string.split(ensembl_data[1], " ")
                    ensembl = "ENSG" + ensembl_data[0]
            except IndexError:
                continue
        goa = []

        goa = merge_go_annoations(go_bio, goa)
        goa = merge_go_annoations(go_com, goa)
        goa = merge_go_annoations(go_mol, goa)
        goa = merge_go_annoations(genmapp, goa)

        goa = unique.unique(goa)
        goa.sort()
        goa = string.join(goa, "")
        try:
            ensembl = string.split(ensembl, " /// ")
        except ValueError:
            ensembl = [ensembl]
        for ensembl_id in ensembl:
            if len(goa) > 10:
                go_annotations[ensembl_id] = goa, symbol
 def Ensembl(self):
     ens_list = unique.unique(self._ensembl)
     ens_str = string.join(ens_list, ',')
     return ens_str
Exemple #38
0
def identifyPutativeSpliceEvents(exon_db, constituitive_probeset_db,
                                 array_id_db, agglomerate_inclusion_probesets,
                                 onlyAnalyzeJunctions):
    exon_dbase = {}
    probeset_comparison_db = {}
    x = 0
    y = 0
    ### Grab all probesets where we can identify a potential exon inclusion/exclusion event
    if len(array_id_db) == 0:
        array_id_db = exon_db  ### Used when exporting all comparitive junction data

    for probeset in array_id_db:
        if probeset in exon_db:
            affygene = exon_db[probeset].GeneID(
            )  #exon_db[probeset] = affygene,exons,ensembl,block_exon_ids,block_structure,comparison_info
            exons = exon_db[probeset].ExonID()  #get rid of last pipe
            if probeset not in constituitive_probeset_db:
                #thus, there is a 'gene' probeset for that gene, but we don't want to look at the gene probesets
                if '|' not in exons:  #get rid of any block exons or ambiguities)
                    try:
                        x += 1
                        probeset_comparison_db[affygene].append(exons)
                    except KeyError:
                        x += 1
                        probeset_comparison_db[affygene] = [exons]
            exon_dbase[affygene, exons] = probeset

    print "Number of putative probeset comparisons:", x

    probe_level_db = {}
    for affygene in probeset_comparison_db:
        for exon_probeset1 in probeset_comparison_db[affygene]:
            for exon_probeset2 in probeset_comparison_db[affygene]:
                if exon_probeset1 != exon_probeset2:
                    if '-' in exon_probeset1:  #get both pair-wise possibilities with this, to grab junctions
                        e1a, e1b = string.split(exon_probeset1, '-')
                        e1 = e1a, e1b
                        try:
                            e2a, e2b = string.split(exon_probeset2, '-')
                            e2 = e2a, e2b
                        except ValueError:
                            e2 = exon_probeset2
                        try:
                            probe_level_db[affygene, e1].append(e2)
                        except KeyError:
                            probe_level_db[affygene, e1] = [e2]
                    else:  ### Required when exon_probeset1 is a single exon rather than a junction
                        if '-' in exon_probeset2:
                            e2a, e2b = string.split(exon_probeset2, '-')
                            e2 = e2a, e2b
                            e1 = exon_probeset1
                            try:
                                probe_level_db[affygene, e2].append(e1)
                            except KeyError:
                                probe_level_db[affygene, e2] = [e1]
    #print "Looking for exon events defined by probeset exon associations"
    alt_junction_db, critical_exon_db = independently_rank_analyze_junction_sets(
        probe_level_db, onlyAnalyzeJunctions)
    #print "Associations Built\n"

    ### Rearange alt_junction_db and agglomerate data for inclusion probesets
    exon_inclusion_db = {}
    exon_inclusion_event_db = {}
    alt_junction_db_collapsed = {}
    if agglomerate_inclusion_probesets == 'yes':
        for affygene in alt_junction_db:
            alt_junction_db[affygene].sort(
            )  ### Should be no need to sort later if we do this
            for event in alt_junction_db[affygene]:
                ### event = [('ei', 'E16-E17'), ('ex', 'E16-E18')]
                event1 = event[0][0]
                exon_set1 = event[0][1]
                exon_set2 = event[1][1]
                probeset1 = exon_dbase[affygene, exon_set1]
                probeset2 = exon_dbase[affygene, exon_set2]
                if event1 == 'ei':
                    ###First generate the original fold values for export summary, then the adjusted
                    try:
                        exon_inclusion_db[probeset2].append(probeset1)
                    except KeyError:
                        exon_inclusion_db[probeset2] = [probeset1]
                    try:
                        exon_inclusion_event_db[(affygene, probeset2,
                                                 event[1])].append(event)
                    except KeyError:
                        exon_inclusion_event_db[(affygene, probeset2,
                                                 event[1])] = [event]
                else:  ### Store all the missing mutual exclusive splicing events
                    try:
                        alt_junction_db_collapsed[affygene].append(event)
                    except KeyError:
                        alt_junction_db_collapsed[affygene] = [event]

        ###Create a new alt_junction_db with merged inclusion events
        for key in exon_inclusion_event_db:
            affygene = key[0]
            excl_probeset = key[1]
            excl_event = key[2]
            ###Collect critical exon information from each inclusion exon-set to agglomerate and delete old entries
            new_critical_exon_list = []
            incl_exon_sets = []
            for event in exon_inclusion_event_db[key]:
                incl_exon_set = event[0][1]
                incl_exon_sets.append(
                    incl_exon_set
                )  ### Don't sort since this will throw off probeset relationships: incl_exon_sets.sort()
                if len(exon_inclusion_event_db[key]
                       ) > 1:  ###If the original list of events > 1
                    critical_exon_list = critical_exon_db[affygene,
                                                          tuple(event)][1]
                    for exon in critical_exon_list:
                        new_critical_exon_list.append(exon)
                    #del critical_exon_db[affygene,tuple(event)]
            new_critical_exon_list = unique.unique(new_critical_exon_list)
            new_critical_exon_list.sort()
            new_critical_exon_list = [1, new_critical_exon_list]
            incl_exon_sets_str = string.join(incl_exon_sets,
                                             '|')  ### New inclusion exon group
            event = [('ei', incl_exon_sets_str),
                     excl_event]  ### Store new inclusion exon group
            try:
                alt_junction_db_collapsed[affygene].append(event)
            except KeyError:
                alt_junction_db_collapsed[affygene] = [event]
            ###Replace exon_dbase entries with new combined probeset IDs
            incl_probesets = exon_inclusion_db[excl_probeset]
            incl_probesets_str = string.join(incl_probesets, '|')
            if len(
                    incl_exon_sets
            ) > 1:  ###Often there will be only a single inclusion probeset
                """for exons in incl_exon_sets:
                    key = affygene,exons
                    try: del exon_dbase[key] ###delete individual inclusion exons and replace with a single inclusion agglomerate
                    except KeyError: continue ###Can occur more than once, if an exon participates in more than one splicing event
                """
                exon_dbase[affygene, incl_exon_sets_str] = incl_probesets_str
                critical_exon_db[affygene,
                                 tuple(event)] = new_critical_exon_list
                ###Create a new probeset entry in exon_db for the agglomerated probesets
                new_block_exon_ids = [
                ]  #exon_db[probeset] = affygene,exons,ensembl,block_exon_ids,block_structure
                for probeset in incl_probesets:
                    edat = exon_db[probeset]
                    ensembl = edat.ExternalGeneID()
                    block_exon_ids = edat.SecondaryExonID()
                    block_structure = edat.GeneStructure()
                    new_block_exon_ids.append(block_exon_ids)
                new_block_exon_ids = string.join(new_block_exon_ids, '')
                edat = exon_db[incl_probesets[0]]
                edat1 = edat
                edat1.setDisplayExonID(
                    incl_exon_sets_str
                )  #; edat1.setExonID(edat.ExonID()) ### Use the first inclusion probeset instance for storing all instance data
                edat1.setSecondaryExonID(new_block_exon_ids)
                edat1.setProbeset(incl_probesets[0])
                exon_db[incl_probesets_str] = edat1
        print "Length of original splice event database:", len(alt_junction_db)
        print "Length of agglomerated splice event database:", len(
            alt_junction_db_collapsed)
        alt_junction_db = alt_junction_db_collapsed  ### Replace with agglomerated database
        ### End Rearangement

    return alt_junction_db, critical_exon_db, exon_dbase, exon_inclusion_db, exon_db
Exemple #39
0
def get_true_count(data):
    count = unique(data, axis=1).shape[1]
    return count
Exemple #40
0
 def EnsExons(self):
     exonsets_unique = unique.unique(self.exon_sets)
     return string.join(exonsets_unique,'|')
def TargetScanImport(parse_sequences,force):
    """The TargetScan data is currently extracted from a cross-species conserved family file. This file only contains
    gene symbol, microRNA name and 3'UTR seed locations."""
    if species == 'Mm': tax = '10090'; prefix = 'mmu-'
    elif species == 'Hs': tax = '9606'; prefix = 'hsa-'
    elif species == 'Rn': tax = '10116'; prefix = 'rno-'
    else: prefix = 'hsa-'

    import AltAnalyze
    ###Get taxid annotations from the GO-Elite config
    species_annot_db=AltAnalyze.importGOEliteSpeciesInfo(); tax_db={}
    for species_full in species_annot_db:
        if species==species_annot_db[species_full].SpeciesCode():
            tax = species_annot_db[species_full].TaxID()
            
    global l
    
    ### See if the files are already there
    verifyTSG, target_scan_target_file = verifyExternalDownload('TargetScanGenes')
    verifyTSS, target_scan_sequence_file = verifyExternalDownload('TargetScanSequences')

    if verifyTSG == 'no' or verifyTSS == 'no': ### used to be - if force == 'yes'
        if parse_sequences == 'no':
            ### Then download the latest annotations and sequences
            target_scan_target_file = downloadFile('TargetScanGenes')
            target_scan_sequence_file = downloadFile('TargetScanSequences')

    ### Cross-species TargetScan file with UTR seqeunces for all genes with reported targets in the conserved family file
    ### Although this file includes valid sequence data that appears to match up to the target file, the target file
    ### appears to only list the seed seqeunce location (UTR start and stop) and not the full binding sequence and thus
    ### is not ammenable to probe set alignment.
    print 'parsing', target_scan_sequence_file
    fn=filepath(target_scan_sequence_file); x=0; target_scan_gene_utr_seq={}
    for line in open(fn,'rU').xreadlines():         
        data = cleanUpLine(line)
        t = string.split(data,'\t')
        if x==0: x=1
        else:
            symbol = string.upper(t[2]); tax_id = t[3]; utr_seq = t[4]
            if tax_id == tax:
                utr_seq_no_gaps = string.replace(utr_seq,'-','')
                utr_seq_no_gaps = string.replace(utr_seq_no_gaps,'U','T')
                if symbol in symbol_ensembl_current and len(utr_seq_no_gaps)>0:
                    target_scan_gene_utr_seq[symbol] = utr_seq_no_gaps
    print 'UTR sequence for',len(target_scan_gene_utr_seq),'TargetScan genes stored in memory.'
        
    mir_sequences = []; count=0
    print 'parsing', target_scan_target_file
    #verifyFile(target_scan_target_file,species) ### Makes sure file is local and if not downloads.
    fn=filepath(target_scan_target_file); x=0; k=[]; l=[]
    for line in open(fn,'rU').xreadlines():         
        data = cleanUpLine(line)
        t = string.split(data,'\t')
        if x==0:
            x=1
            data = string.lower(data)
            t = string.split(data,'\t')
            i=0
            for value in t:
                if 'mir' in value: m = i
                elif 'gene id' in value: g = i
                elif 'gene symbol' in value: s = i
                elif 'transcript' in value: r = i
                elif 'species id' in value: txi = i
                elif 'utr start' in value: us = i
                elif 'utr end' in value: ue = i
                i+=1
        else:
            mir = t[m]; geneid = t[g]; gene_symbol = string.upper(t[s]); taxid = t[txi]; utr_start = int(t[us]); utr_end  = int(t[ue])
            ### Old format
            #mir = t[0]; gene_symbol = string.upper(t[1]); taxid = t[2]; utr_start = t[3]; utr_end = t[4]
            if '/' in mir:
                mir_list=[]
                mirs = string.split(mir,'/')
                for mirid in mirs[1:]:
                    mirid = 'miR-'+mirid
                    mir_list.append(mirid)
                mir_list.append(mirs[0])
            else: mir_list = [mir]

            if taxid == tax: ###human
                #target_scan_gene_utr_seq[symbol] = utr_seq_no_gaps
                if gene_symbol in symbol_ensembl_current: ensembl_geneids = symbol_ensembl_current[gene_symbol]; proceed = 'yes'; k.append(gene_symbol)
                else: proceed = 'no'; l.append(gene_symbol)
                if gene_symbol in target_scan_gene_utr_seq:
                    ### TargetScan provides the core, while processed miRs are typically 22nt - seems to approximate other databases better
                    adj_start = utr_start-15
                    if adj_start < 0: adj_start=0
                    mir_sequences = target_scan_gene_utr_seq[gene_symbol][adj_start:utr_end+1]
                    #if string.lower(gene_symbol) == 'tns3' and mir == 'miR-182': print mir,gene_symbol,taxid,utr_start,utr_end,mir_sequences
                else: mir_sequences=[]
                ###Already multiple geneids associated with each symbol so don't need to worry about renundancy
                if proceed == 'yes':
                    for ensembl_geneid in ensembl_geneids:
                        for mir in mir_list:
                            #if ensembl_geneid == 'ENSG00000137815' and mir == 'miR-214': print mir,gene_symbol,taxid,utr_start,utr_end,mir_sequences,target_scan_gene_utr_seq[gene_symbol];sys.exit()
                            if parse_sequences == 'yes':
                                if (prefix+mir,ensembl_geneid) in combined_results:
                                    combined_results[(prefix+mir,ensembl_geneid)].append(mir_sequences); count+=1
                            else:
                                #if ensembl_geneid == 'ENSMUSG00000029467': print mir
                                y = MicroRNATargetData(ensembl_geneid,gene_symbol,mir_sequences,prefix+mir,'TargetScan')
                                count+=1
                                try: microRNA_target_db[prefix+mir].append(y)
                                except KeyError: microRNA_target_db[prefix+mir] = [y]
    k = unique.unique(k); l = unique.unique(l)
    print 'ensembls-found:',len(k),', not found:',len(l)
    print l[:10]
    print count, 'miRNA-target relationships added for TargetScan'
Exemple #42
0
def parse_affymetrix_annotations(filename):
    temp_affy_db = {}
    x = 0
    fn = filepath(filename)
    for line in open(fn, 'rU').xreadlines():
        probeset_data, null = string.split(line, '\n')  #remove endline
        affy_data = string.split(probeset_data[1:-1], '","')  #remove endline
        if x == 0:
            if probeset_data[0] == '#':
                continue
            x += 1
            affy_headers = affy_data
        else:
            x += 1
            probesets = affy_data[0]
            temp_affy_db[probesets] = affy_data[1:]
    for header in affy_headers:
        x = 0
        eg = ''
        gs = ''
        while x < len(affy_headers):
            if 'rocess' in affy_headers[x]: gb = x - 1
            if 'omponent' in affy_headers[x]: gc = x - 1
            if 'olecular' in affy_headers[x]: gm = x - 1
            if 'athway' in affy_headers[x]: gp = x - 1
            if 'Gene Symbol' in affy_headers[x]: gs = x - 1
            if 'Ensembl' in affy_headers[x]: eg = x - 1
            x += 1
        ###Below code used if human exon array parsed
        global analyze_human_exon_data
        analyze_human_exon_data = 'no'
        if eg == '':
            x = 0
            while x < len(affy_headers):
                if 'mrna_assignment' in affy_headers[x]:
                    eg = x - 1
                    analyze_human_exon_data = 'yes'
                x += 1
    for probeset in temp_affy_db:
        affy_data = temp_affy_db[probeset]
        try:
            go_bio = affy_data[gb]
        except IndexError:
            ###Occurs due to a new line error
            continue
        go_com = affy_data[gc]
        go_mol = affy_data[gm]
        genmapp = affy_data[gp]
        if gs == '': symbol = ''
        else: symbol = affy_data[gs]
        if analyze_human_exon_data == 'no':
            ensembl = affy_data[eg]
        else:
            ensembl_data = affy_data[eg]
            ensembl = ''
            try:
                if 'gene:ENSMUSG' in ensembl_data:
                    ensembl_data = string.split(ensembl_data, 'gene:ENSMUSG')
                    ensembl_data = string.split(ensembl_data[1], ' ')
                    ensembl = 'ENSMUSG' + ensembl_data[0]
                if 'gene:ENSG' in ensembl_data:
                    ensembl_data = string.split(ensembl_data, 'gene:ENSG')
                    ensembl_data = string.split(ensembl_data[1], ' ')
                    ensembl = 'ENSG' + ensembl_data[0]
            except IndexError:
                continue
        goa = []

        goa = merge_go_annoations(go_bio, goa)
        goa = merge_go_annoations(go_com, goa)
        goa = merge_go_annoations(go_mol, goa)
        goa = merge_go_annoations(genmapp, goa)

        goa = unique.unique(goa)
        goa.sort()
        goa = string.join(goa, '')
        try:
            ensembl = string.split(ensembl, ' /// ')
        except ValueError:
            ensembl = [ensembl]
        for ensembl_id in ensembl:
            if len(goa) > 10:
                go_annotations[ensembl_id] = goa, symbol
def importAndReformatEnsemblJunctionAnnotations(species, array_type,
                                                nonconstitutive_junctions):
    filename = 'AltDatabase/' + species + '/' + array_type + '/' + species + '_Ensembl_' + array_type + '_probesets.txt'
    export_filepath = 'AltDatabase/' + species + '/' + array_type + '/' + species + '_Ensembl_probesets.txt'
    efn = filepath(export_filepath)
    export_data = open(efn, 'w')

    fn = filepath(filename)
    x = 0
    ensembl_exon_db = {}
    left = {}
    right = {}
    exon_gene_db = {}
    nonjunction_aligning = {}
    for line in open(fn, 'rU').xreadlines():
        data = cleanUpLine(line)
        if x == 0:
            x = 1
            export_data.write(data + '\n')
        else:
            t = string.split(data, '\t')
            probeset, exon_id, ensembl_gene_id, transcript_cluster_id, chr, strand, probeset_start, probeset_stop, affy_class, constitutitive_probeset, ens_exon_ids, exon_annotations, regionid, r_start, r_stop, splice_event, splice_junctions = t
            if len(regionid) < 1:
                regionid = exon_id
                t[12] = exon_id
            if chr == 'chrM':
                chr = 'chrMT'  ### MT is the Ensembl convention whereas M is the Affymetrix and UCSC convention
            if chr == 'M':
                chr = 'MT'  ### MT is the Ensembl convention whereas M is the Affymetrix and UCSC convention
            tc, probeset = string.split(probeset, ':')
            regionid = string.replace(regionid, '-', '.')
            original_region_id = regionid
            r_starts = string.split(r_start, '|')
            r_stops = string.split(r_stop, '|')
            ed = EnsemblImport.ExonStructureData(ensembl_gene_id, chr, strand,
                                                 probeset_start, probeset_stop,
                                                 constitutitive_probeset,
                                                 ens_exon_ids, [])
            ed.reSetExonID(regionid)
            if '|5' in probeset:
                left[probeset[:-2]] = ed, t
                if strand == '+':  ### If the junction probesets DO NOT align to the region coordinates, then the probeset maps to a junction outside the database
                    if probeset_stop not in r_stops:
                        nonjunction_aligning[
                            probeset[:
                                     -2]] = original_region_id + '_' + probeset_stop, 'left'
                elif probeset_start not in r_starts:
                    nonjunction_aligning[
                        probeset[:
                                 -2]] = original_region_id + '_' + probeset_start, 'left'
            elif '|3' in probeset:
                right[probeset[:-2]] = ed, t
                if strand == '+':
                    if probeset_start not in r_starts:
                        nonjunction_aligning[
                            probeset[:
                                     -2]] = original_region_id + '_' + probeset_start, 'right'
                elif probeset_stop not in r_stops:
                    nonjunction_aligning[
                        probeset[:
                                 -2]] = original_region_id + '_' + probeset_stop, 'right'
            else:
                t[0] = probeset
                ensembl_exon_db[probeset] = ed
                export_data.write(string.join(t, '\t') + '\n')
                regionids = string.split(regionid, '|')
                for regionid in regionids:
                    exon_gene_db[ensembl_gene_id, regionid] = probeset

    for probeset in left:
        if probeset in right:
            l, pl = left[probeset]
            r, pr = right[probeset]
            if l.Constitutive() != r.Constitutive():
                l.setConstitutive(
                    'no'
                )  ### used to determine if a junciton is alternative or constitutive
            if probeset in nonconstitutive_junctions: l.setConstitutive('no')
            l.setJunctionCoordinates(l.ExonStart(), l.ExonStop(),
                                     r.ExonStart(), r.ExonStop())
            ens_exon_idsl = pl[10]
            ens_exon_idsr = pr[10]
            exon_idl = pl[1]
            exon_idr = pr[1]
            regionidl = pl[12]
            regionidr = pr[12]
            splice_junctionsl = pl[-1]
            splice_junctionsr = pr[-1]
            exon_idl = string.replace(exon_idl, '-', '.')
            exon_idr = string.replace(exon_idr, '-', '.')
            regionidl_block = string.split(regionidl, '-')[0]
            regionidr_block = string.split(regionidr, '-')[0]

            if regionidl_block != regionidr_block:  ### Otherwise, the junction is probing a single exon block and thus is not informative
                regionidl = string.replace(regionidl, '-', '.')
                regionidr = string.replace(regionidr, '-', '.')
                exon_id = exon_idl + '-' + exon_idr
                regionid = regionidl + '-' + regionidr

                if probeset in nonjunction_aligning:
                    new_region_id, side = nonjunction_aligning[probeset]
                    regionid = renameJunction(regionid, side, new_region_id)

                l.reSetExonID(regionid)
                ensembl_exon_db[probeset] = l

                splice_junctionsl += splice_junctionsr
                ens_exon_idsl = string.split(ens_exon_idsl, '|')
                ens_exon_idsr = string.split(ens_exon_idsr, '|')
                ens_exon_ids = string.join(
                    unique.unique(ens_exon_idsl + ens_exon_idsr), '|')
                pl[10] = ens_exon_ids
                pl[12] = regionid
                pl[1] = exon_id
                pl[-1] = splice_junctionsl
                pl[13] = l.ExonStart() + '|' + l.ExonStop()
                pl[14] = r.ExonStart() + '|' + r.ExonStop()
                strand = pl[5]
                if strand == '+':
                    pl[6] = l.ExonStop()
                    pl[7] = r.ExonStart()  ### juncstion splice-sites
                else:
                    pl[6] = l.ExonStart()
                    pl[7] = r.ExonStop()  ### juncstion splice-sites

                pl[0] = probeset
                pl[9] = l.Constitutive()

                pl = string.join(pl, '\t') + '\n'
                export_data.write(pl)

    export_data.close()
    return ensembl_exon_db, exon_gene_db
Exemple #44
0
def importHopachOutput(filename):
    """ Import the ID order information """
    db = {}  ### Used to store the cluster data
    hopach_clusters = []
    cluster_level = []
    cluster_level2 = []
    hopach_db = {}
    cluster_db = {}
    level2_level1 = {}
    firstLine = True
    fn = filepath(filename)
    for line in open(fn, "rU").xreadlines():
        data = cleanUpLine(line)
        if firstLine:
            firstLine = False
        else:
            index, uid, cluster_number, cluster_label, cluster_level_order, final_label, final_level_order = string.split(
                data, "\t"
            )
            try:
                l2 = str(int(round(float(cluster_label), 0)))[:2]
            except Exception:
                l2 = int(cluster_label[0])
            hopach_clusters.append(
                (int(final_level_order), int(index) - 1)
            )  ### Need to order according to the original index, sorted by the clustered order
            cluster_level.append(int(cluster_label[0]))  ### This is the root cluster number
            cluster_level2.append(l2)  ### Additional cluster levels
            hopach_db[uid] = cluster_label
            level2_level1[l2] = int(cluster_label[0])
            try:
                cluster_db[int(float(cluster_label[0]))].append(uid)
            except Exception:
                cluster_db[int(cluster_label[0])] = [uid]
            try:
                cluster_db[l2].append(uid)
            except Exception:
                cluster_db[l2] = [uid]

    split_cluster = []
    for cluster in cluster_db:
        # print cluster,len(cluster_db[cluster]),(float(len(cluster_db[cluster]))/len(hopach_db))
        if len(cluster_db[cluster]) > 100 and (float(len(cluster_db[cluster])) / len(hopach_db)) > 0.3:
            # print cluster
            if cluster < 10:
                split_cluster.append(cluster)
    import unique

    levels1 = unique.unique(cluster_level)
    if len(split_cluster) > 0:
        print "Splitting large hopach clusters:", split_cluster
        i = 0
        for l2 in cluster_level2:
            l1 = level2_level1[l2]
            if l1 in split_cluster:
                cluster_level[i] = l2
            i += 1

    else:
        if len(cluster_level) > 50:  ### Decide to use different hopach levels
            if len(levels1) < 3:
                cluster_level = cluster_level2
        if len(cluster_level) > 200:
            if len(levels1) < 4:
                cluster_level = cluster_level2

    hopach_clusters.sort()
    hopach_clusters = map(
        lambda x: x[1], hopach_clusters
    )  ### Store the original file indexes in order based the cluster final order
    db["leaves"] = hopach_clusters  ### This mimics Scipy's cluster output data structure
    db["level"] = cluster_level
    return db
def getJunctionComparisonsFromExport(species,array_type):
    type = 'standard'
    gene_junction_db = importEnsemblUCSCAltJunctions(species,type)
    
    ### Retrieve probesets with exon-junctions associated - these are critical exons
    filename = 'AltDatabase/'+species+'/'+array_type+'/'+species+'_Ensembl_'+array_type+'_probesets.txt'
    gene_probeset_db = ExonArrayEnsemblRules.reimportEnsemblProbesetsForSeqExtraction(filename,'junctions',{})
    left={}; right={}; gene_db={}; gene_exon_db={}; nonjunction_aligning={}
    for gene in gene_probeset_db:
        for (probe_data,ed) in gene_probeset_db[gene]:
            probeset, strand, probeset_start, probeset_stop = probe_data
            region_id = string.replace(ed.RegionNumber(),'-','.')
            original_region_id = region_id
            region_ids = string.split(region_id,'|')
            gene_db[probeset[:-2]]=gene
            #ed.AssociatedSplicingJunctions()
            r_starts=string.split(ed.ExonStart(),'|'); r_stops=string.split(ed.ExonStop(),'|')
            for region_id in region_ids:
                if '|5' in probeset:
                    try: left[probeset[:-2]].append(region_id)
                    except Exception: left[probeset[:-2]]=[region_id]
                    if strand == '+': ### If the junction probesets DO NOT align to the region coordinates, then the probeset maps to a junction outside the database
                        if probeset_stop not in r_stops: nonjunction_aligning[probeset[:-2]] = original_region_id+'_'+probeset_stop,'left'
                    elif probeset_start not in r_starts: nonjunction_aligning[probeset[:-2]] = original_region_id+'_'+probeset_start,'left'
                elif '|3' in probeset:
                    try: right[probeset[:-2]].append(region_id)
                    except Exception: right[probeset[:-2]]=[region_id]
                    if strand == '+':
                        if probeset_start not in r_starts: nonjunction_aligning[probeset[:-2]] = original_region_id+'_'+probeset_start,'right'
                    elif probeset_stop not in r_stops: nonjunction_aligning[probeset[:-2]] = original_region_id+'_'+probeset_stop,'right'
                else:
                    if '_' in region_id: print killer
                    try: gene_exon_db[gene,region_id].append(probeset)
                    except Exception: gene_exon_db[gene,region_id] = [probeset]

    print 'len(nonjunction_aligning)',len(nonjunction_aligning)
    gene_exon_db = eliminateRedundant(gene_exon_db)            
    junction_db={} ### Get the exon-region IDs for an exon-junction
    for probeset in left:
        gene = gene_db[probeset]
        if probeset in right:
            for region1 in left[probeset]:
                for region2 in right[probeset]:
                    junction = region1+'-'+region2
                    try: junction_db[gene,junction].append(probeset)
                    except Exception: junction_db[gene,junction] = [probeset]

    probeset_junction_export = 'AltDatabase/' + species + '/'+array_type+'/'+ species + '_junction_comps.txt'
    
    fn=filepath(probeset_junction_export); data = open(fn,'w')
    print "Exporting",probeset_junction_export
    title = 'gene'+'\t'+'critical_exon'+'\t'+'exclusion_junction_region'+'\t'+'inclusion_junction_region'+'\t'+'exclusion_probeset'+'\t'+'inclusion_probeset'+'\t'+'data_source'+'\n'
    data.write(title); temp_list=[]
    
    for (gene,critical_exon,incl_junction,excl_junction) in gene_junction_db:
        if (gene,incl_junction) in junction_db:
            incl_junction_probesets = junction_db[gene,incl_junction]
            if (gene,excl_junction) in junction_db:
                excl_junction_probesets = junction_db[gene,excl_junction]
                for incl_junction_probeset in incl_junction_probesets:
                    for excl_junction_probeset in excl_junction_probesets:
                        try:
                            for incl_exon_probeset in gene_exon_db[gene,critical_exon]:
                                if incl_junction_probeset in nonjunction_aligning or excl_junction_probeset in nonjunction_aligning: null=[]
                                else: ### Ensure the probeset DOES map to the annotated junctions
                                    temp_list.append(string.join([gene,critical_exon,excl_junction,critical_exon,excl_junction_probeset,incl_exon_probeset,'AltAnalyze'],'\t')+'\n')
                        except Exception: null=[]
                        if incl_junction_probeset in nonjunction_aligning:
                            new_region_id, side = nonjunction_aligning[incl_junction_probeset]
                            incl_junction = renameJunction(incl_junction,side,new_region_id)
                        if excl_junction_probeset in nonjunction_aligning:
                            new_region_id, side = nonjunction_aligning[excl_junction_probeset]
                            excl_junction = renameJunction(excl_junction,side,new_region_id)
                        if excl_junction_probeset!=incl_junction_probeset:
                            temp_list.append(string.join([gene,critical_exon,excl_junction,incl_junction,excl_junction_probeset,incl_junction_probeset,'AltAnalyze'],'\t')+'\n')
    temp_list = unique.unique(temp_list)
    for i in temp_list: data.write(i)
    data.close()
    print 'Number of compared junctions exported', len(temp_list)
 def Ensembl(self):
     ens_list = unique.unique(self._ensembl)
     ens_str = string.join(ens_list,',')
     return ens_str
Exemple #47
0
def associateQueryGenesWithInteractions(query_db,query_interactions,dir_file):
    suffix=''
    if dir_file!=None:
        if len(dir_file)!=0:
            suffix='-'+intNameShort+'_'+export.findFilename(dir_file)[:-4]
    if len(suffix)==0:
        try: suffix = '_'+FileName
        except Exception: None
    file_name = 'AltAnalyze-network'+suffix
    
    query_interactions_unique={}
    interacting_genes={}
    connections = 1
    primary=0
    secondary=0
    terciary=0
    for ensemblGene in query_db:
        if ensemblGene in interaction_db:
            for interacting_ensembl in interaction_db[ensemblGene]:
                if interacting_ensembl not in blackList:
                    ###Only allow direct interactions found in query
                    if interacting_ensembl in query_db:
                        try: query_interactions[ensemblGene].append(interacting_ensembl)
                        except KeyError: query_interactions[ensemblGene] = [interacting_ensembl]
                        try: query_interactions[interacting_ensembl].append(ensemblGene)
                        except KeyError: query_interactions[interacting_ensembl] = [ensemblGene]
                        primary+=1
                    if degrees == 2 or degrees == 'indirect':
                        try: interacting_genes[interacting_ensembl].append(ensemblGene)
                        except KeyError: interacting_genes[interacting_ensembl] = [ensemblGene]
                    elif degrees == 'allInteracting' or degrees == 'all possible':
                        try: query_interactions[ensemblGene].append(interacting_ensembl)
                        except KeyError: query_interactions[ensemblGene] = [interacting_ensembl]
                    if interacting_ensembl in secondaryQueryIDs: ### IDs in the expression file
                        secondary+=1 ### When indirect degrees selected, no additional power added by this (only for direct or shortest path)
                        try: query_interactions[ensemblGene].append(interacting_ensembl)
                        except KeyError: query_interactions[ensemblGene] = [interacting_ensembl]    
        if ensemblGene in second_degree_obligatory:
            for interacting_ensembl in second_degree_obligatory[ensemblGene]:
                try: interacting_genes[interacting_ensembl].append(ensemblGene)
                except KeyError: interacting_genes[interacting_ensembl] = [ensemblGene]

    ### Include indirect interactions to secondaryQueryIDs from the expression file
    if degrees == 2 or degrees == 'indirect':
        for ensemblGene in secondaryQueryIDs:
            if ensemblGene in interaction_db:
                for interacting_ensembl in interaction_db[ensemblGene]:
                    if interacting_ensembl not in blackList:
                        try:
                            interacting_genes[interacting_ensembl].append(ensemblGene)
                            terciary+=1#; print interacting_ensembl
                        except KeyError: None ### Only increase the interacting_genes count if the interacting partner is present from the primary query list
    #print primary,secondary,terciary
    
    ### Report the number of unique interacting genes
    for interacting_ensembl in interacting_genes:
        if len(interacting_genes[interacting_ensembl])==1:
            interacting_genes[interacting_ensembl] = 1
        else:
            unique_interactions = unique.unique(interacting_genes[interacting_ensembl])
            interacting_genes[interacting_ensembl] = len(unique_interactions)
    
    query_indirect_interactions={}; indirect_interacting_gene_list=[]; interacting_gene_list=[]; added=[] 
    if degrees=='shortestPath' or degrees=='shortest path': ### Typically identifying the single smallest path(s) between two nodes.
        query_indirect_interactions, indirect_interacting_gene_list, interacting_gene_list = evaluateShortestPath(query_db,interaction_db,10)
        
    else:
        if degrees==2 or degrees=='indirect' or len(secondDegreeObligatoryCategories)>0:
            for ensembl in interacting_genes:
                if interacting_genes[ensembl] > connections:
                    if ensembl in interaction_db: ### Only nodes removed due to promiscuity will not be found
                        for interacting_ensembl in interaction_db[ensembl]:
                            if interacting_ensembl in query_db or interacting_ensembl in secondaryQueryIDs:
                                try: query_indirect_interactions[interacting_ensembl].append(ensembl)
                                except KeyError: query_indirect_interactions[interacting_ensembl] = [ensembl]
                        ###Record the highest linked nodes
                        indirect_interacting_gene_list.append((interacting_genes[ensembl],ensembl)) 
        if len(obligatory_interactions)>0: ### Include always
            all_reported_genes = combineDBs(query_interactions,query_indirect_interactions) ### combinesDBs and returns a unique list of genes
            for ensemblGene in all_reported_genes: ###This only includes genes in the original input list
                if ensemblGene in obligatory_interactions:
                    for interacting_ensembl in obligatory_interactions[ensemblGene]:
                        #symbol = ensembl_symbol_db[ensemblGene]                    
                        try: query_interactions[ensemblGene].append(interacting_ensembl)
                        except KeyError: query_interactions[ensemblGene] = [interacting_ensembl]
    
    z = dict(query_interactions.items() + query_indirect_interactions.items())
    interaction_restricted_db={}
    for ensembl in z:
        interacting_nodes = z[ensembl]
        for node in interacting_nodes:
            if ensembl in interaction_restricted_db:
                db = interaction_restricted_db[ensembl]
                db[node] = 1
            else: interaction_restricted_db[ensembl] = {node:1}

            if node in interaction_restricted_db:
                db = interaction_restricted_db[node]
                db[ensembl] = 1
            else: interaction_restricted_db[node] = {ensembl:1}
            
    if degrees==2 or degrees=='indirect': ### get rid of non-specific interactions
        query_indirect_interactions, indirect_interacting_gene_list, interacting_gene_list = evaluateShortestPath(query_db,interaction_restricted_db,4)
        
    ###Record the highest linked nodes
    for ensembl in query_interactions:
        linked_nodes = len(unique.unique(query_interactions[ensembl]))
        interacting_gene_list.append((linked_nodes,ensembl))
    interacting_gene_list.sort(); interacting_gene_list.reverse()
    indirect_interacting_gene_list.sort();  indirect_interacting_gene_list.reverse()
    
    print "Length of query_interactions:",len(query_interactions)
    query_interactions_unique=[]
    for gene1 in query_interactions:
        for gene2 in query_interactions[gene1]:
            temp = []; temp.append(gene2); temp.append(gene1)#; temp.sort()
            if gene1 == gene2: interaction_type = 'self'
            else: interaction_type = 'distinct'
            temp.append(interaction_type); temp.reverse()
            query_interactions_unique.append(temp)
    for gene1 in query_indirect_interactions:
        for gene2 in query_indirect_interactions[gene1]:
            temp = []; temp.append(gene2); temp.append(gene1)#; temp.sort()
            if gene1 == gene2: interaction_type = 'self'
            else: interaction_type = 'indirect'
            temp.append(interaction_type); temp.reverse()
            query_interactions_unique.append(temp)
    query_interactions_unique = unique.unique(query_interactions_unique)
    query_interactions_unique.sort()
    

    ###Write out nodes linked to many other nodes
    new_file = outputDir+'/networks/'+file_name+ '-interactions_'+str(degrees)+'_degrees_summary.txt'
    data = export.ExportFile(new_file)
    for (linked_nodes,ensembl) in interacting_gene_list:
        try: symbol = query_db[ensembl]
        except KeyError: symbol = ensembl_symbol_db[ensembl]
        data.write(str(linked_nodes)+'\t'+ensembl+'\t'+symbol+'\t'+'direct'+'\n')
    for (linked_nodes,ensembl) in indirect_interacting_gene_list:
        try: symbol = query_db[ensembl]
        except KeyError:
            try: symbol = ensembl_symbol_db[ensembl]
            except KeyError: symbol = ensembl
            if 'HMDB' in symbol:
                try: symbol = hmdb_symbol_db[ensembl]
                except Exception: pass
        data.write(str(linked_nodes)+'\t'+ensembl+'\t'+symbol+'\t'+'indirect'+'\n')
    data.close()

    regulated_gene_db = query_db    
    sif_export,symbol_pair_unique = exportInteractionData(file_name,query_interactions_unique,regulated_gene_db)
    return sif_export,symbol_pair_unique
def getJunctionComparisonsFromExport(species, array_type):
    type = 'standard'
    gene_junction_db = importEnsemblUCSCAltJunctions(species, type)

    ### Retrieve probesets with exon-junctions associated - these are critical exons
    filename = 'AltDatabase/' + species + '/' + array_type + '/' + species + '_Ensembl_' + array_type + '_probesets.txt'
    gene_probeset_db = ExonArrayEnsemblRules.reimportEnsemblProbesetsForSeqExtraction(
        filename, 'junctions', {})
    left = {}
    right = {}
    gene_db = {}
    gene_exon_db = {}
    nonjunction_aligning = {}
    for gene in gene_probeset_db:
        for (probe_data, ed) in gene_probeset_db[gene]:
            probeset, strand, probeset_start, probeset_stop = probe_data
            region_id = string.replace(ed.RegionNumber(), '-', '.')
            original_region_id = region_id
            region_ids = string.split(region_id, '|')
            gene_db[probeset[:-2]] = gene
            #ed.AssociatedSplicingJunctions()
            r_starts = string.split(ed.ExonStart(), '|')
            r_stops = string.split(ed.ExonStop(), '|')
            for region_id in region_ids:
                if '|5' in probeset:
                    try:
                        left[probeset[:-2]].append(region_id)
                    except Exception:
                        left[probeset[:-2]] = [region_id]
                    if strand == '+':  ### If the junction probesets DO NOT align to the region coordinates, then the probeset maps to a junction outside the database
                        if probeset_stop not in r_stops:
                            nonjunction_aligning[
                                probeset[:
                                         -2]] = original_region_id + '_' + probeset_stop, 'left'
                    elif probeset_start not in r_starts:
                        nonjunction_aligning[
                            probeset[:
                                     -2]] = original_region_id + '_' + probeset_start, 'left'
                elif '|3' in probeset:
                    try:
                        right[probeset[:-2]].append(region_id)
                    except Exception:
                        right[probeset[:-2]] = [region_id]
                    if strand == '+':
                        if probeset_start not in r_starts:
                            nonjunction_aligning[
                                probeset[:
                                         -2]] = original_region_id + '_' + probeset_start, 'right'
                    elif probeset_stop not in r_stops:
                        nonjunction_aligning[
                            probeset[:
                                     -2]] = original_region_id + '_' + probeset_stop, 'right'
                else:
                    if '_' in region_id: print killer
                    try:
                        gene_exon_db[gene, region_id].append(probeset)
                    except Exception:
                        gene_exon_db[gene, region_id] = [probeset]

    print 'len(nonjunction_aligning)', len(nonjunction_aligning)
    gene_exon_db = eliminateRedundant(gene_exon_db)
    junction_db = {}  ### Get the exon-region IDs for an exon-junction
    for probeset in left:
        gene = gene_db[probeset]
        if probeset in right:
            for region1 in left[probeset]:
                for region2 in right[probeset]:
                    junction = region1 + '-' + region2
                    try:
                        junction_db[gene, junction].append(probeset)
                    except Exception:
                        junction_db[gene, junction] = [probeset]

    probeset_junction_export = 'AltDatabase/' + species + '/' + array_type + '/' + species + '_junction_comps.txt'

    fn = filepath(probeset_junction_export)
    data = open(fn, 'w')
    print "Exporting", probeset_junction_export
    title = 'gene' + '\t' + 'critical_exon' + '\t' + 'exclusion_junction_region' + '\t' + 'inclusion_junction_region' + '\t' + 'exclusion_probeset' + '\t' + 'inclusion_probeset' + '\t' + 'data_source' + '\n'
    data.write(title)
    temp_list = []

    for (gene, critical_exon, incl_junction,
         excl_junction) in gene_junction_db:
        if (gene, incl_junction) in junction_db:
            incl_junction_probesets = junction_db[gene, incl_junction]
            if (gene, excl_junction) in junction_db:
                excl_junction_probesets = junction_db[gene, excl_junction]
                for incl_junction_probeset in incl_junction_probesets:
                    for excl_junction_probeset in excl_junction_probesets:
                        try:
                            for incl_exon_probeset in gene_exon_db[
                                    gene, critical_exon]:
                                if incl_junction_probeset in nonjunction_aligning or excl_junction_probeset in nonjunction_aligning:
                                    null = []
                                else:  ### Ensure the probeset DOES map to the annotated junctions
                                    temp_list.append(
                                        string.join([
                                            gene, critical_exon, excl_junction,
                                            critical_exon,
                                            excl_junction_probeset,
                                            incl_exon_probeset, 'AltAnalyze'
                                        ], '\t') + '\n')
                        except Exception:
                            null = []
                        if incl_junction_probeset in nonjunction_aligning:
                            new_region_id, side = nonjunction_aligning[
                                incl_junction_probeset]
                            incl_junction = renameJunction(
                                incl_junction, side, new_region_id)
                        if excl_junction_probeset in nonjunction_aligning:
                            new_region_id, side = nonjunction_aligning[
                                excl_junction_probeset]
                            excl_junction = renameJunction(
                                excl_junction, side, new_region_id)
                        if excl_junction_probeset != incl_junction_probeset:
                            temp_list.append(
                                string.join([
                                    gene, critical_exon, excl_junction,
                                    incl_junction, excl_junction_probeset,
                                    incl_junction_probeset, 'AltAnalyze'
                                ], '\t') + '\n')
    temp_list = unique.unique(temp_list)
    for i in temp_list:
        data.write(i)
    data.close()
    print 'Number of compared junctions exported', len(temp_list)
Exemple #49
0
def processBarcodes(viral_barcode_file, cell_cluster_file, reference_48mers):
    eo = export.ExportFile(viral_barcode_file[:-4] + '-cleaned.txt')
    parent = export.findParentDir(viral_barcode_file)
    eom = export.ExportFile(parent + '/MultiLin-cells.txt')
    ### Import a file with the sample names in the groups file in the correct order
    viral_barcodes = {}
    repair = {}
    short = {}
    cluster_header = []

    cell_clusters = {}
    for line in open(cell_cluster_file, 'rU').xreadlines():
        data = cleanUpLine(line)
        cell, cluster, cluster_name = string.split(data, '\t')
        cell_clusters[cell] = cluster_name
        if cluster_name not in cluster_header:
            cluster_header.append(cluster_name)

    cells_with_virus = {}
    for line in open(viral_barcode_file, 'rU').xreadlines():
        data = cleanUpLine(line)
        cellular, viral = string.split(data, '\t')
        if cellular in cell_clusters:
            try:
                if viral not in cells_with_virus[cellular]:
                    cells_with_virus[cellular].append(viral)
            except Exception:
                cells_with_virus[cellular] = [viral]
            if len(viral) < 48:
                #if len(viral)<38:
                if viral not in repair:
                    repair[viral] = [cellular]
                else:
                    if cellular not in repair[viral]:
                        repair[viral].append(cellular)
            else:
                #short[viral[:35]]=viral
                try:
                    if cellular not in viral_barcodes[viral]:
                        viral_barcodes[viral].append(cellular)
                except Exception:
                    viral_barcodes[viral] = [cellular]

    ### Repair the short sequences
    for viral_short in repair:
        cellular_barcodes = repair[viral_short]
        if viral_short[:35] in short:
            viral = short[viral_short[:35]]
            for cellular in cellular_barcodes:
                try:
                    if cellular not in viral_barcodes[viral]:
                        viral_barcodes[viral].append(cellular)
                except Exception:
                    viral_barcodes[viral] = [cellular]
    print len(viral_barcodes), 'unique viral barcodes present'

    #print cells_with_virus['ACGCCGATCTGTTGAG']
    #print cells_with_virus['CAGAATCCAAACTGCT']
    #sys.exit()

    if reference_48mers != None:
        valid_barcodes = 0
        for viral in viral_barcodes:
            if viral in reference_48mers:
                valid_barcodes += 1
        print valid_barcodes, 'unique valid viral barcodes present'

    #"""
    ### If the viral barcodes have frequent errors - associate the error with the reference in a cell-specific manner
    ### Only one virus for cell should be present unless it is a doublet
    print len(cells_with_virus), 'cells with viral barcodes'
    doublet_cell = {}
    mismatch_to_match = {}
    cells_with_valid_barcodes = 0
    viral_barcodes_overide = {}
    cellular_barcodes_overide = {}
    for cellular in cells_with_virus:
        cell_5prime = {}
        cell_3prime = {}
        ref_sequences = []
        if len(cells_with_virus[cellular]) > 1:
            for i in cells_with_virus[cellular]:
                try:
                    cell_5prime[i[:10]].append(i)
                except Exception:
                    cell_5prime[i[:10]] = [i]
                try:
                    cell_3prime[i[-10:]].append(i)
                except Exception:
                    cell_3prime[i[-10:]] = [i]
                if reference_48mers == None:
                    ref_sequences.append(i)
                elif i in reference_48mers:
                    ref_sequences.append(i)
            if len(ref_sequences) > 0:
                cells_with_valid_barcodes += 1  ### Determine how many cells have valid viral barcodes
            cell_5prime_ls = []
            cell_3prime_ls = []
            for i in cell_5prime:
                cell_5prime_ls.append([len(cell_5prime[i]), i])
            for i in cell_3prime:
                cell_3prime_ls.append([len(cell_3prime[i]), i])
            cell_5prime_ls.sort()
            cell_3prime_ls.sort()

            for seq in ref_sequences:
                if cell_5prime_ls[-1][1] in seq and cell_3prime_ls[-1][
                        1] in seq:
                    ref_seq = seq
            try:
                viral_barcodes_overide[ref_seq].append(cellular)
            except:
                viral_barcodes_overide[ref_seq] = [cellular]
            cellular_barcodes_overide[cellular] = [ref_seq]
            for y in cell_5prime[cell_5prime_ls[-1][1]]:
                mismatch_to_match[y] = ref_seq
            for y in cell_3prime[cell_3prime_ls[-1][1]]:
                mismatch_to_match[y] = ref_seq

        else:
            for i in cells_with_virus[cellular]:
                if reference_48mers == None:
                    cells_with_valid_barcodes += 1
                elif i in reference_48mers:
                    cells_with_valid_barcodes += 1  ### Determine how many cells have valid viral barcodes
                try:
                    viral_barcodes_overide[i].append(cellular)
                except:
                    viral_barcodes_overide[i] = [cellular]

    viral_barcodes = viral_barcodes_overide
    cells_with_virus = cellular_barcodes_overide

    ### Update the viral_barcodes dictionary
    viral_barcodes2 = {}
    cells_with_virus2 = {}
    for v in viral_barcodes:
        cell_barcodes = viral_barcodes[v]
        proceed = False
        if v in mismatch_to_match:
            v = mismatch_to_match[v]
            proceed = True
        elif reference_48mers == None:
            proceed = True
        elif v in reference_48mers:
            proceed = True
        if proceed:
            if v in viral_barcodes2:
                for c in cell_barcodes:
                    if c not in viral_barcodes2:
                        viral_barcodes2[v].append(c)
            else:
                viral_barcodes2[v] = cell_barcodes

    print cells_with_valid_barcodes, 'cells with valid viral barcodes.'
    viral_barcodes = viral_barcodes2
    ### Update the cells_with_virus dictionary
    for v in viral_barcodes:
        cell_barcodes = viral_barcodes[v]
        for c in cell_barcodes:
            if c in cells_with_virus2:
                if v not in cells_with_virus2[c]:
                    cells_with_virus2[c].append(v)
            else:
                cells_with_virus2[c] = [v]
    cells_with_virus = cells_with_virus2

    for c in cells_with_virus:
        if len(cells_with_virus[c]) > 1:
            doublet_cell[c] = []
    print len(doublet_cell), 'doublets'
    #print cells_with_virus['ACGCCGATCTGTTGAG']
    #print cells_with_virus['CAGAATCCAAACTGCT']
    #sys.exit()

    print len(cells_with_virus), 'updated cells with virus'
    print len(viral_barcodes), 'updated unique viral barcodes'
    #"""

    #reference_48mers={}

    multi_cell_mapping = 0
    unique_cells = {}
    multiMappingFinal = {}
    import collections
    import unique
    event_db = collections.OrderedDict()
    for cluster in cluster_header:
        event_db[cluster] = '0'
    k_value = 1
    import unique
    cluster_hits_counts = {}
    cluster_pairs = {}
    custom = []
    cells_per_pattern = {}
    for viral in viral_barcodes:
        clusters = []
        k = len(unique.unique(viral_barcodes[viral]))
        if k > k_value:
            proceed = True
            if reference_48mers == None:
                proceed = True
            elif len(reference_48mers) > 0:
                if viral in reference_48mers:
                    proceed = True
                else:
                    proceed = False
            if proceed:
                viral_cluster_db = copy.deepcopy(event_db)  ### copy this
                multi_cell_mapping += 1
                cell_tracker = []
                multilin = []
                all_cells = []
                for cell in viral_barcodes[viral]:
                    #if cell not in doublet_cell:
                    cell_tracker.append(cell)
                    try:
                        unique_cells[cell].append(viral)
                    except:
                        unique_cells[cell] = [viral]
                    if cell in cell_clusters:
                        cluster = cell_clusters[cell]
                        if 'Multi-Lin' == cluster:
                            multilin.append(cell)
                        all_cells.append(cell)
                        viral_cluster_db[cluster] = '1'
                        clusters.append(cluster)
                c1 = unique.unique(clusters)
                c2 = string.join(c1, '|')
                try:
                    cells_per_pattern[c2] += all_cells
                except:
                    cells_per_pattern[c2] = all_cells
                #if c1 == ['Multi-Lin c4-Mast']:
                #if c1 == ['MultiLin','MEP','Myelo-1'] or  c1 == ['MultiLin','MEP','Myelo-2'] or  c1 == ['MultiLin','MEP','Myelo-4']:
                #if 'Multi-Lin c4-Mast' in c1 and ('ERP-primed' not in c1 and 'MEP' not in c1 and 'MKP-primed' not in c1 and 'MKP' not in c1 and 'ERP' not in c1) and 'Monocyte' not in c1 and 'e-Mono' not in c1 and ('Gran' in c1 or 'Myelo-1' in c1 or 'Myelo-2' in c1 and 'Myelo-3' in c1 and 'Myelo-4' in c1):
                #if 'Multi-Lin' in c1 and ('e-Mono' in c1 or 'Monocyte' in c1) and ('ERP-primed' in c1 or 'MEP' in c1 or 'MKP-primed' in c1 or 'MKP' in c1) and ('Gran' in c1 or 'Myelo-4' in c1 or 'Myelo-1' in c1 or 'Myelo-2' in c1 or 'Myelo-3' in c1):
                if 'Multi-Lin' in c1:
                    for cell in multilin:
                        eom.write(
                            string.join(c1, '|') + '\t' + cell + '\t' + viral +
                            '\n')
                    custom += viral_barcodes[viral]
                    #print 'custom:',custom

                multiMappingFinal[viral] = viral_cluster_db

        ### Count the number of cluster pairs to make a weighted network
        for c1 in clusters:
            for c2 in clusters:
                if c1 != c2:
                    try:
                        cx = cluster_pairs[c1]
                        try:
                            cx[c2] += 1
                        except:
                            cx[c2] = 1
                    except:
                        cx = {}
                        cx[c2] = 1
                        cluster_pairs[c1] = cx
        clusters = string.join(unique.unique(clusters), '|')
        try:
            cluster_hits_counts[clusters] += 1
        except Exception:
            cluster_hits_counts[clusters] = 1
    #sys.exit()
    #print custom

    for cluster in cluster_pairs:
        cluster_counts = []
        cx = cluster_pairs[cluster]
        for c2 in cx:
            count = cx[c2]
            cluster_counts.append([count, c2])
        cluster_counts.sort()
        cluster_counts.reverse()
        #print cluster, cluster_counts
    print len(multiMappingFinal)

    final_ranked_cluster_hits = []
    for clusters in cluster_hits_counts:
        final_ranked_cluster_hits.append(
            [cluster_hits_counts[clusters], clusters])
    final_ranked_cluster_hits.sort()
    final_ranked_cluster_hits.reverse()
    for (counts, clusters) in final_ranked_cluster_hits:
        try:
            print str(counts) + '\t' + clusters + '\t' + str(
                len(unique.unique(cells_per_pattern[clusters])))
            #print cells_per_pattern[clusters];sys.exit()
        except:
            print str(counts) + '\t' + clusters

    eo.write(string.join(['UID'] + cluster_header, '\t') + '\n')
    for viral_barcode in multiMappingFinal:
        cluster_db = multiMappingFinal[viral_barcode]
        hits = []
        for cluster in cluster_db:
            hits.append(cluster_db[cluster])
        eo.write(string.join([viral_barcode] + hits, '\t') + '\n')
    eo.close()

    eo = export.ExportFile(viral_barcode_file[:-4] + '-cells-' + str(k_value) +
                           '.txt')
    for cell in unique_cells:
        #eo.write(cell+'\t1\t1\t'+str(len(unique_cells[cell]))+'\t'+string.join(unique_cells[cell],'|')+'\n')
        eo.write(cell + '\t1\t1\t\n')
    eo.close()
Exemple #50
0
 def test_unique(self):
     self.assertEqual(unique('programmare'), 4)
Exemple #51
0
def importEnsemblTranscriptSequence(Species, Array_type, probeset_seq_db):
    global species
    global array_type
    species = Species
    array_type = Array_type
    start_time = time.time()

    import_dir = "/AltDatabase/" + species + "/SequenceData"  ### Multi-species file
    g = GrabFiles()
    g.setdirectory(import_dir)
    seq_files = g.searchdirectory("cdna.all")
    seq_files.sort()
    filename = seq_files[-1]

    output_file = "AltDatabase/" + species + "/SequenceData/output/" + array_type + "_Ens-mRNA_alignments.txt"
    dataw = export.ExportFile(output_file)

    output_file = "AltDatabase/" + species + "/SequenceData/output/sequences/" + array_type + "_Ens_mRNA_seqmatches.txt"
    datar = export.ExportFile(output_file)

    print "Begining generic fasta import of", filename
    fn = filepath(filename)
    sequence = ""
    x = 0
    count = 0
    global gene_not_found
    gene_not_found = []
    genes_found = {}
    for line in open(fn, "rU").xreadlines():
        exon_start = 1
        exon_stop = 1
        try:
            data, newline = string.split(line, "\n")
        except ValueError:
            continue
        try:
            if data[0] == ">":
                if len(sequence) > 0:
                    gene_found = "no"
                    count += 1
                    if ensembl_id in probeset_seq_db:
                        genes_found[ensembl_id] = []
                        seq_type = "full-length"
                        probeset_seq_data = probeset_seq_db[ensembl_id]
                        cDNA_seq = sequence[1:]
                        mRNA_length = len(cDNA_seq)
                        results = simpleSeqMatchProtocol(probeset_seq_data, cDNA_seq)
                        for (call, probeset) in results:
                            dataw.write(string.join([probeset, str(call), transid], "\t") + "\n")
                        ###Save all sequences to the disk rather than store these in memory. Just select the optimal sequences later.
                        values = [transid, cDNA_seq]
                        values = string.join(values, "\t") + "\n"
                        datar.write(values)
                        x += 1
                    else:
                        gene_not_found.append(ensembl_id)
                t = string.split(data[1:], ":")
                sequence = ""
                transid_data = string.split(t[0], " ")
                transid = transid_data[0]
                ensembl_id = t[-1]
                ind = 0
                # >ENST00000593546 cdna:known chromosome:GRCh37:HG27_PATCH:26597180:26600278:1 gene:ENSG00000268612 gene_biotype:protein_coding transcript_biotype:protein_coding
                for item in t:
                    if "gene" in item and "gene_" not in item:
                        ensembl_id = string.split(t[ind + 1], " ")[0]  ### In the following field
                    ind += 1
                """
                    if 'gene' in t[-3]:
                        ensembl_id = string.split(t[-2],' ')[0] ### Case in Zm for plant and probably other cDNA files (different fields here!!!)
                    elif 'gene' not in t[-2]: ### After Ensembl version 64
                        for entry in t:
                            if 'gene_biotype' in entry: ensembl_id = string.split(entry,' ')[0]"""
        except IndexError:
            continue
        try:
            if data[0] != ">":
                sequence = sequence + data
        except IndexError:
            continue

    datar.close()
    dataw.close()
    end_time = time.time()
    time_diff = int(end_time - start_time)
    gene_not_found = unique.unique(gene_not_found)
    print len(genes_found), "genes associated with reciprocal Ensembl junctions"
    print len(
        gene_not_found
    ), "genes not found in the reciprocol junction database (should be there unless conflict present - or few alternative genes predicted during junction array design)"
    print gene_not_found[0:10], "not found examples"
    if len(genes_found) < 10:
        print "\n\nWARNING!!!!! Ensembl appears to have changed the formatting of this file, preventing propper import!!!!!!\n\n"
    print "Ensembl transcript sequences analyzed in %d seconds" % time_diff
def associateQueryGenesWithInteractions(query_db,query_interactions,dir_file):
    suffix=''
    if dir_file!=None:
        if len(dir_file)!=0:
            suffix='-'+intNameShort+'_'+export.findFilename(dir_file)[:-4]
    if len(suffix)==0:
        try: suffix = '_'+FileName
        except Exception: None
    file_name = 'AltAnalyze-network'+suffix
    
    query_interactions_unique={}
    interacting_genes={}
    connections = 1
    primary=0
    secondary=0
    terciary=0
    for ensemblGene in query_db:
        if ensemblGene in interaction_db:
            for interacting_ensembl in interaction_db[ensemblGene]:
                if interacting_ensembl not in blackList:
                    ###Only allow direct interactions found in query
                    if interacting_ensembl in query_db:
                        try: query_interactions[ensemblGene].append(interacting_ensembl)
                        except KeyError: query_interactions[ensemblGene] = [interacting_ensembl]
                        try: query_interactions[interacting_ensembl].append(ensemblGene)
                        except KeyError: query_interactions[interacting_ensembl] = [ensemblGene]
                        primary+=1
                    if degrees == 2 or degrees == 'indirect':
                        try: interacting_genes[interacting_ensembl].append(ensemblGene)
                        except KeyError: interacting_genes[interacting_ensembl] = [ensemblGene]
                    elif degrees == 'allInteracting' or degrees == 'all possible':
                        try: query_interactions[ensemblGene].append(interacting_ensembl)
                        except KeyError: query_interactions[ensemblGene] = [interacting_ensembl]
                    if interacting_ensembl in secondaryQueryIDs: ### IDs in the expression file
                        secondary+=1 ### When indirect degrees selected, no additional power added by this (only for direct or shortest path)
                        try: query_interactions[ensemblGene].append(interacting_ensembl)
                        except KeyError: query_interactions[ensemblGene] = [interacting_ensembl]    
        if ensemblGene in second_degree_obligatory:
            for interacting_ensembl in second_degree_obligatory[ensemblGene]:
                try: interacting_genes[interacting_ensembl].append(ensemblGene)
                except KeyError: interacting_genes[interacting_ensembl] = [ensemblGene]

    ### Include indirect interactions to secondaryQueryIDs from the expression file
    if degrees == 2 or degrees == 'indirect':
        for ensemblGene in secondaryQueryIDs:
            if ensemblGene in interaction_db:
                for interacting_ensembl in interaction_db[ensemblGene]:
                    if interacting_ensembl not in blackList:
                        try:
                            interacting_genes[interacting_ensembl].append(ensemblGene)
                            terciary+=1#; print interacting_ensembl
                        except KeyError: None ### Only increase the interacting_genes count if the interacting partner is present from the primary query list
    #print primary,secondary,terciary
    
    ### Report the number of unique interacting genes
    for interacting_ensembl in interacting_genes:
        if len(interacting_genes[interacting_ensembl])==1:
            interacting_genes[interacting_ensembl] = 1
        else:
            unique_interactions = unique.unique(interacting_genes[interacting_ensembl])
            interacting_genes[interacting_ensembl] = len(unique_interactions)
    
    query_indirect_interactions={}; indirect_interacting_gene_list=[]; interacting_gene_list=[]; added=[] 
    if degrees=='shortestPath' or degrees=='shortest path': ### Typically identifying the single smallest path(s) between two nodes.
        query_indirect_interactions, indirect_interacting_gene_list, interacting_gene_list = evaluateShortestPath(query_db,interaction_db,10)
        
    else:
        if degrees==2 or degrees=='indirect' or len(secondDegreeObligatoryCategories)>0:
            for ensembl in interacting_genes:
                if interacting_genes[ensembl] > connections:
                    if ensembl in interaction_db: ### Only nodes removed due to promiscuity will not be found
                        for interacting_ensembl in interaction_db[ensembl]:
                            if interacting_ensembl in query_db or interacting_ensembl in secondaryQueryIDs:
                                try: query_indirect_interactions[interacting_ensembl].append(ensembl)
                                except KeyError: query_indirect_interactions[interacting_ensembl] = [ensembl]
                        ###Record the highest linked nodes
                        indirect_interacting_gene_list.append((interacting_genes[ensembl],ensembl)) 
        if len(obligatory_interactions)>0: ### Include always
            all_reported_genes = combineDBs(query_interactions,query_indirect_interactions) ### combinesDBs and returns a unique list of genes
            for ensemblGene in all_reported_genes: ###This only includes genes in the original input list
                if ensemblGene in obligatory_interactions:
                    for interacting_ensembl in obligatory_interactions[ensemblGene]:
                        #symbol = ensembl_symbol_db[ensemblGene]                    
                        try: query_interactions[ensemblGene].append(interacting_ensembl)
                        except KeyError: query_interactions[ensemblGene] = [interacting_ensembl]
    
    z = dict(query_interactions.items() + query_indirect_interactions.items())
    interaction_restricted_db={}
    for ensembl in z:
        interacting_nodes = z[ensembl]
        for node in interacting_nodes:
            if ensembl in interaction_restricted_db:
                db = interaction_restricted_db[ensembl]
                db[node] = 1
            else: interaction_restricted_db[ensembl] = {node:1}

            if node in interaction_restricted_db:
                db = interaction_restricted_db[node]
                db[ensembl] = 1
            else: interaction_restricted_db[node] = {ensembl:1}
            
    if degrees==2 or degrees=='indirect': ### get rid of non-specific interactions
        query_indirect_interactions, indirect_interacting_gene_list, interacting_gene_list = evaluateShortestPath(query_db,interaction_restricted_db,4)
        
    ###Record the highest linked nodes
    for ensembl in query_interactions:
        linked_nodes = len(unique.unique(query_interactions[ensembl]))
        interacting_gene_list.append((linked_nodes,ensembl))
    interacting_gene_list.sort(); interacting_gene_list.reverse()
    indirect_interacting_gene_list.sort();  indirect_interacting_gene_list.reverse()
    
    print "Length of query_interactions:",len(query_interactions)
    query_interactions_unique=[]
    for gene1 in query_interactions:
        for gene2 in query_interactions[gene1]:
            temp = []; temp.append(gene2); temp.append(gene1)#; temp.sort()
            if gene1 == gene2: interaction_type = 'self'
            else: interaction_type = 'distinct'
            temp.append(interaction_type); temp.reverse()
            query_interactions_unique.append(temp)
    for gene1 in query_indirect_interactions:
        for gene2 in query_indirect_interactions[gene1]:
            temp = []; temp.append(gene2); temp.append(gene1)#; temp.sort()
            if gene1 == gene2: interaction_type = 'self'
            else: interaction_type = 'indirect'
            temp.append(interaction_type); temp.reverse()
            query_interactions_unique.append(temp)
    query_interactions_unique = unique.unique(query_interactions_unique)
    query_interactions_unique.sort()
    

    ###Write out nodes linked to many other nodes
    new_file = outputDir+'/networks/'+file_name+ '-interactions_'+str(degrees)+'_degrees_summary.txt'
    data = export.ExportFile(new_file)
    for (linked_nodes,ensembl) in interacting_gene_list:
        try: symbol = query_db[ensembl]
        except KeyError: symbol = ensembl_symbol_db[ensembl]
        data.write(str(linked_nodes)+'\t'+ensembl+'\t'+symbol+'\t'+'direct'+'\n')
    for (linked_nodes,ensembl) in indirect_interacting_gene_list:
        try: symbol = query_db[ensembl]
        except KeyError:
            try: symbol = ensembl_symbol_db[ensembl]
            except KeyError: symbol = ensembl
            if 'HMDB' in symbol:
                try: symbol = hmdb_symbol_db[ensembl]
                except Exception: pass
        data.write(str(linked_nodes)+'\t'+ensembl+'\t'+symbol+'\t'+'indirect'+'\n')
    data.close()

    regulated_gene_db = query_db    
    sif_export,symbol_pair_unique = exportInteractionData(file_name,query_interactions_unique,regulated_gene_db)
    return sif_export,symbol_pair_unique
Exemple #53
0
def parse_affymetrix_annotations(filename):
    temp_affy_db = {}
    x=0
    fn=filepath(filename)
    for line in open(fn,'rU').xreadlines():             
        probeset_data,null = string.split(line,'\n')  #remove endline
        affy_data = string.split(probeset_data[1:-1],'","')  #remove endline
        if x==0:
            if probeset_data[0] == '#':
                continue
            x +=1
            affy_headers = affy_data
        else:
            x +=1
            probesets = affy_data[0]
            temp_affy_db[probesets] = affy_data[1:]
    for header in affy_headers:
        x = 0; eg = ''; gs = ''
        while x < len(affy_headers):
            if 'rocess' in affy_headers[x]: gb = x - 1
            if 'omponent' in affy_headers[x]: gc = x - 1
            if 'olecular' in affy_headers[x]: gm = x - 1
            if 'athway' in affy_headers[x]: gp = x - 1
            if 'Gene Symbol' in affy_headers[x]: gs = x - 1
            if 'Ensembl' in affy_headers[x]: eg = x - 1
            x += 1
        ###Below code used if human exon array parsed
        global analyze_human_exon_data
        analyze_human_exon_data = 'no'
        if eg == '':
            x = 0
            while x < len(affy_headers):
                if 'mrna_assignment' in affy_headers[x]:
                    eg = x - 1
                    analyze_human_exon_data = 'yes'
                x+=1
    for probeset in temp_affy_db:
        affy_data = temp_affy_db[probeset]
        try:
            go_bio = affy_data[gb]
        except IndexError:
            ###Occurs due to a new line error
            continue
        go_com = affy_data[gc]
        go_mol = affy_data[gm]
        genmapp = affy_data[gp]
        if gs == '': symbol = ''
        else: symbol = affy_data[gs]
        if analyze_human_exon_data == 'no':
            ensembl = affy_data[eg]
        else:
            ensembl_data = affy_data[eg]
            ensembl=''
            try:
                if 'gene:ENSMUSG' in ensembl_data:
                    ensembl_data = string.split(ensembl_data,'gene:ENSMUSG')
                    ensembl_data = string.split(ensembl_data[1],' ')
                    ensembl = 'ENSMUSG'+ ensembl_data[0]
                if 'gene:ENSG' in ensembl_data:
                    ensembl_data = string.split(ensembl_data,'gene:ENSG')
                    ensembl_data = string.split(ensembl_data[1],' ')
                    ensembl = 'ENSG'+ ensembl_data[0]
            except IndexError:
                continue
        goa=[]
      
        goa = merge_go_annoations(go_bio,goa)
        goa = merge_go_annoations(go_com,goa)
        goa = merge_go_annoations(go_mol,goa)
        goa = merge_go_annoations(genmapp,goa)

        goa=unique.unique(goa); goa.sort(); 
        goa = string.join(goa,'')
        try:
            ensembl = string.split(ensembl,' /// ')
        except ValueError:
            ensembl = [ensembl]
        for ensembl_id in ensembl:
            if len(goa)>10:
                go_annotations[ensembl_id] = goa, symbol
def analyzeCommonProteinClassesAndCompartments(sm,kw,ft_call,ft_string,rc,de,go):
    ### Used to assign "Common Protein Classes" annotations to Gene Expression summary file (ExpressionOutput folder)
    class_def=[]; annotation=[]; cellular_components = []
    if 'DNA-binding domain' in sm or 'Transcription' in go or 'Transcription regulation' in kw: class_def.append('transcription regulator')
    if 'protein kinase superfamily' in sm or 'Kinase' in go: class_def.append('kinase')
    if 'mRNA splicing' in kw or 'mRNA processing' in kw: class_def.append('splicing regulator')

    if 'G-protein coupled receptor' in sm or 'LU7TM' in sm:
        g_type = []
        if ('adenylate cyclase' in ft_call) or ('adenylyl cyclase'in ft_call):
                ###if both occur
                if (('stimulat' in ft_call) or ('activat' in ft_call)) and ('inhibit' in ft_call):
                    if 'inhibit aden' in ft_call: g_type.append('Gi')
                    if 'stimulate aden' in ft_call or 'activate aden' in ft_call: g_type.append('Gs')
                elif ('stimulat' in ft_call) or ('activat' in ft_call): g_type.append('Gs')
                elif ('inhibit' in ft_call): g_type.append('Gi')
        if ('cAMP' in ft_call):
            if ('stimulat' in ft_call) or ('activat' in ft_call): g_type.append('Gs')
            if ('inhibit' in ft_call): g_type.append('Gi')
        if ('G(s)' in ft_call): g_type.append('Gs')
        if ('G(i)' in ft_call): g_type.append('Gi')
        if ('pertussis' in ft_call and 'insensitive' not in ft_call): g_type.append('Gi')
        if ('G(i/0)' in ft_call) or ('G(i/o)' in ft_call): g_type.append('Gi')
        if ('G(o)' in ft_call): g_type.append('Go')
        if ('G(alpha)q' in ft_call): g_type.append('Gq')
        if ('G(11)' in ft_call): g_type.append('G11')
        if ('G(12)' in ft_call): g_type.append('G12')
        if ('G(13)' in ft_call): g_type.append('G13')
        if ('mobiliz' in ft_call and 'calcium' in ft_call and 'without formation' not in ft_call): g_type.append('Gq')
        if ('phosphatidyl' in ft_call and 'inositol' in ft_call) or ('G(q)' in ft_call) or ('phospholipase C' in ft_call):
                g_type.append('Gq')
        if ('inositol phos' in ft_call) or ('phosphoinositide' in ft_call) or ('PKC'in ft_call) or ('PLC' in ft_call):
            g_type.append('Gq')
        if ('intracellular' in ft_call and 'calcium' in ft_call) and 'nor induced' not in ft_call: g_type.append('Gq')
        if 'G-alpha-11' in ft_call: g_type.append('G11')
        if 'Orphan' in ft_call or 'orphan' in ft_call: g_type.append('orphan')
        if 'taste' in ft_call or 'Taste' in ft_call: g_type.append('taste')
        if 'vision' in ft_call or 'Vision' in ft_call: g_type.append('vision')
        if 'odorant' in ft_call or 'Odorant' in ft_call: g_type.append('oderant')
        if 'influx of extracellar calcium' in ft_call: g_type.append('Gq')
        if 'pheromone receptor' in ft_call or 'Pheromone receptor' in ft_call: g_type.append('pheromone')
        g_protein_list = unique.unique(g_type); g_protein_str = string.join(g_protein_list,'|')
        class_def.append('GPCR(%s)' % g_protein_str)
    elif 'receptor' in sm or 'Receptor' in go: class_def.append('receptor')
    if len(ft_string)>0: ### Add cellular component annotations
        if 'ecreted' in sm: k = 1; annotation.append('extracellular')
        if 'Extraceullar space' in sm: k = 1; annotation.append('extracellular')
        if 'ecreted' in go: k = 1; annotation.append('extracellular')
        if 'xtracellular' in go: k = 1; annotation.append('extracellular')
        if 'Membrane' in sm: k = 1; annotation.append('transmembrane')
        if 'TRANSMEM' in ft_string: k = 1; annotation.append('transmembrane')
        if 'integral to membrane' in go: k = 1; annotation.append('transmembrane')
        if 'Nucleus' in sm: k = 1; annotation.append('nucleus')
        if 'nucleus' in go: k = 1; annotation.append('nucleus')
        if 'Cytoplasm' in sm: k = 1; annotation.append('cytoplasm')
        if 'Mitochondrion' in sm: k = 1; annotation.append('mitochondrion')
        if 'SIGNAL' in ft_string: k = 1; annotation.append('signal')
        ###Generate probably secreted annotations
        if 'signal' in annotation and 'transmembrane' not in annotation:
            for entry in annotation:
                if entry != 'signal': cellular_components.append(entry)
            cellular_components.append('extracellular');annotation = cellular_components
        elif 'signal' in annotation and 'transmembrane' in annotation:
            for entry in annotation:
                if entry != 'signal': cellular_components.append(entry)
            annotation = cellular_components
            
    cellular_components = string.join(annotation,'|')
    class_def = string.join(class_def,'|')
    return class_def, cellular_components
def importAndReformatEnsemblJunctionAnnotations(species,array_type,nonconstitutive_junctions):
    filename = 'AltDatabase/'+species+'/'+array_type+'/'+species+'_Ensembl_'+array_type+'_probesets.txt'
    export_filepath = 'AltDatabase/'+species+'/'+array_type+'/'+species+'_Ensembl_probesets.txt'
    efn=filepath(export_filepath); export_data = open(efn,'w')
    
    fn=filepath(filename); x = 0; ensembl_exon_db={}; left={}; right={}; exon_gene_db={}; nonjunction_aligning={}
    for line in open(fn,'rU').xreadlines():             
        data = cleanUpLine(line)
        if x == 0: x=1; export_data.write(data+'\n')
        else:
            t = string.split(data,'\t')
            probeset, exon_id, ensembl_gene_id, transcript_cluster_id, chr, strand, probeset_start, probeset_stop, affy_class, constitutitive_probeset, ens_exon_ids, exon_annotations,regionid,r_start,r_stop,splice_event,splice_junctions = t
            if len(regionid)<1: regionid = exon_id; t[12] = exon_id     
            if chr == 'chrM': chr = 'chrMT' ### MT is the Ensembl convention whereas M is the Affymetrix and UCSC convention
            if chr == 'M': chr = 'MT' ### MT is the Ensembl convention whereas M is the Affymetrix and UCSC convention
            tc,probeset=string.split(probeset,':'); regionid = string.replace(regionid,'-','.'); original_region_id = regionid
            r_starts=string.split(r_start,'|'); r_stops=string.split(r_stop,'|')
            ed = EnsemblImport.ExonStructureData(ensembl_gene_id, chr, strand, probeset_start, probeset_stop, constitutitive_probeset, ens_exon_ids, []); ed.reSetExonID(regionid)
            if '|5' in probeset:
                left[probeset[:-2]] = ed,t
                if strand == '+': ### If the junction probesets DO NOT align to the region coordinates, then the probeset maps to a junction outside the database
                    if probeset_stop not in r_stops: nonjunction_aligning[probeset[:-2]] = original_region_id+'_'+probeset_stop,'left'
                elif probeset_start not in r_starts: nonjunction_aligning[probeset[:-2]] = original_region_id+'_'+probeset_start,'left'
            elif '|3' in probeset:
                right[probeset[:-2]] = ed,t
                if strand == '+':
                    if probeset_start not in r_starts: nonjunction_aligning[probeset[:-2]] = original_region_id+'_'+probeset_start,'right'
                elif probeset_stop not in r_stops: nonjunction_aligning[probeset[:-2]] = original_region_id+'_'+probeset_stop,'right'
            else:
                t[0] = probeset
                ensembl_exon_db[probeset] = ed
                export_data.write(string.join(t,'\t')+'\n')
                regionids = string.split(regionid,'|')
                for regionid in regionids: exon_gene_db[ensembl_gene_id,regionid] = probeset
                
    for probeset in left:
        if probeset in right:
            l,pl = left[probeset]; r,pr = right[probeset]
            if l.Constitutive() != r.Constitutive(): l.setConstitutive('no') ### used to determine if a junciton is alternative or constitutive
            if probeset in nonconstitutive_junctions: l.setConstitutive('no')
            l.setJunctionCoordinates(l.ExonStart(),l.ExonStop(),r.ExonStart(),r.ExonStop())            
            ens_exon_idsl = pl[10]; ens_exon_idsr = pr[10]; exon_idl = pl[1]; exon_idr = pr[1]
            regionidl = pl[12]; regionidr = pr[12]; splice_junctionsl = pl[-1]; splice_junctionsr = pr[-1]
            exon_idl = string.replace(exon_idl,'-','.'); exon_idr = string.replace(exon_idr,'-','.')
            regionidl_block = string.split(regionidl,'-')[0]; regionidr_block = string.split(regionidr,'-')[0]
                            
            if regionidl_block != regionidr_block: ### Otherwise, the junction is probing a single exon block and thus is not informative     
                regionidl = string.replace(regionidl,'-','.'); regionidr = string.replace(regionidr,'-','.')
                exon_id = exon_idl+'-'+exon_idr; regionid = regionidl+'-'+regionidr
                
                if probeset in nonjunction_aligning:
                    new_region_id, side = nonjunction_aligning[probeset]
                    regionid = renameJunction(regionid,side,new_region_id)
                
                l.reSetExonID(regionid); ensembl_exon_db[probeset] = l
                
                splice_junctionsl+=splice_junctionsr
                ens_exon_idsl = string.split(ens_exon_idsl,'|'); ens_exon_idsr = string.split(ens_exon_idsr,'|')
                ens_exon_ids=string.join(unique.unique(ens_exon_idsl+ens_exon_idsr),'|')
                pl[10] = ens_exon_ids; pl[12] = regionid; pl[1] = exon_id; pl[-1] = splice_junctionsl
                pl[13] = l.ExonStart()+'|'+l.ExonStop(); pl[14] = r.ExonStart()+'|'+r.ExonStop()
                strand = pl[5]
                if strand == '+':
                    pl[6] = l.ExonStop(); pl[7] = r.ExonStart() ### juncstion splice-sites
                else:
                    pl[6] = l.ExonStart(); pl[7] = r.ExonStop() ### juncstion splice-sites

                pl[0] = probeset; pl[9] = l.Constitutive()
            
                pl = string.join(pl,'\t')+'\n'
                export_data.write(pl)

    export_data.close()
    return ensembl_exon_db,exon_gene_db
Exemple #56
0
def importEnsemblTranscriptSequence(Species, Array_type, probeset_seq_db):
    global species
    global array_type
    species = Species
    array_type = Array_type
    start_time = time.time()

    import_dir = '/AltDatabase/' + species + '/SequenceData'  ### Multi-species file
    g = GrabFiles()
    g.setdirectory(import_dir)
    seq_files = g.searchdirectory('cdna.all')
    seq_files.sort()
    filename = seq_files[-1]

    output_file = 'AltDatabase/' + species + '/SequenceData/output/' + array_type + '_Ens-mRNA_alignments.txt'
    dataw = export.ExportFile(output_file)

    output_file = 'AltDatabase/' + species + '/SequenceData/output/sequences/' + array_type + '_Ens_mRNA_seqmatches.txt'
    datar = export.ExportFile(output_file)

    print "Begining generic fasta import of", filename
    fn = filepath(filename)
    sequence = ''
    x = 0
    count = 0
    global gene_not_found
    gene_not_found = []
    genes_found = {}
    for line in open(fn, 'rU').xreadlines():
        exon_start = 1
        exon_stop = 1
        try:
            data, newline = string.split(line, '\n')
        except ValueError:
            continue
        try:
            if data[0] == '>':
                if len(sequence) > 0:
                    gene_found = 'no'
                    count += 1
                    if ensembl_id in probeset_seq_db:
                        genes_found[ensembl_id] = []
                        seq_type = 'full-length'
                        probeset_seq_data = probeset_seq_db[ensembl_id]
                        cDNA_seq = sequence[1:]
                        mRNA_length = len(cDNA_seq)
                        results = simpleSeqMatchProtocol(
                            probeset_seq_data, cDNA_seq)

                        for (call, probeset) in results:
                            dataw.write(
                                string.join(
                                    [probeset, str(call), transid], '\t') +
                                '\n')
                        ###Save all sequences to the disk rather than store these in memory. Just select the optimal sequences later.
                        values = [transid, cDNA_seq]
                        values = string.join(values, '\t') + '\n'
                        datar.write(values)
                        x += 1
                    else:
                        gene_not_found.append(ensembl_id)
                t = string.split(data[1:], ':')
                sequence = ''
                transid_data = string.split(t[0], ' ')
                transid = transid_data[0]
                ensembl_id = t[-1]
                if '.' in transid:
                    transid = string.split(
                        transid,
                        '.')[0]  ### versioned IDs will cause matching issues
                ind = 0
                #>ENST00000593546 cdna:known chromosome:GRCh37:HG27_PATCH:26597180:26600278:1 gene:ENSG00000268612 gene_biotype:protein_coding transcript_biotype:protein_coding
                for item in t:
                    if 'gene_biotype' in item:
                        ensembl_id = string.split(
                            item, ' ')[0]  ### In the following field
                        break
                    elif 'gene' in item and 'gene_' not in item:
                        ensembl_id = string.split(
                            t[ind + 1], ' ')[0]  ### In the following field
                    ind += 1
                if '.' in ensembl_id:
                    ensembl_id = string.split(
                        ensembl_id,
                        '.')[0]  ### versioned IDs will cause matching issues
                """
                    if 'gene' in t[-3]:
                        ensembl_id = string.split(t[-2],' ')[0] ### Case in Zm for plant and probably other cDNA files (different fields here!!!)
                    elif 'gene' not in t[-2]: ### After Ensembl version 64
                        for entry in t:
                            if 'gene_biotype' in entry: ensembl_id = string.split(entry,' ')[0]"""
        except IndexError:
            continue
        try:
            if data[0] != '>': sequence = sequence + data
        except IndexError:
            continue

    datar.close()
    dataw.close()
    end_time = time.time()
    time_diff = int(end_time - start_time)
    gene_not_found = unique.unique(gene_not_found)
    print len(
        genes_found), 'genes associated with reciprocal Ensembl junctions'
    print len(
        gene_not_found
    ), "genes not found in the reciprocol junction database (should be there unless conflict present - or few alternative genes predicted during junction array design)"
    print gene_not_found[0:10], 'not found examples'
    if len(genes_found) < 10:
        print '\n\nWARNING!!!!! Ensembl appears to have changed the formatting of this file, preventing propper import!!!!!!\n\n'
    print "Ensembl transcript sequences analyzed in %d seconds" % time_diff
def identifyPutativeSpliceEvents(exon_db,constituitive_probeset_db,array_id_db,agglomerate_inclusion_probesets,onlyAnalyzeJunctions):
    exon_dbase = {}; probeset_comparison_db = {}; x = 0; y = 0
    ### Grab all probesets where we can identify a potential exon inclusion/exclusion event
    if len(array_id_db) == 0: array_id_db = exon_db ### Used when exporting all comparitive junction data
        
    for probeset in array_id_db:
        if probeset in exon_db:
            affygene = exon_db[probeset].GeneID() #exon_db[probeset] = affygene,exons,ensembl,block_exon_ids,block_structure,comparison_info    
            exons = exon_db[probeset].ExonID() #get rid of last pipe
            if probeset not in constituitive_probeset_db:
              #thus, there is a 'gene' probeset for that gene, but we don't want to look at the gene probesets
              if '|' not in exons: #get rid of any block exons or ambiguities) 
                try: x += 1; probeset_comparison_db[affygene].append(exons)
                except KeyError: x += 1; probeset_comparison_db[affygene] = [exons]
            exon_dbase[affygene,exons] = probeset

    print "Number of putative probeset comparisons:",x
    
    probe_level_db = {}
    for affygene in probeset_comparison_db:
        for exon_probeset1 in probeset_comparison_db[affygene]:
            for exon_probeset2 in probeset_comparison_db[affygene]:
                if exon_probeset1 != exon_probeset2:
                    if '-' in exon_probeset1: #get both pair-wise possibilities with this, to grab junctions
                        e1a,e1b = string.split(exon_probeset1,'-')
                        e1 = e1a,e1b
                        try:
                            e2a,e2b = string.split(exon_probeset2,'-')
                            e2 = e2a,e2b
                        except ValueError: e2 = exon_probeset2
                        try: probe_level_db[affygene,e1].append(e2)
                        except KeyError: probe_level_db[affygene,e1] = [e2]
                    else: ### Required when exon_probeset1 is a single exon rather than a junction
                        if '-' in exon_probeset2:
                            e2a,e2b = string.split(exon_probeset2,'-')
                            e2 = e2a,e2b
                            e1 = exon_probeset1
                            try: probe_level_db[affygene,e2].append(e1)
                            except KeyError: probe_level_db[affygene,e2] = [e1]
    #print "Looking for exon events defined by probeset exon associations"
    alt_junction_db,critical_exon_db = independently_rank_analyze_junction_sets(probe_level_db,onlyAnalyzeJunctions)
    #print "Associations Built\n"

    ### Rearange alt_junction_db and agglomerate data for inclusion probesets
    exon_inclusion_db={}; exon_inclusion_event_db={}; alt_junction_db_collapsed={}
    if agglomerate_inclusion_probesets == 'yes':
        for affygene in alt_junction_db:
            alt_junction_db[affygene].sort() ### Should be no need to sort later if we do this
            for event in alt_junction_db[affygene]:
                ### event = [('ei', 'E16-E17'), ('ex', 'E16-E18')]
                event1 = event[0][0]; exon_set1 = event[0][1]; exon_set2 = event[1][1]            
                probeset1 = exon_dbase[affygene,exon_set1]; probeset2 = exon_dbase[affygene,exon_set2]
                if event1 == 'ei':
                    ###First generate the original fold values for export summary, then the adjusted
                    try: exon_inclusion_db[probeset2].append(probeset1)
                    except KeyError: exon_inclusion_db[probeset2] = [probeset1]
                    try: exon_inclusion_event_db[(affygene, probeset2, event[1])].append(event)
                    except KeyError: exon_inclusion_event_db[(affygene, probeset2, event[1])] = [event]
                else: ### Store all the missing mutual exclusive splicing events
                    try: alt_junction_db_collapsed[affygene].append(event)
                    except KeyError: alt_junction_db_collapsed[affygene] = [event]
        
        ###Create a new alt_junction_db with merged inclusion events
        for key in exon_inclusion_event_db:
            affygene = key[0]; excl_probeset=key[1]; excl_event = key[2]
            ###Collect critical exon information from each inclusion exon-set to agglomerate and delete old entries
            new_critical_exon_list=[]; incl_exon_sets=[]
            for event in exon_inclusion_event_db[key]:
                incl_exon_set = event[0][1]; incl_exon_sets.append(incl_exon_set) ### Don't sort since this will throw off probeset relationships: incl_exon_sets.sort()
                if len(exon_inclusion_event_db[key])>1:  ###If the original list of events > 1
                    critical_exon_list = critical_exon_db[affygene,tuple(event)][1]
                    for exon in critical_exon_list: new_critical_exon_list.append(exon)
                    #del critical_exon_db[affygene,tuple(event)]
            new_critical_exon_list = unique.unique(new_critical_exon_list); new_critical_exon_list.sort()
            new_critical_exon_list = [1,new_critical_exon_list]
            incl_exon_sets_str = string.join(incl_exon_sets,'|') ### New inclusion exon group
            event = [('ei',incl_exon_sets_str),excl_event] ### Store new inclusion exon group
            try: alt_junction_db_collapsed[affygene].append(event)
            except KeyError: alt_junction_db_collapsed[affygene] = [event]
            ###Replace exon_dbase entries with new combined probeset IDs
            incl_probesets = exon_inclusion_db[excl_probeset]
            incl_probesets_str = string.join(incl_probesets,'|')
            if len(incl_exon_sets)>1: ###Often there will be only a single inclusion probeset
                """for exons in incl_exon_sets:
                    key = affygene,exons
                    try: del exon_dbase[key] ###delete individual inclusion exons and replace with a single inclusion agglomerate
                    except KeyError: continue ###Can occur more than once, if an exon participates in more than one splicing event
                """
                exon_dbase[affygene,incl_exon_sets_str] = incl_probesets_str
                critical_exon_db[affygene,tuple(event)] = new_critical_exon_list
                ###Create a new probeset entry in exon_db for the agglomerated probesets
                new_block_exon_ids=[] #exon_db[probeset] = affygene,exons,ensembl,block_exon_ids,block_structure
                for probeset in incl_probesets:
                    edat = exon_db[probeset]; ensembl = edat.ExternalGeneID(); block_exon_ids = edat.SecondaryExonID(); block_structure = edat.GeneStructure()
                    new_block_exon_ids.append(block_exon_ids)
                new_block_exon_ids = string.join(new_block_exon_ids,'')
                edat = exon_db[incl_probesets[0]]; edat1 = edat; edat1.setDisplayExonID(incl_exon_sets_str) #; edat1.setExonID(edat.ExonID()) ### Use the first inclusion probeset instance for storing all instance data
                edat1.setSecondaryExonID(new_block_exon_ids); edat1.setProbeset(incl_probesets[0])
                exon_db[incl_probesets_str] = edat1
        print "Length of original splice event database:",len(alt_junction_db)
        print "Length of agglomerated splice event database:",len(alt_junction_db_collapsed)
        alt_junction_db = alt_junction_db_collapsed  ### Replace with agglomerated database
        ### End Rearangement
        
    return alt_junction_db,critical_exon_db,exon_dbase,exon_inclusion_db,exon_db