Esempio n. 1
0
def getParametersAndExecute(probeset_seq_file,array_type,species,data_type):
    if data_type == 'critical-exons':
        if array_type == 'RNASeq': probeset_annotations_file = 'AltDatabase/'+species+'/'+array_type+'/'+species+'_Ensembl_exons.txt'
        else: probeset_annotations_file = 'AltDatabase/'+species+'/'+array_type+'/'+species+'_Ensembl_'+array_type+'_probesets.txt'
        ###Import probe-level associations
        exon_db = ExonSeqModule.importSplicingAnnotationDatabase(probeset_annotations_file,array_type)
        start_time = time.time()
        probeset_seq_db = importProbesetSeqeunces(probeset_seq_file,exon_db,species)  ###Do this locally with a function that works on tab-delimited as opposed to fasta sequences (exon array)
        end_time = time.time(); time_diff = int(end_time-start_time)
    elif data_type == 'junctions':
        start_time = time.time(); biotype = 'gene' ### Indicates whether to store information at the level of genes or probesets
        probeset_seq_db = importSplicingAnnotationDatabaseAndSequence(species,array_type,biotype)
        end_time = time.time(); time_diff = int(end_time-start_time)
    print "Analyses finished in %d seconds" % time_diff
    return probeset_seq_db
def annotateJunctionIDsAsExon(species, array_type):
    from build_scripts import ExonSeqModule
    probeset_annotations_file = 'AltDatabase/' + species + '/' + array_type + '/' + species + '_Ensembl_junction_probesets-filtered.txt'
    if array_type == 'RNASeq':
        probeset_annotations_file = string.replace(
            probeset_annotations_file, 'junction_probesets-filtered', 'exons')
    junction_exon_db = ExonSeqModule.importSplicingAnnotationDatabase(
        probeset_annotations_file, array_type)
    probeset_annotations_file = 'AltDatabase/' + species + '/exon/' + species + '_Ensembl_probesets.txt'
    exon_db = ExonSeqModule.importSplicingAnnotationDatabase(
        probeset_annotations_file, array_type)

    ### Extract unique exon regions from Exon Array annotations
    multiple_exon_regions = {}
    unique_exon_regions = {}
    for probeset in exon_db:
        y = exon_db[probeset]
        geneid = y.GeneID()
        if '|' in y.ExonRegionID():
            exonids = string.split(y.ExonRegionID(), '|')
            for exonid in exonids:
                multiple_exon_regions[geneid, exonid] = y
        else:
            unique_exon_regions[geneid, y.ExonRegionID()] = y
    ### Add missing exons to unique
    for uid in multiple_exon_regions:
        if uid not in unique_exon_regions:
            unique_exon_regions[uid] = multiple_exon_regions[uid]
    """
        for i in unique_exon_regions:
            if 'ENSMUSG00000066842' in i:
                print i
    stop
    """

    ### Extract unique exon regions from Junction Array annotation
    junction_to_exonids = {}
    for probeset in junction_exon_db:
        if 'ENSMUSG00000066842' in probeset: print probeset
        y = junction_exon_db[probeset]
        geneid = y.GeneID()
        if '|' in y.ExonRegionID():
            exonids = string.split(y.ExonRegionID(), '|')
            if probeset == 'ENSMUSG00000066842|E60.1': print[[exonids]]
            for exonid in exonids:
                if (geneid, exonid) in unique_exon_regions:
                    y = unique_exon_regions[geneid, exonid]
                    if probeset == 'ENSMUSG00000066842:E60.1':
                        print[y.Probeset()]
                    junction_to_exonids[probeset] = y.Probeset()
        else:
            if (geneid, string.replace(y.ExonRegionID(), '.',
                                       '-')) in unique_exon_regions:
                #if ':' in probeset: print [probeset,y.ExonRegionID()];kill
                y = unique_exon_regions[
                    geneid, string.replace(y.ExonRegionID(), '.', '-')]
                junction_to_exonids[probeset] = y.Probeset()

    output_file = 'AltDatabase/' + species + '/' + array_type + '/' + species + '_' + array_type + '-exon_probesets.txt'
    fn = filepath(output_file)
    data = open(fn, 'w')
    data.write(array_type + '_probeset\texon_probeset\n')

    for probeset in junction_to_exonids:
        exon_probeset = junction_to_exonids[probeset]
        data.write(probeset + '\t' + exon_probeset + '\n')
    data.close()