Esempio n. 1
0
def formatAttributeForExport(attribute_db,filename):
    from build_scripts import IdentifyAltIsoforms
    export_db={}
    for (gene,probeset) in attribute_db:
        attribute_list = attribute_db[(gene,probeset)]; attribute_list2=[]
        for (attribute,direction) in attribute_list:
            attribute = string.replace(attribute,'|',' ')
            attribute_list2.append(attribute+'|'+direction)
        export_db[probeset]=attribute_list2
    print 'Exporting:',filename
    IdentifyAltIsoforms.exportSimple(export_db,filename,'')
Esempio n. 2
0
def executeParameters(species,array_type,force,genomic_build,update_uniprot,update_ensembl,update_probeset_to_ensembl,update_domain,update_miRs,update_all,update_miR_seq,ensembl_version):    
    if '|' in array_type: array_type, specific_array_type = string.split(array_type,'|') ### To destinguish between array sub-types, like the HJAY and hGlue
    else: specific_array_type = array_type
    
    if update_all == 'yes':
        update_uniprot='yes'; update_ensembl='yes'; update_probeset_to_ensembl='yes'; update_domain='yes'; update_miRs = 'yes'
        
    if update_ensembl == 'yes':
        from build_scripts import EnsemblSQL; reload(EnsemblSQL)

        """ Used to grab all essential Ensembl annotations previously obtained via BioMart"""        
        configType = 'Advanced'; analysisType = 'AltAnalyzeDBs'; externalDBName = ''
        EnsemblSQL.buildEnsemblRelationalTablesFromSQL(species,configType,analysisType,externalDBName,ensembl_version,force)
        
        """ Used to grab Ensembl-to-External gene associations"""
        configType = 'Basic'; analysisType = 'ExternalOnly'; externalDBName = 'Uniprot/SWISSPROT'
        EnsemblSQL.buildEnsemblRelationalTablesFromSQL(species,configType,analysisType,externalDBName,ensembl_version,force)
        
        """ Used to grab Ensembl full gene sequence plus promoter and 3'UTRs """
        if array_type == 'AltMouse' or array_type == 'junction' or array_type == 'RNASeq':
            EnsemblSQL.getFullGeneSequences(ensembl_version,species)
            
    if update_uniprot == 'yes':            
        ###Might need to delete the existing versions of downloaded databases or force download
        buildUniProtFunctAnnotations(species,force)
                
    if update_probeset_to_ensembl == 'yes':
        if species == 'Mm' and array_type == 'AltMouse':
            buildAltMouseExonAnnotations(species,array_type,force,genomic_build)
        elif array_type == 'junction':
            buildJunctionExonAnnotations(species,array_type,specific_array_type,force,genomic_build)
        elif array_type == 'RNASeq':
            import RNASeq; test_status = 'no'; data_type = 'mRNA'
            RNASeq.getEnsemblAssociations(species,data_type,test_status,force)
        else: buildExonArrayExonAnnotations(species,array_type,force)

    if update_domain == 'yes':
        if array_type == 'RNASeq':
            only_rely_on_coordinate_mapping = True ### This will provide more accurate results as many junctions have missing sequences
        else:
            only_rely_on_coordinate_mapping = False

        from build_scripts import FeatureAlignment
        from build_scripts import JunctionArray
        from build_scripts import mRNASeqAlign
        from build_scripts import IdentifyAltIsoforms
        
        ### Get UCSC associations for all Ensembl linked genes (download databases if necessary)        if species == 'Mm' and array_type == 'AltMouse':
        mRNA_Type = 'mrna'; run_from_scratch = 'yes'
        export_all_associations = 'yes' ### YES only for protein prediction analysis
        buildUCSCAnnoationFiles(species,mRNA_Type,export_all_associations,run_from_scratch,force)

        if (species == 'Mm' and array_type == 'AltMouse'):
            """Imports and re-exports array-Ensembl annotations"""
            null = JunctionArray.importArrayAnnotations(species,array_type); null={}
        if (species == 'Mm' and array_type == 'AltMouse') or array_type == 'junction' or array_type == 'RNASeq':
            if only_rely_on_coordinate_mapping == False:
                """Performs probeset sequence aligment to Ensembl and UCSC transcripts. To do: Need to setup download if files missing"""
                analysis_type = 'reciprocal'
                mRNASeqAlign.alignProbesetsToTranscripts(species,array_type,analysis_type,force)
    
        run_seqcomp = 'no'
        if only_rely_on_coordinate_mapping == False:
            IdentifyAltIsoforms.runProgram(species,array_type,'null',force,run_seqcomp)
            FeatureAlignment.findDomainsByGenomeCoordinates(species,array_type,'null')
        
        if array_type == 'junction' or array_type == 'RNASeq':
            if only_rely_on_coordinate_mapping == False:
                ### For junction probeset sequences from mRNASeqAlign(), find and assess alternative proteins - export to the folder 'junction'
                mRNASeqAlign.alignProbesetsToTranscripts(species,array_type,'single',force)
                IdentifyAltIsoforms.runProgram(species,array_type,'junction',force,run_seqcomp)
                FeatureAlignment.findDomainsByGenomeCoordinates(species,array_type,'junction')
                ### For exon probesets (and junction exons) align and assess alternative proteins - export to the folder 'exon'
                IdentifyAltIsoforms.runProgram(species,array_type,'exon',force,run_seqcomp)
                FeatureAlignment.findDomainsByGenomeCoordinates(species,array_type,'exon') # not needed
            
            """ Repeat above with CoordinateBasedMatching = True """ 
            ### Peform coordinate based junction mapping to transcripts (requires certain sequence files built in IdentifyAltIosofmrs)
            analysis_type = 'reciprocal'
            mRNASeqAlign.alignProbesetsToTranscripts(species,array_type,analysis_type,force,CoordinateBasedMatching = True)
            IdentifyAltIsoforms.runProgram(species,array_type,'null',force,run_seqcomp)
            FeatureAlignment.findDomainsByGenomeCoordinates(species,array_type,'null')
            mRNASeqAlign.alignProbesetsToTranscripts(species,array_type,'single',force,CoordinateBasedMatching = True)
            IdentifyAltIsoforms.runProgram(species,array_type,'junction',force,run_seqcomp)
            FeatureAlignment.findDomainsByGenomeCoordinates(species,array_type,'junction')
            IdentifyAltIsoforms.runProgram(species,array_type,'exon',force,run_seqcomp)
            if array_type == 'RNASeq':
                JunctionArray.combineExonJunctionAnnotations(species,array_type)
                
    if update_miRs == 'yes':
        if update_miR_seq == 'yes':
            from build_scripts import MatchMiRTargetPredictions; only_add_sequence_to_previous_results = 'no'
            MatchMiRTargetPredictions.runProgram(species,force,only_add_sequence_to_previous_results)

        if array_type == 'exon' or array_type == 'gene':        
            from build_scripts import ExonSeqModule
            stringency = 'strict'; process_microRNA_predictions = 'yes'; mir_source = 'multiple'
            ExonSeqModule.runProgram(species,array_type,process_microRNA_predictions,mir_source,stringency)
            stringency = 'lax'
            ExonSeqModule.runProgram(species,array_type,process_microRNA_predictions,mir_source,stringency)
            ExonArray.exportMetaProbesets(array_type,species) ### Export metaprobesets for this build
        else:
            from build_scripts  import JunctionSeqModule
            stringency = 'strict'; mir_source = 'multiple'
            JunctionSeqModule.runProgram(species,array_type,mir_source,stringency,force)
            stringency = 'lax'
            JunctionSeqModule.runProgram(species,array_type,mir_source,stringency,force)

    if array_type == 'junction':
        try:
            from build_scripts import JunctionArray; from build_scripts import JunctionArrayEnsemblRules
            JunctionArray.filterForCriticalExons(species,array_type)
            JunctionArray.overRideExonEntriesWithJunctions(species,array_type)
            JunctionArrayEnsemblRules.annotateJunctionIDsAsExon(species,array_type)
            ExonArray.exportMetaProbesets(array_type,species) ### Export metaprobesets for this build
        except IOError: print 'No built junction files to analyze';sys.exit()
    if array_type == 'RNASeq' and (species == 'Hs' or species == 'Mm' or species == 'Rn'):
        from build_scripts import JunctionArray; from build_scripts import JunctionArrayEnsemblRules
        try: JunctionArrayEnsemblRules.annotateJunctionIDsAsExon(species,array_type)
        except IOError: print 'No Ensembl_exons.txt file to analyze';sys.exit()
    
    try:
        filename = 'AltDatabase/'+species+'/SequenceData/miRBS-combined_gene-targets.txt'; ef=filepath(filename)
        er = string.replace(ef,species+'/SequenceData/miRBS-combined_gene-targets.txt','ensembl/'+species+'/'+species+'_microRNA-Ensembl.txt')
        import shutil; shutil.copyfile(ef,er)
    except Exception: null=[]
    if array_type != 'RNASeq':
        ### Get the probeset-probe relationships from online - needed for FIRMA analysis
        filename = 'AltDatabase/'+species+'/'+array_type+'/'+species+'_probeset-probes.txt'
        if array_type == 'junction' and 'lue' in specific_array_type:
            server_folder = 'junction/hGlue'
            verifyFile(filename,server_folder) ### Will force download if missing
            verifyFile('AltDatabase/'+species+'/'+array_type+'/platform.txt',server_folder) ### Will force download if missing
        elif array_type != 'AltMouse': verifyFile(filename,array_type) ### Will force download if missing
        if (array_type == 'exon' or array_type == 'AltMouse') and species != 'Rn':
            try:
                ### Available for select exon-arrays and AltMouse
                probeset_to_remove_file = 'AltDatabase/'+species+'/'+array_type+'/'+species+'_probes_to_remove.txt'
                verifyFile(probeset_to_remove_file,array_type)
            except Exception: null=[]
Esempio n. 3
0
def reAnalyzeRNAProbesetMatches(align_files, species, array_type,
                                pairwise_probeset_combinations):
    """Import matching and non-matching probesets and export the valid comparisons"""
    align_files2 = []
    for file in align_files:
        if array_type in file: align_files2.append(file)
    align_files = align_files2

    matching = {}
    not_matching = {}
    for filename in align_files:
        print 'Reading', filename
        start_time = time.time()
        fn = filepath(filename)
        for line in open(fn, 'rU').xreadlines():
            values = string.replace(line, '\n', '')
            probeset, call, accession = string.split(values, '\t')
            if call == '1':
                try:
                    matching[probeset].append(accession)
                except KeyError:
                    matching[probeset] = [accession]
            else:
                try:
                    not_matching[probeset].append(accession)
                except KeyError:
                    not_matching[probeset] = [accession]

    probeset_matching_pairs = {}
    matching_in_both = 0
    match_and_null = 0
    no_matches = 0
    no_nulls = 0
    for (probeset1, probeset2) in pairwise_probeset_combinations:
        if probeset1 in matching and probeset2 in matching:
            matching[probeset1].sort()
            matching[probeset2].sort()
            match1 = string.join(matching[probeset1], '|')
            match2 = string.join(matching[probeset2], '|')
            if match1 != match2:
                probeset_matching_pairs[probeset1 + '|' +
                                        probeset2] = [match1, match2]
            matching_in_both += 1
        else:
            if probeset1 in matching and probeset1 in not_matching:
                match = string.join(matching[probeset1], '|')
                null_match = string.join(
                    filterNullMatch(not_matching[probeset1],
                                    matching[probeset1]), '|')
                probeset_matching_pairs[probeset1] = [match, null_match]
                match_and_null += 1
            elif probeset2 in matching and probeset2 in not_matching:
                match = string.join(matching[probeset2], '|')
                null_match = string.join(
                    filterNullMatch(not_matching[probeset2],
                                    matching[probeset2]), '|')
                probeset_matching_pairs[probeset2] = [match, null_match]
                match_and_null += 1
            elif probeset1 in matching or probeset2 in matching:
                no_nulls += 1
            else:
                no_matches += 1
                #if no_matches<10: print probeset1,probeset2

    print matching_in_both, "probeset pairs with matching isoforms for both recipricol probesets."
    print match_and_null, "probeset pairs with a match for one and null for that one."
    print no_nulls, "probeset pairs with only one match."
    print no_matches, "probeset pairs with no matches."

    from build_scripts import IdentifyAltIsoforms
    export_file = 'AltDatabase/' + species + '/' + array_type + '/' + species + '_all-transcript-matches.txt'
    if analysis_type == 'single':
        export_file = 'AltDatabase/' + species + '/' + array_type + '/junction/' + species + '_all-transcript-matches.txt'
    IdentifyAltIsoforms.exportSimple(probeset_matching_pairs, export_file, '')