Ejemplo n.º 1
0
	def runLizsSet(PredictionSet, ProtocolID):
		raise colortext.Exception("Do you really want to run this?")
		colortext.printf("\nAdding Liz's data set to %s prediction set." % PredictionSet, "lightgreen")
		KeepHETATMLines = False
		FilterTester.openDB()

		# Filter by the DummySource set of experiments
		er1 = ExperimentResultSet(ddGdb)
		ef1 = ExperimentFilter()
		ef1.setSource(ExperimentFilter.LizKellogg)
		er1.addFilter(ef1)
		FilterTester.test(er1)

		experimentIDs = sorted(list(er1.getFilteredIDs()))
		colortext.message("\nThe number of unique experiments is %d.\n" % len(experimentIDs))
		ddG_connection = db.ddG()
		count = 0
		for experimentID in experimentIDs:
			ddG_connection.addPrediction(experimentID, PredictionSet, ProtocolID, KeepHETATMLines, StoreOutput = True)
			count += 1
			if count >= 10:
				colortext.write(".")
				colortext.flush()
				count = 0
		print("")
Ejemplo n.º 2
0
	def addLinsJobs(PredictionSet, ProtocolID):
		raise colortext.Exception("Do you really want to run this?")
		colortext.printf("\nAdding Lin's mutations to %s prediction set." % PredictionSet, "lightgreen")
		KeepHETATMLines = False
		FilterTester.openDB()

		# Filter by the DummySource set of experiments
		er1 = ExperimentResultSet(ddGdb)
		ef1 = ExperimentFilter()
		ef1.setSource(ExperimentFilter.DummySource)
		er1.addFilter(ef1)

		# Filter by the particular PDB
		sr = StructureResultSet(ddGdb, 'WHERE PDB_ID="3K0NB_lin"')
		er1 = ExperimentResultSet.fromIDs(ddGdb, er1.getFilteredIDs()).filterBySet(sr)
		FilterTester.test(er1)

		experimentIDs = sorted(list(er1.getFilteredIDs()))
		colortext.message("\nThe number of unique experiments is %d.\n" % len(experimentIDs))
		ddG_connection = db.ddG()
		count = 0
		for experimentID in experimentIDs:
			ddG_connection.addPrediction(experimentID, PredictionSet, ProtocolID, KeepHETATMLines, StoreOutput = True)
			count += 1
			if count >= 10:
				colortext.write(".")
				colortext.flush()
				count = 0
		print("")
def check_JSON_dataset(dataset_ID):
    # I substitute PDB IDs so this function does a simple check to make sure that the mutations still look okay (this is a simple check - the mutations may not be correct)

    colortext.message('Reading PDB IDs...')
    PDB_ids = set([record['PDBFileID'] for record in JSON_datasets[dataset_ID]['data']])

    colortext.message('Loading %s PDBs...' % len(PDB_ids))
    for PDB_id in PDB_ids:
        if not(cached_pdbs.get(PDB_id)):
            print('Reading %s' % PDB_id)
            colortext.write('.', 'yellow')
            sys.stdout.flush()
            cached_pdbs[PDB_id] = PDB(ddGdb.execute_select('SELECT Content FROM PDBFile WHERE ID=%s', parameters=(PDB_id,))[0]['Content'])
    print('')

    count = 0
    for record in JSON_datasets[dataset_ID]['data']:
        pdb_id = record['PDBFileID']
        p = cached_pdbs[pdb_id]
        #colortext.printf('pdb_id', color='cyan')
        #pprint.pprint(record)
        #pprint.pprint(record['Mutations'])
        for m in record['Mutations']:
            chain_id = m['Chain']
            residue_id = m['ResidueID']
            residue_aa = m['WildTypeAA']
            padded_id = ChainResidueID2String(chain_id, residue_id)
            if p.atom_sequences[chain_id][padded_id].ResidueAA != residue_aa:
                print(pdb_id, chain_id, residue_id, residue_aa)
                print(p.atom_sequences[chain_id][padded_id].ResidueAA, residue_aa)
            assert(p.atom_sequences[chain_id][padded_id].ResidueAA == residue_aa)
        count += 1
    print('Successfully checked %d datapoints.' % count)
Ejemplo n.º 4
0
	def showAllEligibleProTherm(PredictionSet, ProtocolID, KeepHETATMLines):
		#inserter = JobInserter()
		colortext.printf("\nAdding ProTherm mutations to %s prediction set." % PredictionSet, "lightgreen")
		#ddGdb = dbi.ddGDatabase()
		
		MAX_RESOLUTION = 2.1
		MAX_NUMRES_PROTHERM = 350
		MAX_STANDARD_DEVIATION = 1.0

		FilterTester.openDB()
		
		if False:
			t1 = time.time()
			er1 = ExperimentResultSet(ddGdb)
			er1.addFilter(ExperimentFilter.OnSource(ExperimentFilter.ProTherm))
			er1.addFilter(ExperimentFilter.NumberOfMutations(1, 1))
			er1.addFilter(ExperimentFilter.NumberOfChains(1, 1))
			er1.addFilter(ExperimentFilter.StandardDeviation(None, MAX_STANDARD_DEVIATION))
			er1.addFilter(StructureFilter.Resolution(None, MAX_RESOLUTION))
			er1.addFilter(StructureFilter.Techniques(StructureFilter.XRay))
			FilterTester.test(er1)
			t2 = time.time()
			print(t2 - t1)
		
		# This method usually takes around 65% of the time as the method above 
		t1 = time.time()
		ef1 = ExperimentFilter()
		ef1.setSource(ExperimentFilter.ProTherm)
		er1 = ExperimentResultSet(ddGdb)
		er1.addFilter(ExperimentFilter.OnSource(ExperimentFilter.ProTherm))
		FilterTester.test(er1)
		ef1.setNumberOfMutations(1, 1)
		ef1.setNumberOfChains(1, 1)
		ef1.setStandardDeviation(None, MAX_STANDARD_DEVIATION)
		sf1 = StructureFilter()
		sf1.setResolution(None, MAX_RESOLUTION)
		sf1.setTechniques(StructureFilter.XRay)
		er1 = ExperimentResultSet(ddGdb)
		er1.addFilter(ef1)
		er1.addFilter(sf1)
		FilterTester.test(er1)
		t2 = time.time()
		print(t2 - t1)
		
		experimentIDs = sorted(list(er1.getFilteredIDs()))
		colortext.message("\nThe number of unique ProTherm experiments with:\n\t- one mutation;\n\t- structures solved by X-ray diffraction and with <= %d residues;\n\t- a maximum standard deviation in experimental results of <= %0.2f;\n\t- and a resolution of <= %0.2f Angstroms.\nis %d.\n" % (MAX_NUMRES_PROTHERM, MAX_STANDARD_DEVIATION, MAX_RESOLUTION, len(experimentIDs)))
		ddG_connection = db.ddG()
		count = 0
		sys.exit(0)
		print("")
		for experimentID in experimentIDs:
			ddG_connection.addPrediction(experimentID, PredictionSet, ProtocolID, KeepHETATMLines, StoreOutput = True)
			count += 1
			if count >= 10:
				colortext.write(".")
				colortext.flush()
				count = 0
		print("")
Ejemplo n.º 5
0
    def __init__(self, UniProtAC, XML = None, cache_dir = None, silent = True):
        if cache_dir and not(os.path.exists(cache_dir)):
            raise Exception("The cache directory %s does not exist." % cache_dir)

        self.UniProtAC = UniProtAC
        self.silent = silent

        # Get XML
        if XML == None:
            protein_xml = None
            cached_filepath = None
            if cache_dir:
                cached_filepath = os.path.join(cache_dir, '%s.xml' % UniProtAC)
            if cached_filepath and os.path.exists(cached_filepath):
                protein_xml = read_file(cached_filepath)
            else:
                if not silent:
                    colortext.write("Retrieving %s\n" % UniProtAC, "cyan")
                url = 'http://www.uniprot.org/uniprot/%s.xml' % UniProtAC
                protein_xml = http_get(url)
                if not(protein_xml.strip()):
                    raise EmptyUniProtACXMLException('The file %s is empty.' % UniProtAC)
                if cached_filepath:
                    write_file(cached_filepath, protein_xml)
            self.XML = protein_xml
        else:
            self.XML = XML

        self.recommended_name = None
        self.submitted_names = []
        self.alternative_names = []

        # Get DOM
        try:
            self._dom = parseString(protein_xml)
        except:
            if cached_filepath:
                raise Exception("The UniProtAC XML for '%s' was invalid. The cached file is located at %s. Check this file - if it is not valid XML then delete the file and rerun the script." % (UniProtAC, cached_filepath))
            else:
                raise Exception("The UniProtAC XML for '%s' was invalid." % UniProtAC)
        main_tags = self._dom.getElementsByTagName("uniprot")
        assert(len(main_tags) == 1)
        entry_tags = main_tags[0].getElementsByTagName("entry")
        assert(len(entry_tags) == 1)
        self.entry_tag = entry_tags[0]

        self._parse_evidence_tag()
        self._parse_sequence_tag()
        self._parse_protein_tag()
        self._parse_organism_tag()
        self._parse_subsections()
        self._parse_PDB_mapping()
Ejemplo n.º 6
0
def test_pdbml_speed():

    test_cases = [
        '1WSY',
        '1YGV',
        '487D',
        '1HIO',
        '1H38',
        '3ZKB',
    ]
    for test_case in test_cases:
        print("\n")

        colortext.message("Creating PDBML object for %s" % test_case)
        #PDBML.retrieve(test_case, cache_dir = cache_dir)

        print("")
        colortext.printf("Using the old minidom class", color = 'cyan')
        t1 = time.clock()
        p_minidom = PDBML_slow.retrieve(test_case, cache_dir = cache_dir)
        t2 = time.clock()
        colortext.message("Done in %0.2fs!" % (t2 - t1))

        print("")
        colortext.printf("Using the new sax class", color = 'cyan')
        t1 = time.clock()
        p_sax = PDBML.retrieve(test_case, cache_dir = cache_dir)
        t2 = time.clock()
        colortext.message("Done in %0.2fs!" % (t2 - t1))

        colortext.write("\nEquality test: ", color = 'cyan')
        try:
            assert(p_minidom.atom_to_seqres_sequence_maps.keys() == p_sax.atom_to_seqres_sequence_maps.keys())
            for c, s_1 in p_minidom.atom_to_seqres_sequence_maps.iteritems():
                s_2 = p_sax.atom_to_seqres_sequence_maps[c]
                assert(str(s_1) == str(s_2))
            colortext.message("passed\n")
        except:
            colortext.error("failed\n")
Ejemplo n.º 7
0
    def _get_XML(self):
        uparc_xml = None
        cached_filepath = None
        if self.cache_dir:
            cached_filepath = os.path.join(self.cache_dir, '%s.xml' % self.UniParcID)
        if cached_filepath and os.path.exists(cached_filepath):
            uparc_xml = read_file(cached_filepath)
        else:
            if not self.silent:
                colortext.write("Retrieving %s\n" % self.UniParcID, "cyan")
            url = 'http://www.uniprot.org/uniparc/%s.xml' % self.UniParcID
            uparc_xml = http_get(url)
            if cached_filepath:
                write_file(cached_filepath, uparc_xml)
        self.XML = uparc_xml

        # Get DOM
        self._dom = parseString(uparc_xml)
        main_tags = self._dom.getElementsByTagName("uniparc")
        assert(len(main_tags) == 1)
        entry_tags = main_tags[0].getElementsByTagName("entry")
        assert(len(entry_tags) == 1)
        self.entry_tag = entry_tags[0]
def main(prediction_ids = None, memory_free='3.0G', cfg = None):
    # This uses the version of Rosetta from your cluster template settings file
    settings = parse_settings.get_dict()
    rosetta_scripts_path = settings['local_rosetta_installation_path'] + '/source/bin/' + 'rosetta_scripts' + settings['local_rosetta_binary_type']
    ppi_api = get_interface_with_config_file(rosetta_scripts_path = rosetta_scripts_path, rosetta_database_path = '/home/kyleb/rosetta/working_branches/alascan/database')

    t1, t2 = None, None

    # Read the keep_hetatm_lines optional setting
    keep_hetatm_lines = False
    keep_all_lines = False
    try: keep_hetatm_lines = cfg.keep_hetatm_lines
    except: colortext.warning('Note: keep_hetatm_lines is not specified in {0}. Defaulting to {1}.'.format(sys.argv[1], keep_hetatm_lines))
    try: keep_all_lines = cfg.keep_all_lines
    except: colortext.warning('Note: keep_all_lines is not specified in {0}. Defaulting to {1}.'.format(sys.argv[1], keep_all_lines))

    prediction_set_id = cfg.prediction_set_id

    if prediction_ids == None:
        assert( len(sys.argv) > 1 )
        cfg = importlib.import_module(sys.argv[1], package=None)

        protocol_name = cfg.protocol_name

        suppress_warnings = True

        if not ppi_api.prediction_set_exists(prediction_set_id):
            print 'Creating new prediction set:', prediction_set_id
            t1 = time.time()
            ppi_api.add_prediction_set(prediction_set_id, halted = True, priority = 7, allow_existing_prediction_set = False, description = cfg.prediction_set_description)

            # Populate the prediction set with jobs from a (tagged subset of a) user dataset
            print 'Created PredictionSet:', prediction_set_id
            ppi_api.add_prediction_run(prediction_set_id, cfg.user_dataset_name, keep_all_lines = keep_all_lines, keep_hetatm_lines = keep_hetatm_lines, tagged_subset = cfg.tagged_subset, extra_rosetta_command_flags = '-ignore_zero_occupancy false -ignore_unrecognized_res', show_full_errors = True, suppress_warnings = suppress_warnings)
            t2 = time.time()

        existing_job = False
        end_job_name  = '%s_%s' % (getpass.getuser(), prediction_set_id)
        if not os.path.exists(job_output_directory):
            os.makedirs(job_output_directory)

        for d in os.listdir(job_output_directory):
            if os.path.isdir(os.path.join(job_output_directory, d)) and end_job_name in d:
                print 'Found existing job:', d
                job_name = d
                existing_job = True
        if not existing_job:
            job_name = '%s-%s' % (time.strftime("%y%m%d"), end_job_name)

            ppi_api.add_development_protocol_command_lines(
                prediction_set_id, protocol_name, 'minimize_with_cst', ''
            )
            # 2x because bugs
            ppi_api.add_development_protocol_command_lines(
                prediction_set_id, protocol_name, 'minimize_with_cst', ''
            )

        prediction_ids = sorted(ppi_api.get_prediction_ids(prediction_set_id))
        output_dir = os.path.join(job_output_directory, job_name )
    else:
        # Prediction_ids passed in
        job_name = '%s-%s_%s-rerun' % (time.strftime("%y%m%d"), getpass.getuser(), prediction_set_id)

        output_dir = os.path.join(job_output_directory, job_name )
        if os.path.isdir(output_dir):
            shutil.rmtree(output_dir)
        existing_job = False

    settings['scriptname'] = prediction_set_id + '_run'
    settings['tasks_per_process'] = 5
    settings['mem_free'] = memory_free
    settings['output_dir'] = output_dir
    settings['rosetta_args_list'] = [
        '-in:file:fullatom',
        '-ignore_zero_occupancy false',
        '-ignore_unrecognized_res',
        '-fa_max_dis 9.0',
        '-ddg::harmonic_ca_tether 0.5',
        '-ddg::constraint_weight 1.0',
        '-ddg::out_pdb_prefix min_cst_0.5',
        '-ddg::sc_min_only false',
    ]
    settings['rosetta_args_list'].extend(cfg.extra_flags)
    print settings['rosetta_args_list']

    # Now get run settings from database and save to pickle file
    job_dict = {}
    output_data_dir = os.path.join(settings['output_dir'], 'data')

    if not os.path.isdir(output_data_dir):
        os.makedirs(output_data_dir)

    if t1 != None and t2 != None and len(prediction_ids) != 0:
        print('Time taken for {0} predictions: {1}s ({2}s per prediction).'.format(len(prediction_ids), t2-t1, (t2-t1)/len(prediction_ids)))
    print('File cache statistics:')
    pprint.pprint(ppi_api.get_file_content_cache_stats())
    settings['numjobs'] = len(prediction_ids)
    app_name = 'minimize_with_cst'
    settings['appname'] = app_name

    print('')

    t1 = time.time()

    # Progress counter setup
    colortext.message('Creating input data for %d predictions.' % (len(prediction_ids)))
    count, records_per_dot = 0, 50
    print("|" + ("*" * (int(len(prediction_ids)/records_per_dot)-2)) + "|")
    for prediction_id in prediction_ids:
        # Progress counter
        count += 1
        if count % records_per_dot == 0: colortext.write(".", "cyan", flush = True)

        # Check if job already ran
        prediction_id_dir = os.path.join(output_dir, str(prediction_id))
        if existing_job:
            if os.path.isdir( prediction_id_dir ):
                pdb_output_files = [x for x in os.listdir( prediction_id_dir ) if '.pdb' in x]
            else:
                pdb_output_files = []
            if len(pdb_output_files) >= 1:
                print 'Skipping', prediction_id
                settings['numjobs'] = settings['numjobs'] - 1
                continue
            if os.path.isdir(prediction_id_dir):
                print 'Job directory %s already exists, deleting' % prediction_id_dir
                shutil.rmtree(prediction_id_dir)
            # else:
            #     print 'Creating new job directory %s' % prediction_id_dir

        job_data_dir = os.path.join(output_data_dir, str(prediction_id))

        # Allow us to resume from an interrupted setup
        truncate_content = None
        all_files_exist = os.path.exists(job_data_dir) and os.path.exists(os.path.join(job_data_dir, '.ready'))
        if all_files_exist:
            truncate_content = 0

        job_details = ppi_api.get_job_details(prediction_id, truncate_content = truncate_content)
        file_tuples = [] # List of names, contents
        for file_info in job_details['Files']['Input']:
            file_tuples.append( (file_info['Filename'], file_info['Content']) )
        substitution_parameters = json.loads(job_details['JSONParameters'])

        # Scrub the folder
        if not all_files_exist:
            if os.path.isdir(job_data_dir):
                shutil.rmtree(job_data_dir)
            os.makedirs(job_data_dir)

        files_dict = {} # Maps name to filepath position
        for file_name, file_contents in file_tuples:
            new_file_location = os.path.join(job_data_dir, file_name)
            if not all_files_exist:
                if '.pdb' in file_name:
                    if keep_hetatm_lines or keep_all_lines:
                        write_file(new_file_location, file_contents)
                    else:
                        write_file(new_file_location, '\n'.join([l for l in file_contents.split('\n') if l.startswith('ATOM')]))
                else:
                    with open(new_file_location, 'w') as f:
                        f.write(file_contents)
            files_dict[file_name] = os.path.relpath(new_file_location, settings['output_dir'])
        if not all_files_exist:
            write_file(os.path.join(job_data_dir, '.ready'), '')

        argdict = {
            'input_file_list' : [files_dict[substitution_parameters['%%input_pdb%%']]],
        }
        for file_name, file_location in files_dict.iteritems():
            if 'params' in file_name:
                argdict['-extra_res_fa'] = file_location
        job_dict[prediction_id] = argdict


    t2 = time.time()

    print('')
    if count != 0:
        print('Time taken for {0} predictions: {1}s ({2}s per prediction).'.format(count, t2-t1, (t2-t1)/count))
    print('File cache statistics:')
    pprint.pprint(ppi_api.get_file_content_cache_stats())

    print('')
    if len(job_dict) > 0:
        write_run_file(settings, database_run = False, job_dict = job_dict)
        print 'Job files written to directory:', os.path.abspath(output_dir)
    else:
        print 'No tasks to process, not writing job files'
    pprint.pprint(ppi_api.get_file_content_cache_stats())
    settings['numjobs'] = len(prediction_ids)
    settings['appname'] = None

    print('')

    t1 = time.time()

    # Progress counter setup
    colortext.message('Creating input data for %d predictions.' % (len(prediction_ids)))
    count, records_per_dot = 0, 50
    print("|" + ("*" * (int(len(prediction_ids)/records_per_dot)-2)) + "|")
    for prediction_id in prediction_ids:
        # Progress counter
        count += 1
        if count % records_per_dot == 0: colortext.write(".", "cyan", flush = True)

        # Check if job already ran
        prediction_id_dir = os.path.join(output_dir, str(prediction_id))
        if existing_job:
            if os.path.isdir( prediction_id_dir ):
                pdb_output_files = [x for x in os.listdir( prediction_id_dir ) if '.pdb' in x]
            else:
                pdb_output_files = []
            if len(pdb_output_files) >= 1:
                print 'Skipping', prediction_id
                settings['numjobs'] = settings['numjobs'] - 1
                continue
            if os.path.isdir(prediction_id_dir):
                print 'Job directory %s already exists, deleting' % prediction_id_dir
                shutil.rmtree(prediction_id_dir)
Ejemplo n.º 10
0
    def __init__(self, UniParcID, UniProtACs = None, UniProtIDs = None, cache_dir = None, silent = False):
        if cache_dir and not(os.path.exists(os.path.abspath(cache_dir))):
            raise Exception("The cache directory %s does not exist." % os.path.abspath(cache_dir))
        self.UniParcID = UniParcID
        self.cache_dir = cache_dir
        self.recommended_name = None
        self.silent = silent

        # Get AC mapping
        if not UniProtACs or UniParcID=='UPI0000047CA3': # todo: is this UPI0000047CA3 special handling necessary?
            mapping = uniprot_map('UPARC', 'ACC', [UniParcID], cache_dir = cache_dir, silent = silent)[UniParcID]
            self.UniProtACs = mapping
        else:
            self.UniProtACs = UniProtACs

        # Get ID mapping
        if not UniProtIDs:
            mapping = uniprot_map('UPARC', 'ID', [UniParcID], cache_dir = cache_dir, silent = silent)[UniParcID]
            self.UniProtIDs = mapping
        else:
            self.UniProtIDs = UniProtIDs

        # Get FASTA
        cached_filepath = None
        if cache_dir:
            cached_filepath = os.path.join(cache_dir, '%s.fasta' % UniParcID)
        if cached_filepath and os.path.exists(cached_filepath):
            fasta = read_file(cached_filepath)
        else:
            if not silent:
                print("Getting FASTA file")
            url = 'http://www.uniprot.org/uniparc/%s.fasta' % UniParcID
            fasta = http_get(url)
            if cached_filepath:
                write_file(cached_filepath, fasta)

        # Get sequence
        header = fasta.split("\n")[0].split()
        assert(len(header) == 2)
        assert(header[0] == ">%s" % UniParcID)
        assert(header[1].startswith("status="))
        sequence = "".join(map(string.strip, fasta.split("\n")[1:]))
        self.sequence = sequence

        # Get atomic mass (and sequence again)
        self.atomic_mass = None
        self.CRC64Digest = None
        recommended_names = []
        alternative_names = []
        submitted_names = []

        self.AC_entries = {}
        subsections = ProteinSubsectionHolder(len(sequence))

        for UniProtAC in self.UniProtACs:
            #colortext.write("%s\n" % UniProtAC, 'cyan')
            try:
                AC_entry = UniProtACEntry(UniProtAC, cache_dir = self.cache_dir, silent = silent)
            except EmptyUniProtACXMLException:
                continue
            self.AC_entries[UniProtAC] = AC_entry

            # Mass sanity check
            if self.atomic_mass != None:
                assert(self.atomic_mass == AC_entry.atomic_mass)
            self.atomic_mass = AC_entry.atomic_mass

            # Sequence sanity check
            assert(self.sequence == AC_entry.sequence)
            # CRC 64 sanity check
            if self.CRC64Digest != None:
                assert(self.CRC64Digest == AC_entry.CRC64Digest)
            self.CRC64Digest = AC_entry.CRC64Digest
            assert(CRC64.CRC64digest(self.sequence) == self.CRC64Digest)

            if AC_entry.recommended_name:
                found = False
                for n in recommended_names:
                    if n[0] == AC_entry.recommended_name:
                        n[1] += 1
                        found = True
                        break
                if not found:
                    recommended_names.append([AC_entry.recommended_name, 1])

            for alternative_name in AC_entry.alternative_names:
                found = False
                for n in alternative_names:
                    if n[0] == alternative_name:
                        n[1] += 1
                        found = True
                        break
                if not found:
                    alternative_names.append([alternative_name, 1])

            for submitted_name in AC_entry.submitted_names:
                found = False
                for n in submitted_names:
                    if n[0] == submitted_name:
                        n[1] += 1
                        found = True
                        break
                if not found:
                    submitted_names.append([submitted_name, 1])

            subsections += AC_entry.subsections
        self.subsections = subsections

        assert(len(set(UniParcMergedRecommendedNamesRemap.keys()).intersection(set(UniParcMergedSubmittedNamesRemap.keys()))) == 0)
        if UniParcID in UniParcMergedRecommendedNamesRemap:
            recommended_names = [[UniParcMergedRecommendedNamesRemap[UniParcID], 1]]
        elif UniParcID in UniParcMergedSubmittedNamesRemap:
            recommended_names = [[UniParcMergedSubmittedNamesRemap[UniParcID], 1]]

        if not silent:
            colortext.write('Subsections\n', 'orange')
        #print(subsections)

        if len(recommended_names) == 0 and len(alternative_names) == 0 and len(submitted_names) == 0:
            raise UniParcEntryStandardizationException("UniParcID %s has no recommended names." % UniParcID)
        elif len(recommended_names) == 0:
            s = ["UniParcID %s has no recommended names.\n" % UniParcID]
            if alternative_names:
                s.append("It has the following alternative names:")
                for tpl in sorted(alternative_names, key=lambda x:-x[1]):
                    s.append("\n  count=%s: %s" % (str(tpl[1]).ljust(5), tpl[0]['Name']))
                    if tpl[0]['Short names']:
                        s.append(" (short names: %s)" % ",".join(tpl[0]['Short names']))
                    if tpl[0]['EC numbers']:
                        s.append(" (EC numbers: %s)" % ",".join(tpl[0]['EC numbers']))
            if submitted_names:
                s.append("It has the following submitted names:")
                for tpl in sorted(submitted_names, key=lambda x:-x[1]):
                    s.append("\n  count=%s: %s" % (str(tpl[1]).ljust(5), tpl[0]['Name']))
                    if tpl[0]['Short names']:
                        s.append(" (short names: %s)" % ",".join(tpl[0]['Short names']))
                    if tpl[0]['EC numbers']:
                        s.append(" (EC numbers: %s)" % ",".join(tpl[0]['EC numbers']))
            #raise UniParcEntryStandardizationException("".join(s))
        elif len(recommended_names) > 1:
            s = ["UniParcID %s has multiple recommended names: " % UniParcID]
            for tpl in sorted(recommended_names, key=lambda x:-x[1]):
                s.append("\n  count=%s: %s" % (str(tpl[1]).ljust(5), tpl[0]['Name']))
                if tpl[0]['Short names']:
                    s.append(" (short names: %s)" % ",".join(tpl[0]['Short names']))
                if tpl[0]['EC numbers']:
                    s.append(" (EC numbers: %s)" % ",".join(tpl[0]['EC numbers']))
            raise UniParcEntryStandardizationException("".join(s))

        #assert(len(recommended_names) == 1) # todo: this is not always available
        #print(recommended_names)
        self.recommended_name = None
        if len(recommended_names) == 1:
            self.recommended_name = recommended_names[0][0]
        self.get_organisms()
Ejemplo n.º 11
0
def pdb_to_uniparc(pdb_ids, silent = True, cache_dir = None, manual_additions = {}):
    ''' Returns a mapping {PDB ID -> List(UniParcEntry)}
        The UniParcEntry objects have a to_dict() method which may be useful.
    '''

    # Map PDB IDs to UniProtKB AC
    if not silent:
        colortext.write("Retrieving PDB to UniProtKB AC mapping: ", 'cyan')
    pdb_ac_mapping = uniprot_map('PDB_ID', 'ACC', pdb_ids, cache_dir = cache_dir, silent = silent)

    for k, v in manual_additions.iteritems():
        if k in pdb_ids:
            if pdb_ac_mapping.get(k):
                pdb_ac_mapping[k].extend(v)
                pdb_ac_mapping[k] = list(set(pdb_ac_mapping[k]))
            else:
                pdb_ac_mapping[k] = v

    if not silent:
        colortext.write("done\n", 'green')

    # Get a list of AC_IDs
    if not silent:
        colortext.write("Retrieving UniProtKB AC to UniProtKB ID mapping: ", 'cyan')
    AC_IDs = set()
    for k, v in pdb_ac_mapping.iteritems():
        AC_IDs = AC_IDs.union(set(v))
    AC_IDs = list(AC_IDs)
    if not silent:
        colortext.write("done\n", 'green')

    # Map UniProtKB ACs to UniParc IDs
    if not silent:
        colortext.write("Retrieving UniProtKB AC to UniParc ID mapping: ", 'cyan')
    ac_uniparc_mapping = uniprot_map('ACC', 'UPARC', AC_IDs, cache_dir = cache_dir, silent = silent)
    for k, v in ac_uniparc_mapping.iteritems():
        assert(len(v) == 1)
        ac_uniparc_mapping[k] = v[0]
    if not silent:
        colortext.write("done\n", 'green')

    # Map UniProtKB ACs to UniProtKB IDs
    ac_id_mapping = uniprot_map('ACC', 'ID', AC_IDs, cache_dir = cache_dir, silent = silent)

    for k, v in ac_id_mapping.iteritems():
        assert(len(v) == 1)
        ac_id_mapping[k] = v[0]

    # Create mapping from PDB IDs to UniParcEntry objects
    m = {}
    if not silent:
        colortext.message("\nRetrieving FASTA sequences for the %d PDB IDs." % len(pdb_ids))
    for pdb_id, ACs in pdb_ac_mapping.iteritems():
        if not silent:
            colortext.write("%s: " % pdb_id, "orange")
        m[pdb_id] = []
        for AC in ACs:
            entry = UniParcEntry(ac_uniparc_mapping[AC], [AC], [ac_id_mapping[AC]], cache_dir = cache_dir)
            m[pdb_id].append(entry)
            if not silent:
                colortext.write(".", "green")
        if not silent:
            print("")
    return m
Ejemplo n.º 12
0
from klab import colortext

# Test
chars = 'A'
count = 0
for name, data in colortext.colors.iteritems():
    colortext.write(name, name)
    for effect in colortext.EFFECTS_:
        colortext.write(name, color = name, bgcolor = 'lightblue', effect = effect)
    print("")
colortext.rainbowprint("Rainbow test")
colortext.printf("\ntest1", color = 'red')
colortext.printf("test2")
colortext.bar('blue', 9, suffix = "\n")