def run_refinement_pipeline(in_mtz, ref_pdb, out_dir, program='dimple'): """Run refinement of the input MTZ file against a reference PDB file""" print '*************************' print '*** Running Pipelines ***' print '*************************' if program == 'dimple': # Define output files out_pdb = os.path.join(out_dir, 'final.pdb') out_mtz = os.path.join(out_dir, 'final.mtz') # Create command manager for dimple cmd = CommandManager('dimple') cmd.add_command_line_arguments( ['--jelly', '5', in_mtz, ref_pdb, out_dir]) else: raise Exception("no stop that. you're doing it wrong.") print_run_and_raise_error_maybe(cmd) if not os.path.exists(out_pdb): raise Failure( 'running refinement with {} has failed -- {} does not exist'. format(program, out_pdb)) if not os.path.exists(out_mtz): raise Failure( 'running refinement with {} has failed -- {} does not exist'. format(program, out_mtz)) return out_pdb, out_mtz
def transfer_rfree_flags(in_mtz, out_mtz, reference_mtz, input_free_r_flag, output_free_r_flag, delete_tmp_files=True): """Copy R-free flags from reference mtz""" print '*********************************' print '*** Transferring R-free flags ***' print '*********************************' tmp_mtz = splice_ext(path=out_mtz, new='step1-transfer') # Stage 1 - transfer R-free from reference cmd = CommandManager('cad') cmd.add_command_line_arguments( ['hklin1', in_mtz, 'hklin2', reference_mtz, 'hklout', tmp_mtz]) cmd.add_standard_input([ 'labin file_number 1 ALL', 'labin file_number 2 E1={}'.format(input_free_r_flag), 'labout file_number 2 E1={}'.format(output_free_r_flag), 'END' ]) print_run_and_raise_error_maybe(cmd) if not os.path.exists(tmp_mtz): raise Failure( 'transfer of R-free flags has failed -- {} does not exist'.format( tmp_mtz)) print '*******************************' print '*** Completing R-free flags ***' print '*******************************' # Stage 2 - populate missing R-free values cmd = CommandManager('freerflag') cmd.add_command_line_arguments(['hklin', tmp_mtz, 'hklout', out_mtz]) cmd.add_standard_input( ['COMPLETE FREE={}'.format(output_free_r_flag), 'END']) print_run_and_raise_error_maybe(cmd) if not os.path.exists(out_mtz): raise Failure( 'expanding of R-free flags has failed -- {} does not exist'.format( out_mtz)) if delete_tmp_files: os.remove(tmp_mtz)
def select_reference_dataset(self, datasets, method='resolution', max_rfree=0.4, min_resolution=5): """Select dataset to act as the reference - scaling, aligning etc""" assert method in [ 'resolution', 'rfree' ], 'METHOD FOR SELECTING THE REFERENCE DATASET NOT RECOGNISED: {!s}'.format( method) # ==============================> # Get the potential reference datasets # ==============================> # filtered_datasets = self.datasets.mask(mask_name='valid - all') filtered_datasets = [d for dtag, d in datasets.items()] if not filtered_datasets: raise Failure( "Can't select a reference dataset - NO SUITABLE (NON-REJECTED) DATASETS REMAINING" ) # ==============================> # Select by either R-free or Resolution # ==============================> if method == 'rfree': # Get RFrees of datasets (set to dummy value of 999 if resolution is too high so that it is not selected) r_frees = [ d.model.input.get_r_rfree_sigma().r_free if (d.data.mtz_object().max_min_resolution()[1] < min_resolution) else 999 for d in filtered_datasets ] if len(r_frees) == 0: raise Exception( 'NO DATASETS BELOW RESOLUTION CUTOFF {!s}A - CANNOT SELECT REFERENCE DATASET' .format(min_resolution)) ref_dataset_index = r_frees.index(min(r_frees)) elif method == 'resolution': # Get Resolutions of datasets (set to dummy value of 999 if r-free is too high so that it is not selected) resolns = [ d.data.mtz_object().max_min_resolution()[1] if (d.model.input.get_r_rfree_sigma().r_free < max_rfree) else 999 for d in filtered_datasets ] if len(resolns) == 0: raise Exception( 'NO DATASETS BELOW RFREE CUTOFF {!s} - CANNOT SELECT REFERENCE DATASET' .format(max_rfree)) ref_dataset_index = resolns.index(min(resolns)) # ==============================> # Report and return # ==============================> reference = filtered_datasets[ref_dataset_index] return reference.model.filename, reference.data.filename
def __init__(self, mtz_file, pdb_file, f_label=None): scores, command = score_with_edstats_to_dict(mtz_file=mtz_file, pdb_file=pdb_file, f_label=f_label) self.scores = pandas.DataFrame.from_dict(scores) self._command = command if self.scores.empty and self._command.error: raise Failure( 'EDSTATS has failed to run (error message below)\n=========>\n{!s}\n=========>' .format(self._command.error))
def raise_cmd_output_and_error(cmd): """Return STDOUT and STDERR from command object""" err_msg = '' err_msg += '============================>\n' err_msg += 'Program returned with an error ({})\n'.format(' '.join( cmd.program)) err_msg += '============================>\n' err_msg += str(cmd) + '\n' err_msg += '============================>\n' err_msg += cmd.output + '\n' err_msg += '============================>\n' err_msg += cmd.error + '\n' err_msg += '============================>\n' raise Failure(err_msg)
def reindex_mtz_to_reference(in_mtz, out_mtz, reference_mtz, tolerance): """Reindex the data in one mtz to a reference mtz""" print '**************************' print '*** Running reindexing ***' print '**************************' cmd = CommandManager('pointless') cmd.add_command_line_arguments( ['hklin', in_mtz, 'hklref', reference_mtz, 'hklout', out_mtz]) cmd.add_standard_input(['tolerance {}'.format(tolerance)]) print_run_and_raise_error_maybe(cmd) if not os.path.exists(out_mtz): raise Failure( 'reindexing has failed -- {} does not exist'.format(out_mtz))
def run(params): assert params.input.mtz, 'No MTZs given for comparison' assert not os.path.exists( params.output.merged_mtz ), 'The output file ({}) already exists. Please delete it before re-running.'.format( params.output.merged_mtz) assert not os.path.exists( params.output.phasematch_mtz ), 'The output file ({}) already exists. Please delete it before re-running.'.format( params.output.phasematch_mtz) merge = CommandManager('giant.mtz.merge') merge.add_command_line_arguments('label_suffix=incremental') merge.add_command_line_arguments(params.input.mtz) merge.add_command_line_arguments('output.mtz=' + params.output.merged_mtz) merge.add_command_line_arguments(params.settings.f_obs.split(',')) merge.add_command_line_arguments(params.settings.f_calc.split(',')) merge.run() merge.write_output(params.output.merging_log) if not os.path.exists(params.output.merged_mtz): raise Failure('giant.mtz.merge has failed to merge the mtz files') fo1, fo2 = params.settings.f_obs.split(',') fc1, fc2 = params.settings.f_calc.split(',') for i_2 in range(1, len(params.input.mtz) + 1): for i_1 in range(1, len(params.input.mtz) + 1): if i_1 == i_2: break match = CommandManager('cphasematch') match.add_command_line_arguments( ['-mtzin', params.output.merged_mtz]) match.add_command_line_arguments( ['-mtzout', params.output.phasematch_mtz]) match.add_command_line_arguments( ['-colin-fo', '{}-{},{}-{}'.format(fo1, i_1, fo1, i_1)]) match.add_command_line_arguments( ['-colin-fc-1', '{}-{},{}-{}'.format(fc1, i_1, fc1, i_1)]) match.add_command_line_arguments( ['-colin-fc-2', '{}-{},{}-{}'.format(fc2, i_2, fc2, i_2)]) match.run() march.write_output(params.output.phase_log_template + '-{}-{}.log'.format(i_1.i_2))
def extract_residue_group_scores(self, residue_group, data_table=None, rg_label=None, column_suffix=''): """Extract density quality metrics for a residue group from precalculated edstats scores""" rg = residue_group # Set defaults if rg_label is None: rg_label = (rg.unique_resnames()[0] + '-' + rg.parent().id + '-' + rg.resseq + rg.icode).replace(' ', '') if data_table is None: data_table = pandas.DataFrame(index=[rg_label], column=[]) # Check validity if len(rg.unique_resnames()) != 1: raise Failure( rg_label + ': More than one residue name associated with residue group -- cannot process' ) # Extract residue scores ed_scores = self.scores[(rg.unique_resnames()[0], rg.parent().id, rg.resseq_as_int(), rg.icode)] # Append scores to data_table data_table.set_value(index=rg_label, col='RSCC' + column_suffix, value=ed_scores['CCSa']) data_table.set_value(index=rg_label, col='RSR' + column_suffix, value=ed_scores['Ra']) data_table.set_value(index=rg_label, col='B_AV' + column_suffix, value=ed_scores['BAa']) data_table.set_value(index=rg_label, col='RSZO' + column_suffix, value=ed_scores['ZOa']) data_table.set_value(index=rg_label, col='RSZD' + column_suffix, value=ed_scores['ZDa']) return data_table
def __call__(self, reference=None): """Generate the grid objects for the analysis""" # ============================================================================> ##### # Create Sampling Grid (for generated maps) ##### # Create reference grid based on the reference structure # ============================================================================> if self.grid is None: # Which dataset to be used to mask the grid if bool(self.mask_pdb): mask_dataset = PanddaReferenceDataset.from_file(model_filename=self.mask_pdb).label( tag='masking') if self.align_mask_to_reference: try: mask_dataset.model.alignment = None mask_dataset.model.align_to(other_hierarchy=reference.model.hierarchy, method=self.alignment_method, require_hierarchies_identical=False) except: msg = traceback.format_exc() msg += '\n------------------>>>' msg += '\n\nFailed to align masking pdb ({}) to the reference structure.'.format( self.mask_pdb) msg += '\nIf the masking structure does not need alignment, rerun with params.masks.align_mask_to_reference=False' raise Failure(msg) else: mask_dataset.set_origin_shift((0.0, 0.0, 0.0)) else: mask_dataset = reference.copy() # Create the grid using the masking dataset (for determining size and extent of grid) self.create_reference_grid(dataset=mask_dataset, grid_spacing=self.grid_spacing, reference=reference) self.mask_reference_grid(dataset=mask_dataset, selection=self.mask_selection_string) # Store the transformation to shift the reference dataset to the "grid frame", where the grid origin is (0,0,0) reference.set_origin_shift([-1.0 * a for a in self.grid.cart_origin()]) # Partition the grid with the reference dataset (which grid points use which coordinate transformations) self.partition_reference_grid(dataset=reference) return self.grid
def transfer_residue_groups_from_other(acceptor_hierarchy, donor_hierarchy, in_place=False, verbose=False): """Transfer atom_groups from donor_hierarchy to matching residue_groups in acceptor_hierarchy, creating new chains and residue groups only where necessary""" if not in_place: acceptor_hierarchy = acceptor_hierarchy.deep_copy() # Sort all residues (by chain then id) for the acceptor hierarchy accept_model = acceptor_hierarchy.only_model() accept_dict = {c.id: {} for c in accept_model.chains()} [ accept_dict.get(rg.parent().id).setdefault(rg.resid(), []).append(rg) for rg in accept_model.residue_groups() ] # Dictionary to link matching chains (allows multiple chain As to be linked uniquely to multiple chain As) link_dict = {} # Residues that don't have a matching partner in the old hierarchy tricky_rgs = [] # Iterate through donor chains for donor_ch in donor_hierarchy.only_model().chains(): # If chain not in hierarchy, simply copy across if accept_dict.get(donor_ch.id, None) is None: if verbose: print 'Transferring whole chain: {}'.format( Labeller.format(donor_ch)) accept_model.append_chain(donor_ch.detached_copy()) continue # Chain present, copy by residue_group for donor_rg in donor_ch.residue_groups(): # Find equivalent residue groups in the other hierarchy accept_rg = accept_dict.get(donor_ch.id).get(donor_rg.resid(), []) if len(accept_rg) > 1: # Should only be one... raise Exception( 'More than one residue group in hierarchy with the same residue_id and chain_id' ) elif len(accept_rg) == 1: accept_rg = accept_rg[0] # Record the links between these chains link_dict.setdefault(donor_rg.parent(), accept_rg.parent()) # Transfer atom groups to this residue_group if verbose: print 'Transferring atom groups: {} > {}'.format( Labeller.format(donor_rg), Labeller.format(accept_rg)) for donor_ag in donor_rg.atom_groups(): accept_rg.append_atom_group(donor_ag.detached_copy()) else: # Have the possibility of multiple chains with the same id, so at the moment, store for later tricky_rgs.append(donor_rg) # Transfer residues that have chain matches, but don't have residue matches in the acceptor structures for donor_rg in tricky_rgs: # Get chain from link_dict accept_ch = link_dict.get(donor_rg.parent().id, None) # If the chain isn't linked: if accept_ch is None: # If there's only one chain with the same ID, choose this one possible_chains = [ c for c in accept_model.chains() if c.id == donor_rg.parent().id ] if len(possible_chains) == 1: accept_ch = possible_chains[0] else: raise Failure( "Don't know how to transfer {} to the output model".format( Labeller.format(donor_rg))) # Simply append to chain if verbose: print 'Transferring residue group: {} > {}'.format( Labeller.format(donor_rg), Labeller.format(accept_ch)) accept_ch.append_residue_group(donor_rg.detached_copy()) return acceptor_hierarchy
def __call__(self, mcd, reference_dataset=None): """Align each structure the reference structure""" assert self.method in ['local', 'global'], 'METHOD NOT DEFINED: {!s}'.format( self.method) # ==============================> # Select the datasets for alignment # ==============================> datasets_for_alignment = [d for dtag, d in mcd.datasets.items()] # ==============================> # Delete alignments # ==============================> for d in datasets_for_alignment: d.model.alignment = None # ==============================> # Generate the alignments for each structure # ==============================> arg_list = [ DatasetAligner(model=d.model, other=reference_dataset.model, method=self.method, id=d.tag) for d in datasets_for_alignment ] dataset_alignments = jl.Parallel(n_jobs=self.cpus, verbose=15)( jl.delayed(wrapper_run)(arg) for arg in arg_list) # dataset_alignments = easy_mp.pool_map(func=wrapper_run, args=arg_list, processes=self.cpus) # ==============================> # Catch errors and print at end # ==============================> errors = [] for dataset, alignment in zip(datasets_for_alignment, dataset_alignments): # If errored, print and record if isinstance(alignment, str): errors.append((dataset, alignment)) self.alignments[dataset.tag] = False print(alignment) continue # Attach alignment to dataset assert dataset.tag == alignment.id self.alignments[alignment.id] = True dataset.model.alignment = alignment # Output an aligned copy of the structure aligned_struc = dataset.model.hierarchy.deep_copy() aligned_struc.atoms().set_xyz( dataset.model.alignment.nat2ref( coordinates=dataset.model.hierarchy.atoms().extract_xyz())) # TODO: restore this functionality? # aligned_struc.write_pdb_file(file_name=os.path.join(self.file_manager.get_dir('aligned_structures'), # '{!s}-aligned.pdb'.format(dataset.tag))) # aligned_struc.write_pdb_file( # file_name=splice_ext(dataset.file_manager.get_file('aligned_model'), 'ref', position=-1)) # Write alignment summary to log # ==============================> # Report Errors # ==============================> if errors: for dataset, err_msg in errors: print('Failed to align dataset {}'.format(dataset.tag)) print(err_msg) raise Failure( 'Failed to align {} datasets. Error messages printed above.'. format(len(errors))) # ==============================> # Make new dataset # ==============================> new_datasets = {d.tag: d for d in datasets_for_alignment} new_dataset = mcd.new_from_datasets(datasets=new_datasets) return new_dataset
def check_programs_are_available(self, programs): ni = not_installed(programs) if ni: raise Failure( 'The following programs are not available/installed:\n\t{}'. format('\n\t'.join(ni)))
def align_structures_flexible(mov_hierarchy, ref_hierarchy, altlocs=['','A'], cutoff_radius=15, sequence_identity_threshold=0.95, one_to_one_mapping=True, require_hierarchies_identical=True, verbose=False): """ Perform a flexible alignment on two hierarchies. Alignments are performed on a chain-by-chain basis. Each chain of mov_hierarchy is aligned """ # List of the alignments for each chain local_alignments = [] # Trim to protein only mov_hierarchy = backbone(mov_hierarchy, copy=True) ref_hierarchy = backbone(ref_hierarchy, copy=True) # Check the structures only have one model try: mov_hierarchy.only_model() ref_hierarchy.only_model() except: raise Exception('Structures for alignment can only have one model!') # Check the structures are identical if require_hierarchies_identical: assert mov_hierarchy.is_similar_hierarchy(ref_hierarchy), 'Structures for alignment must have the same atoms (although atomic parameters can vary)' # Extract the chains from the structures c_mov = list(mov_hierarchy.chains()) c_ref = list(ref_hierarchy.chains()) # Match chains in the two structures (c_mov is first so the array is first indexed by the chains in mov) chn_sim = pairwise_chain_sequence_identity(c_mov, c_ref, seq_identity_threshold=None) # Create strings for use in case of errors/verbose printing s = 'Chain and sequences for aligment:' s += '\n{} chains in mov_hierarchy:'.format(len(c_mov)) for c in c_mov: s += '\n\t{}: {}'.format(c.id, ''.join(c.as_sequence())) s += '\n{} chains in ref_hierarchy:'.format(len(c_ref)) for c in c_ref: s += '\n\t{}: {}'.format(c.id, ''.join(c.as_sequence())) s += '\nPairwise chain-by-chain sequence identities:' s += '\n REF' s += '\nMOV {}'.format(' '.join(['{:4}'.format(c.id) for c in c_ref])) for i,i_c in enumerate(c_mov): s+= '\n{:3} {}'.format(i_c.id, ' '.join(['{:4}'.format(v) for v in chn_sim[i]])) # Report to be returned in case of error report = Report(s, verbose=verbose) # Make the array boolean at the threshold value chn_sim = (chn_sim>sequence_identity_threshold).astype(int) # Report s = 'Pairwise chain-by-chain sequence identities (thresholded at {}%):'.format(100*sequence_identity_threshold) s += '\n REF' s += '\nMOV {}'.format(' '.join(['{:4}'.format(c.id) for c in c_ref])) for i,i_c in enumerate(c_mov): s+= '\n{:3} {}'.format(i_c.id, ' '.join(['{:4}'.format(v) for v in chn_sim[i]])) report(s) # Iterate through and align the chains for i, chn_mov in enumerate(c_mov): # Skip if not protein if not chn_mov.is_protein(): continue # Find the first chain in the reference structure that's "alignable" try: idx_ref = list(chn_sim[i]).index(1) chn_ref = c_ref[idx_ref] report('Aligning chain {} of mov_hierarchy to chain {} in ref_hierarchy'.format(chn_mov.id, chn_ref.id)) if one_to_one_mapping: report('Removing chain {} of ref_hierarchy from the pool of alignment chains (one_to_one_mapping is turned on)'.format(chn_ref.id)) chn_sim[:,idx_ref] = 0 except ValueError: raise Failure('Error raised during alignment.\n' 'Unable to align chain {} from mov_hierarchy: there is no suitable chain in ref_hierarchy.\n'\ 'This might be fixed by setting one_to_one_mapping to False or decreasing sequence_identity_threshold.\n'.format(chn_mov.id)+ str(report)) continue # Align the selected chains l_ali = align_chains_flexible(chn_mov=chn_mov, chn_ref=chn_ref, altlocs=altlocs, cutoff_radius=cutoff_radius) # Add aligned chains as the ID of the LocalAlignment object l_ali.id = 'chain {} to chain {}'.format(chn_mov.id, chn_ref.id) l_ali.mov_id = chn_mov.id l_ali.ref_id = chn_ref.id l_ali.seq_ali = align_sequences_default(seq_a=chn_ref.as_sequence(), seq_b=chn_mov.as_sequence()) # Append to the alignments local_alignments.append(l_ali) # Print which chains were aligned to which report('\n'.join(['Alignment finished:']+['\t(mov) chain {} aligned to (ref) chain {}'.format(*l_ali.id) for l_ali in local_alignments])) # Combine all of the local alignments return MultipleLocalAlignment(local_alignments=local_alignments)
def fill_missing_reflections(in_mtz, out_mtz, fill_resolution_low, fill_resolution_high, delete_tmp_files=True): """Complete the set of miller indices in an MTZ file""" print '***************************' print '*** Filling reflections ***' print '***************************' tmp_mtz_1 = splice_ext(path=out_mtz, new='step1-truncate') tmp_mtz_2 = splice_ext(path=out_mtz, new='step2-uniquify') tmp_mtz_3 = splice_ext(path=out_mtz, new='step3-remerged') # Stage 1 - truncate dataset, fill missing reflections, change column name cmd = CommandManager('cad') cmd.add_command_line_arguments(['hklin1', in_mtz, 'hklout', tmp_mtz_1]) cmd.add_standard_input([ 'monitor BRIEF', 'labin file_number 1 ALL', 'resolution file 1 {} {}'.format(fill_resolution_low, fill_resolution_high) ]) print_run_and_raise_error_maybe(cmd) if not os.path.exists(tmp_mtz_1): raise Failure( 'filling of missing reflections has failed -- {} does not exist'. format(tmp_mtz_1)) print '-------------------' # Stage 2 - Uniqueify the file cmd = CommandManager('uniqueify') cmd.add_command_line_arguments(['-p', '0.05', tmp_mtz_1, tmp_mtz_2]) print_run_and_raise_error_maybe(cmd) if not os.path.exists(tmp_mtz_2): raise Failure( 'filling of missing reflections has failed -- {} does not exist'. format(tmp_mtz_2)) print '-------------------' # Stage 3 - remerge the two files cmd = CommandManager('cad') cmd.add_command_line_arguments( ['hklin1', in_mtz, 'hklin2', tmp_mtz_2, 'hklout', tmp_mtz_3]) cmd.add_standard_input([ 'monitor BRIEF', 'labin file_number 1 ALL', 'labin file_number 2 E1=FreeR_flag', 'labout file_number 2 E1=dummy' ]) print_run_and_raise_error_maybe(cmd) if not os.path.exists(tmp_mtz_3): raise Failure( 'filling of missing reflections has failed -- {} does not exist'. format(tmp_mtz_3)) print '-------------------' # Stage 4 - remove the dummy column cmd = CommandManager('mtzutils') cmd.add_command_line_arguments(['hklin1', tmp_mtz_3, 'hklout', out_mtz]) cmd.add_standard_input( ['HEADER BRIEF', 'EXCLUDE 1 dummy', 'ONEFILE', 'END']) print_run_and_raise_error_maybe(cmd) if not os.path.exists(tmp_mtz_3): raise Failure( 'filling of missing reflections has failed -- {} does not exist'. format(out_mtz)) if delete_tmp_files: os.remove(tmp_mtz_1) os.remove(tmp_mtz_2) os.remove(tmp_mtz_3)