Example #1
0
    def run(self, args):
        #read inputs
        from prime.postrefine.mod_input import process_input, read_pickles
        iparams, txt_out_input = process_input(args)
        print txt_out_input
        with open(os.path.join(iparams.run_no, self.module_name, 'log.txt'),
                  'w') as f:
            f.write(txt_out_input)
        #read all integration pickles
        frame_files = read_pickles(iparams.data)
        n_frames = len(frame_files)
        if n_frames == 0:
            print "No integration pickle found. Exit program."
            return None, iparams
        #start
        if iparams.isoform_cluster.isorefin:
            #get collection of iso. ref. reflection set.
            mxh = mx_handler()
            miller_array_ref_set = []
            for isorefin in iparams.isoform_cluster.isorefin:
                flag_ref_found, miller_array_ref = mxh.get_miller_array_from_reflection_file(
                    isorefin)
                if flag_ref_found:
                    miller_array_ref_set.append(miller_array_ref)
            #get observation list
            frame_files_sel, obs_list = self.get_observation_set(
                iparams, frame_files, n_frames)
            if miller_array_ref_set:
                frames = [(i, frame_files_sel[i], obs_list[i], iparams,
                           miller_array_ref_set) for i in range(len(obs_list))]
                cc_results = pool_map(iterable=frames,
                                      func=solve_with_mtz_mproc,
                                      processes=iparams.n_processors)
                sol_pickle = {}
                for result in cc_results:
                    pickle_filename, cluster_id = result
                    sol_pickle[pickle_filename] = cluster_id
                write_out_solutions(iparams, sol_pickle)
                txt_out = "Cluster images with given " + str(
                    len(miller_array_ref_set)
                ) + " mtz files completed. Use cluster_0.lst - cluster_k.lst (for k clusters) for merging.\n"
                print txt_out
                with open(
                        os.path.join(iparams.run_no, self.module_name,
                                     'log.txt'), 'a') as f:
                    f.write(txt_out)
            return

        #*************************************************
        #solve with Brehm & Diederichs - sample size n_sample_frames then bootstrap the rest
        txt_out = "Cluster images with B&D algorithms.\n"
        frame_files_sel, obs_list = self.get_observation_set(
            iparams, frame_files, iparams.isoform_cluster.n_sample_frames)
        frames = [(i, frame_files_sel[i], obs_list[i], obs_list)
                  for i in range(len(frame_files_sel))]
        #calculate r
        print "Calculating R"
        calc_r_results = pool_map(iterable=frames,
                                  func=calculate_r_mproc,
                                  processes=iparams.n_processors)
        frame_files_sel = []
        r_matrix = []
        obs_list = []
        for result in calc_r_results:
            if result:
                pickle_filename, r_set, obs = result
                frame_files_sel.append(pickle_filename)
                obs_list.append(obs)
                if len(r_matrix) == 0:
                    r_matrix = r_set
                else:
                    r_matrix = np.append(r_matrix, r_set, axis=0)
        #choose groups with best R
        print "Selecting frames with best R"
        i_mean_r = np.argsort(np.mean(r_matrix, axis=1))[::-1]
        r_matrix_sorted = r_matrix[i_mean_r]
        frame_files_sorted = np.array(frame_files_sel)[i_mean_r]
        obs_list_sorted = np.array(obs_list)[i_mean_r]
        frame_files_sel = []
        obs_sel = []
        for frame_file, r_set, obs in zip(frame_files_sorted, r_matrix_sorted,
                                          obs_list_sorted):
            if frame_file not in frame_files_sel:
                frame_files_sel.append(frame_file)
                obs_sel.append(obs)
                print frame_file, np.mean(r_set)
                if len(frame_files_sel
                       ) >= iparams.isoform_cluster.n_selected_frames:
                    print 'Found all %6.0f good frames' % (
                        len(frame_files_sel))
                    break
        #Recalculate r for the new selected list
        frames = [(i, frame_files_sel[i], obs_sel[i], obs_sel)
                  for i in range(len(frame_files_sel))]
        print "Re-calculating R"
        calc_r_results = pool_map(iterable=frames,
                                  func=calculate_r_mproc,
                                  processes=iparams.n_processors)
        frame_files_sel = []
        r_matrix = []
        obs_list = []
        for result in calc_r_results:
            if result:
                pickle_filename, r_set, obs = result
                frame_files_sel.append(pickle_filename)
                obs_list.append(obs)
                if len(r_matrix) == 0:
                    r_matrix = r_set
                else:
                    r_matrix = np.append(r_matrix, r_set, axis=0)
        print "Minimizing frame distance"
        isoch = isoform_cluster_handler()
        x_set = isoch.optimize(r_matrix, flag_plot=iparams.flag_plot)
        print "Clustering results"
        kmh = kmeans_handler()
        k = iparams.isoform_cluster.n_clusters
        centroids, labels = kmh.run(x_set, k, flag_plot=iparams.flag_plot)
        print "Get solution pickle and cluster files list"
        sol_pickle, cluster_files = isoch.assign_cluster(frame_files_sel, labels, k, \
            os.path.join(iparams.run_no,self.module_name))
        #if more frames found, merge the sample frames to get a reference set
        #that can be used for breaking the ambiguity.
        if n_frames > iparams.isoform_cluster.n_selected_frames:
            print "Assign cluster_id for the remaining images."
            old_iparams_data = iparams.data[:]
            miller_array_ref_set = []
            from prime.command_line.postrefine import scale_frames, merge_frames
            for i in range(k):
                #generate a reference set from solved frames
                with open(cluster_files[i]) as f:
                    frame_files_processed = f.read().split('\n')[:-1]
                scaled_pres_set = scale_frames(
                    range(len(frame_files_processed)), frame_files_processed,
                    iparams)
                mdh, txt_merge_out = merge_frames(scaled_pres_set, iparams, \
                    mtz_out_prefix=os.path.join(self.module_name,'cluster_'+str(i)))
                miller_array_ref_set.append(mdh.miller_array_merge)
                txt_out += txt_merge_out
            #setup a list of remaining frames
            frame_files_remain = [
                frame for frame in frame_files if frame not in sol_pickle
            ]
            frame_files_remain_sel, obs_remain_sel_list = self.get_observation_set(iparams, \
                frame_files_remain, len(frame_files_remain))
            frames = [(i, frame_files_remain_sel[i], obs_remain_sel_list[i],
                       iparams, miller_array_ref_set)
                      for i in range(len(obs_remain_sel_list))]
            cc_results = pool_map(iterable=frames,
                                  func=solve_with_mtz_mproc,
                                  processes=iparams.n_processors)
            for result in cc_results:
                if result:
                    pickle_filename, cluster_id = result
                    sol_pickle[pickle_filename] = cluster_id
            iparams.data = old_iparams_data[:]
        #write out solution pickle
        write_out_solutions(iparams, sol_pickle)
        #write out text output
        txt = "Cluster images completed. Use cluster_0.lst - cluster_k.lst (for k clusters) for merging.\n"
        txt_out += txt
        print txt
        with open(os.path.join(iparams.run_no, self.module_name, 'log.txt'),
                  'a') as f:
            f.write(txt_out)
Example #2
0
 def run(self, args):
     import time
     start = time.time()
     #read inputs
     from prime.postrefine.mod_input import process_input, read_pickles
     iparams, txt_out_input = process_input(args)
     print txt_out_input
     f = open(iparams.run_no + '/log.txt', 'w')
     f.write(txt_out_input)
     f.close()
     #if solution pickle is given, return the file name
     if iparams.indexing_ambiguity.index_basis_in is not None:
         if iparams.indexing_ambiguity.index_basis_in.endswith('.pickle'):
             sol_pickle = pickle.load(
                 open(iparams.indexing_ambiguity.index_basis_in, "rb"))
             return sol_pickle, iparams
     #read all integration pickles
     frame_files = read_pickles(iparams.data)
     n_frames = len(frame_files)
     if n_frames == 0:
         print "No integration pickle found. Exit program."
         return None, iparams
     #exit if no problem
     if self.should_terminate(iparams, frame_files[0]):
         print "No indexing ambiguity problem. Set index_ambiguity.mode = Forced and assigned_basis = list of basis formats to solve pseudo-twinning problem."
         return None, iparams
     #continue with (Auto - alt>1, find solution), (Auto - alt>1, mtz)
     #(Forced - assigned_basis, mtz), (Forced - assigned_basis, find solution)
     #*************************************************
     #if mtz file is given, use it to solve the problem
     sol_fname = iparams.run_no + '/index_ambiguity/solution_pickle.pickle'
     if iparams.indexing_ambiguity.index_basis_in is not None:
         if iparams.indexing_ambiguity.index_basis_in.endswith('.mtz'):
             mxh = mx_handler()
             flag_ref_found, miller_array_ref = mxh.get_miller_array_from_reflection_file(
                 iparams.indexing_ambiguity.index_basis_in)
             if flag_ref_found == False:
                 print "Reference mtz file not found. Set indexing_ambiguity.index_basis_in = None to enable auto generate the solutions."
                 return None, iparams
             else:
                 frames = [(i, frame_files[i], iparams, miller_array_ref)
                           for i in range(n_frames)]
                 cc_results = parallel_map(iterable=frames,
                                           func=solve_with_mtz_mproc,
                                           processes=iparams.n_processors)
                 sol_pickle = {}
                 for result in cc_results:
                     pickle_filename, index_basis = result
                     sol_pickle[pickle_filename] = index_basis
                 pickle.dump(sol_pickle, open(sol_fname, "wb"))
                 return sol_pickle, iparams
     #*************************************************
     #solve with Brehm & Diederichs - sample size n_sample_frames then bootstrap the rest
     frames = [(i, frame_files[i], iparams) for i in random.sample(
         range(n_frames), iparams.indexing_ambiguity.n_sample_frames)]
     #get observations list
     print "Reading observations"
     alt_dict_results = parallel_map(iterable=frames,
                                     func=get_obs_mproc,
                                     processes=iparams.n_processors)
     frame_dup_files = []
     frame_keys = []
     obs_list = []
     for result in alt_dict_results:
         alt_dict, pickle_filename = result
         if alt_dict is not None:
             for key in alt_dict.keys():
                 frame_dup_files.append(pickle_filename)
                 frame_keys.append(key)
                 obs_list.append(alt_dict[key])
     frames = [(i, frame_dup_files[i], frame_keys[i], obs_list[i], obs_list)
               for i in range(len(frame_dup_files))]
     #calculate r
     print "Calculating R"
     calc_r_results = parallel_map(iterable=frames,
                                   func=calculate_r_mproc,
                                   processes=iparams.n_processors)
     frame_dup_files = []
     frame_keys = []
     r_matrix = []
     for result in calc_r_results:
         if result is not None:
             pickle_filename, index_basis, r_set = result
             frame_dup_files.append(pickle_filename)
             frame_keys.append(index_basis)
             if len(r_matrix) == 0:
                 r_matrix = r_set
             else:
                 r_matrix = np.append(r_matrix, r_set, axis=0)
     #choose groups with best CC
     print "Selecting frames with best R"
     i_mean_r = np.argsort(np.mean(r_matrix, axis=1))[::-1]
     r_matrix_sorted = r_matrix[i_mean_r]
     frame_dup_files_sorted = np.array(frame_dup_files)[i_mean_r]
     frame_keys_sorted = np.array(frame_keys)[i_mean_r]
     frame_dup_files_sel = []
     for frame_file, frame_key, r_set in zip(frame_dup_files_sorted,
                                             frame_keys_sorted,
                                             r_matrix_sorted):
         if frame_file not in frame_dup_files_sel:
             frame_dup_files_sel.append(frame_file)
             print frame_file, frame_key, np.mean(r_set)
             if len(frame_dup_files_sel
                    ) >= iparams.indexing_ambiguity.n_selected_frames:
                 print 'Found all %6.0f good frames' % (
                     len(frame_dup_files_sel))
                 break
     ##
     #rebuild observations and r_matrix
     frames = [(i, frame_dup_files_sel[i], iparams)
               for i in range(len(frame_dup_files_sel))]
     #get observations list
     print "Re-reading observations"
     alt_dict_results = parallel_map(iterable=frames,
                                     func=get_obs_mproc,
                                     processes=iparams.n_processors)
     frame_dup_files = []
     frame_keys = []
     obs_list = []
     for result in alt_dict_results:
         alt_dict, pickle_filename = result
         if alt_dict is not None:
             for key in alt_dict.keys():
                 frame_dup_files.append(pickle_filename)
                 frame_keys.append(key)
                 obs_list.append(alt_dict[key])
     frames = [(i, frame_dup_files[i], frame_keys[i], obs_list[i], obs_list)
               for i in range(len(frame_dup_files))]
     #calculate r
     print "Re-calculating R"
     calc_r_results = parallel_map(iterable=frames,
                                   func=calculate_r_mproc,
                                   processes=iparams.n_processors)
     frame_dup_files = []
     frame_keys = []
     r_matrix = []
     for result in calc_r_results:
         if result is not None:
             pickle_filename, index_basis, r_set = result
             frame_dup_files.append(pickle_filename)
             frame_keys.append(index_basis)
             if len(r_matrix) == 0:
                 r_matrix = r_set
             else:
                 r_matrix = np.append(r_matrix, r_set, axis=0)
     print "Minimizing frame distance"
     idah = indamb_handler()
     x_set = idah.optimize(r_matrix, flag_plot=iparams.flag_plot)
     x_pickle = {'frame_dup_files':frame_dup_files, 'frame_keys':frame_keys, \
       'r_matrix':r_matrix, 'x_set':x_set}
     pickle.dump(x_pickle,
                 open(iparams.run_no + '/index_ambiguity/x.out', "wb"))
     print "Clustering results"
     kmh = kmeans_handler()
     k = 2**(len(idah.get_observations(frame_dup_files[0], iparams)) - 1)
     centroids, labels = kmh.run(x_set, k, flag_plot=iparams.flag_plot)
     print "Get solution pickle"
     sample_fname = iparams.run_no + '/index_ambiguity/sample.lst'
     sol_pickle = idah.assign_basis(frame_dup_files, frame_keys, labels, k,
                                    sample_fname)
     pickle.dump(sol_pickle, open(sol_fname, "wb"))
     #if more frames found, merge the sample frames to get a reference set
     #that can be used for breaking the ambiguity.
     txt_merge_out = None
     if n_frames > iparams.indexing_ambiguity.n_selected_frames:
         print "Breaking the indexing ambiguity for the remaining images."
         old_iparams_data = iparams.data[:]
         iparams.indexing_ambiguity.index_basis_in = sol_pickle
         #generate a reference set from solved frames
         with open(sample_fname) as f:
             frame_files_processed = f.read().split('\n')[:-1]
         from prime.command_line.postrefine import scale_frames, merge_frames
         scaled_pres_set = scale_frames(range(len(frame_files_processed)),
                                        frame_files_processed, iparams)
         mdh, txt_merge_out = merge_frames(
             scaled_pres_set, iparams, mtz_out_prefix='index_ambiguity/ref')
         miller_array_ref = mdh.miller_array_merge
         #setup a list of remaining frames
         frame_files_remain = [
             frame for frame in frame_files if frame not in sol_pickle
         ]
         frames = [(i, frame_files_remain[i], iparams, miller_array_ref)
                   for i in range(len(frame_files_remain))]
         cc_results = parallel_map(iterable=frames,
                                   func=solve_with_mtz_mproc,
                                   processes=iparams.n_processors)
         for result in cc_results:
             pickle_filename, index_basis = result
             sol_pickle[pickle_filename] = index_basis
         iparams.data = old_iparams_data[:]
     #write out solution pickle
     pickle.dump(sol_pickle, open(sol_fname, "wb"))
     #write out text output
     txt_out = "Solving indexing ambiguity complete. Solution file saved to " + sol_fname + "\n"
     if txt_merge_out:
         txt_out += "Reference set used to solve the indexing ambiguity problem:\n" + txt_merge_out
     with open(iparams.run_no + '/log.txt', 'a') as f:
         f.write(txt_out)
     print "Indexing Ambiguity Solver Elapsed Time (s) %10.2s" % (
         time.time() - start)
     return sol_pickle, iparams