def get_expression_profile(expression_level, genes, expression_bins, input_format, output_format, species, tmp, symmetric_expression): df = pd.DataFrame({'genes': genes, 'expression_level': expression_level}) df = df[df.iloc[:, 1].notna()] df = df.sort_values(by=df.columns[1]) expression_level = np.array(df.iloc[:, 1]) if symmetric_expression: left = MI.discretize(expression_level[expression_level < 0], expression_bins // 2) right = MI.discretize(expression_level[expression_level >= 0], expression_bins // 2 + expression_bins % 2) right += expression_bins // 2 expression_profile = np.concatenate((left, right)) else: expression_profile = MI.discretize(expression_level, expression_bins) genes = list(df.iloc[:, 0]) genes = [gene.split('.')[0] for gene in genes] if input_format and output_format and input_format != output_format: genes = change_accessions(genes, input_format, output_format, species, tmp) gene_dict = dict(zip(genes, expression_profile)) expression_profile = np.array( [gene_dict[gene] for gene in gene_dict.keys() if gene != '-']) genes = [gene for gene in gene_dict.keys() if gene != '-'] return expression_profile, genes
def prepare(bags, class_prior, L, U, T): """ Parameters ---------- bags : original dataset class_prior : the ratio of positive samples L : the number of labeled samples in output dataset U : the number of unlabeled samples in output dataset T : the number of test samples in output dataset """ # original data p_bags = MI.extract_bags(bags, 1, with_label = True) n_bags = MI.extract_bags(bags, -1, with_label = True) random.shuffle(p_bags) random.shuffle(n_bags) P = len(p_bags) N = len(n_bags) retry_count = 0 while retry_count < 5: try: return _prepare(p_bags, n_bags, P, N, class_prior, L, U, T) except: # if the obtained split is invalid, try sampling again sys.stderr.write("Warning: Retry train-test-split (recommend to change the splitting number)\n") retry_count += 1 continue
def min_CI_normalized_test(counter, accepted_seeds_list, profiles_passed, discr_exp_profile, nbins, index_array, min_ratio, do_print=False): profile_full = profiles_passed[counter] profile_being_analyzed = profile_full[index_array] for i in range(len(accepted_seeds_list)): ith_accepted_profile_full = profiles_passed[accepted_seeds_list[i]] ith_accepted_profile = ith_accepted_profile_full[index_array] cond_inf = MI.cond_mut_info(profile_being_analyzed, discr_exp_profile, ith_accepted_profile, x_bins=2, y_bins=nbins, z_bins=2) mut_inf = MI.mut_info(profile_being_analyzed, ith_accepted_profile, x_bins=2, y_bins=2) if np.isclose(mut_inf, 0., atol=1e-16): mut_inf = 1e-16 ratio = cond_inf / mut_inf print("Comparing seed #%d to an existing seed #%d. The ratio is %.2f" % (counter, i, ratio)) if ratio < min_ratio: return False, i # return index of accepted seed that is similar to the current one return True, 0
def train(bags, s, l, args): P = np.vstack(MI.extract_bags(bags, 0)) Q = np.vstack(MI.extract_bags(bags, 1)) n = len(P) m = len(Q) X = np.vstack((P, Q)) KP = np.exp(-(r(P**2) - 2 * P.dot(X.T) + r(X**2).T) / (2 * s**2)) KQ = np.exp(-(r(Q**2) - 2 * Q.dot(X.T) + r(X**2).T) / (2 * s**2)) # initialization step L = np.r_[np.c_[l * np.eye(n + m), np.zeros((n + m, n)), np.zeros((n + m, m))], np.c_[np.zeros((n, n + m)), np.zeros((n, n)), np.zeros((n, m))], np.c_[np.zeros((m, n + m)), np.zeros((m, n)), np.zeros((m, m))], ] k = np.r_[np.zeros((n + m, 1)), np.ones((n, 1)) / n, np.ones((m, 1)) / m, ] G = np.r_[np.c_[np.zeros((n, n + m)), -np.eye(n), np.zeros((n, m))], np.c_[KP, -np.eye(n), np.zeros((n, m))], np.c_[np.zeros((m, n + m)), np.zeros((m, n)), -np.eye(m)], np.c_[KQ, np.zeros((m, n)), -np.eye(m)], ] h = np.r_[np.zeros((n, 1)), -np.ones((n, 1)), np.zeros((m, 1)), np.ones((m, 1)), ] result = cvxopt.solvers.qp(matrix(L), matrix(k), matrix(G), matrix(h)) a = np.array(result['x'])[:n + m] T = 10 for t in range(T): # tighten the upper-bound b = KP.dot(a) >= 1 c = KQ.dot(a) >= -1 # minimize the upper-bound k = np.r_[-KP.T.dot(b) / n - KQ.T.dot(c) / m, np.ones((n, 1)) / n, np.ones((m, 1)) / m, ] result = cvxopt.solvers.qp(matrix(L), matrix(k), matrix(G), matrix(h)) a = np.array(result['x'])[:n + m] def classifier(x): x = x.reshape(1, -1) return a.T.dot( np.exp(-(r(X**2) - 2 * X.dot(x.T) + r(x**2).T) / (2 * s**2))) return lambda X: np.max([classifier(x) for x in X])
def prediction_error(bags, model, theta): N1 = len(MI.extract_bags(bags, 1)) N0 = len(MI.extract_bags(bags, 0)) error = nc_risk(theta, N1, N0, zero_one_loss) return sum( list( map( lambda B: float( error(model(B.data()), Variable(np.array([[B.label()]]).astype(np.float32))) .data), bags))) - theta
def _class_prior(bags, basis, r): # cf. (du Plessis et al., 2014) p_bags = MI.extract_bags(bags, 1) u_bags = MI.extract_bags(bags, 0) n1 = len(p_bags) n0 = len(u_bags) H = 1./n1 * np.sum([np.outer(basis(B), basis(B).T) for B in p_bags], axis=0) h = 1./n0 * np.sum(list(map(lambda B: basis(B), u_bags)), axis=0) G = H + r * np.eye(n1 + n0) G_ = np.linalg.inv(G) return (2*h.T.dot(G_.dot(h))-h.T.dot(G_.dot(H.dot(G_.dot(h)))))**(-1)
def validation_error(validation_set, training_set, s, l, t): X = np.vstack(( np.vstack(MI.extract_bags(training_set, 1)), np.vstack(MI.extract_bags(training_set, 0)))) d = X.shape[1] P = np.vstack(MI.extract_bags(validation_set, 1)) Q = np.vstack(MI.extract_bags(validation_set, 0)) H = (np.pi * s**2)**(d/2) * np.exp(- (r(X**2) - 2*X.dot(X.T) + r(X**2).T) / (4*s**2)) h = np.exp(- (r(X**2) - 2*X.dot(P.T) + r(P**2).T) / (2*s**2)).mean(axis=1) \ - np.exp(- (r(X**2) - 2*X.dot(Q.T) + r(Q**2).T) / (2*s**2)).mean(axis=1) return t.dot(H.dot(t)) - 2*h.T.dot(t)
def train(bags, width, reg, args): P = np.vstack(MI.extract_bags(bags, 1)) Q = np.vstack(MI.extract_bags(bags, 0)) t = LSDD(P, Q, width, reg) X = np.vstack((P, Q)) def classifier(x): x = x.reshape(1, -1) return t.T.dot(np.exp(- (r(X**2) - 2*X.dot(x.T) + r(x**2).T) / (2*width**2))) return lambda X: np.max([classifier(x) for x in X])
def train_lsdd(data, args): widths = [1.0e-2, 1.0e-4, 1.0e-6] regs = [1.0, 1.0e-03, 1.0e-06] def train(data, width, reg, measure_time=False): if measure_time: t_start = time.time() model = MI.UU.LSDD.train(data, width, reg, args) metadata = {'width': width, 'reg': reg} if measure_time: t_end = time.time() print("# elapsed time = {}".format(t_end - t_start)) return model, metadata # cross validation best_param = {} best_error = np.inf if args.verbose: print("# *** Cross Validation ***") for width, reg in itertools.product(widths, regs): errors = [] for data_train, data_val in MI.cross_validation(data, 5): t = MI.UU.LSDD.LSDD(np.vstack(MI.extract_bags(data_train, 1)), np.vstack(MI.extract_bags(data_train, 0)), width, reg) e = MI.UU.LSDD.validation_error(data_val, data_train, width, reg, t) errors.append(e) error = np.mean(errors) if args.verbose: print("# width = {:.3e} / reg = {:.3e} / error = {:.3e}".format( width, reg, error)) if error < best_error: best_error = error best_param = {'width': width, 'reg': reg} if args.verbose: print("# {}".format('-' * 80)) model, metadata = train(data, best_param['width'], best_param['reg'], measure_time=True) return model, best_param
def main(): import_modules() args = handler() index_array, values_array = IO.unpack_mask_file(args.exp_mask_file) discr_exp_profile = MI.discretize_exp_profile(index_array, values_array, nbins=args.nbins) seeds_passed = IO.read_motif_file(args.combined_seeds_filename) profiles_passed = IO.unpack_profiles_file(args.combined_profiles_filename) classification_array, N_families = filter_CMI(seeds_passed, profiles_passed, discr_exp_profile, index_array, args.nbins, args.min_ratio, do_print=args.do_print) MI_values_array = calculate_MIs_all_seeds(profiles_passed, discr_exp_profile, index_array, args.nbins) seeds_unique, profiles_unique = choose_best_reps_for_families( seeds_passed, profiles_passed, classification_array, N_families, MI_values_array, do_print=args.do_print) IO.write_list_of_seeds(seeds_unique, args.unique_seeds_filename) IO.write_array_of_profiles(profiles_unique, args.unique_profiles_filename) IO.write_classification_array(classification_array, args.families_classification_filename)
def MI_get_pvalue_and_zscore(active_profile, discr_exp_profile, nbins, current_MI, n_permutations): shuffled_MI_values = np.zeros(n_permutations, dtype=np.float64) for i in range(n_permutations): shuffled_expr = np.random.permutation(discr_exp_profile) ith_MI = MI.mut_info(active_profile, shuffled_expr, x_bins=2, y_bins=nbins) shuffled_MI_values[i] = ith_MI shuffled_MI_values.sort() if current_MI < shuffled_MI_values[0]: # shortcut: if current MI is less than the minimal permuted MI, exit value_undiv = n_permutations else: # go from right to left while the shuffled score is higher than the real one j = n_permutations - 1 while (j >= 0) and (current_MI <= shuffled_MI_values[j]): j -= 1 value_undiv = n_permutations - j - 1 pvalue = value_undiv / float(n_permutations) z_score = (current_MI - np.mean(shuffled_MI_values)) / np.std(shuffled_MI_values) # print(shuffled_MI_values) # print(current_MI) return pvalue, z_score
def main(): # I only import things if I run this script itself # do relative import based on current working directory # otherwise I have to install the package for relative import to work import_modules() args = handler() # get mapping of task ids to input files mapping_dict = sge.parse_task_mapping_file(args.task_mapping_file) # get the task id env_variables_dict = sge.get_env_variables() # get the names of input and output files profiles_filename_full, MI_values_filename_full, rna_bin_filename = get_current_in_out_filenames( args, env_variables_dict, mapping_dict) decompressed_profiles_array, index_array, values_array = IO.unpack_profiles_and_mask( profiles_filename_full, args.exp_mask_file, do_print=True) discr_exp_profile = MI.discretize_exp_profile(index_array, values_array, args.nbins) MI_values_array = calculate_MI_for_seeds(decompressed_profiles_array, index_array, discr_exp_profile, args.nbins, args.min_occurences, do_print=True) IO.write_MI_values(MI_values_array, args.nbins, MI_values_filename_full) if args.print_qstat == 'y': sge.print_qstat_proc(env_variables_dict, args.path_to_qstat)
def main(): import_modules() args = handler() n_seqs_list = read_sequences(args.rna_bin_file) index_array, values_array = IO.unpack_mask_file(args.exp_mask_file) discr_exp_profile = MI.discretize_exp_profile(index_array, values_array, nbins = args.nbins) seeds_initial = IO.read_motif_file(args.unique_seeds_filename) profiles_initial = IO.unpack_profiles_file(args.unique_profiles_filename) seqs_of_interest = [n_seqs_list[x] for x in range(index_array.shape[0]) if index_array[x]] # get the task id env_variables_dict = sge.get_env_variables() seed_chunks, profiles_chunks = chunk_up_input_files(seeds_initial, profiles_initial, args.size_of_chunks) seed_right_chunk, profiles_right_chunk = pick_one_chunk(seed_chunks, profiles_chunks, env_variables_dict) seeds_filename_full, profiles_filename_full, \ char_filename_full, robustness_filename_full = make_output_filenames(env_variables_dict, args) seeds_optimized, profiles_optimized, \ seed_charact_array, robustness_array = optimize_motifs(seed_right_chunk, profiles_right_chunk, discr_exp_profile, args.nbins, index_array, seqs_of_interest, args, do_print=True) IO.write_list_of_seeds(seeds_optimized, seeds_filename_full) IO.write_array_of_profiles(profiles_optimized, profiles_filename_full) IO.write_np_array(seed_charact_array, char_filename_full) IO.write_np_array(robustness_array, robustness_filename_full)
def calculate_MI_for_seeds(decompressed_profiles_array, index_array, discr_exp_profile, nbins, min_occurences, do_print=False): MI_values_array = np.zeros(decompressed_profiles_array.shape[0], dtype=np.float32) for i, profile in enumerate(decompressed_profiles_array): active_profile = profile[index_array] if active_profile.sum() <= min_occurences: MI_values_array[i] = MASK_OUT_SEED_VALUE # print("The seed number %d binds only %d transcripts" % (i, active_profile.sum())) continue MI_values_array[i] = MI.mut_info(active_profile, discr_exp_profile, x_bins=2, y_bins=nbins) if do_print: if i % 1000 == 0 and i > 0: print("Profile number %d has been calculated" % i) MI_values_array = np.array( MI_values_array, dtype=np.float64) # make sure all elements are of the same size return MI_values_array
def affinity(clf, conf, bags, uidx, nidx): # evaluate F-score on unlabeled set # regard "reliable negative bags" as negative set, and the other bags as positive set pidx = list(set(uidx) - set(nidx)) pred = np.array([clf(bags[i], conf[i]) for i in pidx + nidx]) true = np.r_[np.ones(len(pidx)), -1 * np.ones(len(nidx))] return MI.f_score(pred, true)
def test(): x = asarray([gauss(0, 1) for i in range(1000)]) y1 = asarray([int(e > 0) for e in x]) y2 = asarray([randint(0, 1) for e in x]) hx, bx = histogram(x, bins=x.size / 10, density=True) dx = digitize(x, bx) print "X ~ N(0,1)" print "y1 = 1 <=> x > 0" print "y2 = 1 con probabilidad 0.5" print print "I(y1;x) = H(X) - H(X|Y1) = %.02f" % (mi.mutual_information(x, y1)) print "I(y1;x) = H(Y1) - H(Y1|X) = %.02f" % (mi.mutual_information(y1, dx)) print print "I(y2;x) = H(X) - H(X|Y2) = %.02f" % (mi.mutual_information(x, y2)) print "I(y2;x) = H(Y2) - H(Y2|X) = %.02f" % (mi.mutual_information(y2, dx))
def train_sl(bags, basis, bdim, theta, r, args): p_bags = MI.extract_bags(bags, 1) u_bags = MI.extract_bags(bags, 0) N1 = len(p_bags) N0 = len(u_bags) N = N1 + N0 P1 = np.array([np.r_[1, basis(B)].T for B in p_bags]) P0 = np.array([np.r_[1, basis(B)].T for B in u_bags]) param = np.linalg.inv(0.5 / N0 * P0.T.dot(P0) + r * np.eye(bdim + 1)).dot( \ theta / N1 * P1.T.dot(np.ones((N1, 1))) - 0.5 / N0 * P0.T.dot(np.ones((N0, 1))) ) alpha = param[1:] beta = float(param[:1]) clf = lambda X: alpha.T.dot(basis(X)) + beta return clf
def calculate(pdf, variable_time, variables_state, dt, sample_N=1, sample_T=1, logbase="log2"): """ Input: pdf joint pdf class variable_time variable identifying the time series (time 0 must be included). Must be ordered, since the first index is used to calculate the entropy used to intersect the entropy curve. variables_state variables representing the state. Could fit multiple labels (e.g. "var" fits "var_1", "var_2", "var_3",...) dt Number of timesteps between time series sample_N sample percentage for the variables sample_T sample percentage for the time variable logbase Base for the logarithm ("log2", "log", "log10") returns information integration. """ assert np.isscalar(variable_time), "Only one time variable can be specified" assert logbase in ["log2", "log", "log10"], "Logbase parameter must be one of (\"log2\", \"log\", \"log10\")" """Sample variables""" labels_state = pdf.get_labels(variables_state) sampled_pdf = pdf.sample_variables(labels_state, sample_N) sampled_labels_state = sampled_pdf.get_labels(variables_state) """Sample time""" sampled_pdf = sampled_pdf.sample_values([variable_time], [sample_T]) num_time_series = sampled_pdf.get_num_bins_of(variable_time) """Calculate IDT in each element (sample)""" II = np.ndarray((num_time_series, len(sampled_labels_state))) for i, l_i in enumerate(sampled_labels_state): for t in xrange(num_time_series): """Calculate I(Si^T:{Sj^0}j)""" """Create joint pdf class with the initial state as a variable""" state_vars = sampled_labels_state[:] state_vars.remove(l_i) joint_i_sj_pdf = sampled_pdf.join_dimensions(state_vars, "initial_state") MI_i = MI.calculate(joint_i_sj_pdf, l_i, "initial_state", logbase) """I(Si^T:Sj^0) accumulator""" MI_i_tAcc = 0 for j, l_j in enumerate(sampled_labels_state): MI_i_tAcc += MI.calculate(sampled_pdf, l_i, l_j, logbase) II[t, i] = MI_i - MI_i_tAcc return II
def minimax_basis(bags, degree=1): """ Build basis function based on minimax kernel. Parameters ---------- deg : Degree of polynomial kernel. """ degree = int(degree) p_bags = MI.extract_bags(bags, 1) u_bags = MI.extract_bags(bags, 0) n_bags = MI.extract_bags(bags, -1) bags = p_bags + u_bags + n_bags stat = lambda X: np.r_[X.min(axis=0), X.max(axis=0)] poly_kern = lambda X, Y: (stat(X).dot(stat(Y)) + 1)**degree return lambda X: np.array([poly_kern(X, B) for B in bags])
def optimize_motifs(seeds_initial, profiles_initial, discr_exp_profile, nbins, index_array, seqs_of_interest, args, do_print = True): seeds_optimized = copy.deepcopy(seeds_initial) profiles_optimized = np.zeros((len(seeds_initial), discr_exp_profile.shape[0]), dtype=bool) # seed_charact_array keeps MI values, p-values and z-scores seed_charact_array = np.zeros((len(seeds_initial), 3), dtype=np.float64) robustness_array = np.zeros(len(seeds_initial), dtype=bool) for i, motif in enumerate(seeds_initial): profile = profiles_initial[i] active_profile = profile[index_array] n_bestmotif = type_conversions.w_to_n_motif(seeds_initial[i]) # initial mi value init_best_MI = MI.mut_info(active_profile, discr_exp_profile, x_bins=2, y_bins=nbins) lastmyfreq = active_profile.sum() / float(active_profile.shape[0]) if do_print: w_bestmotif = type_conversions.n_to_w_motif(n_bestmotif) print("Optimzing the sequence of motif %d (sequence is %s). Initial MI = %.5f" % (i, w_bestmotif.print_sequence(return_string=True), init_best_MI)) #print("Initial frequency: %.4f" % lastmyfreq) bestmi, lastmyfreq, n_bestmotif = optimize_motif_sequence(n_bestmotif, init_best_MI, seqs_of_interest, discr_exp_profile, nbins, lastmyfreq, args, do_print = do_print, random_noseed = args.random_noseed) if do_print: print("Elongating motif %d" % i) bestmi, lastmyfreq, n_bestmotif = elongate_motif(n_bestmotif, bestmi, seqs_of_interest, discr_exp_profile, nbins, lastmyfreq, args, do_print = do_print) w_bestmotif = type_conversions.n_to_w_motif(n_bestmotif) bestmotif_profile, bestmotif_mi, pvalue, z_score = get_characteristics( n_bestmotif, seqs_of_interest, discr_exp_profile, nbins, args, do_print=do_print) if do_print: print("Checking robustness of the optimized motif %d (sequence %s)" % (i, w_bestmotif.print_sequence(return_string=True))) is_robust = check_robustness(bestmotif_profile, discr_exp_profile, nbins, args, do_print = do_print) seeds_optimized[i] = w_bestmotif profiles_optimized[i] = bestmotif_profile.values seed_charact_array[i, : ] = np.array([bestmotif_mi, pvalue, z_score], dtype=np.float64) robustness_array[i] = is_robust return seeds_optimized, profiles_optimized, \ seed_charact_array, robustness_array
def get_characteristics(n_bestmotif, seqs_of_interest, discr_exp_profile, nbins, args, do_print = False): bestmotif_profile, _time = matchmaker.calculate_profile_one_motif(n_bestmotif, seqs_of_interest, is_degenerate = True) bestmotif_mi = MI.mut_info(bestmotif_profile.values, discr_exp_profile, x_bins=2, y_bins=nbins) pvalue, z_score = statistic_tests.MI_get_pvalue_and_zscore(bestmotif_profile.values, discr_exp_profile, nbins, bestmotif_mi, args.n_permutations) if do_print: print("The final p-value is: %.4f, z-score is: %.3f" % (pvalue, z_score)) return bestmotif_profile, bestmotif_mi, pvalue, z_score
def calculate_MIs_all_seeds(profiles_passed, discr_exp_profile, index_array, nbins): MI_values_array = np.zeros(profiles_passed.shape[0], dtype=np.float32) for i, profile in enumerate(profiles_passed): active_profile = profile[index_array] MI_values_array[i] = MI.mut_info(active_profile, discr_exp_profile, x_bins=2, y_bins=nbins) return MI_values_array
def nsk_basis(bags, width=1.0e-01): """ Build basis function based on normalized set kernel. """ ins_kern = lambda x, c: np.exp(-width * np.linalg.norm(x - c)**2) p_bags = MI.extract_bags(bags, 1) u_bags = MI.extract_bags(bags, 0) n_bags = MI.extract_bags(bags, -1) bags = p_bags + u_bags + n_bags # (un-normalized) set kernel usk = lambda S0, S1: sum( list( map(lambda s: ins_kern(s[0], s[1]), list(itertools.product(S0, S1)) ))) # normalized set kernel nsk = lambda S0, S1: usk(S0, S1) / np.sqrt(usk(S0, S0) * usk(S1, S1)) return lambda X: np.array([nsk(X, B) for B in bags])
def get_current_statistics(index, MI_values_array, profiles_array, index_array, discr_exp_profile, args): profile = profiles_array[index] active_profile = profile[index_array] current_MI = MI_values_array[index] if current_MI == -1: return args.max_pvalue + 0.1, args.min_zscore - 0.1 assert (np.isclose(current_MI, MI.mut_info(active_profile, discr_exp_profile), rtol=1e-10)) pvalue, z_score = statistic_tests.MI_get_pvalue_and_zscore( active_profile, discr_exp_profile, current_MI, args.n_permutations) return pvalue, z_score
def jackknife_test(active_profile, discr_exp_profile, nbins, n_permutations, max_pvalue, n_samples, fraction_retain, min_fraction_passed, do_print=False): total_number_passed = 0 for j in range(n_samples): full_indices_array = np.arange(active_profile.shape[0]) how_many_keep = int(fraction_retain * active_profile.shape[0]) subsampl_index_array = np.random.choice(full_indices_array, size=how_many_keep, replace=False) curr_profile = active_profile[subsampl_index_array] curr_exp_profile = discr_exp_profile[subsampl_index_array] curr_MI = MI.mut_info(curr_profile, curr_exp_profile, x_bins=2, y_bins=nbins) pvalue, z_score = MI_get_pvalue_and_zscore(curr_profile, discr_exp_profile, nbins, curr_MI, n_permutations) if do_print: print( "Iteration %d. p-value: %.5f; max_pvalue: %.5f, z-score: %.2f" % (j, pvalue, max_pvalue, z_score)) if pvalue < max_pvalue: total_number_passed += 1 fraction_passed = total_number_passed / float(n_samples) if do_print: print("%.2f subsamples passed the test; required fraction is %.2f" % (fraction_passed, min_fraction_passed)) if fraction_passed >= min_fraction_passed: if do_print: print("Passed robustness test") return True else: if do_print: print("Did not pass robustness test") return False
def main(): args = handler() # read occurence profiles and expression profile profiles_array, index_array, values_array = IO.unpack_profiles_and_mask( args, do_print=False) # read precalculated MI values MI_values_array, nbins = IO.read_MI_values(args.MI_values_file) # find the threshold discr_exp_profile = MI.discretize_exp_profile(index_array, values_array, nbins) determine_mi_threshold(MI_values_array, discr_exp_profile, profiles_array, index_array, args, do_print=True)
def main(): # I only import things if I run this script itself # do relative import based on current working directory # otherwise I have to install the package for relative import to work import_modules() args = handler() # get mapping of task ids to input files mapping_dict = sge.parse_task_mapping_file(args.task_mapping_file) # get the task id env_variables_dict = sge.get_env_variables() # get the names of input and output files profiles_filename_full, MI_values_filename_full, \ passed_seed_filename_full, passed_profiles_filename, \ seed_filename_full, \ rna_bin_filename, exp_mask_filename = get_current_in_out_filenames(args, env_variables_dict, mapping_dict) # read motifs, their profiles and MI values profiles_array, index_array, values_array = IO.unpack_profiles_and_mask( profiles_filename_full, exp_mask_filename, do_print=True) w_motifs_list = IO.read_motif_file(seed_filename_full) MI_values_array, nbins = IO.read_MI_values(MI_values_filename_full) # find the threshold discr_exp_profile = MI.discretize_exp_profile(index_array, values_array, nbins) last_positive_seed = determine_mi_threshold(MI_values_array, discr_exp_profile, nbins, profiles_array, index_array, args, do_print=True) write_seeds_passed(last_positive_seed, MI_values_array, w_motifs_list, passed_seed_filename_full) write_profiles_passed(last_positive_seed, MI_values_array, profiles_array, passed_profiles_filename) if args.print_qstat == 'y': sge.print_qstat_proc(env_variables_dict, args.path_to_qstat)
def are_there_better_motifs(n_modified_motifs, seqs_of_interest, discr_exp_profile, nbins, bestmi, n_bestmotif, lastmyfreq, args, do_print = True): for curr_motif in n_modified_motifs: current_profile, time_spent = matchmaker.calculate_profile_one_motif(curr_motif, seqs_of_interest, is_degenerate = True) myfreq = current_profile.values.sum() / float(len(seqs_of_interest)) tempmi = MI.mut_info(current_profile.values, discr_exp_profile, x_bins=2, y_bins=nbins) if tempmi > bestmi and current_profile.sum() > args.min_occurences and (myfreq < args.maxfreq or myfreq < lastmyfreq): n_bestmotif = structures.copy_n_motif(curr_motif) w_bestmotif = type_conversions.n_to_w_motif(n_bestmotif) bestmi = tempmi lastmyfreq = myfreq if do_print: print("New motif (MI = %.4f): %s" % (bestmi, w_bestmotif.print_sequence(return_string=True))) # w_bestmotif.print() # w_bestmotif.print_linear() #print("Current frequency: %.4f" % lastmyfreq) return bestmi, lastmyfreq, n_bestmotif
def individual(pdf, variable_time, variables_idt, dt, sample_N=1, sample_T=1, logbase="log2"): """ Input: pdf joint pdf class variable_time variable identifying the time series (time 0 must be included). Must be ordered, since the first index is used to calculate the entropy used to intersect the entropy curve. variables_idt variables to calculate their idt. Could fit multiple labels (e.g. "var" fits "var_1", "var_2", "var_3",...) dt Number of timesteps between time series sample_N sample percentage for the idt variables sample_T sample percentage for the time variable logbase Base for the logarithm ("log2", "log", "log10") returns idt calculated on the variables_idt. """ assert np.isscalar(variable_time), "Only one time variable can be specified" """Sample variables""" labels_idt = pdf.get_labels(variables_idt) sampled_pdf = pdf.sample_variables(labels_idt, sample_N) sampled_labels_idt = sampled_pdf.get_labels(variables_idt) # labels_initial_idt = pdf_initial.get_labels(variables_idt) # sampled_initial_pdf = pdf_initial.sample_variables(labels_initial_idt, sample_N) """Before sampling time, calculate entropy at time 0""" joint_time_0 = sampled_pdf.filter_joint_probabilities([variable_time], [0]) joint_time_0.normalize() h_initial = shannon.calculate(joint_time_0, sampled_labels_idt) """Sample time""" sampled_pdf = sampled_pdf.sample_values([variable_time], [sample_T], True) num_time_series = sampled_pdf.get_num_bins_of(variable_time) """Maximum value for IDT when there is no enough decay""" IDT_max = num_time_series * dt num_variables = len(sampled_labels_idt) IDT_var = np.ndarray((num_variables)) IDT_var[:] = IDT_max """Calculate IDT for each element""" for i, l_i in enumerate(sampled_labels_idt): """Target decay limit""" h_target = h_initial[i]/2 """Maximum mutual information variable""" max_I = np.ndarray((num_time_series)) """Time series mutual information""" for t in xrange(num_time_series - 1): """Filter combination with time t""" pdf_t = sampled_pdf.filter_joint_probabilities([variable_time], [[t+1]]) pdf_t.normalize() mi = np.ndarray((num_variables)) """Mutual information in time t""" for j, l_j in enumerate(sampled_labels_idt): """Joint pdf with var i from initial state and var j from time t""" """Mutual information""" if t == 0 and i == 0 and j == 1: print joint_time_0.shrink_dimensions_to([l_i]).joint_probabilities, pdf_t.shrink_dimensions_to([l_j]).joint_probabilities mi[j] = MI.calculate(pdf_t, l_i, l_j, logbase) max_I[t] = np.amax(mi) print max_I[1] """Find t crossing target decay""" for t in xrange(num_time_series - 1): """Interpolate when found""" if max_I[t + 1] - h_target < 0: t1 = t t2 = t + 1 found = True h1 = max_I[t1] h2 = max_I[t2] if h2 - h1 == 0: IDT_var[i] = 0 else: IDT_var[i] = (t1 + (t2-t1)*(h_target - h1) /(h2-h1)) * dt break return IDT_var
def individual(initial, times, dt, bin_values, continuous_bins, sample_N1, sample_N2, sample_t, logbase="log2"): """ IDT individual metric Input: initial Initial data NxP N = elements P = population times Time state data TxNxP T = time series N = elements P = population dt Number of timesteps between time series bin_values values of the bins continuous_bins true if the values of the bins are continuous sample_N1 percentage of elements to choose as a sample for state 0 sample_N2 percentage of elements to choose as a sample for state t sample_time percentage of elements to choose as a sample for time series logbase Base for the logarithm ("log2", "log", "log10") Returns: IDT N N = elements """ assert logbase in ["log2", "log", "log10"], 'Logbase parameter must be one of ("log2", "log", "log10")' assert 0 < sample_N1 <= 1, "Sample for N1 must be within (0, 1]" assert 0 < sample_N2 <= 1, "Sample for N2 must be within (0, 1]" assert 0 < sample_time <= 1, "Sample for time must be within (0, 1]" number_of_bins = len(bin_values) if continuous_bins: number_of_bins = number_of_bins - 1 # Sampling input data sample_elements_1 = np.arange(len(initial)) sample_elements_2 = np.arange(len(initial)) sample_time = np.arange(len(times)) np.random.shuffle(sample_elements_1) np.random.shuffle(sample_elements_2) np.random.shuffle(sample_time) sample_elements_1 = sample_elements_1[: len(initial) * sample_N1] sample_elements_2 = sample_elements_2[: len(initial) * sample_N2] sample_time = sample_time[: len(times) * sample_t] sample_time = np.sort(sample_time) times_sampled = times[sample_time] initial_sampled = initial[sample_elements_1] initial_sampled_2 = initial[sample_elements_2] initial_sampled_len = len(initial_sampled) initial_sampled_len_2 = len(initial_sampled_2) times_sampled_len = len(times_sampled) # Maximum value for IDT when there is no enough decay IDT_max = len(times) * dt # Initial marginals pdf pdf_initial = PDF.single(initial_sampled, bin_values, continuous_bins) pdf_initial_2 = PDF.single(initial_sampled_2, bin_values, continuous_bins) # Initial entropy h_initial = shannon.calculate(pdf_initial, logbase) # Temporal marginals pdf pdf_t = np.ndarray((len(sample_time), len(sample_elements_2), number_of_bins), dtype="float") for t in xrange(len(sample_time)): pdf_t[t] = PDF.single(times_sampled[t][sample_elements_2, ...], bin_values, continuous_bins) # Calculate IDT for each element (sample) IDT_var = np.ndarray(initial_sampled_len, dtype="float") init = time.clock() for i in xrange(initial_sampled_len): # Target decay limit h_target = h_initial[i] / 2 # Maximum mutual information max_I = np.ndarray(times_sampled_len + 1, dtype="float") initial_sampled_i_len = len(initial_sampled[i]) found = False # Initial mutual information mi_init = np.ndarray((len(times_sampled[t][sample_elements_2])), dtype="float") for j in xrange(len(times_sampled[t][sample_elements_2])): initial_sampled_i_len_2 = len(initial_sampled_2[j]) # Calculate joint pdf from initial state pdf_joint = PDF.joint( initial_sampled[i].reshape(1, initial_sampled_i_len), bin_values, continuous_bins, initial_sampled_2[j].reshape(1, initial_sampled_i_len_2), bin_values, continuous_bins, ) # Mutual information mi_init[j] = MI.calculate( pdf_initial[i].reshape(1, number_of_bins), pdf_initial_2[j].reshape(1, number_of_bins), pdf_joint, logbase, ) max_I[0] = np.amax(mi_init) # Time series mutual information for t in xrange(times_sampled_len): mi = np.ndarray((len(times_sampled[t][sample_elements_2])), dtype="float") # Mutual information in time t for j in xrange(len(times_sampled[t][sample_elements_2])): # Calculate joint pdf from initial state and time t pdf_joint = PDF.joint( initial_sampled[i].reshape(1, initial_sampled_i_len), bin_values, continuous_bins, times_sampled[t][sample_elements_2][j].reshape(1, len(times_sampled[t][sample_elements_2][j])), bin_values, continuous_bins, ) # Mutual information mi[j] = MI.calculate( pdf_initial[i].reshape(1, number_of_bins), pdf_t[t, j, :].reshape(1, number_of_bins), pdf_joint, logbase, ) max_I[t + 1] = np.amax(mi) # Find t crossing target decay for t in xrange(times_sampled_len): # Interpolate when found if max_I[t + 1] - h_target < 0: t1 = t t2 = t + 1 found = True h1 = max_I[t1] h2 = max_I[t2] if h2 - h1 == 0: IDT_var[i] = 0 else: IDT_var[i] = (t1 + (t2 - t1) * (h_target - h1) / (h2 - h1)) * dt break # Setting maximum IDT value when not found if not found: IDT_var[i] = IDT_max return IDT_var
def calculateMetric(metric_name, param_vals): if metric_name == 'count': if len(param_vals)!= 1: print 'ERROR:Error in ', metric_name, ', number of parameters incorrect. It must be count(data)' raise Exception() return red.count(*param_vals) elif metric_name == 'pdf': if len(param_vals)!= 3: print 'ERROR:Error in ', metric_name, ', number of parameters incorrect. It must be pdf(data, bin_values, continuous_bins)' raise Exception() return PDF.single(*param_vals) elif metric_name == 'deft': if len(param_vals) < 2 or len(param_vals) > 3: print 'ERROR:Error in ', metric_name, ', number of parameters incorrect. It must be deft(data, g, alpha)' raise Exception() return deft.deft(*param_vals) elif metric_name == 'pdf_joint': if len(param_vals)!= 6: print 'ERROR:Error in ', metric_name, ', number of parameters incorrect. It must be pdf_joint(dataA, bin_valuesA, continuous_binsA, dataB, bin_valuesB, continuous_binsB)' raise Exception() return PDF.joint(*param_vals) elif metric_name == 'mutual_information': if len(param_vals) < 3 or len(param_vals) > 4: print 'ERROR:Error in ', metric_name, ', number of parameters incorrect. It must be mutual_information(pdfA, pdfB, joint_pdf, logbase="log2")' raise Exception() return MI.calculate(*param_vals) elif metric_name == 'shannon': if len(param_vals) < 1 or len(param_vals) > 2: print 'ERROR:Error in ', metric_name, ', number of parameters incorrect. It must be shannon(pdf, logbase="log2")' raise Exception() return shannon.calculate(*param_vals) elif metric_name == 'kullback-leibler': if len(param_vals) < 2 or len(param_vals) > 3: print 'ERROR:Error in ', metric_name, ', number of parameters incorrect. It must be kullback-leibler(pdf_p, pdf_q, logbase="log2")' raise Exception() return kullback.calculate(*param_vals) elif metric_name == 'fisher': if len(param_vals) < 2 or len(param_vals) > 3: print 'ERROR:Error in ', metric_name, ', number of parameters incorrect. It must be fisher(pdf, eps, logbase="log2")' raise Exception() return fis.calculate(*param_vals) elif metric_name == 'hellinger-distance': if len(param_vals) != 2: print 'ERROR:Error in ', metric_name, ', number of parameters incorrect. It must be hellinger-distance(pdf_p, pdf_q)' raise Exception() return hellinger.calculate(*param_vals) elif metric_name == 'surprise': if len(param_vals)!= 1: print 'ERROR:Error in ', metric_name, ', number of parameters incorrect. It must be surprise(prob)' raise Exception() return surprise.calculate(*param_vals) elif metric_name == 'idt': if len(param_vals) < 6 or len(param_vals) > 7: print 'ERROR:Error in ', metric_name, ', number of parameters incorrect. It must be idt(initial, time_series, epsilon, dt, bin_values, continuous_bins, logbase="log2")' raise Exception() return IDT.system(*param_vals) elif metric_name == 'idt_individual': if len(param_vals) < 8 or len(param_vals) > 9: print 'ERROR:Error in ', metric_name, ', number of parameters incorrect. It must be idt_individual(initial, time_series, dt, bin_values, continuous_bins, sample_state_0, sample_state_t, sample_time, logbase="log2")' raise Exception() return IDT.individual(*param_vals) elif metric_name == 'information_integration': if len(param_vals) < 9 or len(param_vals) > 10: print 'ERROR:Error in ', metric_name, ', number of parameters incorrect. It must be information_integration(initial, group, dt, bin_values, continuous_bins, sample_N1, sample_N2, sample_G, sample_t, logbase="log2")' raise Exception() return II.calculate(*param_vals) elif metric_name == 'multi_information': if len(param_vals) < 6 or len(param_vals) > 7: print 'ERROR:Error in ', metric_name, ', number of parameters incorrect. It must be multi_information(data, bin_values, continuous_bins, sample_var, sample_elems, sample_pop, logbase="log2")' raise Exception() return multi.calculate(*param_vals) elif metric_name == 'swap_axes': if len(param_vals)!= 3: print 'ERROR:Error in ', metric_name, ', number of parameters incorrect. It must be swap_axes(data, axis0, axis1)' raise Exception() return np.swapaxes(*param_vals) elif metric_name == 'add_dimension': if len(param_vals)!= 2: print 'ERROR:Error in ', metric_name, ', number of parameters incorrect. It must be add_dimension(data, dimNumber)' raise Exception() return np.expand_dims(*param_vals) elif metric_name == 'join_dimensions': if len(param_vals)!= 3: print 'ERROR:Error in ', metric_name, ', number of parameters incorrect. It must be join_dimensions(data, dimNumberA, dimNumberB)' raise Exception() return red.join(*param_vals) else : # Try to get a numpy function try : func = getattr(np, metric_name) return func(*param_vals) except: print 'ERROR:Metric ', metric_name, ' does not exist' raise Exception()
def calculateMetric(metric_name, param_vals): ''' Calculates a metric. Input: metric_name metric name param_vals metric parameters Returns: result of the metric ''' if metric_name == 'count': if len(param_vals)!= 1: print 'ERROR:Error in ', metric_name, ', number of parameters incorrect. It must be count(data)' raise Exception() return red.count(*param_vals) elif metric_name == 'pdf': if len(param_vals)!= 3: print 'ERROR:Error in ', metric_name, ', number of parameters incorrect. It must be pdf(data, bin_values, continuous_bins)' raise Exception() return PDF.single(*param_vals) elif metric_name == 'deft': if len(param_vals) < 4 or len(param_vals) > 5: print 'ERROR:Error in ', metric_name, ', number of parameters incorrect. It must be deft(data, g, minLimit, maxLimit, alpha=2)' raise Exception() return deft.deft(*param_vals) elif metric_name == 'deft_joint': if len(param_vals) < 7 or len(param_vals) > 8: print 'ERROR:Error in ', metric_name, ', number of parameters incorrect. It must be deft_joint(dataA, dataB, g, minLimitA, maxLimitA, minLimitB, maxLimitB, alpha=2)' raise Exception() return deft.deft(*param_vals) elif metric_name == 'pdf_joint': if len(param_vals)!= 6: print 'ERROR:Error in ', metric_name, ', number of parameters incorrect. It must be pdf_joint(dataA, bin_valuesA, continuous_binsA, dataB, bin_valuesB, continuous_binsB)' raise Exception() return PDF.joint(*param_vals) elif metric_name == 'mutual_information': if len(param_vals) < 3 or len(param_vals) > 4: print 'ERROR:Error in ', metric_name, ', number of parameters incorrect. It must be mutual_information(pdfA, pdfB, joint_pdf, logbase="log2")' raise Exception() return MI.calculate(*param_vals) elif metric_name == 'shannon': if len(param_vals) < 1 or len(param_vals) > 2: print 'ERROR:Error in ', metric_name, ', number of parameters incorrect. It must be shannon(pdf, logbase="log2")' raise Exception() return shannon.calculate(*param_vals) elif metric_name == 'kullback-leibler': if len(param_vals) < 2 or len(param_vals) > 3: print 'ERROR:Error in ', metric_name, ', number of parameters incorrect. It must be kullback-leibler(pdf_p, pdf_q, logbase="log2")' raise Exception() return kullback.calculate(*param_vals) elif metric_name == 'fisher': if len(param_vals) < 2 or len(param_vals) > 3: print 'ERROR:Error in ', metric_name, ', number of parameters incorrect. It must be fisher(pdf, eps, logbase="log2")' raise Exception() return fis.calculate(*param_vals) elif metric_name == 'hellinger-distance': if len(param_vals) != 2: print 'ERROR:Error in ', metric_name, ', number of parameters incorrect. It must be hellinger-distance(pdf_p, pdf_q)' raise Exception() return hellinger.calculate(*param_vals) elif metric_name == 'surprise': if len(param_vals)!= 1: print 'ERROR:Error in ', metric_name, ', number of parameters incorrect. It must be surprise(prob)' raise Exception() return surprise.calculate(*param_vals) elif metric_name == 'idt': if len(param_vals) < 6 or len(param_vals) > 7: print 'ERROR:Error in ', metric_name, ', number of parameters incorrect. It must be idt(initial, time_series, epsilon, dt, bin_values, continuous_bins, logbase="log2")' raise Exception() return IDT.system(*param_vals) elif metric_name == 'idt_individual': if len(param_vals) < 8 or len(param_vals) > 9: print 'ERROR:Error in ', metric_name, ', number of parameters incorrect. It must be idt_individual(initial, time_series, dt, bin_values, continuous_bins, sample_state_0, sample_state_t, sample_time, logbase="log2")' raise Exception() return IDT.individual(*param_vals) elif metric_name == 'information_integration': if len(param_vals) < 9 or len(param_vals) > 10: print 'ERROR:Error in ', metric_name, ', number of parameters incorrect. It must be information_integration(initial, group, dt, bin_values, continuous_bins, sample_N1, sample_N2, sample_G, sample_t, logbase="log2")' raise Exception() return II.calculate(*param_vals) elif metric_name == 'multi_information': if len(param_vals) < 6 or len(param_vals) > 7: print 'ERROR:Error in ', metric_name, ', number of parameters incorrect. It must be multi_information(data, bin_values, continuous_bins, sample_var, sample_elems, sample_pop, logbase="log2")' raise Exception() return multi.calculate(*param_vals) elif metric_name == 'early_warning_difference': if len(param_vals) < 4 or len(param_vals) > 5: print 'ERROR:Error in ', metric_name, ', number of parameters incorrect. It must be early_warning_difference(time_series_ref, time_series_comp, change_values, warning_values, histogram_limit=50)' raise Exception() return ew.early_warning_difference(*param_vals) elif metric_name == 'early_warning_flips': if len(param_vals) != 2: print 'ERROR:Error in ', metric_name, ', number of parameters incorrect. It must be early_warning_flips(time_series, change_values)' raise Exception() return ew.early_warning_flips(*param_vals) elif metric_name == 'add_dimension': if len(param_vals)!= 2: print 'ERROR:Error in ', metric_name, ', number of parameters incorrect. It must be add_dimension(data, dimNumber)' raise Exception() return np.expand_dims(*param_vals) elif metric_name == 'join_dimensions': if len(param_vals)!= 3: print 'ERROR:Error in ', metric_name, ', number of parameters incorrect. It must be join_dimensions(data, dimNumberA, dimNumberB)' raise Exception() return red.join(*param_vals) else : # Try to get a numpy function try : func = getattr(np, metric_name) return func(*param_vals) except: print 'ERROR:Metric ', metric_name, ' does not exist' raise Exception()
action='store', default=180, type=int, help='the number of unlabeled data') parser.add_argument('-v', '--verbose', action='store_true', default=False, help='verbose output') parser.add_argument( '--aucplot', action='store_true', default=False, help='output prediction score and true label for AUC plot') args = parser.parse_args() print("# {}".format('-' * 80)) print("# *** Experimental Setting ***") print("# model : LSDD") print("# {}".format('-' * 80)) bags_train, bags_test, metadata = MI.datasets.load_dataset( args.dataset, args.prior, args.np, args.nu) clf, best_param = train_lsdd(bags_train, args) print("# width = {:.3e} / reg = {:.3e}".format(best_param['width'], best_param['reg'])) MI.print_evaluation_result(clf, bags_test, args)
Entropy.entropy(list_1) print "2-8gram entropy generated,at "+str(time.clock())+"s" #不同长度词块熵值的对比 MaxEntropy.maxEntropy() print "2-8gram maxEntropy generated,at "+str(time.clock())+"s" word_dic={} for j in xrange(1,9): #循环将1-n词的频率存到word_dic字典中,供MI/FAI/LL用 with open('%dgramindexed.txt'%j, 'r') as f: gram_content = f.readlines() for i in xrange(len(gram_content)): word_dic[(gram_content[i].split('\t'))[0]]=(gram_content[i].split('\t'))[1] print "word_dic generated,at "+str(time.clock())+"s" #求Mi MI.mi(word_dic,n) print "2-8gram mi generated,at "+str(time.clock())+"s" #不同长度词块Mi的对比 MaxMi.maxMi() print "2-8gram maxMi generated,at "+str(time.clock())+"s" #求Fai FaiSquare.fai(word_dic,n) print "2-8gram Fai generated,at "+str(time.clock())+"s" print "total time is "+str(time.clock())+"s"
kp1_location = [] kp2_location = [] kp1_angle = [] kp2_angle = [] for i in range(len(kp1)): kp1_location.append(kp1[i].pt) kp1_angle.append(kp1[i].angle) for i in range(len(kp2)): kp2_location.append(kp2[i].pt) kp2_angle.append(kp2[i].angle) good_kp1, good_kp2 = match.match(kp1_location, kp2_location, des1, des2, sift_ratio) img_good = display.display(img1, img2, good_kp1, good_kp2) better_kp1, better_kp2 = ransac.ransac(good_kp1, good_kp2, error_threshold) solution, rmse = ransac.least_square(better_kp1, better_kp2) img_better = display.display(img1, img2, better_kp1, better_kp2) sift_fusion = image_fusion.image_fusion(img1, img2, solution) common1, common2 = image_fusion.common_region(gray1, gray2, solution) mi = MI.MI(common1, common2) print mi cv2.imshow("sift1 image good match", img_good) #show mathces cv2.imshow("sift1 image better match", img_better) #show mathces cv2.imshow("sift1 image fusion", sift_fusion) #show fusion cv2.waitKey(0)
def train_dh(bags, basis, bdim, theta, r, args): if _SOLVER == 'cvxopt': import cvxopt from cvxopt import matrix from cvxopt.solvers import qp cvxopt.solvers.options['show_progress'] = False elif _SOLVER == 'openopt': from openopt import QP import warnings warnings.simplefilter(action="ignore", category=FutureWarning) elif _SOLVER == 'gurobi': import sys sys.path.append( "/home/local/bin/gurobi650/linux64/lib/python3.4_utf32/gurobipy") import gurobipy from MI.gurobi_helper.helper import quadform, dot, mvmul p_bags = MI.extract_bags(bags, 1) u_bags = MI.extract_bags(bags, 0) N1 = len(p_bags) N0 = len(u_bags) N = N1 + N0 d = bdim P1 = np.array([basis(B).T for B in p_bags]) P0 = np.array([basis(B).T for B in u_bags]) H = np.r_[np.c_[r * np.eye(d), np.zeros((d, 1)), np.zeros((d, N0))], np.c_[np.zeros((1, d)), 0, np.zeros((1, N0))], np.c_[np.zeros((N0, d)), np.zeros((N0, 1)), np.zeros((N0, N0))]] f = np.r_[-theta / N1 * P1.T.sum(axis=1).reshape((-1, 1)), [[-theta]], 1. / N0 * np.ones((N0, 1))] L = np.r_[np.c_[0.5 * P0, 0.5 * np.ones((N0, 1)), -np.eye(N0)], np.c_[P0, np.ones((N0, 1)), -np.eye(N0)], np.c_[np.zeros( (N0, d)), np.zeros((N0, 1)), -np.eye(N0)]] k = np.r_[-0.5 * np.ones((N0, 1)), np.zeros((N0, 1)), -np.zeros((N0, 1))] if _SOLVER == 'cvxopt': result = qp(matrix(H), matrix(f), matrix(L), matrix(k)) gamma = np.array(result['x']) elif _SOLVER == 'openopt': problem = QP(H + 1e-3 * np.eye(H.shape[0]), f, A=L, b=k) result = problem.solve('qlcp') gamma = result.xf elif _SOLVER == 'gurobi': # model and target variables m = gurobipy.Model('qp') m.setParam('OutputFlag', False) opt_dim = H.shape[0] x = [ m.addVar(lb=-gurobipy.GRB.INFINITY, name='x{}'.format(i)) for i in range(opt_dim) ] m.update() # objective function and constraints obj = 0.5 * quadform(H.tolist(), x) + dot(f.reshape(-1).tolist(), x) constrs = [lhs <= rhs for lhs, rhs in zip(mvmul(L.tolist(), x), k)] # solve m.setObjective(obj) for i, constr in enumerate(constrs): m.addConstr(constr, 'c{}'.format(i)) try: m.optimize() gamma = np.array([v.x for v in m.getVars()]) except gurobipy.GurobiError: raise ValueError() alpha = gamma[:d] beta = gamma[d] clf = lambda X: alpha.T.dot(basis(X)) + beta return clf