コード例 #1
0
ファイル: preprocess.py プロジェクト: artemy-bakulin/ipage
def get_expression_profile(expression_level, genes, expression_bins,
                           input_format, output_format, species, tmp,
                           symmetric_expression):
    df = pd.DataFrame({'genes': genes, 'expression_level': expression_level})
    df = df[df.iloc[:, 1].notna()]
    df = df.sort_values(by=df.columns[1])
    expression_level = np.array(df.iloc[:, 1])
    if symmetric_expression:
        left = MI.discretize(expression_level[expression_level < 0],
                             expression_bins // 2)
        right = MI.discretize(expression_level[expression_level >= 0],
                              expression_bins // 2 + expression_bins % 2)
        right += expression_bins // 2
        expression_profile = np.concatenate((left, right))
    else:
        expression_profile = MI.discretize(expression_level, expression_bins)

    genes = list(df.iloc[:, 0])
    genes = [gene.split('.')[0] for gene in genes]
    if input_format and output_format and input_format != output_format:
        genes = change_accessions(genes, input_format, output_format, species,
                                  tmp)
        gene_dict = dict(zip(genes, expression_profile))
        expression_profile = np.array(
            [gene_dict[gene] for gene in gene_dict.keys() if gene != '-'])
        genes = [gene for gene in gene_dict.keys() if gene != '-']
    return expression_profile, genes
コード例 #2
0
ファイル: __init__.py プロジェクト: gear/pumil
def prepare(bags, class_prior, L, U, T):
  """
  Parameters
  ----------
  bags        : original dataset
  class_prior : the ratio of positive samples
  L           : the number of labeled samples in output dataset
  U           : the number of unlabeled samples in output dataset
  T           : the number of test samples in output dataset
  """
  # original data
  p_bags = MI.extract_bags(bags,  1, with_label = True)
  n_bags = MI.extract_bags(bags, -1, with_label = True)
  random.shuffle(p_bags)
  random.shuffle(n_bags)
  P = len(p_bags)
  N = len(n_bags)

  retry_count = 0
  while retry_count < 5:
    try:
      return _prepare(p_bags, n_bags, P, N, class_prior, L, U, T)
    except:
      # if the obtained split is invalid, try sampling again
      sys.stderr.write("Warning: Retry train-test-split (recommend to change the splitting number)\n")
      retry_count += 1
      continue
コード例 #3
0
def min_CI_normalized_test(counter,
                           accepted_seeds_list,
                           profiles_passed,
                           discr_exp_profile,
                           nbins,
                           index_array,
                           min_ratio,
                           do_print=False):
    profile_full = profiles_passed[counter]
    profile_being_analyzed = profile_full[index_array]

    for i in range(len(accepted_seeds_list)):
        ith_accepted_profile_full = profiles_passed[accepted_seeds_list[i]]
        ith_accepted_profile = ith_accepted_profile_full[index_array]

        cond_inf = MI.cond_mut_info(profile_being_analyzed,
                                    discr_exp_profile,
                                    ith_accepted_profile,
                                    x_bins=2,
                                    y_bins=nbins,
                                    z_bins=2)
        mut_inf = MI.mut_info(profile_being_analyzed,
                              ith_accepted_profile,
                              x_bins=2,
                              y_bins=2)
        if np.isclose(mut_inf, 0., atol=1e-16):
            mut_inf = 1e-16
        ratio = cond_inf / mut_inf

        print("Comparing seed #%d to an existing seed #%d. The ratio is %.2f" %
              (counter, i, ratio))

        if ratio < min_ratio:
            return False, i  # return index of accepted seed that is similar to the current one
    return True, 0
コード例 #4
0
ファイル: DSDD.py プロジェクト: gear/pumil
def train(bags, s, l, args):
    P = np.vstack(MI.extract_bags(bags, 0))
    Q = np.vstack(MI.extract_bags(bags, 1))

    n = len(P)
    m = len(Q)

    X = np.vstack((P, Q))
    KP = np.exp(-(r(P**2) - 2 * P.dot(X.T) + r(X**2).T) / (2 * s**2))
    KQ = np.exp(-(r(Q**2) - 2 * Q.dot(X.T) + r(X**2).T) / (2 * s**2))

    # initialization step
    L = np.r_[np.c_[l * np.eye(n + m),
                    np.zeros((n + m, n)),
                    np.zeros((n + m, m))], np.c_[np.zeros((n, n + m)),
                                                 np.zeros((n, n)),
                                                 np.zeros((n, m))],
              np.c_[np.zeros((m, n + m)),
                    np.zeros((m, n)),
                    np.zeros((m, m))], ]
    k = np.r_[np.zeros((n + m, 1)), np.ones((n, 1)) / n, np.ones((m, 1)) / m, ]
    G = np.r_[np.c_[np.zeros((n, n + m)), -np.eye(n),
                    np.zeros((n, m))], np.c_[KP, -np.eye(n),
                                             np.zeros((n, m))],
              np.c_[np.zeros((m, n + m)),
                    np.zeros((m, n)), -np.eye(m)], np.c_[KQ,
                                                         np.zeros((m, n)),
                                                         -np.eye(m)], ]
    h = np.r_[np.zeros((n, 1)), -np.ones((n, 1)),
              np.zeros((m, 1)),
              np.ones((m, 1)), ]

    result = cvxopt.solvers.qp(matrix(L), matrix(k), matrix(G), matrix(h))
    a = np.array(result['x'])[:n + m]

    T = 10
    for t in range(T):
        # tighten the upper-bound
        b = KP.dot(a) >= 1
        c = KQ.dot(a) >= -1

        # minimize the upper-bound
        k = np.r_[-KP.T.dot(b) / n - KQ.T.dot(c) / m,
                  np.ones((n, 1)) / n,
                  np.ones((m, 1)) / m, ]

        result = cvxopt.solvers.qp(matrix(L), matrix(k), matrix(G), matrix(h))
        a = np.array(result['x'])[:n + m]

    def classifier(x):
        x = x.reshape(1, -1)
        return a.T.dot(
            np.exp(-(r(X**2) - 2 * X.dot(x.T) + r(x**2).T) / (2 * s**2)))

    return lambda X: np.max([classifier(x) for x in X])
コード例 #5
0
ファイル: loss.py プロジェクト: gear/pumil
def prediction_error(bags, model, theta):
    N1 = len(MI.extract_bags(bags, 1))
    N0 = len(MI.extract_bags(bags, 0))
    error = nc_risk(theta, N1, N0, zero_one_loss)
    return sum(
        list(
            map(
                lambda B: float(
                    error(model(B.data()),
                          Variable(np.array([[B.label()]]).astype(np.float32)))
                    .data), bags))) - theta
コード例 #6
0
ファイル: __init__.py プロジェクト: gear/pumil
def _class_prior(bags, basis, r):
  # cf. (du Plessis et al., 2014)
  p_bags = MI.extract_bags(bags, 1)
  u_bags = MI.extract_bags(bags, 0)
  n1 = len(p_bags)
  n0 = len(u_bags)
  H = 1./n1 * np.sum([np.outer(basis(B), basis(B).T) for B in p_bags], axis=0)
  h = 1./n0 * np.sum(list(map(lambda B: basis(B), u_bags)), axis=0)
  G = H + r * np.eye(n1 + n0)
  G_ = np.linalg.inv(G)
  return (2*h.T.dot(G_.dot(h))-h.T.dot(G_.dot(H.dot(G_.dot(h)))))**(-1)
コード例 #7
0
ファイル: LSDD.py プロジェクト: gear/pumil
def validation_error(validation_set, training_set, s, l, t):
  X = np.vstack((
    np.vstack(MI.extract_bags(training_set, 1)),
    np.vstack(MI.extract_bags(training_set, 0))))
  d = X.shape[1]
  P = np.vstack(MI.extract_bags(validation_set, 1))
  Q = np.vstack(MI.extract_bags(validation_set, 0))
  H = (np.pi * s**2)**(d/2) * np.exp(- (r(X**2) - 2*X.dot(X.T) + r(X**2).T) / (4*s**2))
  h = np.exp(- (r(X**2) - 2*X.dot(P.T) + r(P**2).T) / (2*s**2)).mean(axis=1) \
    - np.exp(- (r(X**2) - 2*X.dot(Q.T) + r(Q**2).T) / (2*s**2)).mean(axis=1)
  return t.dot(H.dot(t)) - 2*h.T.dot(t)
コード例 #8
0
ファイル: LSDD.py プロジェクト: gear/pumil
def train(bags, width, reg, args):
  P = np.vstack(MI.extract_bags(bags, 1))
  Q = np.vstack(MI.extract_bags(bags, 0))

  t = LSDD(P, Q, width, reg)
  X = np.vstack((P, Q))

  def classifier(x):
    x = x.reshape(1, -1)
    return t.T.dot(np.exp(- (r(X**2) - 2*X.dot(x.T) + r(x**2).T) / (2*width**2)))

  return lambda X: np.max([classifier(x) for x in X])
コード例 #9
0
def train_lsdd(data, args):
    widths = [1.0e-2, 1.0e-4, 1.0e-6]
    regs = [1.0, 1.0e-03, 1.0e-06]

    def train(data, width, reg, measure_time=False):
        if measure_time:
            t_start = time.time()

        model = MI.UU.LSDD.train(data, width, reg, args)
        metadata = {'width': width, 'reg': reg}

        if measure_time:
            t_end = time.time()
            print("#  elapsed time = {}".format(t_end - t_start))

        return model, metadata

    # cross validation
    best_param = {}
    best_error = np.inf
    if args.verbose:
        print("# *** Cross Validation ***")
    for width, reg in itertools.product(widths, regs):
        errors = []
        for data_train, data_val in MI.cross_validation(data, 5):
            t = MI.UU.LSDD.LSDD(np.vstack(MI.extract_bags(data_train, 1)),
                                np.vstack(MI.extract_bags(data_train, 0)),
                                width, reg)
            e = MI.UU.LSDD.validation_error(data_val, data_train, width, reg,
                                            t)
            errors.append(e)

        error = np.mean(errors)

        if args.verbose:
            print("#  width = {:.3e} / reg = {:.3e} / error = {:.3e}".format(
                width, reg, error))

        if error < best_error:
            best_error = error
            best_param = {'width': width, 'reg': reg}

    if args.verbose:
        print("# {}".format('-' * 80))

    model, metadata = train(data,
                            best_param['width'],
                            best_param['reg'],
                            measure_time=True)

    return model, best_param
コード例 #10
0
def main():
    import_modules()
    args = handler()

    index_array, values_array = IO.unpack_mask_file(args.exp_mask_file)
    discr_exp_profile = MI.discretize_exp_profile(index_array,
                                                  values_array,
                                                  nbins=args.nbins)
    seeds_passed = IO.read_motif_file(args.combined_seeds_filename)
    profiles_passed = IO.unpack_profiles_file(args.combined_profiles_filename)

    classification_array, N_families = filter_CMI(seeds_passed,
                                                  profiles_passed,
                                                  discr_exp_profile,
                                                  index_array,
                                                  args.nbins,
                                                  args.min_ratio,
                                                  do_print=args.do_print)

    MI_values_array = calculate_MIs_all_seeds(profiles_passed,
                                              discr_exp_profile, index_array,
                                              args.nbins)

    seeds_unique, profiles_unique = choose_best_reps_for_families(
        seeds_passed,
        profiles_passed,
        classification_array,
        N_families,
        MI_values_array,
        do_print=args.do_print)

    IO.write_list_of_seeds(seeds_unique, args.unique_seeds_filename)
    IO.write_array_of_profiles(profiles_unique, args.unique_profiles_filename)
    IO.write_classification_array(classification_array,
                                  args.families_classification_filename)
コード例 #11
0
def MI_get_pvalue_and_zscore(active_profile, discr_exp_profile, nbins,
                             current_MI, n_permutations):
    shuffled_MI_values = np.zeros(n_permutations, dtype=np.float64)

    for i in range(n_permutations):
        shuffled_expr = np.random.permutation(discr_exp_profile)
        ith_MI = MI.mut_info(active_profile,
                             shuffled_expr,
                             x_bins=2,
                             y_bins=nbins)

        shuffled_MI_values[i] = ith_MI

    shuffled_MI_values.sort()

    if current_MI < shuffled_MI_values[0]:
        # shortcut: if current MI is less than the minimal permuted MI, exit
        value_undiv = n_permutations
    else:
        # go from right to left while the shuffled score is higher than the real one
        j = n_permutations - 1
        while (j >= 0) and (current_MI <= shuffled_MI_values[j]):
            j -= 1
        value_undiv = n_permutations - j - 1

    pvalue = value_undiv / float(n_permutations)
    z_score = (current_MI -
               np.mean(shuffled_MI_values)) / np.std(shuffled_MI_values)

    # print(shuffled_MI_values)
    # print(current_MI)
    return pvalue, z_score
コード例 #12
0
def main():
    # I only import things if I run this script itself
    # do relative import based on current working directory
    # otherwise I have to install the package for relative import to work
    import_modules()

    args = handler()

    # get mapping of task ids to input files
    mapping_dict = sge.parse_task_mapping_file(args.task_mapping_file)
    # get the task id
    env_variables_dict = sge.get_env_variables()
    # get the names of input and output files
    profiles_filename_full, MI_values_filename_full, rna_bin_filename = get_current_in_out_filenames(
        args, env_variables_dict, mapping_dict)

    decompressed_profiles_array, index_array, values_array = IO.unpack_profiles_and_mask(
        profiles_filename_full, args.exp_mask_file, do_print=True)

    discr_exp_profile = MI.discretize_exp_profile(index_array, values_array,
                                                  args.nbins)

    MI_values_array = calculate_MI_for_seeds(decompressed_profiles_array,
                                             index_array,
                                             discr_exp_profile,
                                             args.nbins,
                                             args.min_occurences,
                                             do_print=True)
    IO.write_MI_values(MI_values_array, args.nbins, MI_values_filename_full)

    if args.print_qstat == 'y':
        sge.print_qstat_proc(env_variables_dict, args.path_to_qstat)
コード例 #13
0
def main():
    import_modules()
    args = handler()

    n_seqs_list = read_sequences(args.rna_bin_file)
    index_array, values_array = IO.unpack_mask_file(args.exp_mask_file)
    discr_exp_profile = MI.discretize_exp_profile(index_array, values_array, nbins = args.nbins)
    seeds_initial = IO.read_motif_file(args.unique_seeds_filename)
    profiles_initial = IO.unpack_profiles_file(args.unique_profiles_filename)
    seqs_of_interest = [n_seqs_list[x] for x in range(index_array.shape[0]) if index_array[x]]

    # get the task id
    env_variables_dict = sge.get_env_variables()
    seed_chunks, profiles_chunks = chunk_up_input_files(seeds_initial, profiles_initial, args.size_of_chunks)
    seed_right_chunk, profiles_right_chunk = pick_one_chunk(seed_chunks, profiles_chunks, env_variables_dict)

    seeds_filename_full, profiles_filename_full, \
    char_filename_full, robustness_filename_full = make_output_filenames(env_variables_dict, args)


    seeds_optimized, profiles_optimized, \
    seed_charact_array, robustness_array  = optimize_motifs(seed_right_chunk, profiles_right_chunk,
                                            discr_exp_profile, args.nbins, index_array, seqs_of_interest,
                                            args, do_print=True)

    IO.write_list_of_seeds(seeds_optimized, seeds_filename_full)
    IO.write_array_of_profiles(profiles_optimized, profiles_filename_full)
    IO.write_np_array(seed_charact_array, char_filename_full)
    IO.write_np_array(robustness_array, robustness_filename_full)
コード例 #14
0
def calculate_MI_for_seeds(decompressed_profiles_array,
                           index_array,
                           discr_exp_profile,
                           nbins,
                           min_occurences,
                           do_print=False):
    MI_values_array = np.zeros(decompressed_profiles_array.shape[0],
                               dtype=np.float32)

    for i, profile in enumerate(decompressed_profiles_array):
        active_profile = profile[index_array]

        if active_profile.sum() <= min_occurences:
            MI_values_array[i] = MASK_OUT_SEED_VALUE
            # print("The seed number %d binds only %d transcripts" % (i, active_profile.sum()))
            continue

        MI_values_array[i] = MI.mut_info(active_profile,
                                         discr_exp_profile,
                                         x_bins=2,
                                         y_bins=nbins)

        if do_print:
            if i % 1000 == 0 and i > 0:
                print("Profile number %d has been calculated" % i)

    MI_values_array = np.array(
        MI_values_array,
        dtype=np.float64)  # make sure all elements are of the same size
    return MI_values_array
コード例 #15
0
ファイル: pumil.py プロジェクト: gear/pumil
def affinity(clf, conf, bags, uidx, nidx):
    # evaluate F-score on unlabeled set
    # regard "reliable negative bags" as negative set, and the other bags as positive set
    pidx = list(set(uidx) - set(nidx))
    pred = np.array([clf(bags[i], conf[i]) for i in pidx + nidx])
    true = np.r_[np.ones(len(pidx)), -1 * np.ones(len(nidx))]

    return MI.f_score(pred, true)
コード例 #16
0
ファイル: MItest.py プロジェクト: FelixSeol/ML
def test():
    x = asarray([gauss(0, 1) for i in range(1000)])
    y1 = asarray([int(e > 0) for e in x])
    y2 = asarray([randint(0, 1) for e in x])

    hx, bx = histogram(x, bins=x.size / 10, density=True)
    dx = digitize(x, bx)

    print "X ~ N(0,1)"
    print "y1 = 1 <=> x > 0"
    print "y2 = 1 con probabilidad 0.5"
    print
    print "I(y1;x) = H(X) - H(X|Y1) = %.02f" % (mi.mutual_information(x, y1))
    print "I(y1;x) = H(Y1) - H(Y1|X) = %.02f" % (mi.mutual_information(y1, dx))
    print
    print "I(y2;x) = H(X) - H(X|Y2) = %.02f" % (mi.mutual_information(x, y2))
    print "I(y2;x) = H(Y2) - H(Y2|X) = %.02f" % (mi.mutual_information(y2, dx))
コード例 #17
0
def train_sl(bags, basis, bdim, theta, r, args):
    p_bags = MI.extract_bags(bags, 1)
    u_bags = MI.extract_bags(bags, 0)
    N1 = len(p_bags)
    N0 = len(u_bags)
    N = N1 + N0
    P1 = np.array([np.r_[1, basis(B)].T for B in p_bags])
    P0 = np.array([np.r_[1, basis(B)].T for B in u_bags])

    param = np.linalg.inv(0.5 / N0 * P0.T.dot(P0) + r * np.eye(bdim + 1)).dot( \
        theta / N1 * P1.T.dot(np.ones((N1, 1))) - 0.5 / N0 * P0.T.dot(np.ones((N0, 1)))
    )

    alpha = param[1:]
    beta = float(param[:1])
    clf = lambda X: alpha.T.dot(basis(X)) + beta

    return clf
コード例 #18
0
def calculate(pdf, variable_time, variables_state, dt, sample_N=1, sample_T=1, logbase="log2"):
    """
    Input:
        pdf                     joint pdf class
        variable_time           variable identifying the time series (time 0 must be included). 
                                Must be ordered, since the first index is used to calculate the entropy used to intersect the entropy curve.
        variables_state         variables representing the state. Could fit multiple labels (e.g. "var" fits "var_1", "var_2", "var_3",...)
        dt                      Number of timesteps between time series
        sample_N                sample percentage for the variables
        sample_T                sample percentage for the time variable
        logbase                 Base for the logarithm ("log2", "log", "log10")

        returns                 information integration.
    """
    assert np.isscalar(variable_time), "Only one time variable can be specified"
    assert logbase in ["log2", "log", "log10"], "Logbase parameter must be one of (\"log2\", \"log\", \"log10\")"

    """Sample variables"""
    labels_state = pdf.get_labels(variables_state)
    sampled_pdf = pdf.sample_variables(labels_state, sample_N)
    sampled_labels_state = sampled_pdf.get_labels(variables_state)
    """Sample time"""
    sampled_pdf = sampled_pdf.sample_values([variable_time], [sample_T])
    num_time_series = sampled_pdf.get_num_bins_of(variable_time)

    """Calculate IDT in each element (sample)"""
    II = np.ndarray((num_time_series, len(sampled_labels_state)))
    for i, l_i in enumerate(sampled_labels_state):
        for t in xrange(num_time_series):
            """Calculate I(Si^T:{Sj^0}j)"""
            """Create joint pdf class with the initial state as a variable"""
            state_vars = sampled_labels_state[:]
            state_vars.remove(l_i)
            joint_i_sj_pdf = sampled_pdf.join_dimensions(state_vars, "initial_state")
            MI_i = MI.calculate(joint_i_sj_pdf, l_i, "initial_state", logbase)

            """I(Si^T:Sj^0) accumulator"""
            MI_i_tAcc = 0
            for j, l_j in enumerate(sampled_labels_state):
                MI_i_tAcc += MI.calculate(sampled_pdf, l_i, l_j, logbase)

            II[t, i] = MI_i - MI_i_tAcc

    return II
コード例 #19
0
ファイル: kernel.py プロジェクト: gear/pumil
def minimax_basis(bags, degree=1):
    """
  Build basis function based on minimax kernel.

  Parameters
  ----------
  deg : Degree of polynomial kernel.
  """
    degree = int(degree)

    p_bags = MI.extract_bags(bags, 1)
    u_bags = MI.extract_bags(bags, 0)
    n_bags = MI.extract_bags(bags, -1)
    bags = p_bags + u_bags + n_bags

    stat = lambda X: np.r_[X.min(axis=0), X.max(axis=0)]
    poly_kern = lambda X, Y: (stat(X).dot(stat(Y)) + 1)**degree

    return lambda X: np.array([poly_kern(X, B) for B in bags])
コード例 #20
0
def optimize_motifs(seeds_initial, profiles_initial,
                    discr_exp_profile, nbins, index_array, seqs_of_interest,
                    args, do_print = True):
    seeds_optimized = copy.deepcopy(seeds_initial)
    profiles_optimized = np.zeros((len(seeds_initial), discr_exp_profile.shape[0]), dtype=bool)
    # seed_charact_array keeps MI values, p-values and z-scores
    seed_charact_array = np.zeros((len(seeds_initial), 3), dtype=np.float64)
    robustness_array = np.zeros(len(seeds_initial), dtype=bool)

    for i, motif in enumerate(seeds_initial):
        profile = profiles_initial[i]
        active_profile = profile[index_array]
        n_bestmotif = type_conversions.w_to_n_motif(seeds_initial[i])

        # initial mi value
        init_best_MI = MI.mut_info(active_profile, discr_exp_profile, x_bins=2, y_bins=nbins)
        lastmyfreq = active_profile.sum() / float(active_profile.shape[0])

        if do_print:
            w_bestmotif = type_conversions.n_to_w_motif(n_bestmotif)
            print("Optimzing the sequence of motif %d (sequence is %s). Initial MI = %.5f" %
                            (i, w_bestmotif.print_sequence(return_string=True), init_best_MI))
            #print("Initial frequency: %.4f" % lastmyfreq)

        bestmi, lastmyfreq, n_bestmotif = optimize_motif_sequence(n_bestmotif, init_best_MI, seqs_of_interest,
                            discr_exp_profile, nbins, lastmyfreq, args, do_print = do_print,
                            random_noseed = args.random_noseed)

        if do_print:
            print("Elongating motif %d" % i)

        bestmi, lastmyfreq, n_bestmotif = elongate_motif(n_bestmotif, bestmi, seqs_of_interest,
                            discr_exp_profile, nbins, lastmyfreq, args, do_print = do_print)

        w_bestmotif = type_conversions.n_to_w_motif(n_bestmotif)
        bestmotif_profile, bestmotif_mi, pvalue, z_score = get_characteristics(
                                                            n_bestmotif, seqs_of_interest,
                                                            discr_exp_profile, nbins, args,
                                                            do_print=do_print)

        if do_print:
            print("Checking robustness of the optimized motif %d (sequence %s)" %
                  (i, w_bestmotif.print_sequence(return_string=True)))

        is_robust = check_robustness(bestmotif_profile,
                                    discr_exp_profile, nbins, args,
                                    do_print = do_print)

        seeds_optimized[i] = w_bestmotif
        profiles_optimized[i] = bestmotif_profile.values
        seed_charact_array[i, : ] = np.array([bestmotif_mi, pvalue, z_score], dtype=np.float64)
        robustness_array[i] = is_robust

    return seeds_optimized, profiles_optimized, \
           seed_charact_array, robustness_array
コード例 #21
0
def get_characteristics(n_bestmotif, seqs_of_interest,
                        discr_exp_profile, nbins, args,
                        do_print = False):
    bestmotif_profile, _time = matchmaker.calculate_profile_one_motif(n_bestmotif, seqs_of_interest,
                                                                      is_degenerate = True)
    bestmotif_mi = MI.mut_info(bestmotif_profile.values, discr_exp_profile, x_bins=2, y_bins=nbins)
    pvalue, z_score = statistic_tests.MI_get_pvalue_and_zscore(bestmotif_profile.values, discr_exp_profile, nbins,
                                                               bestmotif_mi, args.n_permutations)
    if do_print:
        print("The final p-value is: %.4f, z-score is: %.3f" % (pvalue, z_score))
    return bestmotif_profile, bestmotif_mi, pvalue, z_score
コード例 #22
0
def calculate_MIs_all_seeds(profiles_passed, discr_exp_profile, index_array,
                            nbins):
    MI_values_array = np.zeros(profiles_passed.shape[0], dtype=np.float32)

    for i, profile in enumerate(profiles_passed):
        active_profile = profile[index_array]
        MI_values_array[i] = MI.mut_info(active_profile,
                                         discr_exp_profile,
                                         x_bins=2,
                                         y_bins=nbins)

    return MI_values_array
コード例 #23
0
ファイル: kernel.py プロジェクト: gear/pumil
def nsk_basis(bags, width=1.0e-01):
    """
  Build basis function based on normalized set kernel.
  """

    ins_kern = lambda x, c: np.exp(-width * np.linalg.norm(x - c)**2)

    p_bags = MI.extract_bags(bags, 1)
    u_bags = MI.extract_bags(bags, 0)
    n_bags = MI.extract_bags(bags, -1)
    bags = p_bags + u_bags + n_bags

    # (un-normalized) set kernel
    usk = lambda S0, S1: sum(
        list(
            map(lambda s: ins_kern(s[0], s[1]), list(itertools.product(S0, S1))
                )))

    # normalized set kernel
    nsk = lambda S0, S1: usk(S0, S1) / np.sqrt(usk(S0, S0) * usk(S1, S1))

    return lambda X: np.array([nsk(X, B) for B in bags])
コード例 #24
0
def get_current_statistics(index, MI_values_array, profiles_array, index_array,
                           discr_exp_profile, args):
    profile = profiles_array[index]
    active_profile = profile[index_array]
    current_MI = MI_values_array[index]

    if current_MI == -1:
        return args.max_pvalue + 0.1, args.min_zscore - 0.1

    assert (np.isclose(current_MI,
                       MI.mut_info(active_profile, discr_exp_profile),
                       rtol=1e-10))

    pvalue, z_score = statistic_tests.MI_get_pvalue_and_zscore(
        active_profile, discr_exp_profile, current_MI, args.n_permutations)
    return pvalue, z_score
コード例 #25
0
def jackknife_test(active_profile,
                   discr_exp_profile,
                   nbins,
                   n_permutations,
                   max_pvalue,
                   n_samples,
                   fraction_retain,
                   min_fraction_passed,
                   do_print=False):
    total_number_passed = 0

    for j in range(n_samples):
        full_indices_array = np.arange(active_profile.shape[0])
        how_many_keep = int(fraction_retain * active_profile.shape[0])
        subsampl_index_array = np.random.choice(full_indices_array,
                                                size=how_many_keep,
                                                replace=False)
        curr_profile = active_profile[subsampl_index_array]
        curr_exp_profile = discr_exp_profile[subsampl_index_array]
        curr_MI = MI.mut_info(curr_profile,
                              curr_exp_profile,
                              x_bins=2,
                              y_bins=nbins)
        pvalue, z_score = MI_get_pvalue_and_zscore(curr_profile,
                                                   discr_exp_profile, nbins,
                                                   curr_MI, n_permutations)
        if do_print:
            print(
                "Iteration %d. p-value: %.5f; max_pvalue: %.5f, z-score: %.2f"
                % (j, pvalue, max_pvalue, z_score))
        if pvalue < max_pvalue:
            total_number_passed += 1

    fraction_passed = total_number_passed / float(n_samples)
    if do_print:
        print("%.2f subsamples passed the test; required fraction is %.2f" %
              (fraction_passed, min_fraction_passed))
    if fraction_passed >= min_fraction_passed:
        if do_print:
            print("Passed robustness test")
        return True
    else:
        if do_print:
            print("Did not pass robustness test")
        return False
コード例 #26
0
def main():
    args = handler()

    # read occurence profiles and expression profile
    profiles_array, index_array, values_array = IO.unpack_profiles_and_mask(
        args, do_print=False)

    # read precalculated MI values
    MI_values_array, nbins = IO.read_MI_values(args.MI_values_file)

    # find the threshold
    discr_exp_profile = MI.discretize_exp_profile(index_array, values_array,
                                                  nbins)
    determine_mi_threshold(MI_values_array,
                           discr_exp_profile,
                           profiles_array,
                           index_array,
                           args,
                           do_print=True)
コード例 #27
0
def main():
    # I only import things if I run this script itself
    # do relative import based on current working directory
    # otherwise I have to install the package for relative import to work
    import_modules()

    args = handler()

    # get mapping of task ids to input files
    mapping_dict = sge.parse_task_mapping_file(args.task_mapping_file)
    # get the task id
    env_variables_dict = sge.get_env_variables()

    # get the names of input and output files
    profiles_filename_full, MI_values_filename_full, \
    passed_seed_filename_full, passed_profiles_filename, \
    seed_filename_full, \
    rna_bin_filename, exp_mask_filename = get_current_in_out_filenames(args, env_variables_dict, mapping_dict)

    # read motifs, their profiles and MI values
    profiles_array, index_array, values_array = IO.unpack_profiles_and_mask(
        profiles_filename_full, exp_mask_filename, do_print=True)
    w_motifs_list = IO.read_motif_file(seed_filename_full)
    MI_values_array, nbins = IO.read_MI_values(MI_values_filename_full)

    # find the threshold
    discr_exp_profile = MI.discretize_exp_profile(index_array, values_array,
                                                  nbins)
    last_positive_seed = determine_mi_threshold(MI_values_array,
                                                discr_exp_profile,
                                                nbins,
                                                profiles_array,
                                                index_array,
                                                args,
                                                do_print=True)

    write_seeds_passed(last_positive_seed, MI_values_array, w_motifs_list,
                       passed_seed_filename_full)
    write_profiles_passed(last_positive_seed, MI_values_array, profiles_array,
                          passed_profiles_filename)

    if args.print_qstat == 'y':
        sge.print_qstat_proc(env_variables_dict, args.path_to_qstat)
コード例 #28
0
def are_there_better_motifs(n_modified_motifs, seqs_of_interest, discr_exp_profile, nbins,
                            bestmi, n_bestmotif, lastmyfreq, args, do_print = True):

    for curr_motif in n_modified_motifs:
        current_profile, time_spent = matchmaker.calculate_profile_one_motif(curr_motif,
                                                                             seqs_of_interest,
                                                                            is_degenerate = True)
        myfreq = current_profile.values.sum() / float(len(seqs_of_interest))
        tempmi = MI.mut_info(current_profile.values, discr_exp_profile, x_bins=2, y_bins=nbins)

        if tempmi > bestmi and current_profile.sum() > args.min_occurences and (myfreq < args.maxfreq or myfreq < lastmyfreq):
            n_bestmotif = structures.copy_n_motif(curr_motif)
            w_bestmotif = type_conversions.n_to_w_motif(n_bestmotif)
            bestmi = tempmi
            lastmyfreq = myfreq
            if do_print:
                print("New motif (MI = %.4f): %s" % (bestmi, w_bestmotif.print_sequence(return_string=True)))
                # w_bestmotif.print()
                # w_bestmotif.print_linear()
                #print("Current frequency: %.4f" % lastmyfreq)
    return bestmi, lastmyfreq, n_bestmotif
コード例 #29
0
def individual(pdf, variable_time, variables_idt, dt, sample_N=1, sample_T=1, logbase="log2"):
    """
    Input:
        pdf                     joint pdf class
        variable_time           variable identifying the time series (time 0 must be included). 
                                Must be ordered, since the first index is used to calculate the entropy used to intersect the entropy curve.
        variables_idt           variables to calculate their idt. Could fit multiple labels (e.g. "var" fits "var_1", "var_2", "var_3",...)
        dt                      Number of timesteps between time series
        sample_N                sample percentage for the idt variables
        sample_T                sample percentage for the time variable
        logbase                 Base for the logarithm ("log2", "log", "log10")

        returns                 idt calculated on the variables_idt.
    """
    assert np.isscalar(variable_time), "Only one time variable can be specified"

    """Sample variables"""
    labels_idt = pdf.get_labels(variables_idt)
    sampled_pdf = pdf.sample_variables(labels_idt, sample_N)
    sampled_labels_idt = sampled_pdf.get_labels(variables_idt)

 #   labels_initial_idt = pdf_initial.get_labels(variables_idt)
 #   sampled_initial_pdf = pdf_initial.sample_variables(labels_initial_idt, sample_N)

    """Before sampling time, calculate entropy at time 0"""
    joint_time_0 = sampled_pdf.filter_joint_probabilities([variable_time], [0])
    joint_time_0.normalize()
    h_initial = shannon.calculate(joint_time_0, sampled_labels_idt)

    """Sample time"""
    sampled_pdf = sampled_pdf.sample_values([variable_time], [sample_T], True)
    num_time_series = sampled_pdf.get_num_bins_of(variable_time)

    """Maximum value for IDT when there is no enough decay"""
    IDT_max = num_time_series * dt

    num_variables = len(sampled_labels_idt)
    IDT_var = np.ndarray((num_variables))
    IDT_var[:] = IDT_max
    """Calculate IDT for each element"""
    for i, l_i in enumerate(sampled_labels_idt):
        """Target decay limit"""
        h_target = h_initial[i]/2

        """Maximum mutual information variable"""
        max_I = np.ndarray((num_time_series))

        """Time series mutual information"""
        for t in xrange(num_time_series - 1):
            """Filter combination with time t"""
            pdf_t = sampled_pdf.filter_joint_probabilities([variable_time], [[t+1]])
            pdf_t.normalize()
            mi = np.ndarray((num_variables))
            """Mutual information in time t"""
            for j, l_j in enumerate(sampled_labels_idt):
                """Joint pdf with var i from initial state and var j from time t"""
                """Mutual information"""
                if t == 0 and i == 0 and j == 1:
                    print joint_time_0.shrink_dimensions_to([l_i]).joint_probabilities, pdf_t.shrink_dimensions_to([l_j]).joint_probabilities
                mi[j] = MI.calculate(pdf_t, l_i, l_j, logbase)
            max_I[t] = np.amax(mi)
        print max_I[1]
        """Find t crossing target decay"""
        for t in xrange(num_time_series - 1):
            """Interpolate when found"""
            if max_I[t + 1] - h_target < 0:
                t1 = t
                t2 = t + 1
                found = True
                h1 = max_I[t1]
                h2 = max_I[t2]

                if h2 - h1 == 0:
                    IDT_var[i] = 0
                else:
                    IDT_var[i] = (t1 + (t2-t1)*(h_target - h1) /(h2-h1)) * dt
                break
    
    return IDT_var
コード例 #30
0
def individual(initial, times, dt, bin_values, continuous_bins, sample_N1, sample_N2, sample_t, logbase="log2"):
    """
    IDT individual metric

    Input:
        initial         Initial data NxP
                            N = elements
                            P = population
        times           Time state data TxNxP
                            T = time series 
                            N = elements
                            P = population
        dt              Number of timesteps between time series
        bin_values      values of the bins
        continuous_bins true if the values of the bins are continuous
        sample_N1       percentage of elements to choose as a sample for state 0
        sample_N2       percentage of elements to choose as a sample for state t
        sample_time     percentage of elements to choose as a sample for time series
        logbase         Base for the logarithm ("log2", "log", "log10")
    Returns:
                        IDT N
                            N = elements
    """
    assert logbase in ["log2", "log", "log10"], 'Logbase parameter must be one of ("log2", "log", "log10")'
    assert 0 < sample_N1 <= 1, "Sample for N1 must be within (0, 1]"
    assert 0 < sample_N2 <= 1, "Sample for N2 must be within (0, 1]"
    assert 0 < sample_time <= 1, "Sample for time must be within (0, 1]"

    number_of_bins = len(bin_values)
    if continuous_bins:
        number_of_bins = number_of_bins - 1
    # Sampling input data
    sample_elements_1 = np.arange(len(initial))
    sample_elements_2 = np.arange(len(initial))
    sample_time = np.arange(len(times))
    np.random.shuffle(sample_elements_1)
    np.random.shuffle(sample_elements_2)
    np.random.shuffle(sample_time)
    sample_elements_1 = sample_elements_1[: len(initial) * sample_N1]
    sample_elements_2 = sample_elements_2[: len(initial) * sample_N2]
    sample_time = sample_time[: len(times) * sample_t]
    sample_time = np.sort(sample_time)
    times_sampled = times[sample_time]
    initial_sampled = initial[sample_elements_1]
    initial_sampled_2 = initial[sample_elements_2]
    initial_sampled_len = len(initial_sampled)
    initial_sampled_len_2 = len(initial_sampled_2)
    times_sampled_len = len(times_sampled)

    # Maximum value for IDT when there is no enough decay
    IDT_max = len(times) * dt

    # Initial marginals pdf
    pdf_initial = PDF.single(initial_sampled, bin_values, continuous_bins)
    pdf_initial_2 = PDF.single(initial_sampled_2, bin_values, continuous_bins)
    # Initial entropy
    h_initial = shannon.calculate(pdf_initial, logbase)
    # Temporal marginals pdf
    pdf_t = np.ndarray((len(sample_time), len(sample_elements_2), number_of_bins), dtype="float")
    for t in xrange(len(sample_time)):
        pdf_t[t] = PDF.single(times_sampled[t][sample_elements_2, ...], bin_values, continuous_bins)

    # Calculate IDT for each element (sample)
    IDT_var = np.ndarray(initial_sampled_len, dtype="float")
    init = time.clock()
    for i in xrange(initial_sampled_len):
        # Target decay limit
        h_target = h_initial[i] / 2

        # Maximum mutual information
        max_I = np.ndarray(times_sampled_len + 1, dtype="float")
        initial_sampled_i_len = len(initial_sampled[i])

        found = False
        # Initial mutual information
        mi_init = np.ndarray((len(times_sampled[t][sample_elements_2])), dtype="float")
        for j in xrange(len(times_sampled[t][sample_elements_2])):
            initial_sampled_i_len_2 = len(initial_sampled_2[j])
            # Calculate joint pdf from initial state
            pdf_joint = PDF.joint(
                initial_sampled[i].reshape(1, initial_sampled_i_len),
                bin_values,
                continuous_bins,
                initial_sampled_2[j].reshape(1, initial_sampled_i_len_2),
                bin_values,
                continuous_bins,
            )
            # Mutual information
            mi_init[j] = MI.calculate(
                pdf_initial[i].reshape(1, number_of_bins),
                pdf_initial_2[j].reshape(1, number_of_bins),
                pdf_joint,
                logbase,
            )
        max_I[0] = np.amax(mi_init)

        # Time series mutual information
        for t in xrange(times_sampled_len):
            mi = np.ndarray((len(times_sampled[t][sample_elements_2])), dtype="float")
            # Mutual information in time t
            for j in xrange(len(times_sampled[t][sample_elements_2])):
                # Calculate joint pdf from initial state and time t
                pdf_joint = PDF.joint(
                    initial_sampled[i].reshape(1, initial_sampled_i_len),
                    bin_values,
                    continuous_bins,
                    times_sampled[t][sample_elements_2][j].reshape(1, len(times_sampled[t][sample_elements_2][j])),
                    bin_values,
                    continuous_bins,
                )
                # Mutual information
                mi[j] = MI.calculate(
                    pdf_initial[i].reshape(1, number_of_bins),
                    pdf_t[t, j, :].reshape(1, number_of_bins),
                    pdf_joint,
                    logbase,
                )
            max_I[t + 1] = np.amax(mi)

        # Find t crossing target decay
        for t in xrange(times_sampled_len):
            # Interpolate when found
            if max_I[t + 1] - h_target < 0:
                t1 = t
                t2 = t + 1
                found = True
                h1 = max_I[t1]
                h2 = max_I[t2]

                if h2 - h1 == 0:
                    IDT_var[i] = 0
                else:
                    IDT_var[i] = (t1 + (t2 - t1) * (h_target - h1) / (h2 - h1)) * dt
                break
        # Setting maximum IDT value when not found
        if not found:
            IDT_var[i] = IDT_max

    return IDT_var
コード例 #31
0
def calculateMetric(metric_name, param_vals):
	if metric_name == 'count':
		if len(param_vals)!= 1:
			print 'ERROR:Error in ', metric_name, ', number of parameters incorrect. It must be count(data)'
			raise Exception()
		return red.count(*param_vals)
	elif metric_name == 'pdf':
		if len(param_vals)!= 3:
			print 'ERROR:Error in ', metric_name, ', number of parameters incorrect. It must be pdf(data, bin_values, continuous_bins)'
			raise Exception()
		return PDF.single(*param_vals)
	elif metric_name == 'deft':
		if len(param_vals) < 2 or len(param_vals) > 3:
			print 'ERROR:Error in ', metric_name, ', number of parameters incorrect. It must be deft(data, g, alpha)'
			raise Exception()
		return deft.deft(*param_vals)
	elif metric_name == 'pdf_joint':
		if len(param_vals)!= 6:
			print 'ERROR:Error in ', metric_name, ', number of parameters incorrect. It must be pdf_joint(dataA, bin_valuesA, continuous_binsA, dataB, bin_valuesB, continuous_binsB)'
			raise Exception()
		return PDF.joint(*param_vals)
	elif metric_name == 'mutual_information':
		if len(param_vals) < 3 or len(param_vals) > 4:
			print 'ERROR:Error in ', metric_name, ', number of parameters incorrect. It must be mutual_information(pdfA, pdfB, joint_pdf, logbase="log2")'
			raise Exception()
		return MI.calculate(*param_vals)
	elif metric_name == 'shannon':
		if len(param_vals) < 1 or len(param_vals) > 2:
			print 'ERROR:Error in ', metric_name, ', number of parameters incorrect. It must be shannon(pdf, logbase="log2")'
			raise Exception()
		return shannon.calculate(*param_vals)
	elif metric_name == 'kullback-leibler':
		if len(param_vals) < 2 or len(param_vals) > 3:
			print 'ERROR:Error in ', metric_name, ', number of parameters incorrect. It must be kullback-leibler(pdf_p, pdf_q, logbase="log2")'
			raise Exception()
		return kullback.calculate(*param_vals)
	elif metric_name == 'fisher':
		if len(param_vals) < 2 or len(param_vals) > 3:
			print 'ERROR:Error in ', metric_name, ', number of parameters incorrect. It must be fisher(pdf, eps, logbase="log2")'
			raise Exception()
		return fis.calculate(*param_vals)
	elif metric_name == 'hellinger-distance':
		if len(param_vals) != 2:
			print 'ERROR:Error in ', metric_name, ', number of parameters incorrect. It must be hellinger-distance(pdf_p, pdf_q)'
			raise Exception()
		return hellinger.calculate(*param_vals)
	elif metric_name == 'surprise':
		if len(param_vals)!= 1:
			print 'ERROR:Error in ', metric_name, ', number of parameters incorrect. It must be surprise(prob)'
			raise Exception()
		return surprise.calculate(*param_vals)
	elif metric_name == 'idt':
		if len(param_vals) < 6 or len(param_vals) > 7:
			print 'ERROR:Error in ', metric_name, ', number of parameters incorrect. It must be idt(initial, time_series, epsilon, dt, bin_values, continuous_bins, logbase="log2")'
			raise Exception()
		return IDT.system(*param_vals)
	elif metric_name == 'idt_individual':
		if len(param_vals) < 8 or len(param_vals) > 9:
			print 'ERROR:Error in ', metric_name, ', number of parameters incorrect. It must be idt_individual(initial, time_series, dt, bin_values, continuous_bins, sample_state_0, sample_state_t, sample_time, logbase="log2")'
			raise Exception()
		return IDT.individual(*param_vals)
	elif metric_name == 'information_integration':
		if len(param_vals) < 9 or len(param_vals) > 10:
			print 'ERROR:Error in ', metric_name, ', number of parameters incorrect. It must be information_integration(initial, group, dt, bin_values, continuous_bins, sample_N1, sample_N2, sample_G, sample_t, logbase="log2")'
			raise Exception()
		return II.calculate(*param_vals)
	elif metric_name == 'multi_information':
		if len(param_vals) < 6 or len(param_vals) > 7:
			print 'ERROR:Error in ', metric_name, ', number of parameters incorrect. It must be multi_information(data, bin_values, continuous_bins, sample_var, sample_elems, sample_pop, logbase="log2")'
			raise Exception()
		return multi.calculate(*param_vals)
	elif metric_name == 'swap_axes':
		if len(param_vals)!= 3:
			print 'ERROR:Error in ', metric_name, ', number of parameters incorrect. It must be swap_axes(data, axis0, axis1)'
			raise Exception()
		return np.swapaxes(*param_vals)
	elif metric_name == 'add_dimension':
		if len(param_vals)!= 2:
			print 'ERROR:Error in ', metric_name, ', number of parameters incorrect. It must be add_dimension(data, dimNumber)'
			raise Exception()
		return  np.expand_dims(*param_vals)
	elif metric_name == 'join_dimensions':
		if len(param_vals)!= 3:
			print 'ERROR:Error in ', metric_name, ', number of parameters incorrect. It must be join_dimensions(data, dimNumberA, dimNumberB)'
			raise Exception()
		return  red.join(*param_vals)
	else :
		# Try to get a numpy function
		try :
			func = getattr(np, metric_name)
			return func(*param_vals)
		except:
			print 'ERROR:Metric ', metric_name, ' does not exist'
			raise Exception()
コード例 #32
0
def calculateMetric(metric_name, param_vals):
    '''
    Calculates a metric.

    Input:
        metric_name     metric name
        param_vals      metric parameters
    Returns:
                        result of the metric
    '''
    if metric_name == 'count':
        if len(param_vals)!= 1:
            print 'ERROR:Error in ', metric_name, ', number of parameters incorrect. It must be count(data)'
            raise Exception()
        return red.count(*param_vals)
    elif metric_name == 'pdf':
        if len(param_vals)!= 3:
            print 'ERROR:Error in ', metric_name, ', number of parameters incorrect. It must be pdf(data, bin_values, continuous_bins)'
            raise Exception()
        return PDF.single(*param_vals)
    elif metric_name == 'deft':
        if len(param_vals) < 4 or len(param_vals) > 5:
            print 'ERROR:Error in ', metric_name, ', number of parameters incorrect. It must be deft(data, g, minLimit, maxLimit, alpha=2)'
            raise Exception()
        return deft.deft(*param_vals)
    elif metric_name == 'deft_joint':
        if len(param_vals) < 7 or len(param_vals) > 8:
            print 'ERROR:Error in ', metric_name, ', number of parameters incorrect. It must be deft_joint(dataA, dataB, g, minLimitA, maxLimitA, minLimitB, maxLimitB, alpha=2)'
            raise Exception()
        return deft.deft(*param_vals)
    elif metric_name == 'pdf_joint':
        if len(param_vals)!= 6:
            print 'ERROR:Error in ', metric_name, ', number of parameters incorrect. It must be pdf_joint(dataA, bin_valuesA, continuous_binsA, dataB, bin_valuesB, continuous_binsB)'
            raise Exception()
        return PDF.joint(*param_vals)
    elif metric_name == 'mutual_information':
        if len(param_vals) < 3 or len(param_vals) > 4:
            print 'ERROR:Error in ', metric_name, ', number of parameters incorrect. It must be mutual_information(pdfA, pdfB, joint_pdf, logbase="log2")'
            raise Exception()
        return MI.calculate(*param_vals)
    elif metric_name == 'shannon':
        if len(param_vals) < 1 or len(param_vals) > 2:
            print 'ERROR:Error in ', metric_name, ', number of parameters incorrect. It must be shannon(pdf, logbase="log2")'
            raise Exception()
        return shannon.calculate(*param_vals)
    elif metric_name == 'kullback-leibler':
        if len(param_vals) < 2 or len(param_vals) > 3:
            print 'ERROR:Error in ', metric_name, ', number of parameters incorrect. It must be kullback-leibler(pdf_p, pdf_q, logbase="log2")'
            raise Exception()
        return kullback.calculate(*param_vals)
    elif metric_name == 'fisher':
        if len(param_vals) < 2 or len(param_vals) > 3:
            print 'ERROR:Error in ', metric_name, ', number of parameters incorrect. It must be fisher(pdf, eps, logbase="log2")'
            raise Exception()
        return fis.calculate(*param_vals)
    elif metric_name == 'hellinger-distance':
        if len(param_vals) != 2:
            print 'ERROR:Error in ', metric_name, ', number of parameters incorrect. It must be hellinger-distance(pdf_p, pdf_q)'
            raise Exception()
        return hellinger.calculate(*param_vals)
    elif metric_name == 'surprise':
        if len(param_vals)!= 1:
            print 'ERROR:Error in ', metric_name, ', number of parameters incorrect. It must be surprise(prob)'
            raise Exception()
        return surprise.calculate(*param_vals)
    elif metric_name == 'idt':
        if len(param_vals) < 6 or len(param_vals) > 7:
            print 'ERROR:Error in ', metric_name, ', number of parameters incorrect. It must be idt(initial, time_series, epsilon, dt, bin_values, continuous_bins, logbase="log2")'
            raise Exception()
        return IDT.system(*param_vals)
    elif metric_name == 'idt_individual':
        if len(param_vals) < 8 or len(param_vals) > 9:
            print 'ERROR:Error in ', metric_name, ', number of parameters incorrect. It must be idt_individual(initial, time_series, dt, bin_values, continuous_bins, sample_state_0, sample_state_t, sample_time, logbase="log2")'
            raise Exception()
        return IDT.individual(*param_vals)
    elif metric_name == 'information_integration':
        if len(param_vals) < 9 or len(param_vals) > 10:
            print 'ERROR:Error in ', metric_name, ', number of parameters incorrect. It must be information_integration(initial, group, dt, bin_values, continuous_bins, sample_N1, sample_N2, sample_G, sample_t, logbase="log2")'
            raise Exception()
        return II.calculate(*param_vals)
    elif metric_name == 'multi_information':
        if len(param_vals) < 6 or len(param_vals) > 7:
            print 'ERROR:Error in ', metric_name, ', number of parameters incorrect. It must be multi_information(data, bin_values, continuous_bins, sample_var, sample_elems, sample_pop, logbase="log2")'
            raise Exception()
        return multi.calculate(*param_vals)
    elif metric_name == 'early_warning_difference':
        if len(param_vals) < 4 or len(param_vals) > 5:
            print 'ERROR:Error in ', metric_name, ', number of parameters incorrect. It must be early_warning_difference(time_series_ref, time_series_comp, change_values, warning_values, histogram_limit=50)'
            raise Exception()
        return ew.early_warning_difference(*param_vals)
    elif metric_name == 'early_warning_flips':
        if len(param_vals) != 2:
            print 'ERROR:Error in ', metric_name, ', number of parameters incorrect. It must be early_warning_flips(time_series, change_values)'
            raise Exception()
        return ew.early_warning_flips(*param_vals)
    elif metric_name == 'add_dimension':
        if len(param_vals)!= 2:
            print 'ERROR:Error in ', metric_name, ', number of parameters incorrect. It must be add_dimension(data, dimNumber)'
            raise Exception()
        return  np.expand_dims(*param_vals)
    elif metric_name == 'join_dimensions':
        if len(param_vals)!= 3:
            print 'ERROR:Error in ', metric_name, ', number of parameters incorrect. It must be join_dimensions(data, dimNumberA, dimNumberB)'
            raise Exception()
        return  red.join(*param_vals)
    else :
        # Try to get a numpy function
        try :
            func = getattr(np, metric_name)
            return func(*param_vals)
        except:
            print 'ERROR:Metric ', metric_name, ' does not exist'
            raise Exception()
コード例 #33
0
                        action='store',
                        default=180,
                        type=int,
                        help='the number of unlabeled data')

    parser.add_argument('-v',
                        '--verbose',
                        action='store_true',
                        default=False,
                        help='verbose output')

    parser.add_argument(
        '--aucplot',
        action='store_true',
        default=False,
        help='output prediction score and true label for AUC plot')

    args = parser.parse_args()

    print("# {}".format('-' * 80))
    print("# *** Experimental Setting ***")
    print("#   model                     : LSDD")
    print("# {}".format('-' * 80))

    bags_train, bags_test, metadata = MI.datasets.load_dataset(
        args.dataset, args.prior, args.np, args.nu)
    clf, best_param = train_lsdd(bags_train, args)
    print("#  width = {:.3e} / reg = {:.3e}".format(best_param['width'],
                                                    best_param['reg']))
    MI.print_evaluation_result(clf, bags_test, args)
コード例 #34
0
ファイル: Main.py プロジェクト: riyueqianjiao/Corpus
Entropy.entropy(list_1)
print "2-8gram entropy generated,at "+str(time.clock())+"s"

#不同长度词块熵值的对比
MaxEntropy.maxEntropy()
print "2-8gram maxEntropy generated,at "+str(time.clock())+"s"


word_dic={}
for j in xrange(1,9): #循环将1-n词的频率存到word_dic字典中,供MI/FAI/LL用
    with open('%dgramindexed.txt'%j, 'r') as f:
        gram_content = f.readlines()
    for i in xrange(len(gram_content)):
        word_dic[(gram_content[i].split('\t'))[0]]=(gram_content[i].split('\t'))[1]
print "word_dic generated,at "+str(time.clock())+"s"

#求Mi
MI.mi(word_dic,n)
print "2-8gram mi generated,at "+str(time.clock())+"s"
#不同长度词块Mi的对比
MaxMi.maxMi()
print "2-8gram maxMi generated,at "+str(time.clock())+"s"

#求Fai
FaiSquare.fai(word_dic,n)
print "2-8gram Fai generated,at "+str(time.clock())+"s"




print "total time is "+str(time.clock())+"s"
コード例 #35
0
kp1_location = []
kp2_location = []
kp1_angle = []
kp2_angle = []

for i in range(len(kp1)):
    kp1_location.append(kp1[i].pt)
    kp1_angle.append(kp1[i].angle)

for i in range(len(kp2)):
    kp2_location.append(kp2[i].pt)
    kp2_angle.append(kp2[i].angle)

good_kp1, good_kp2 = match.match(kp1_location, kp2_location, des1, des2,
                                 sift_ratio)
img_good = display.display(img1, img2, good_kp1, good_kp2)
better_kp1, better_kp2 = ransac.ransac(good_kp1, good_kp2, error_threshold)

solution, rmse = ransac.least_square(better_kp1, better_kp2)
img_better = display.display(img1, img2, better_kp1, better_kp2)
sift_fusion = image_fusion.image_fusion(img1, img2, solution)

common1, common2 = image_fusion.common_region(gray1, gray2, solution)
mi = MI.MI(common1, common2)
print mi

cv2.imshow("sift1 image good match", img_good)  #show mathces
cv2.imshow("sift1 image better match", img_better)  #show mathces
cv2.imshow("sift1 image fusion", sift_fusion)  #show fusion
cv2.waitKey(0)
コード例 #36
0
def train_dh(bags, basis, bdim, theta, r, args):
    if _SOLVER == 'cvxopt':
        import cvxopt
        from cvxopt import matrix
        from cvxopt.solvers import qp
        cvxopt.solvers.options['show_progress'] = False

    elif _SOLVER == 'openopt':
        from openopt import QP
        import warnings
        warnings.simplefilter(action="ignore", category=FutureWarning)

    elif _SOLVER == 'gurobi':
        import sys
        sys.path.append(
            "/home/local/bin/gurobi650/linux64/lib/python3.4_utf32/gurobipy")
        import gurobipy
        from MI.gurobi_helper.helper import quadform, dot, mvmul

    p_bags = MI.extract_bags(bags, 1)
    u_bags = MI.extract_bags(bags, 0)
    N1 = len(p_bags)
    N0 = len(u_bags)
    N = N1 + N0
    d = bdim
    P1 = np.array([basis(B).T for B in p_bags])
    P0 = np.array([basis(B).T for B in u_bags])
    H = np.r_[np.c_[r * np.eye(d),
                    np.zeros((d, 1)),
                    np.zeros((d, N0))], np.c_[np.zeros((1, d)), 0,
                                              np.zeros((1, N0))],
              np.c_[np.zeros((N0, d)),
                    np.zeros((N0, 1)),
                    np.zeros((N0, N0))]]
    f = np.r_[-theta / N1 * P1.T.sum(axis=1).reshape((-1, 1)), [[-theta]],
              1. / N0 * np.ones((N0, 1))]
    L = np.r_[np.c_[0.5 * P0, 0.5 * np.ones((N0, 1)), -np.eye(N0)],
              np.c_[P0, np.ones((N0, 1)), -np.eye(N0)], np.c_[np.zeros(
                  (N0, d)), np.zeros((N0, 1)), -np.eye(N0)]]
    k = np.r_[-0.5 * np.ones((N0, 1)), np.zeros((N0, 1)), -np.zeros((N0, 1))]

    if _SOLVER == 'cvxopt':
        result = qp(matrix(H), matrix(f), matrix(L), matrix(k))
        gamma = np.array(result['x'])

    elif _SOLVER == 'openopt':
        problem = QP(H + 1e-3 * np.eye(H.shape[0]), f, A=L, b=k)
        result = problem.solve('qlcp')
        gamma = result.xf

    elif _SOLVER == 'gurobi':
        # model and target variables
        m = gurobipy.Model('qp')
        m.setParam('OutputFlag', False)
        opt_dim = H.shape[0]
        x = [
            m.addVar(lb=-gurobipy.GRB.INFINITY, name='x{}'.format(i))
            for i in range(opt_dim)
        ]
        m.update()

        # objective function and constraints
        obj = 0.5 * quadform(H.tolist(), x) + dot(f.reshape(-1).tolist(), x)
        constrs = [lhs <= rhs for lhs, rhs in zip(mvmul(L.tolist(), x), k)]

        # solve
        m.setObjective(obj)
        for i, constr in enumerate(constrs):
            m.addConstr(constr, 'c{}'.format(i))

        try:
            m.optimize()
            gamma = np.array([v.x for v in m.getVars()])

        except gurobipy.GurobiError:
            raise ValueError()

    alpha = gamma[:d]
    beta = gamma[d]
    clf = lambda X: alpha.T.dot(basis(X)) + beta

    return clf