Beispiel #1
0
def bench_gaussian_hmm(size):
    title = "benchmarking Gaussian HMM on a sample of size {0}".format(size)
    print(title.center(36, " "))
    ghmm = GaussianHMM()
    ghmm.means_ = [[42], [24]]
    ghmm.covars_ = [[1], [1]]

    with timed_step("generating sample"):
        sample, _states = ghmm.sample(size)

    with timed_step("fitting"):
        fit = GaussianHMM(n_components=2).fit([sample])

    with timed_step("estimating states"):
        fit.predict(sample)
    def getFinalState(self,globalstatenumber,data,pa,transport,means,convars):
        model = GaussianHMM(n_components=globalstatenumber,n_iter=1000,covariance_type='diag',params='stcm', init_params='',random_state=1)
        model.startprob_=pa
        model.transmat_=transport
        model.means_=means
        model.covars_=convars
        #-----------
#         for i, con in enumerate(model.covars_):
#             if (not np.allclose(con, con.T) or np.any(linalg.eigvalsh(con) <= 0)):
#                 print 'is:',i
#             else:
#                 print 'not is:',i
        
#         print 'before_model.covars_:',model.covars_
        model.fit(data)
        hidden_states = model.predict(data)
#         print 'after_model.covars_:',model.covars_
        return hidden_states
Beispiel #3
0
def create_combined_hmm(model):
    list_pi = [model[appliance].startprob_ for appliance in model]
    list_A = [model[appliance].transmat_ for appliance in model]
    list_means = [model[appliance].means_.flatten().tolist()
                  for appliance in model]

    pi_combined = compute_pi_fhmm(list_pi)
    A_combined = compute_A_fhmm(list_A)
    [mean_combined, cov_combined] = combine_means(list_means)

    combined_model = GaussianHMM(
        n_components=len(pi_combined), covariance_type='full',
        startprob_prior=pi_combined, transmat_prior=A_combined)
    combined_model.covars_ = cov_combined
    combined_model.means_ = mean_combined
    combined_model.startprob_ = pi_combined
    combined_model.transmat_ = A_combined
    return combined_model
def fit_hmm(
    depth_normed,  # normalised coverage array 
    transition_probability,  # probability of state transition
    variance,  # variance per copy 
    variance_fixed,  # variance for the zero copy number state 
    max_copy_number=12,  # maximum copy number to consider in the model 
    n_iter=0,  # number of iterations to perform when fitting the model
    params='st',  # parameters that can be changed through fitting 
    init_params=''  # parameters that are initialised from the data
):

    # convenience variable
    min_copy_number = 0  # minimum copy number to consider in the model
    n_states = max_copy_number - min_copy_number + 1

    # construct the transition matrix
    transmat = np.zeros((n_states, n_states))
    transmat[:] = transition_probability
    transmat[np.diag_indices(n_states)] = 1 - (
        (n_states - 1) * transition_probability)

    # construct means and covariance
    means_list = range(n_states)
    means = np.array([[n] for n in means_list])
    covars = np.array([[variance * n + variance_fixed] for n in means_list])

    # setup HMM
    model = GaussianHMM(n_states,
                        covariance_type='diag',
                        n_iter=n_iter,
                        params=params,
                        init_params=init_params)
    model.means_ = means
    model.covars_ = covars
    model.transmat_ = transmat

    # fit HMM
    obs = np.column_stack([depth_normed])
    model.fit(obs)

    # predict hidden states
    h = model.predict(obs)

    return h
def calculate_hmm_g(training_set, test_set, taxonomy, cursor, connection, settings):
    da_id_taxonomy = find_da_id(taxonomy, cursor)
    states, start_probability, transition_probability = start_transition_probability_extraction(training_set, taxonomy)
    n_states = len(states)

    feature_list = extract_features_training_set_gaus(training_set, taxonomy, settings)
    n_features = len(feature_list[states[0]][0])
    mean = calculate_means(states, feature_list, n_features)
    covariance = calculate_covariance(states, feature_list, n_features)
    # covariance = diag_cov(states, feature_list, n_features, mean)

    model = GaussianHMM(n_components=n_states, covariance_type='full')
    model.startprob_ = start_probability
    model.transmat_ = transition_probability
    model.means_ = mean
    model.covars_ = covariance

    test_seq, con_pathes = extract_features_test_set_gaus(test_set, taxonomy, settings)
    da_predictions(test_seq, model, con_pathes, states, da_id_taxonomy, taxonomy, cursor, connection)
model_gaussian.transmat_ = transition_matrix

# Initial state probability
initial_state_prob = np.array([0.1, 0.4, 0.5])

# Setting initial state probability
model_gaussian.startprob_ = initial_state_prob

# As we want to have a 2-D gaussian distribution the mean has to
# be in the shape of (n_components, 2)
mean = np.array([[0.0, 0.0],
                 [0.0, 10.0],
                 [10.0, 0.0]])

# Setting the mean
model_gaussian.means_ = mean

# As emission probability is a 2-D gaussian distribution, thus
# covariance matrix for each state would be a 2-D matrix, thus
# overall the covariance matrix for all the states would be in the
# form of (n_components, 2, 2)
covariance = 0.5 * np.tile(np.identity(2), (3, 1, 1))
model_gaussian.covars_ = covariance

# model.sample returns both observations as well as hidden states
# the first return argument being the observation and the second
# being the hidden states
Z, X = model_gaussian.sample(100)

# Plotting the observations
plt.plot(Z[:, 0], Z[:, 1], "-o", label="observations",
    def fit_and_predict(self, dataset):

        predicted_stock_data = np.empty([0, dataset.shape[1]])

        for idx in range(self.num_calib):
            train_dataset = dataset[idx:idx + self.time_step:]
            test_data = dataset[idx + self.time_step, :]
            if idx == 0:
                #                 n_components=4, covariance_type="diag", n_iter=100
                model = GaussianHMM(n_components=self.states,
                                    covariance_type='full',
                                    verbose=True,
                                    n_iter=100,
                                    init_params='stmc')
            else:
                # Retune the model by using the HMM paramters from the previous iterations as the prior
                model = GaussianHMM(n_components=self.states,
                                    covariance_type='full',
                                    verbose=True,
                                    n_iter=100,
                                    init_params='')
                model.transmat_ = transmat_retune_prior
                model.startprob_ = startprob_retune_prior
                model.means_ = means_retune_prior
                model.covars_ = covars_retune_prior

            model.fit(train_dataset)

            print(model.transmat_)

            transmat_retune_prior = model.transmat_
            startprob_retune_prior = model.startprob_
            means_retune_prior = model.means_
            covars_retune_prior = model.covars_

            if model.monitor_.iter == 100:
                print('Increase number of iterations')
                sys.exit(1)

            iters = 1
            past_likelihood = []
            K = self.time_step
            curr_likelihood = model.score(train_dataset[0:K, :])
            num_examples = train_dataset.shape[0]

            iters = num_examples

            while iters > 0:
                past_likelihood = np.append(
                    past_likelihood, model.score(train_dataset[0:iters, :]))
                iters = iters - 1

            likelihood_diff_idx = np.argmin(
                np.absolute(past_likelihood - curr_likelihood))

            predicted_change = train_dataset[
                likelihood_diff_idx, :] - train_dataset[likelihood_diff_idx +
                                                        1, :]

            predicted_stock_data = np.vstack(
                (predicted_stock_data,
                 dataset[idx + self.time_step - 1, :] + predicted_change))

            mape = calc_mape(predicted_stock_data,
                             np.flipud(dataset[range(100), :]))
            print('MAPE is ', mape)
            print(predicted_stock_data)
Beispiel #8
0
def main():
    """
    Main function that performs footprint analysis.

    Keyword arguments: None
        
    Return: None
    """

    ###################################################################################################
    # Processing Input Arguments
    ###################################################################################################

    # Initializing ErrorHandler
    error_handler = ErrorHandler()
 
    # Parameters
    current_version = "0.0.1"
    usage_message = ("\n--------------------------------------------------\n"
                     "The 'hint' program predicts TFBSs given open chromatin data.\n"
                     "In order to use this tools, please type: \n\n"
                     "%prog [options] <experiment_matrix>\n\n"
                     "The <experiment matrix> should contain:\n"
                     "- One region file representing the regions in which the HMM\n"
                     "  will be applied. It should contain 'regions' in the type field\n"
                     "- One DNase aligned reads file (bam) file with 'DNASE' in the name field.\n"
                     "- One to Three histone modification aligned reads file (bam).\n\n"

                     "For more information, please refer to:\n"
                     "http://www.regulatory-genomics.org/dnasefootprints/\n"
                     "--------------------------------------------------")
    version_message = "HINT - Regulatory Analysis Toolbox (RGT). Version: "+str(current_version)

    # Initializing Option Parser
    parser = PassThroughOptionParser(usage = usage_message, version = version_message)

    # Optional Input Options
    parser.add_option("--hmm-file", dest = "hmm_file", type = "string", 
                      metavar="FILE_1_1[[,...,FILE_N_1];...;FILE_1_M[,...,FILE_N_M]]", default = None,
                      help = ("List of HMM files separated by comma. If one file only, then this HMM will be "
                              "applied for all histone signals, otherwise, the list must have the same number "
                              "of histone files given. The order of the list should be the order of the "
                              "histones in the input_matrix file. If the argument is not given, then a default HMM "
                              "will be used. In case multiple input groups are used, then "
                              "other lists can be passed using semicolon. The number of group of lists should "
                              "equals the number of input groups."))
    parser.add_option("--bias-table", dest = "bias_table", type = "string",
                      metavar="FILE1_F,FILE1_R[;...;FILEM_F,FILEM_R]", default = None,
                      help = ("List of files (for each input group; separated by semicolon) with all "
                              "possible k-mers (for any k) and their bias estimates. Each input group"
                              "should have two files: one for the forward and one for the negative strand."
                              "Each line should contain a kmer and the bias estimate separated by tab. "
                              "Leave an empty set for histone-only groups. Eg. FILE1;;FILE3."))

    # Parameters Options
    parser.add_option("--organism", dest = "organism", type = "string", metavar="STRING", default = "hg19",
                      help = ("Organism considered on the analysis. Check our full documentation for all available "
                              "options. All default files such as genomes will be based on the chosen organism "
                              "and the data.config file. This option is used only if a bigbed output is asked."))
    parser.add_option("--estimate-bias-correction", dest = "estimate_bias_correction",
                      action = "store_true", default = False,
                      help = ("Applies DNase-seq cleavage bias correction with k-mer bias estimated "
                              "from the given DNase-seq data (SLOW HINT-BC)."))
    parser.add_option("--default-bias-correction", dest = "default_bias_correction",
                      action = "store_true", default = False,
                      help = ("Applies DNase-seq cleavage bias correction with default "
                              "k-mer bias estimates (FAST HINT-BC)."))

    parser.add_option("--dnase-norm-per", dest = "dnase_norm_per", type = "float", metavar="INT", default = 98,
                      help = SUPPRESS_HELP)
    parser.add_option("--dnase-slope-per", dest = "dnase_slope_per", type = "float", metavar="INT", default = 98,
                      help = SUPPRESS_HELP)
    parser.add_option("--dnase-frag-ext", dest = "dnase_frag_ext", type = "int", metavar="INT", default = 1,
                      help = SUPPRESS_HELP)
    parser.add_option("--ext-both-directions", dest = "ext_both_directions", action = "store_true", default = False,
                      help = SUPPRESS_HELP)

    parser.add_option("--histone-norm-per", dest = "histone_norm_per", type = "float", metavar="INT", default = 98,
                      help = SUPPRESS_HELP)
    parser.add_option("--histone-slope-per", dest = "histone_slope_per", type = "float", metavar="INT", default = 98,
                      help = SUPPRESS_HELP)

    # Output Options
    parser.add_option("--output-location", dest = "output_location", type = "string", metavar="PATH", 
                      default = getcwd(),
                      help = ("Path where the output files will be written."))
    parser.add_option("--print-bb", dest = "print_bb", action = "store_true", default = False,
                      help = ("If used, the output will be a bigbed (.bb) file."))

    parser.add_option("--print-wig", dest = "print_wig", type = "string", metavar="PATH", default = None,
                      help = SUPPRESS_HELP)

    # Processing Options
    options, arguments = parser.parse_args()
    if(not arguments or len(arguments) > 1): error_handler.throw_error("FP_WRONG_ARGUMENT")

    # Fixed Parameters ################
    region_total_ext = 10000
    fp_limit_size = 50
    fp_limit_size_histone = 2000
    fp_limit_size_ext = 10
    fp_limit_size_ext_histone = 200
    fp_ext = 5
    fp_ext_histone = 50
    tc_ext = 50
    tc_ext_histone = 500
    ###
    dnase_initial_clip = 1000
    dnase_sg_window_size = 9
    dnase_norm_per = options.dnase_norm_per
    dnase_slope_per = options.dnase_slope_per
    dnase_frag_ext = options.dnase_frag_ext
    dnase_ext_both_directions = options.ext_both_directions
    ###
    histone_initial_clip = 1000
    histone_sg_window_size = 201
    histone_norm_per = options.histone_norm_per
    histone_slope_per = options.histone_slope_per
    histone_frag_ext = 200
    ###################################

    # Output wig signal
    if(options.print_wig):
        system("touch "+options.print_wig+"signal.wig | echo -n "" > "+options.print_wig+"signal.wig")
        system("touch "+options.print_wig+"norm.wig | echo -n "" > "+options.print_wig+"norm.wig")
        system("touch "+options.print_wig+"slope.wig | echo -n "" > "+options.print_wig+"slope.wig")

    # Global class initialization
    genome_data = GenomeData(options.organism)
    hmm_data = HmmData()
    
    ###################################################################################################
    # Reading Input Matrix
    ###################################################################################################

    # Reading input argument
    input_matrix = arguments[0]

    # Create experimental matrix
    try:
        exp_matrix = ExperimentalMatrix()
        exp_matrix.read(input_matrix)
    except Exception: error_handler.throw_error("FP_WRONG_EXPMAT")

    ###################################################################################################
    # Reading Input
    ###################################################################################################

    # Group class
    class Group:
        def __init__(self):
            self.name = None
            self.original_regions = None
            self.regions = None
            self.dnase_file = None
            self.histone_file_list = []
            self.dnase_only = True
            self.histone_only = True
            self.hmm = []
            self.flag_multiple_hmms = False
            self.bias_table = None

    # Initialization
    name_list = exp_matrix.names
    type_list = exp_matrix.types
    file_dict = exp_matrix.files
    fields_dict = exp_matrix.fieldsDict
    objects_dict = exp_matrix.objectsDict

    # Populating fields dict data
    for e in ["HS", "DNASE", "HISTONE"]:
        try: fields_dict["data"][e]
        except Exception: fields_dict["data"][e] = []

    # Fetching files per group
    group_list = []
    for g in fields_dict["group"].keys():
        group = Group()
        group.name = g
        for i in range(0,len(fields_dict["group"][g])):
            if(name_list[i] in fields_dict["data"]["HS"]):
                group.original_regions = objects_dict[name_list[i]]
                group.regions = deepcopy(group.original_regions)
                group.regions.extend(int(region_total_ext/2),int(region_total_ext/2)) # Extending
                group.regions.merge() # Sort & Merge
            elif(name_list[i] in fields_dict["data"]["DNASE"]):
                group.dnase_file = GenomicSignal(file_dict[name_list[i]])
                group.dnase_file.load_sg_coefs(dnase_sg_window_size)
            elif(name_list[i] in fields_dict["data"]["HISTONE"]):
                group.histone_file_list.append(GenomicSignal(file_dict[name_list[i]]))
                group.histone_file_list[-1].load_sg_coefs(histone_sg_window_size)
            else: pass # TODO Error (Category of data outside "HS, DNASE, HISTONE")
        if(group.dnase_file): group.histone_only = False
        if(group.histone_file_list): group.dnase_only = False
        if(group.histone_only and group.dnase_only): pass # TODO ERROR (There is no DNase or histone data)
        if(not group.original_regions): pass # TODO ERROR (There is no HS regions)
        group_list.append(group)

    ###################################################################################################
    # Fetching Bias Table
    ###################################################################################################

    bias_correction = False
    if(options.bias_table):

        bias_table_group_list = options.bias_table.split(";")
        if(len(bias_table_group_list) != len(group_list)): pass # TODO ERROR
        for g in range(0,len(group_list)):
            group = group_list[g]
            bias_table_list = bias_table_group_list[g].split(",")
            if(group.histone_only): continue
            group.bias_table = BiasTable(table_file_F=bias_table_list[0], table_file_R=bias_table_list[1])
        bias_correction = True

    elif(options.estimate_bias_correction):

        for group in group_list:
            if(group.histone_only): continue
            group.bias_table = BiasTable(regions=group.original_regions,dnase_file_name=group.dnase_file.file_name,
                                         genome_file_name=genome_data.get_genome())
        bias_correction = True

    elif(options.default_bias_correction):

        for group in group_list:
            if(group.histone_only): continue
            group.bias_table = BiasTable(table_file_F=hmm_data.get_default_bias_table_F(),
                                         table_file_R=hmm_data.get_default_bias_table_R())
        bias_correction = True

    ###################################################################################################
    # Creating HMMs
    ###################################################################################################

    # Fetching HMM input
    flag_multiple_hmms = False
    if(options.hmm_file): # Argument is passed

        hmm_group_list = options.hmm_file.split(";")
        if(len(hmm_group_list) != len(group_list)): pass # TODO ERROR
        for g in range(0,len(group_list)):

            group = group_list[g]

            # Fetching list of HMM files
            group.hmm = hmm_group_list[g].split(",")

            # Verifying HMM application mode (one HMM or multiple HMM files)
            if(len(group.hmm) == 1):
                group.flag_multiple_hmms = False
                group.hmm = group.hmm[0]
            elif(len(group.hmm) == len(histone_file_name_list)): flag_multiple_hmms = True
            else: error_handler.throw_error("FP_NB_HMMS")

    else: # Argument was not passed

        for group in group_list:

            group.flag_multiple_hmms = False
            if(group.dnase_only):
                if(bias_correction): group.hmm = hmm_data.get_default_hmm_dnase_bc()
                else: group.hmm = hmm_data.get_default_hmm_dnase()
            elif(group.histone_only):
                group.hmm = hmm_data.get_default_hmm_histone()
            else: 
                if(bias_correction): group.hmm = hmm_data.get_default_hmm_dnase_histone_bc()
                else: group.hmm = hmm_data.get_default_hmm_dnase_histone()

    # Creating scikit HMM list
    for group in group_list:

        if(group.flag_multiple_hmms):

            hmm_list = []
            for hmm_file_name in group.hmm:

                try:
                    hmm_scaffold = HMM()
                    hmm_scaffold.load_hmm(hmm_file_name)
                    if(int(hmm_ver.split(".")[0]) <= 0 and int(hmm_ver.split(".")[1]) <= 1):
                        scikit_hmm = GaussianHMM(n_components=hmm_scaffold.states, covariance_type="full", 
                                                 transmat=array(hmm_scaffold.A), startprob=array(hmm_scaffold.pi))
                        scikit_hmm.means_ = array(hmm_scaffold.means)
                        scikit_hmm.covars_ = array(hmm_scaffold.covs)
                    else:
                        scikit_hmm = GaussianHMM(n_components=hmm_scaffold.states, covariance_type="full")
                        scikit_hmm.startprob_ = array(hmm_scaffold.pi)
                        scikit_hmm.transmat_ = array(hmm_scaffold.A)
                        scikit_hmm.means_ = array(hmm_scaffold.means)
                        scikit_hmm.covars_ = array(hmm_scaffold.covs)

                except Exception: error_handler.throw_error("FP_HMM_FILES")
                hmm_list.append(scikit_hmm)

            group.hmm = hmm_list

        else:

            scikit_hmm = None
            try:
                hmm_scaffold = HMM()
                hmm_scaffold.load_hmm(group.hmm)
                if(int(hmm_ver.split(".")[0]) <= 0 and int(hmm_ver.split(".")[1]) <= 1):
                    scikit_hmm = GaussianHMM(n_components=hmm_scaffold.states, covariance_type="full", 
                                             transmat=array(hmm_scaffold.A), startprob=array(hmm_scaffold.pi))
                    scikit_hmm.means_ = array(hmm_scaffold.means)
                    scikit_hmm.covars_ = array(hmm_scaffold.covs)
                else:
                    scikit_hmm = GaussianHMM(n_components=hmm_scaffold.states, covariance_type="full")
                    scikit_hmm.startprob_ = array(hmm_scaffold.pi)
                    scikit_hmm.transmat_ = array(hmm_scaffold.A)
                    scikit_hmm.means_ = array(hmm_scaffold.means)
                    scikit_hmm.covars_ = array(hmm_scaffold.covs)


            except Exception: error_handler.throw_error("FP_HMM_FILES")
            group.hmm = scikit_hmm

    ###################################################################################################
    # Main Pipeline
    ###################################################################################################

    # Iterating over groups
    for group in group_list:

        # Initializing result set
        footprints = GenomicRegionSet(group.name)

        # Iterating over regions
        for r in group.regions.sequences:

            ###################################################################################################
            # DNASE ONLY
            ###################################################################################################

            if(group.dnase_only):

                # Fetching DNase signal
                try: dnase_norm, dnase_slope = group.dnase_file.get_signal(r.chrom, r.initial, r.final, 
                                               dnase_frag_ext, dnase_initial_clip, dnase_norm_per,
                                               dnase_slope_per, group.bias_table, genome_data.get_genome(),
                                               dnase_ext_both_directions, options.print_wig)
                except Exception:
                    raise
                    error_handler.throw_warning("FP_DNASE_PROC", add_msg="for region ("+",".join([r.chrom, 
                                  str(r.initial), str(r.final)])+"). This iteration will be skipped.")
                    continue

                # Formatting sequence
                try: input_sequence = array([dnase_norm,dnase_slope]).T
                except Exception:
                    raise
                    error_handler.throw_warning("FP_SEQ_FORMAT",add_msg="for region ("+",".join([r.chrom, 
                                  str(r.initial), str(r.final)])+"). This iteration will be skipped.")
                    continue

                # Applying HMM
                if(isinstance(group.hmm,list)): continue # TODO Error
                try: posterior_list = group.hmm.predict(input_sequence)
                except Exception:
                    raise
                    error_handler.throw_warning("FP_HMM_APPLIC",add_msg="in region ("+",".join([r.chrom, 
                                  str(r.initial), str(r.final)])+"). This iteration will be skipped.")
                    continue

                # Formatting results
                start_pos = 0
                flag_start = False
                fp_state_nb = 4
                for k in range(r.initial, r.final):
                    curr_index = k - r.initial
                    if(flag_start):
                        if(posterior_list[curr_index] != fp_state_nb):
                            if(k-start_pos < fp_limit_size):
                                fp = GenomicRegion(r.chrom, start_pos, k)
                                footprints.add(fp)
                            flag_start = False
                    else:
                        if(posterior_list[curr_index] == fp_state_nb):
                            flag_start = True
                            start_pos = k
                if(flag_start): 
                    fp = GenomicRegion(r.chrom, start_pos, r.final)
                    footprints.add(fp)

            ###################################################################################################
            # HISTONES
            ###################################################################################################

            else:

                # Fetching DNase signal
                if(not group.histone_only):
                    try:
                        dnase_norm, dnase_slope = group.dnase_file.get_signal(r.chrom, r.initial, r.final, 
                                                  dnase_frag_ext, dnase_initial_clip, dnase_norm_per,
                                                  dnase_slope_per, group.bias_table, genome_data.get_genome(),
                                                  dnase_ext_both_directions, options.print_wig)
                    except Exception:
                        raise
                        error_handler.throw_warning("FP_DNASE_PROC", add_msg="for region ("+",".join([r.chrom, 
                                      str(r.initial), str(r.final)])+"). This iteration will be skipped.")
                        continue

                # Iterating over histone modifications
                for i in range(0,len(group.histone_file_list)):

                    # Fetching histone signal
                    try:
                        histone_file = group.histone_file_list[i]
                        histone_norm, histone_slope = histone_file.get_signal(r.chrom, r.initial, r.final, 
                                                      histone_frag_ext, histone_initial_clip, histone_norm_per,
                                                      histone_slope_per, options.print_wig)
                    except Exception:
                        raise
                        error_handler.throw_warning("FP_HISTONE_PROC",add_msg="for region ("+",".join([r.chrom, 
                                      str(r.initial), str(r.final)])+") and histone modification "+histone_file.file_name+". This iteration will be skipped for this histone.")
                        continue

                    # Formatting sequence
                    try:
                        if(group.histone_only): input_sequence = array([histone_norm,histone_slope]).T
                        else: input_sequence = array([dnase_norm,dnase_slope,histone_norm,histone_slope]).T
                    except Exception:
                        raise
                        error_handler.throw_warning("FP_SEQ_FORMAT",add_msg="for region ("+",".join([r.chrom, str(r.initial), str(r.final)])+") and histone modification "+histone_file.file_name+". This iteration will be skipped.")
                        continue

                    # Applying HMM
                    if(flag_multiple_hmms): current_hmm = group.hmm[i]
                    else: current_hmm = group.hmm
                    try: posterior_list = current_hmm.predict(input_sequence)
                    except Exception:
                        raise
                        error_handler.throw_warning("FP_HMM_APPLIC",add_msg="in region ("+",".join([r.chrom, str(r.initial), str(r.final)])+") and histone modification "+histone_file.file_name+". This iteration will be skipped.")
                        continue

                    # Histone-only limit size
                    if(group.histone_only):
                        fp_limit_size = fp_limit_size_histone
                        fp_state_nb = 4
                    else: fp_state_nb = 7

            	    # Formatting results
                    start_pos = 0
                    flag_start = False
                    for k in range(r.initial, r.final):
                        curr_index = k - r.initial
                        if(flag_start):
                            if(posterior_list[curr_index] != fp_state_nb):
                                if(k-start_pos < fp_limit_size):
                                    fp = GenomicRegion(r.chrom, start_pos, k)
                                    footprints.add(fp)
                                flag_start = False
                        else:
                            if(posterior_list[curr_index] == fp_state_nb):
                                flag_start = True
                                start_pos = k
                    if(flag_start): 
                        fp = GenomicRegion(r.chrom, start_pos, r.final)
                        footprints.add(fp)

        ###################################################################################################
        # Post-processing
        ###################################################################################################

        # Parameters
        if(group.histone_only):
            fp_limit = fp_limit_size_ext_histone
            fp_ext = fp_ext_histone
            tc_ext = tc_ext_histone
            tcsignal = group.histone_file_list[0]
            tcfragext = 1
            tcinitialclip = histone_initial_clip
            tcextboth = False
        else:
            fp_limit = fp_limit_size_ext
            fp_ext = fp_ext
            tc_ext = tc_ext
            tcsignal = group.dnase_file
            tcfragext = 1
            tcinitialclip = dnase_initial_clip
            tcextboth = dnase_ext_both_directions

        # Sorting and Merging
        footprints.merge()

        # Overlapping results with original regions
        footprints = footprints.intersect(group.original_regions,mode=OverlapType.ORIGINAL)

        # Extending footprints
        for f in footprints.sequences:
            if(f.final - f.initial < fp_limit):
                f.initial = max(0,f.initial-fp_ext)
                f.final = f.final+fp_ext

        # Fetching chromosome sizes
        chrom_sizes_file_name = genome_data.get_chromosome_sizes()
        chrom_sizes_file = open(chrom_sizes_file_name,"r")
        chrom_sizes_dict = dict()
        for chrom_sizes_entry_line in chrom_sizes_file:
            chrom_sizes_entry_vec = chrom_sizes_entry_line.strip().split("\t")
            chrom_sizes_dict[chrom_sizes_entry_vec[0]] = int(chrom_sizes_entry_vec[1])
        chrom_sizes_file.close()

        # Evaluating TC
        for f in footprints.sequences:
            mid = (f.initial+f.final)/2
            p1 = max(mid - tc_ext,0)
            p2 = min(mid + tc_ext,chrom_sizes_dict[f.chrom])
            try: tag_count = tcsignal.get_tag_count(f.chrom, p1, p2, tcfragext, tcinitialclip, tcextboth)
            except Exception: tag_count = 0
            f.data = str(int(tag_count))

        ###################################################################################################
        # Writing output
        ###################################################################################################

        # Creating output file
        output_file_name = options.output_location+group.name+".bed"
        footprints.write_bed(output_file_name)

        # Verifying condition to write bb
        if(options.print_bb):

            # Fetching file with chromosome sizes
            
            chrom_sizes_file = genome_data.get_chromosome_sizes()

            # Converting to big bed
            output_bb_name = options.output_location+options.footprint_name+".bb"
            system(" ".join(["bedToBigBed",output_file_name,chrom_sizes_file,output_bb_name]))
Beispiel #9
0
                   [1759475.42864922, 36552747.45908708],
                   [2724.71340548, 296602.83220848],
                   [63837.66522882, 2867629.16600791],
                   [20513.28086561, 19980338.31462503],
                   [28962.97633114, 520482.13848515],
                   [4315.55389006, 3128607.93648248],
                   [1790.20488976, 123237.84834907]])

# Build an HMM instance and set parameters
test_model = GaussianHMM(n_components=10, covariance_type="diag")

# Instead of fitting it from the data, we directly set the estimated
# parameters, the means and covariance of the components
test_model.startprob_ = startprob
test_model.transmat_ = transmat
test_model.means_ = means
test_model.covars_ = covars

test_hidden_states = test_model.predict(X)
print(test_hidden_states)

test_result = test_hidden_states
for i in range(len(test_close_v_real)):
    if test_result[i] == 0:
        test_result[i] = -1
    elif test_result[i] == 1:
        test_result[i] = -1
    elif test_result[i] == 2:
        test_result[i] = -1
    elif test_result[i] == 4:
        test_result[i] = -1
Beispiel #10
0
def MyGaussianHMM():
    from hmmlearn.hmm import GaussianHMM
    df = pd.read_csv(
        "/home/ray/Documents/suibe/2017/建模/Modeling_Preparation/dataset/SZIndex.csv",
        header=-1)
    df.head()
    X = np.array(df.iloc[:, 0:5])

    # 一、未知模型情况下,解决问题3
    model = GaussianHMM(n_components=6, covariance_type="diag",
                        n_iter=1000)  # 方差矩阵为对角阵
    """
    参数解释:
    covariance_type:
        "spherical"     :主对角元素均为1,其余元素为0,独立同分布  (数据不足时,难以进行参数估计)
        "diag"          :主对角元素不为0,其余为0               (一般情况,折中)
        "full"          :所有元素均不为0                      (数据足够进行参数估计时)
    """
    model.fit(X)
    print "隐含状态为: ", model.predict(X)  # 列出每一天的隐含状态
    print "特征数目 %s" % model.n_features
    print "隐状态数目 %s" % model.n_components
    print "起始概率 :", model.startprob_
    print "隐状态转移矩阵", model.transmat_
    ## 每个隐含层对应的特征概率空间假设为正态分布,则可以得到一个model.n_components行model.n_features列的均值矩阵
    print "混淆矩阵:均值部分", model.means_
    print "混淆矩阵:方差部分", model.covars_

    ## 绘图
    hidden_states = model.predict(X)
    tradeDate = df.iloc[:, 5].values
    closeIndex = df.iloc[:, 6].values
    plt.figure(figsize=(15, 8))
    for i in range(model.n_components):
        idx = (hidden_states == i)
        plt.plot_date(pd.to_datetime(tradeDate[idx]),
                      closeIndex[idx],
                      '.',
                      label='%dth hidden state' % i,
                      lw=1)
        plt.legend()
        plt.grid(1)
    plt.show()

    # 二、已知模型情况下,解决问题1,2

    ## 沿用上述模型
    ### 问题1
    print "某天出现该观测的概率为: %s" % np.exp(model.score(X[0]))
    ### 问题2
    log_prob, state = model.decode(X[:10], algorithm="viterbi")
    print "只根据前十天,推断出最有可能的隐含状态序列为:", state

    ## 自己输入模型参数
    ### 一个2特征,4隐状态情况
    startprob = np.array([0.6, 0.3, 0.1, 0.0])
    # The transition matrix, note that there are no transitions possible
    # between component 1 and 3
    transmat = np.array([[0.7, 0.2, 0.0, 0.1], [0.3, 0.5, 0.2, 0.0],
                         [0.0, 0.3, 0.5, 0.2], [0.2, 0.0, 0.2, 0.6]])
    # The means of each component
    means = np.array([[0.0, 0.0], [0.0, 11.0], [9.0, 10.0], [11.0, -1.0]])
    # The covariance of each component
    covars = .5 * np.tile(np.identity(2), (4, 1, 1))
    model2 = GaussianHMM(n_components=4, covariance_type="full", n_iter=1000)
    model2.startprob_ = startprob
    model2.transmat_ = transmat
    model2.means_ = means
    model2.covars_ = covars
Beispiel #11
0
def predictions_mls(filename, company, dt1, dt2,num_of_states,test_num, days_future, tr_prob):
# Generate samples starting in the most likely actual current state
       
    model = joblib.load(filename) 
    
    rp = getrealprice_series(company, dt2,days_future)
    days = rp.size
    
    quotes = quotes_historical_yahoo_ochl(company, dt1, dt2) 
    dates = np.array([q[0] for q in quotes], dtype=int)
    close_v = np.array([q[2] for q in quotes])


    # Take diff of close value and shift by 1    
    diff = np.diff(close_v)

    dates = dates[1:]
    close_v = close_v[1:]    
    
    X = np.column_stack([diff])

    # Predict the most likely current internal hidden state
    hidden_probs = model.predict_proba(X)
    lstate_prob = hidden_probs[-1] 
    

    
    # If more than one state, make sure we start at the most likely current state
    if (num_of_states>1):
        startprob = np.zeros(num_of_states)
        startprob[lstate_prob.argmax()] = 1.0
    else:
        startprob = [ 1.]

    # Prepare the model for sampling
    model_2_sample = GaussianHMM(n_components=num_of_states, covariance_type="full")
    model_2_sample.startprob_ = startprob
    model_2_sample.transmat_ = model.transmat_
    model_2_sample.means_ = model.means_
    model_2_sample.covars_ = model.covars_

    #Make sure to randomize the samples
    random.seed()
    rseed = random.randrange(0,max_int_value)
    X, Z = model_2_sample.sample(days, random_state=rseed)
    
    # Make predictions
    avg_prediction = 0 
    allpredictions = np.zeros((test_num, days)) #added two in case there was a weekend at the end
    
    for test in range(test_num): 
        
        final_price = rp[0] #start at day 0 of the real prices
        allpredictions[test][0] = final_price   #day 0 prediction same as current real price
        
        for i in range(1, days):
            final_price += X[i][0]

            allpredictions[test][i] = final_price
            
        rseed = random.randrange(0,max_int_value)
        X, Z = model_2_sample.sample(days, random_state=rseed)



    predictions = allpredictions.mean(axis=0)
    predictions_var = allpredictions.var(axis=0)
    predictions_median =  np.median(allpredictions, axis=0)    

    
    errors = predictions - rp 
    tr_prob_vector = np.full((predictions.size),tr_prob)
    
    data = [predictions,rp, errors, tr_prob_vector, 
            predictions_var,predictions_median]

    err_final = errors[-1]
    
    print ("Start Price: ",rp[0],"Avg. Prediction: ",str(num_of_states),"states:" ,
           predictions[-1]," Real Price:", rp[-1]) 
    print (" Error end of predictions:", err_final,"Delta Start-End:", rp[0]-rp[-1],"\n")
    #print ("Real prices:", rp)
    #print ("Predicted prices", predictions)
    
    fname = "Predictions_"+str(company)+"_States_"+str(num_of_states)+"_stats.csv"
    fname = os.path.join('./sims_final', fname)
    np.savetxt(fname, data, delimiter=",")

    
    return
Beispiel #12
0
best_covars = model.covars_

max_prop = -999
for i in range(5):
    model.fit(A)
    temp_prop = model.score(A)
    if(temp_prop>max_prop):
        max_prop=temp_prop
        best_startprob_ = model.startprob_
        best_transmat = model.transmat_
        best_means_ = model.means_
        best_covars = model.covars_

model.startprob_ = best_startprob_
model.transmat_ = best_transmat
model.means_ = best_means_
model.covars_ = best_covars


#已知模型参数,根据观测序列,解码隐藏状态序列
hidden_states = model.predict(A)
print hidden_states

#我们把每个预测的状态用不同颜色标注在指数曲线上看一下结果。从图中可以比较明显的看出绿色的隐藏状态代表指数大幅上涨,浅蓝色和黄色的隐藏状态代表指数下跌。
plt.figure(figsize=(25, 18))
for i in range(model.n_components):   #n_components应该是隐藏状态的数组
    pos = (hidden_states==i)
    plt.plot_date(Date[pos],close[pos],'o',label='hidden state %d'%i,lw=2)
    plt.legend(loc="left")
plt.show()
		stateCovs[i] = covAll * args.fractionBG/args.ploidy; # since if the variance is 0, the probability of observing anything but the mean (0) is 0
	else:
		stateCovs[i] = covAll * float(i)/args.ploidy;
	cnvsToStateIs[i]=i
	statePDFMaxima[i]=np.log(multivariate_normal.pdf(x=stateMeans[i],mean=stateMeans[i],cov=stateCovs[i]))

cnvsToStateIs0=cnvsToStateIs;
stateIsToCNVs0 = stateIsToCNVs;
if len(IDs)==1:
	stateCovs = np.expand_dims(stateCovs,1)


#model = GaussianHMM(len(states),covariance_type="full",n_iter=1);
model = GaussianHMM(numStates,covariance_type="full", n_iter=1);
###insert my own params
model.means_ = stateMeans;
model.covars_ = stateCovs;

### make transmat
if args.transition <= -100:
	transitionMatrix = (1-np.eye(numStates))*args.transition*np.log(10);
	model._log_transmat =transitionMatrix;
else:
	transitionMatrix = np.add(np.eye(numStates)*(1-(numStates-1)*10**args.transition),(1-np.eye(numStates))*10**args.transition);
	model._set_transmat(transitionMatrix);

if args.verbose>0: sys.stderr.write(np.array_str(model._log_transmat)+"\n");

#exit;
meanNormal = meanAll;
normalState = cnvsToStateIs[args.ploidy];
Beispiel #14
0
startprob = np.array([0.6, 0.3, 0.1, 0.0])

transmat = np.array([[0.7, 0.2, 0.0, 0.1],
                     [0.3, 0.5, 0.2, 0.0],
                     [0.0, 0.3, 0.5, 0.2],
                     [0.2, 0.0, 0.2, 0.6]])

means = np.array([[0.0, 0.0],
                  [0.0, 11.0],
                  [9.0, 10.0],
                  [11.0, -1.0]])

covars = .5 * np.tile(np.identity(2), (4, 1, 1))

model = GaussianHMM(n_components=4, covariance_type="full")
model.startprob_ = startprob
model.transmat_ = transmat
model.means_ = means
model.covars_ = covars

X, state_sequence = model.sample(n_samples=5)

plt.plot(X[:, 0], X[:, 1], ".-", label="observations", ms=6,
         mfc="orange", alpha=0.7)
for i, m in enumerate(means):
    plt.text(m[0], m[1], 'Component %i' % (i + 1),
    size=12, horizontalalignment='center',
    bbox=dict(alpha=.7, facecolor='w'))
plt.legend(loc='best')
plt.show()
Beispiel #15
0
# Setting the transition probability
model_gaussian.transmat_ = transition_matrix

# Initial state probability
initial_state_prob = np.array([0.1, 0.4, 0.5])

# Setting initial state probability
model_gaussian.startprob_ = initial_state_prob

# As we want to have a 2-D gaussian distribution the mean has to
# be in the shape of (n_components, 2)
mean = np.array([[0.0, 0.0], [0.0, 10.0], [10.0, 0.0]])

# Setting the mean
model_gaussian.means_ = mean

# As emission probability is a 2-D gaussian distribution, thus
# covariance matrix for each state would be a 2-D matrix, thus
# overall the covariance matrix for all the states would be in the
# form of (n_components, 2, 2)
covariance = 0.5 * np.tile(np.identity(2), (3, 1, 1))
model_gaussian.covars_ = covariance

# model.sample returns both observations as well as hidden states
# the first return argument being the observation and the second
# being the hidden states
Z, X = model_gaussian.sample(100)

# Plotting the observations
plt.plot(Z[:, 0],
def predictions_mls(filename, company, refcompany, dt1, dt2, num_of_states,
                    test_num):
    # Generate samples starting in the most likely actual current state

    days_future = 365

    model = joblib.load(filename)

    quotes = quotes_historical_yahoo_ochl(company, dt1, dt2)
    dates = np.array([q[0] for q in quotes], dtype=int)
    close_v = np.array([q[2] for q in quotes])
    volume = np.array([q[5] for q in quotes])[1:]

    # Take diff of close value. Note that this makes
    # len(diff) = len(close_t) - 1 therefore, other quantities also need to be shifted by 1

    diff = np.diff(close_v)
    dates = dates[1:]
    close_v = close_v[1:]

    # Unpack quotes Company2
    quotes2 = quotes_historical_yahoo_ochl(refcompany, dt1, dt2)
    close_v2 = np.array([q[2] for q in quotes2])
    diff2 = np.diff(close_v2)
    close_v2 = close_v2[1:]

    #print (diff2.shape)

    delta = diff2.shape[0] - diff.shape[0]
    delta = abs(delta)

    diff0 = np.pad(diff, (delta, 0), mode='constant', constant_values=0)
    close_v = np.pad(close_v, (delta, 0), mode='constant', constant_values=0)

    #print (diff.shape)
    #print (diff0.shape)

    X = np.column_stack([diff0, diff2])

    # Predict the most likely current internal hidden state
    hidden_probs = model.predict_proba(X)
    lstate_prob = hidden_probs[-1]

    days = int(days_future // total2active)  # 251 open market days in a year
    print(days, strftime("%Y-%m-%d %H:%M:%S", gmtime()))  #debugging purposes

    if (num_of_states > 1):
        startprob = np.zeros(num_of_states)
        startprob[lstate_prob.argmax()] = 1.0
    else:
        startprob = [1.]

    model_2_sample = GaussianHMM(n_components=num_of_states,
                                 covariance_type="full")
    model_2_sample.startprob_ = startprob
    model_2_sample.transmat_ = model.transmat_
    model_2_sample.means_ = model.means_
    model_2_sample.covars_ = model.covars_

    random.seed()
    rseed = random.randrange(0, max_int_value)
    X, Z = model_2_sample.sample(days, random_state=rseed)
    avg_prediction = 0

    allpredictions = np.zeros((test_num, yr))
    for test in range(test_num):
        final_price = close_v[-1]
        j = 0
        for i in range(days):
            if ((final_price + X[i][0]) > 0):
                final_price += X[i][0]
            if (j > 1 and i % 5 == 0):
                allpredictions[test][j] = final_price
                allpredictions[test][j + 1] = final_price
                allpredictions[test][j + 2] = final_price
                j = j + 3
            else:
                allpredictions[test][j] = final_price
                j = j + 1

        while (j < allpredictions.shape[1]):
            allpredictions[test][j] = final_price
            j = j + 1

        rseed = random.randrange(0, max_int_value)
        X, Z = model_2_sample.sample(days, random_state=rseed)

    predictions_year = allpredictions.mean(axis=0)
    print("Avg. Prediction: ", predictions_year[-1])

    fname = "Year_of_predictions_" + str(company) + "_States_" + str(
        num_of_states) + "_adv.csv"
    fname = os.path.join('./sims3', fname)
    np.savetxt(fname, predictions_year, delimiter=",")

    return allpredictions[:, days_future -
                          2], allpredictions[:, (days_future - 2) /
                                             4], allpredictions[:,
                                                                (days_future -
                                                                 2) / 36]

def chromas_from_midi(midi):
    chromas = []
    for i in range(0, len(midi), 16):
        chromas.append(chroma_from_slice(midi[i:i + 16]))
    return chromas


#%%

#%%

start_probs, transition_matrix = get_hmm_parameters()

markov_model = GaussianHMM(n_components=24,
                           covariance_type="full",
                           init_params="stmc")

markov_model.startprob_ = start_probs
markov_model.transmat_ = transition_matrix
markov_model.n_features = 12
markov_model.means_ = Chroma_Templates
markov_model.covars_ = covariance_matrix

path = markov_model.predict(emissions, [len(emissions)])

chords = [index_to_chord(chord) for chord in path]
probable_chords = most_likely_from_midi(notes)
#%%
Beispiel #18
0
transmat = OrderedDict()
means = OrderedDict()
covars = OrderedDict()
model = OrderedDict()

for appliance in model_appliance:	
	startprob[appliance] = np.array(model_appliance[appliance]['startprob'])
	transmat[appliance] = np.array(model_appliance[appliance]['transmat'])
	means[appliance] = np.array(model_appliance[appliance]['means'])
	covars[appliance] = np.array(model_appliance[appliance]['covars'])
	
for appliance in model_appliance:
	model[appliance] = GaussianHMM(n_components=state_appliances[appliance], covariance_type="full")
	model[appliance].startprob_ = startprob[appliance]
	model[appliance].transmat_ = transmat[appliance]
	model[appliance].means_ = means[appliance]
	model[appliance].covars_ = covars[appliance]


new_model = OrderedDict()
for appliance in model:
    startprob_new, means_new, covars_new, transmat_new = sort_learnt_parameters(			
		startprob[appliance], means[appliance],
        covars[appliance], transmat[appliance])
                
    new_model[appliance] = GaussianHMM(n_components=startprob_new.size, covariance_type="full")
    new_model[appliance].startprob_ = startprob_new
    new_model[appliance].transmat_ = transmat_new
    new_model[appliance].means_ = means_new
    new_model[appliance].covars_ = covars_new
	
Beispiel #19
0
import warnings

warnings.filterwarnings("ignore", category=DeprecationWarning)
warnings.filterwarnings("ignore", category=RuntimeWarning)
from hmmlearn.hmm import GaussianHMM
import numpy as np

#samples:
X = np.array([[-1.03573482, -1.03573482], [6.62721065, 11.62721065],
              [3.19196949, 8.19196949], [0.38798214, 0.38798214],
              [2.56845104, 7.56845104], [5.03699793, 10.03699793],
              [5.87873937, 10.87873937], [4.27000819, -1.72999181],
              [4.02692237, -1.97307763], [5.7222677, 10.7222677]])

# Trainning a new model over samples:
model = GaussianHMM(n_components=3, covariance_type="diag").fit(X)

# Create a new copy of the trained model:
new_model = GaussianHMM(n_components=3, covariance_type="diag")
new_model.startprob_ = model.startprob_
new_model.transmat_ = model.transmat_
new_model.means_ = model.means_
m = model._covars_
n = model.covars_
p = model.get_params()
new_model.covars_ = model._covars_

# Predict from X:
X_N = new_model.predict(X)

print(X_N)