Example #1
0
def test_2():
    n_features = 3
    length = 32

    for n_states in [4]:
        t1 = np.random.randn(length, n_features)
        means = np.random.randn(n_states, n_features)
        vars = np.random.rand(n_states, n_features)
        transmat = np.random.rand(n_states, n_states)
        transmat = transmat / np.sum(transmat, axis=1)[:, None]
        startprob = np.random.rand(n_states)
        startprob = startprob / np.sum(startprob)

        chmm = GaussianHMMCPUImpl(n_states, n_features)
        chmm._sequences = [t1]

        pyhmm = GaussianHMM(n_components=n_states,
                            init_params='',
                            params='',
                            covariance_type='diag')
        chmm.means_ = means.astype(np.float32)
        chmm.vars_ = vars.astype(np.float32)
        chmm.transmat_ = transmat.astype(np.float32)
        chmm.startprob_ = startprob.astype(np.float32)
        clogprob, cstats = chmm.do_estep()

        pyhmm.means_ = means
        pyhmm.covars_ = vars
        pyhmm.transmat_ = transmat
        pyhmm.startprob_ = startprob

        framelogprob = pyhmm._compute_log_likelihood(t1)
        fwdlattice = pyhmm._do_forward_pass(framelogprob)[1]
        bwdlattice = pyhmm._do_backward_pass(framelogprob)
        gamma = fwdlattice + bwdlattice
        posteriors = np.exp(gamma.T - logsumexp(gamma, axis=1)).T
        stats = pyhmm._initialize_sufficient_statistics()
        pyhmm._accumulate_sufficient_statistics(stats, t1, framelogprob,
                                                posteriors, fwdlattice,
                                                bwdlattice, 'stmc')

        yield lambda: np.testing.assert_array_almost_equal(
            stats['trans'], cstats['trans'], decimal=3)
        yield lambda: np.testing.assert_array_almost_equal(
            stats['post'], cstats['post'], decimal=3)
        yield lambda: np.testing.assert_array_almost_equal(
            stats['obs'], cstats['obs'], decimal=3)
        yield lambda: np.testing.assert_array_almost_equal(
            stats['obs**2'], cstats['obs**2'], decimal=3)
def get_trained_model(rootpath, condition, n_states, n_iterations, feature, cov_type):
    fname_mean = condition + '-cond-' + feature + '-feat-' + str(n_states) + '-states-' + str(n_iterations) + '-iter-mean.txt'
    fname_cov = condition + '-cond-' + feature + '-feat-' + str(n_states) + '-states-' + str(n_iterations) + '-iter-cov.txt'
    fname_tmat = condition + '-cond-' + feature + '-feat-' + str(n_states) + '-states-' + str(n_iterations) + '-iter-transtion.txt'

    constructed_path_mean = rootpath + condition + '/' + fname_mean
    mean = np.loadtxt(constructed_path_mean)
    iter_list = range(n_states)
    iter_list.reverse()
    deleted_means = []
    for i in iter_list:
        if mean[i][mean[i] > 0.01].shape[0] == 0:
            print 'skipping deleting ith mean:', i, mean[i]
            #mean = np.delete(mean, i, 0)
            #deleted_means.append(i)

    constructed_path_cov = rootpath + condition + '/' + fname_cov
    if cov_type == 'full':
        cov = load_full(constructed_path_cov, n_states, 10)
    else:
        cov = np.loadtxt(constructed_path_cov)
    constructed_path_tmat = rootpath + condition + '/' + fname_tmat
    tmat = np.loadtxt(constructed_path_tmat)
    #fixing tmat if any of the means and covs were deleted
    deleted_means.sort()
    deleted_means.reverse()
    for di in deleted_means:
        tmat = np.delete(tmat, di, 1)
        tmat = np.delete(tmat, di, 0)

    smat = np.zeros(tmat.shape[0])
    smat[0] = 1.0
    sum_fix = np.sum(tmat, axis=1)
    sum_fix = 1.0 / sum_fix
    #print tmat
    for i in range(tmat.shape[0]):
        tmat[i] = tmat[i] * sum_fix[i]
        #print 'corrected\n', tmat
    if n_states != tmat.shape[0]:
        print 'removed some states, n_states now corrected to: ', tmat.shape[0], 'was originaly', n_states
        n_states = tmat.shape[0]
    model = GaussianHMM(n_components=n_states, covariance_type=cov_type, startprob=smat, transmat=tmat, n_iter=0, init_params='mc')
    model.means_ = mean
    model.covars_ = cov
    return model
def test_2():
    n_features = 3
    length = 32
    
    for n_states in [4]:
        t1 = np.random.randn(length, n_features)
        means = np.random.randn(n_states, n_features)
        vars = np.random.rand(n_states, n_features)
        transmat = np.random.rand(n_states, n_states)
        transmat = transmat / np.sum(transmat, axis=1)[:, None]
        startprob = np.random.rand(n_states)
        startprob = startprob / np.sum(startprob)
        
        chmm = GaussianHMMCPUImpl(n_states, n_features)
        chmm._sequences = [t1]

        pyhmm = GaussianHMM(n_components=n_states, init_params='', params='', covariance_type='diag')
        chmm.means_ = means.astype(np.float32)
        chmm.vars_ = vars.astype(np.float32)
        chmm.transmat_ = transmat.astype(np.float32)
        chmm.startprob_ = startprob.astype(np.float32)
        clogprob, cstats = chmm.do_estep()

        pyhmm.means_ = means
        pyhmm.covars_ = vars
        pyhmm.transmat_ = transmat
        pyhmm.startprob_ = startprob

        framelogprob = pyhmm._compute_log_likelihood(t1)
        fwdlattice = pyhmm._do_forward_pass(framelogprob)[1]
        bwdlattice = pyhmm._do_backward_pass(framelogprob)
        gamma = fwdlattice + bwdlattice
        posteriors = np.exp(gamma.T - logsumexp(gamma, axis=1)).T
        stats = pyhmm._initialize_sufficient_statistics()
        pyhmm._accumulate_sufficient_statistics(
            stats, t1, framelogprob, posteriors, fwdlattice,
            bwdlattice, 'stmc')

        yield lambda: np.testing.assert_array_almost_equal(stats['trans'], cstats['trans'], decimal=3)
        yield lambda: np.testing.assert_array_almost_equal(stats['post'], cstats['post'], decimal=3)
        yield lambda: np.testing.assert_array_almost_equal(stats['obs'], cstats['obs'], decimal=3)
        yield lambda: np.testing.assert_array_almost_equal(stats['obs**2'], cstats['obs**2'], decimal=3)
Example #4
0
    def predict(self, obs):
        """Find most likely state sequence corresponding to `obs`.

        Parameters
        ----------
        obs : np.ndarray, shape=(n_samples, n_features)
            Sequence of n_features-dimensional data points. Each row
            corresponds to a single point in the sequence.

        Returns
        -------
        hidden_states : np.ndarray, shape=(n_states)
            Index of the most likely states for each observation
        """
        _, vl = scipy.linalg.eig(self.transmat_, left=True, right=False)
        startprob = vl[:, 0] / np.sum(vl[:, 0])

        model = GaussianHMM(n_components=self.n_states, covariance_type='full')
        model.startprob_ = startprob
        model.transmat_ = self.transmat_
        model.means_ = self.means_
        model.covars_ = self.covars_
        return model.predict(obs)
Example #5
0
    def predict(self, obs):
        """Find most likely state sequence corresponding to `obs`.

        Parameters
        ----------
        obs : np.ndarray, shape=(n_samples, n_features)
            Sequence of n_features-dimensional data points. Each row
            corresponds to a single point in the sequence.

        Returns
        -------
        hidden_states : np.ndarray, shape=(n_states)
            Index of the most likely states for each observation
        """
        _, vl = scipy.linalg.eig(self.transmat_, left=True, right=False)
        startprob = vl[:, 0] / np.sum(vl[:, 0])

        model = GaussianHMM(n_components=self.n_states, covariance_type='full')
        model.startprob_ = startprob
        model.transmat_ = self.transmat_
        model.means_ = self.means_
        model.covars_ = self.covars_
        return model.predict(obs)
def test_2():
    np.random.seed(42)
    n_features = 32
    length = 20

    #for n_states in [3, 4, 5, 7, 8, 9, 15, 16, 17, 31, 32]:
    for n_states in [8]:
        t1 = np.random.randn(length, n_features)
        means = np.random.randn(n_states, n_features)
        vars = np.random.rand(n_states, n_features)
        transmat = np.random.rand(n_states, n_states)
        transmat = transmat / np.sum(transmat, axis=1)[:, None]
        startprob = np.random.rand(n_states)
        startprob = startprob / np.sum(startprob)

        cuhmm = GaussianHMMCUDAImpl(n_states, n_features)
        cuhmm._sequences = [t1]

        pyhmm = GaussianHMM(n_components=n_states, init_params='', params='', covariance_type='diag')
        cuhmm.means_ = means
        cuhmm.vars_ = vars
        cuhmm.transmat_ = transmat
        cuhmm.startprob_ = startprob
        logprob, custats = cuhmm.do_estep()

        pyhmm.means_ = means
        pyhmm.covars_ = vars
        pyhmm.transmat_ = transmat
        pyhmm.startprob_ = startprob
        pyhmm._initialize_sufficient_statistics()

        framelogprob = pyhmm._compute_log_likelihood(t1)
        cuframelogprob = cuhmm._get_framelogprob()
        yield lambda: np.testing.assert_array_almost_equal(framelogprob, cuframelogprob, decimal=3)

        fwdlattice = pyhmm._do_forward_pass(framelogprob)[1]
        cufwdlattice = cuhmm._get_fwdlattice()
        yield lambda: np.testing.assert_array_almost_equal(fwdlattice, cufwdlattice, decimal=3)

        bwdlattice = pyhmm._do_backward_pass(framelogprob)
        cubwdlattice = cuhmm._get_bwdlattice()
        yield lambda: np.testing.assert_array_almost_equal(bwdlattice, cubwdlattice, decimal=3)

 
        gamma = fwdlattice + bwdlattice
        posteriors = np.exp(gamma.T - logsumexp(gamma, axis=1)).T
        cuposteriors = cuhmm._get_posteriors()
        yield lambda: np.testing.assert_array_almost_equal(posteriors, cuposteriors, decimal=3)

        stats = pyhmm._initialize_sufficient_statistics()
        pyhmm._accumulate_sufficient_statistics(
            stats, t1, framelogprob, posteriors, fwdlattice,
            bwdlattice, 'stmc')

        print 'ref transcounts'
        print transitioncounts(cufwdlattice, cubwdlattice, cuframelogprob, np.log(transmat))
        print 'cutranscounts'
        print custats['trans']

        yield lambda: np.testing.assert_array_almost_equal(stats['trans'], custats['trans'], decimal=3)
        yield lambda: np.testing.assert_array_almost_equal(stats['post'], custats['post'], decimal=3)
        yield lambda: np.testing.assert_array_almost_equal(stats['obs'], custats['obs'], decimal=3)
        yield lambda: np.testing.assert_array_almost_equal(stats['obs**2'], custats['obs**2'], decimal=3)
 list_of_patient_feats, start_stop_idx, list_of_patient_file_paths = string_patient_feats(train_map, condition, overlap, window)
 #sirs_feats_stacked = stack_patient_feats(list_of_sirs_patients)
 feats_as_list = list_patient_feats(list_of_patient_feats)
 #print np.shape(sirs_feats_stacked)
 means, covs = get_initial_states(pre_states, condition, feature, end=False, start=False, cov_type=cov_type)
 print means
 print covs
 if cov_type == 'full':
     for i in range(n_states):
         print 'checking if initial covs are pos-definite'
         np.linalg.cholesky(covs[i])
         print np.linalg.eigvals(covs[i])
 tmat, smat = get_tmat_and_smat(pre_states, end=False, start=False)
 print tmat, smat
 model = GaussianHMM(n_components=n_states, n_iter=n_iter, covariance_type=cov_type, startprob=smat, transmat=tmat, init_params='mc')
 model.means_ = means
 model.covars_ = covs
 sum_inital_ll = 0.0
 sum_initial_score = 0.0
 sum_initial_map = 0.0
 remove_idx = []
 for idx, feat_from_list in enumerate(feats_as_list):
     if np.shape(feat_from_list)[0] > n_states:
         initial_ll, initial_best_seq = model.decode(feat_from_list)
         initial_map, initial_best_sep_map = model.decode(feat_from_list, algorithm='map')
         sum_initial_score += model.score(feat_from_list)
         sum_inital_ll += initial_ll
         sum_initial_map += initial_map
     else:
         remove_idx.append(idx)
         print 'too few samples in file', list_of_patient_file_paths[idx], np.shape(feat_from_list)
Example #8
0
File: HMM.py Project: ranulfo0s/HMM
d = 0.050

##EX transitions_prob = np.mat([row0 = [a,c,d,c,d], row1 = [ e,a,b,e,e], row2 = [c,d,a,c,d] , row3 = [d,c,c,a,d] , row4  [d,c,d,c ,a]])

transitions_prob = np.mat([[a, c, d, c, d], [e, a, b, e, e], [c, d, a, c, d], [d, c, c, a, d], [d, c, d, c, a]])


HMM = GaussianHMM(n_components=5, covariance_type="diag", transmat=transitions_prob)


#
# Must always fit the obs data before change means and covars
#
HMM.fit([Resul])

HMM.means_ = np.identity(5)

HMM.covars_ = 0.2 * np.ones((5, 5))

# Use of LR probability to predict the states.
HResul = HMM.predict(Resul)

# Get the probability of success HMM
Hscore = comp(HResul, target)

# print HResul

print "HMM = "
print Hscore

Example #9
0
def main():
    """
    Main function that performs footprint analysis.

    Keyword arguments: None
        
    Return: None
    """

    ###################################################################################################
    # Processing Input Arguments
    ###################################################################################################

    # Initializing ErrorHandler
    error_handler = ErrorHandler()

    # Parameters
    current_version = "0.0.1"
    usage_message = (
        "\n--------------------------------------------------\n"
        "The 'hint' program predicts TFBSs given open chromatin data.\n"
        "In order to use this tools, please type: \n\n"
        "%prog [options] <experiment_matrix>\n\n"
        "The <experiment matrix> should contain:\n"
        "- One region file representing the regions in which the HMM\n"
        "  will be applied. It should contain 'regions' in the type field\n"
        "- One DNase aligned reads file (bam) file with 'DNASE' in the name field.\n"
        "- One to Three histone modification aligned reads file (bam).\n\n"
        "For more information, please refer to:\n"
        "http://www.regulatory-genomics.org/dnasefootprints/\n"
        "--------------------------------------------------")
    version_message = "HINT - Regulatory Analysis Toolbox (RGT). Version: " + str(
        current_version)

    # Initializing Option Parser
    parser = PassThroughOptionParser(usage=usage_message,
                                     version=version_message)

    # Optional Input Options
    parser.add_option(
        "--hmm-file",
        dest="hmm_file",
        type="string",
        metavar="FILE_1[,FILE_2,...,FILE_N]",
        default=None,
        help=
        ("List of HMM files separated by comma. If one file only, then this HMM will be "
         "applied for all histone signals, otherwise, the list must have the same number"
         "of histone files given. The order of the list should be the order of the"
         "histones in the input_matrix file. If the argument is not given, then an HMM"
         "trained with H3K4me3 in K562 will be used."))

    # Parameters Options
    parser.add_option(
        "--organism",
        dest="organism",
        type="string",
        metavar="STRING",
        default="hg19",
        help=
        ("Organism considered on the analysis. Check our full documentation for all available "
         "options. All default files such as genomes will be based on the chosen organism "
         "and the data.config file. This option is used only if a bigbed output is asked."
         ))

    # Output Options
    parser.add_option("--output-location",
                      dest="output_location",
                      type="string",
                      metavar="PATH",
                      default=getcwd(),
                      help=("Path where the output files will be written."))
    parser.add_option("--footprint-name",
                      dest="footprint_name",
                      type="string",
                      metavar="STRING",
                      default="footprints",
                      help=("Name of the footprint file (without extension)."))
    parser.add_option(
        "--print-bb",
        dest="print_bb",
        action="store_true",
        default=False,
        help=("If used, the output will be a bigbed (.bb) file."))

    # Processing Options
    options, arguments = parser.parse_args()
    if (not arguments or len(arguments) > 1):
        error_handler.throw_error("FP_WRONG_ARGUMENT")

    # Fixed Parameters ################
    region_total_ext = 10000
    fp_state_nb = 7
    fp_limit_size = 50
    ###
    dnase_initial_clip = 1000
    dnase_sg_window_size = 9
    dnase_norm_per = 98
    dnase_slope_per = 98
    dnase_frag_ext = 1
    ###
    histone_initial_clip = 1000
    histone_sg_window_size = 201
    histone_norm_per = 98
    histone_slope_per = 98
    histone_frag_ext = 200
    ###################################

    ###################################################################################################
    # Reading Input Matrix
    ###################################################################################################

    # Reading input argument
    input_matrix = arguments[0]

    # Create experimental matrix
    try:
        exp_matrix = ExperimentalMatrix()
        exp_matrix.read(input_matrix)
    except Exception:
        error_handler.throw_error("FP_WRONG_EXPMAT")

    ###################################################################################################
    # Reading Regions
    ###################################################################################################

    # Fetching region file
    region_set_list = exp_matrix.get_regionsets()
    if (len(region_set_list) == 0): error_handler.throw_error("FP_ONE_REGION")
    elif (len(region_set_list) > 1):
        error_handler.throw_warning("FP_ONE_REGION")
    regions = region_set_list[0]

    # Extending + Sorting + Merging / keeping an original copy
    original_regions = deepcopy(regions)
    regions.extend(int(region_total_ext / 2),
                   int(region_total_ext / 2))  # Extending
    regions.merge()  # Sort & Merge

    ###################################################################################################
    # Reading Signals
    ###################################################################################################

    # Initialization
    name_list = exp_matrix.names
    type_list = exp_matrix.types
    file_dict = exp_matrix.files
    dnase_label = "DNASE"

    # Fetching signal files
    dnase_file = None
    histone_file_list = []
    for i in range(0, len(name_list)):
        if (type_list[i] == "regions"): continue
        if (name_list[i].upper() == dnase_label):  # DNase signal
            if (not dnase_file):
                dnase_file = BamFile(file_dict[name_list[i]])
                dnase_file.load_sg_coefs(dnase_sg_window_size)
            else:
                error_handler.throw_warning("FP_MANY_DNASE")
        else:  # Histone signal
            histone_file = BamFile(file_dict[name_list[i]])
            histone_file.load_sg_coefs(histone_sg_window_size)
            histone_file_list.append(histone_file)

    # Handling errors
    if (not dnase_file): error_handler.throw_error("FP_NO_DNASE")
    if (len(histone_file_list) == 0):
        error_handler.throw_error("FP_NO_HISTONE")
    elif (len(histone_file_list) > 3):
        error_handler.throw_warning("FP_MANY_HISTONE")

    ###################################################################################################
    # Creating HMM list
    ###################################################################################################

    # Fetching HMM input
    flag_multiple_hmms = False
    if (options.hmm_file):  # Argument is passed

        # Fetching list of HMM files
        hmm_file_list = options.hmm_file.split(",")

        # Verifying HMM application mode (one HMM or multiple HMM files)
        if (len(hmm_file_list) == 1):
            flag_multiple_hmms = False  # One HMM file only
        elif (len(hmm_file_list) == len(histone_file_name_list)):
            flag_multiple_hmms = True  # One HMM file for each histone
        else:
            error_handler.throw_error("FP_NB_HMMS")

    else:  # Argument was not passed
        flag_multiple_hmms = False
        hmm_data = HmmData()
        hmm_file_list = [hmm_data.get_default_hmm()]

    # Creating scikit HMM list
    hmm_list = []
    for hmm_file_name in hmm_file_list:

        try:
            hmm_scaffold = HMM()
            hmm_scaffold.load_hmm(hmm_file_name)
            scikit_hmm = GaussianHMM(n_components=hmm_scaffold.states,
                                     covariance_type="full",
                                     transmat=array(hmm_scaffold.A),
                                     startprob=array(hmm_scaffold.pi))
            scikit_hmm.means_ = array(hmm_scaffold.means)
            scikit_hmm.covars_ = array(hmm_scaffold.covs)
        except Exception:
            error_handler.throw_error("FP_HMM_FILES")
        hmm_list.append(scikit_hmm)

    ###################################################################################################
    # Main Pipeline
    ###################################################################################################

    # Initializing result set
    footprints = GenomicRegionSet("footprints")

    # Iterating over regions
    for r in regions.sequences:

        # Fetching DNase signal
        try:
            dnase_norm, dnase_slope = dnase_file.get_signal(
                r.chrom, r.initial, r.final, dnase_frag_ext,
                dnase_initial_clip, dnase_norm_per, dnase_slope_per)
        except Exception:
            error_handler.throw_warning(
                "FP_DNASE_PROC",
                add_msg="for region (" +
                ",".join([r.chrom, str(r.initial),
                          str(r.final)]) +
                "). This iteration will be skipped.")
            continue

        # Iterating over histone modifications
        for i in range(0, len(histone_file_list)):

            # Fetching histone signal
            try:
                histone_file = histone_file_list[i]
                histone_norm, histone_slope = histone_file.get_signal(
                    r.chrom, r.initial, r.final, histone_frag_ext,
                    histone_initial_clip, histone_norm_per, histone_slope_per)
            except Exception:
                error_handler.throw_warning(
                    "FP_HISTONE_PROC",
                    add_msg="for region (" +
                    ",".join([r.chrom, str(r.initial),
                              str(r.final)]) + ") and histone modification " +
                    histone_file.file_name +
                    ". This iteration will be skipped for this histone.")
                continue

            # Formatting sequence
            try:
                input_sequence = array(
                    [dnase_norm, dnase_slope, histone_norm, histone_slope]).T
            except Exception:
                error_handler.throw_warning(
                    "FP_SEQ_FORMAT",
                    add_msg="for region (" +
                    ",".join([r.chrom, str(r.initial),
                              str(r.final)]) + ") and histone modification " +
                    histone_file.file_name +
                    ". This iteration will be skipped.")
                continue

            # Applying HMM
            if (flag_multiple_hmms): current_hmm = hmm_list[i]
            else: current_hmm = hmm_list[0]
            try:
                posterior_list = current_hmm.predict(input_sequence)
            except Exception:
                error_handler.throw_warning(
                    "FP_HMM_APPLIC",
                    add_msg="in region (" +
                    ",".join([r.chrom, str(r.initial),
                              str(r.final)]) + ") and histone modification " +
                    histone_file.file_name +
                    ". This iteration will be skipped.")
                continue

            # Writing results
            start_pos = 0
            flag_start = False
            for k in range(r.initial, r.final):
                curr_index = k - r.initial
                if (flag_start):
                    if (posterior_list[curr_index] != fp_state_nb):
                        if (k - start_pos < fp_limit_size):
                            fp = GenomicRegion(r.chrom, start_pos, k)
                            footprints.add(fp)
                        flag_start = False
                else:
                    if (posterior_list[curr_index] == fp_state_nb):
                        flag_start = True
                        start_pos = k
            if (flag_start):
                fp = GenomicRegion(r.chrom, start_pos, r.final)
                footprints.add(fp)

    # Sorting and Merging
    footprints.merge()

    # Overlapping results with original regions
    footprints = footprints.intersect(original_regions,
                                      mode=OverlapType.ORIGINAL)

    ###################################################################################################
    # Writing output
    ###################################################################################################

    # Creating output file
    output_file_name = options.output_location + options.footprint_name + ".bed"
    footprints.write_bed(output_file_name)

    # Verifying condition to write bb
    if (options.print_bb):

        # Fetching file with chromosome sizes
        genome_data = GenomeData(options.organism)
        chrom_sizes_file = genome_data.get_chromosome_sizes()

        # Converting to big bed
        output_bb_name = options.output_location + options.footprint_name + ".bb"
        try:
            system(" ".join([
                "bedToBigBed", output_file_name, chrom_sizes_file,
                output_bb_name
            ]))
            #remove(output_file_name)
        except Exception:
            error_handler.throw_error("FP_BB_CREATION")
Example #10
0
def main():
    """
    Main function that performs footprint analysis.

    Keyword arguments: None
        
    Return: None
    """

    ###################################################################################################
    # Processing Input Arguments
    ###################################################################################################

    # Initializing ErrorHandler
    error_handler = ErrorHandler()
 
    # Parameters
    current_version = "0.0.1"
    usage_message = ("\n--------------------------------------------------\n"
                     "The 'hint' program predicts TFBSs given open chromatin data.\n"
                     "In order to use this tools, please type: \n\n"
                     "%prog [options] <experiment_matrix>\n\n"
                     "The <experiment matrix> should contain:\n"
                     "- One region file representing the regions in which the HMM\n"
                     "  will be applied. It should contain 'regions' in the type field\n"
                     "- One DNase aligned reads file (bam) file with 'DNASE' in the name field.\n"
                     "- One to Three histone modification aligned reads file (bam).\n\n"

                     "For more information, please refer to:\n"
                     "http://www.regulatory-genomics.org/dnasefootprints/\n"
                     "--------------------------------------------------")
    version_message = "HINT - Regulatory Analysis Toolbox (RGT). Version: "+str(current_version)

    # Initializing Option Parser
    parser = PassThroughOptionParser(usage = usage_message, version = version_message)

    # Optional Input Options
    parser.add_option("--hmm-file", dest = "hmm_file", type = "string", metavar="FILE_1[,FILE_2,...,FILE_N]", default = None,
                      help = ("List of HMM files separated by comma. If one file only, then this HMM will be "
                              "applied for all histone signals, otherwise, the list must have the same number"
                              "of histone files given. The order of the list should be the order of the"
                              "histones in the input_matrix file. If the argument is not given, then an HMM"
                              "trained with H3K4me3 in K562 will be used."))

    # Parameters Options
    parser.add_option("--organism", dest = "organism", type = "string", metavar="STRING", default = "hg19",
                      help = ("Organism considered on the analysis. Check our full documentation for all available "
                              "options. All default files such as genomes will be based on the chosen organism "
                              "and the data.config file. This option is used only if a bigbed output is asked."))

    # Output Options
    parser.add_option("--output-location", dest = "output_location", type = "string", metavar="PATH", default = getcwd(),
                      help = ("Path where the output files will be written."))
    parser.add_option("--footprint-name", dest = "footprint_name", type = "string", metavar="STRING", default = "footprints",
                      help = ("Name of the footprint file (without extension)."))
    parser.add_option("--print-bb", dest = "print_bb", action = "store_true", default = False,
                      help = ("If used, the output will be a bigbed (.bb) file."))

    # Processing Options
    options, arguments = parser.parse_args()
    if(not arguments or len(arguments) > 1): error_handler.throw_error("FP_WRONG_ARGUMENT")

    # Fixed Parameters ################
    region_total_ext = 10000
    fp_state_nb = 7
    fp_limit_size = 50
    ###
    dnase_initial_clip = 1000
    dnase_sg_window_size = 9
    dnase_norm_per = 98
    dnase_slope_per = 98
    dnase_frag_ext = 1
    ###
    histone_initial_clip = 1000
    histone_sg_window_size = 201
    histone_norm_per = 98
    histone_slope_per = 98
    histone_frag_ext = 200
    ###################################
    
    ###################################################################################################
    # Reading Input Matrix
    ###################################################################################################

    # Reading input argument
    input_matrix = arguments[0]

    # Create experimental matrix
    try:
        exp_matrix = ExperimentalMatrix()
        exp_matrix.read(input_matrix)
    except Exception: error_handler.throw_error("FP_WRONG_EXPMAT")

    ###################################################################################################
    # Reading Regions
    ###################################################################################################

    # Fetching region file
    region_set_list = exp_matrix.get_regionsets()
    if(len(region_set_list) == 0): error_handler.throw_error("FP_ONE_REGION")
    elif(len(region_set_list) > 1): error_handler.throw_warning("FP_ONE_REGION")
    regions = region_set_list[0]

    # Extending + Sorting + Merging / keeping an original copy
    original_regions = deepcopy(regions)
    regions.extend(int(region_total_ext/2),int(region_total_ext/2)) # Extending
    regions.merge() # Sort & Merge

    ###################################################################################################
    # Reading Signals
    ###################################################################################################

    # Initialization
    name_list = exp_matrix.names
    type_list = exp_matrix.types
    file_dict = exp_matrix.files
    dnase_label = "DNASE"

    # Fetching signal files
    dnase_file = None
    histone_file_list = []
    for i in range(0,len(name_list)):
        if(type_list[i] == "regions"): continue
        if(name_list[i].upper() == dnase_label): # DNase signal
            if(not dnase_file):
                dnase_file = BamFile(file_dict[name_list[i]])
                dnase_file.load_sg_coefs(dnase_sg_window_size)
            else: error_handler.throw_warning("FP_MANY_DNASE")
        else: # Histone signal
            histone_file = BamFile(file_dict[name_list[i]])
            histone_file.load_sg_coefs(histone_sg_window_size)
            histone_file_list.append(histone_file)

    # Handling errors
    if(not dnase_file): error_handler.throw_error("FP_NO_DNASE")
    if(len(histone_file_list) == 0): error_handler.throw_error("FP_NO_HISTONE")
    elif(len(histone_file_list) > 3): error_handler.throw_warning("FP_MANY_HISTONE")

    ###################################################################################################
    # Creating HMM list
    ###################################################################################################

    # Fetching HMM input
    flag_multiple_hmms = False
    if(options.hmm_file): # Argument is passed

        # Fetching list of HMM files
        hmm_file_list = options.hmm_file.split(",")

        # Verifying HMM application mode (one HMM or multiple HMM files)
        if(len(hmm_file_list) == 1): flag_multiple_hmms = False # One HMM file only
        elif(len(hmm_file_list) == len(histone_file_name_list)): flag_multiple_hmms = True # One HMM file for each histone
        else: error_handler.throw_error("FP_NB_HMMS")

    else: # Argument was not passed
        flag_multiple_hmms = False
        hmm_data = HmmData()
        hmm_file_list = [hmm_data.get_default_hmm()]

    # Creating scikit HMM list
    hmm_list = []
    for hmm_file_name in hmm_file_list:

        try:
            hmm_scaffold = HMM()
            hmm_scaffold.load_hmm(hmm_file_name)
            scikit_hmm = GaussianHMM(n_components=hmm_scaffold.states, covariance_type="full", 
                                         transmat=array(hmm_scaffold.A), startprob=array(hmm_scaffold.pi))
            scikit_hmm.means_ = array(hmm_scaffold.means)
            scikit_hmm.covars_ = array(hmm_scaffold.covs)
        except Exception: error_handler.throw_error("FP_HMM_FILES")
        hmm_list.append(scikit_hmm)

    ###################################################################################################
    # Main Pipeline
    ###################################################################################################

    # Initializing result set
    footprints = GenomicRegionSet("footprints")

    # Iterating over regions
    for r in regions.sequences:

        # Fetching DNase signal
        try:
            dnase_norm, dnase_slope = dnase_file.get_signal(r.chrom, r.initial, r.final, 
                                      dnase_frag_ext, dnase_initial_clip, dnase_norm_per, dnase_slope_per)
        except Exception:
            error_handler.throw_warning("FP_DNASE_PROC",add_msg="for region ("+",".join([r.chrom, str(r.initial), str(r.final)])+"). This iteration will be skipped.")
            continue

        # Iterating over histone modifications
        for i in range(0,len(histone_file_list)):

            # Fetching histone signal
            try:
                histone_file = histone_file_list[i]
                histone_norm, histone_slope = histone_file.get_signal(r.chrom, r.initial, r.final, 
                                              histone_frag_ext, histone_initial_clip, histone_norm_per, histone_slope_per)
            except Exception:
                error_handler.throw_warning("FP_HISTONE_PROC",add_msg="for region ("+",".join([r.chrom, str(r.initial), str(r.final)])+") and histone modification "+histone_file.file_name+". This iteration will be skipped for this histone.")
                continue

            # Formatting sequence
            try:
                input_sequence = array([dnase_norm,dnase_slope,histone_norm,histone_slope]).T
            except Exception:
                error_handler.throw_warning("FP_SEQ_FORMAT",add_msg="for region ("+",".join([r.chrom, str(r.initial), str(r.final)])+") and histone modification "+histone_file.file_name+". This iteration will be skipped.")
                continue

            # Applying HMM
            if(flag_multiple_hmms): current_hmm = hmm_list[i]
            else: current_hmm = hmm_list[0]
            try:
                posterior_list = current_hmm.predict(input_sequence)
            except Exception:
                error_handler.throw_warning("FP_HMM_APPLIC",add_msg="in region ("+",".join([r.chrom, str(r.initial), str(r.final)])+") and histone modification "+histone_file.file_name+". This iteration will be skipped.")
                continue

            # Writing results
            start_pos = 0
            flag_start = False
            for k in range(r.initial, r.final):
                curr_index = k - r.initial
                if(flag_start):
                    if(posterior_list[curr_index] != fp_state_nb):
                        if(k-start_pos < fp_limit_size):
                            fp = GenomicRegion(r.chrom, start_pos, k)
                            footprints.add(fp)
                        flag_start = False
                else:
                    if(posterior_list[curr_index] == fp_state_nb):
                        flag_start = True
                        start_pos = k
            if(flag_start): 
                fp = GenomicRegion(r.chrom, start_pos, r.final)
                footprints.add(fp)

    # Sorting and Merging
    footprints.merge()

    # Overlapping results with original regions
    footprints = footprints.intersect(original_regions,mode=OverlapType.ORIGINAL)

    ###################################################################################################
    # Writing output
    ###################################################################################################

    # Creating output file
    output_file_name = options.output_location+options.footprint_name+".bed"
    footprints.write_bed(output_file_name)

    # Verifying condition to write bb
    if(options.print_bb):

        # Fetching file with chromosome sizes
        genome_data = GenomeData(options.organism)
        chrom_sizes_file = genome_data.get_chromosome_sizes()

        # Converting to big bed
        output_bb_name = options.output_location+options.footprint_name+".bb"
        try:
            system(" ".join(["bedToBigBed",output_file_name,chrom_sizes_file,output_bb_name]))
            #remove(output_file_name)
        except Exception: error_handler.throw_error("FP_BB_CREATION")
Example #11
0
def test_2():
    np.random.seed(42)
    n_features = 32
    length = 20

    #for n_states in [3, 4, 5, 7, 8, 9, 15, 16, 17, 31, 32]:
    for n_states in [8]:
        t1 = np.random.randn(length, n_features)
        means = np.random.randn(n_states, n_features)
        vars = np.random.rand(n_states, n_features)
        transmat = np.random.rand(n_states, n_states)
        transmat = transmat / np.sum(transmat, axis=1)[:, None]
        startprob = np.random.rand(n_states)
        startprob = startprob / np.sum(startprob)

        cuhmm = GaussianHMMCUDAImpl(n_states, n_features)
        cuhmm._sequences = [t1]

        pyhmm = GaussianHMM(n_components=n_states,
                            init_params='',
                            params='',
                            covariance_type='diag')
        cuhmm.means_ = means
        cuhmm.vars_ = vars
        cuhmm.transmat_ = transmat
        cuhmm.startprob_ = startprob
        logprob, custats = cuhmm.do_estep()

        pyhmm.means_ = means
        pyhmm.covars_ = vars
        pyhmm.transmat_ = transmat
        pyhmm.startprob_ = startprob
        pyhmm._initialize_sufficient_statistics()

        framelogprob = pyhmm._compute_log_likelihood(t1)
        cuframelogprob = cuhmm._get_framelogprob()
        yield lambda: np.testing.assert_array_almost_equal(
            framelogprob, cuframelogprob, decimal=3)

        fwdlattice = pyhmm._do_forward_pass(framelogprob)[1]
        cufwdlattice = cuhmm._get_fwdlattice()
        yield lambda: np.testing.assert_array_almost_equal(
            fwdlattice, cufwdlattice, decimal=3)

        bwdlattice = pyhmm._do_backward_pass(framelogprob)
        cubwdlattice = cuhmm._get_bwdlattice()
        yield lambda: np.testing.assert_array_almost_equal(
            bwdlattice, cubwdlattice, decimal=3)

        gamma = fwdlattice + bwdlattice
        posteriors = np.exp(gamma.T - logsumexp(gamma, axis=1)).T
        cuposteriors = cuhmm._get_posteriors()
        yield lambda: np.testing.assert_array_almost_equal(
            posteriors, cuposteriors, decimal=3)

        stats = pyhmm._initialize_sufficient_statistics()
        pyhmm._accumulate_sufficient_statistics(stats, t1, framelogprob,
                                                posteriors, fwdlattice,
                                                bwdlattice, 'stmc')

        print 'ref transcounts'
        print transitioncounts(cufwdlattice, cubwdlattice, cuframelogprob,
                               np.log(transmat))
        print 'cutranscounts'
        print custats['trans']

        yield lambda: np.testing.assert_array_almost_equal(
            stats['trans'], custats['trans'], decimal=3)
        yield lambda: np.testing.assert_array_almost_equal(
            stats['post'], custats['post'], decimal=3)
        yield lambda: np.testing.assert_array_almost_equal(
            stats['obs'], custats['obs'], decimal=3)
        yield lambda: np.testing.assert_array_almost_equal(
            stats['obs**2'], custats['obs**2'], decimal=3)