def get_data(split_sequences=False):

    word2idx = {}
    tag2idx = {}
    word_idx = 0
    tag_idx = 0
    Xtrain = []
    Ytrain = []
    currentX = []
    currentY = []
    for line in get_obj_s3("pos_train.txt").read().decode("utf-8").split("\n"):
        line = line.rstrip()
        if line:
            r = line.split()
            word, tag, _ = r
            if word not in word2idx:
                word2idx[word] = word_idx
                word_idx += 1
            currentX.append(word2idx[word])

            if tag not in tag2idx:
                tag2idx[tag] = tag_idx
                tag_idx += 1
            currentY.append(tag2idx[tag])
        elif split_sequences:
            Xtrain.append(currentX)
            Ytrain.append(currentY)
            currentX = []
            currentY = []

    if not split_sequences:
        Xtrain = currentX
        Ytrain = currentY

    # load and score test data
    Xtest = []
    Ytest = []
    currentX = []
    currentY = []
    for line in get_obj_s3("pos_test.txt").read().decode("utf-8").split("\n"):
        line = line.rstrip()
        if line:
            r = line.split()
            word, tag, _ = r
            if word in word2idx:
                currentX.append(word2idx[word])
            else:
                currentX.append(word_idx)  # use this as unknown
            currentY.append(tag2idx[tag])
        elif split_sequences:
            Xtest.append(currentX)
            Ytest.append(currentY)
            currentX = []
            currentY = []
    if not split_sequences:
        Xtest = currentX
        Ytest = currentY

    return Xtrain, Ytrain, Xtest, Ytest, word2idx
Ejemplo n.º 2
0
def fit_coin(file_key):
    """Loads data and trains HMM."""

    X = []
    for line in get_obj_s3(file_key).read().decode("utf-8").strip().split(sep="\n"):
        x = [1 if e == "H" else 0 for e in line.rstrip()]
        X.append(x)

    # Instantiate object of class HMM with 2 hidden states (heads and tails)
    hmm = HMM(2)
    hmm.fit(X)
    L = hmm.log_likelihood_multi(X).sum()
    print("Log likelihood with fitted params: ", round(L, 3))

    # Try the true values
    hmm.pi = np.array([0.5, 0.5])
    hmm.A = np.array([
        [0.1, 0.9],
        [0.8, 0.2]
    ])
    hmm.B = np.array([
        [0.6, 0.4],
        [0.3, 0.7]
    ])
    L = hmm.log_likelihood_multi(X).sum()
    print("Log Likelihood with true params: ", round(L, 3))

    # Viterbi
    print("Best state sequence:\n", str(X[0]).replace(",",""))
    print("", hmm.get_state_sequence(X[0]))
Ejemplo n.º 3
0
def fit_coin(file_key):
    X = []
    for line in get_obj_s3(file_key).read().decode("utf-8").strip().split(
            sep="\n"):
        x = [1 if e == "H" else 0 for e in line.rstrip()]
        X.append(x)

    hmm = HMM(2)
    # the entire graph (including optimizer's variables) must be built
    # before calling global variables initializer!
    hmm.init_random(2)
    init = tf.global_variables_initializer()
    with tf.Session() as session:
        session.run(init)
        hmm.set_session(session)
        hmm.fit(X, max_iter=5)
        L = hmm.get_cost_multi(X).sum()
        print("LL with fitted params:", L)

        # try true values
        # remember these must be in their "pre-softmax" forms
        pi = np.log(np.array([0.5, 0.5])).astype(np.float32)
        A = np.log(np.array([[0.1, 0.9], [0.8, 0.2]])).astype(np.float32)
        B = np.log(np.array([[0.6, 0.4], [0.3, 0.7]])).astype(np.float32)
        hmm.set(pi, A, B)
        L = hmm.get_cost_multi(X).sum()
        print("LL with true params:", L)
Ejemplo n.º 4
0
def fit_coin(file_key):
    """Loads data and trains HMM."""

    X = []
    for line in get_obj_s3(file_key).read().decode("utf-8").strip().split(
            sep="\n"):
        x = [1 if e == "H" else 0 for e in line.rstrip()]
        X.append(x)

    # Instantiate object of class HMM with 2 hidden states (heads and tails)
    hmm = HMM(2)
    hmm.fit(X)
    L = hmm.get_cost_multi(X).sum()
    print("Log likelihood with fitted params: ", round(L, 3))

    # Try the true values
    pi = np.array([0.5, 0.5])
    A = np.array([[0.1, 0.9], [0.8, 0.2]])
    B = np.array([[0.6, 0.4], [0.3, 0.7]])
    hmm.set(pi, A, B)
    L = hmm.get_cost_multi(X).sum()
    print("Log Likelihood with true params: ", round(L, 3))