Beispiel #1
0
Created on Wed Oct 30 21:47:48 2019

@author: tanma
"""

import matplotlib.pyplot as plt
import numpy as np

from IPython.core.display import HTML
from itertools import chain
from collections import Counter, defaultdict, namedtuple
from helpers import show_model, Dataset
from pomegranate import State, HiddenMarkovModel, DiscreteDistribution

data = Dataset("tags-universal.txt",
               "brown-universal.txt",
               train_test_split=0.8)

assert len(data) == len(data.training_set) + len(data.testing_set), \
       "The number of sentences in the training set + testing set should sum to the number of sentences in the corpus"

assert data.N == data.training_set.N + data.testing_set.N, \
       "The number of training + test samples should sum to the total number of samples"


def pair_counts(sequences_A, sequences_B):
    """Return a dictionary keyed to each unique value in the first sequence list
    that counts the number of occurrences of the corresponding value from the
    second sequences list.
    
    For example, if sequences_A is tags and sequences_B is the corresponding
Beispiel #2
0
            acc += 1
    return float(acc) / len(val_data)


if __name__ == '__main__':
    tf.logging.set_verbosity(tf.logging.ERROR)
    os.environ['TF_CPP_MIN_LOG_LEVEL'] = '3'
    argv = sys.argv[1:]
    args, _ = Parser().getParser().parse_known_args(argv)
    random.seed()

    logging.basicConfig(filename=args.log, level=logging.INFO,
                        format='%(message)s')
    logger = logging.getLogger(__name__)

    dataset = Dataset(args.dataset, logger)
    train_data, dev_data = dataset.getdata(args.maxlenth)
    word_vector = dataset.get_wordvector(args.word_vector)

    ###
    train_text_num = 500
    dev_text_num = 20
    if args.smalldata == 1:
        train_data = train_data[:train_text_num]
        dev_data = dev_data[:dev_text_num]
    print("train_data ", len(train_data))
    print("dev_data", len(dev_data))
    ###

    config = tf.ConfigProto()
    config.gpu_options.allow_growth = True
Beispiel #3
0
import matplotlib.pyplot as plt
import numpy as np

from IPython.core.display import HTML
from itertools import chain
from collections import Counter, defaultdict
from helpers import show_model, Dataset
from pomegranate import State, HiddenMarkovModel, DiscreteDistribution


data = Dataset("tags-universal.txt", "brown-universal.txt", train_test_split=0.8)


print("There are {} sentences in the corpus.".format(len(data)))
print("There are {} sentences in the training set.".format(len(data.training_set)))
print("There are {} sentences in the testing set.".format(len(data.testing_set)))


assert len(data) == len(data.training_set) + len(data.testing_set),        "The number of sentences in the training set + testing set should sum to the number of sentences in the corpus"


key = 'b100-38532'
print("Sentence: {}".format(key))
print("words:\n\t{!s}".format(data.sentences[key].words))
print("tags:\n\t{!s}".format(data.sentences[key].tags))


print("There are a total of {} samples of {} unique words in the corpus."
      .format(data.N, len(data.vocab)))
print("There are {} samples of {} unique words in the training set."
      .format(data.training_set.N, len(data.training_set.vocab)))
        res = pd.DataFrame([" ".join(tup) for tup in bigram_sequence_set],
                           columns=["bigram_sequence"])
        res = res.groupby("bigram_sequence").size().reset_index(name="Count")
        res.to_csv(bigram_training_path)

    df = pd.read_csv(bigram_training_path)
    df.drop(columns=["Unnamed: 0"], inplace=True)
    dct = df.set_index("bigram_sequence").T.to_dict("Records")[0]
    return {key_to_tuple(k): dct[k] for k in dct}


def tag_aggregate(sequences):

    if not os.path.exists(TAG_TRAINING_PATH):
        start_end_frame = []
        for i, seq in enumerate(sequences):
            tup = (seq[0], seq[-1])
            start_end_frame.append(tup)

        df = pd.DataFrame(start_end_frame, columns=["start_type", "end_type"])
        df.to_csv(TAG_TRAINING_PATH)

    df = pd.read_csv(TAG_TRAINING_PATH)
    start_frame, end_frame = df.start_type.value_counts(
    ), df.end_type.value_counts()
    return start_frame.to_dict, end_frame.to_dict()


data = Dataset(TAG_PATH, BROWN_PATH, train_test_split=0.8)
tag_starts, tag_end = tag_aggregate(data.training_set.Y)
Beispiel #5
0
import matplotlib.pyplot as plt
import numpy as np
from IPython.core.display import HTML
from itertools import chain
from collections import Counter, defaultdict
from helpers import show_model, Dataset
from pomegranate import State, HiddenMarkovModel, DiscreteDistribution

data = Dataset("tags-universal.txt", "brown-universal.txt", train_test_split=0.8)

print("There are {} sentences in the corpus.".format(len(data)))
print("There are {} sentences in the training set.".format(len(data.training_set)))
print("There are {} sentences in the testing set.".format(len(data.testing_set)))

assert len(data) == len(data.training_set) + len(data.testing_set), \
    "The number of sentences in the training set + testing set should sum to the number of sentences in the corpus"

key = 'b100-38532'
print("Sentence: {}".format(key))
print("words:\t{!s}".format(data.sentences[key].words))
print("tags:\t{!s}".format(data.sentences[key].tags))

print("There are a total of {} samples of {} unique words in the copus."
      .format(data.N, len(data.vocab)))
print("There are {} samples of {} unique words in the training set."
      .format(data.training_set.N, len(data.training_set.vocab)))
print("There are {} samples of {} unique words in the testing set."
      .format(data.testing_set.N, len(data.testing_set.vocab)))
print("There are {} words in the test set that are missing in the training set."
      .format(len(data.testing_set.vocab - data.training_set.vocab)))