Beispiel #1
0
    def load_and_process_data(self, train_path, dev_path=None):
        # load train and dev data
        logger.info("Loading training data...")
        train_raw = read_conll(train_path)
        logger.info("Done. Read %d sentences", len(train_raw))
        logger.info("Loading dev data...")
        if dev_path is None or not os.path.exists(dev_path):
            # samples from train in tail to dev
            dev_seg_size = self.config.dev_seg_size
            dev_raw = train_raw[len(train_raw) - dev_seg_size:]
            train_raw = train_raw[:len(train_raw) - dev_seg_size]
            logger.info("Divided train data. Read %d sentences of train",
                        len(train_raw))
        else:
            dev_raw = read_conll(dev_path)
        logger.info("Done. Read %d sentences", len(dev_raw))

        helper = ModelHelper.build(train_raw, self.config)
        logger.info("Corpus of train max sentence length is %d",
                    helper.max_length)

        # process all the input data
        train_data = helper.vectorize(train_raw, self.config)
        dev_data = helper.vectorize(dev_raw, self.config)

        return helper, train_data, dev_data, train_raw, dev_raw
Beispiel #2
0
def load_and_preprocess_data(args):
    logger.info("Loading training data...")
    train = read_conll(args.data_train)
    logger.info("Done. Read %d sentences", len(train))
    logger.info("Loading dev data...")
    dev = read_conll(args.data_dev)
    logger.info("Done. Read %d sentences", len(dev))

    helper = ModelHelper.build(train)

    # now process all the input data.
    train_data = helper.vectorize(train)
    dev_data = helper.vectorize(dev)

    return helper, train_data, dev_data, train, dev
Beispiel #3
0
def load_and_preprocess_data(args):
    logger.info("Loading training data...")
    train = read_conll(args.data_train)
    logger.info("Done. Read %d sentences", len(train))
    logger.info("Loading dev data...")
    dev = read_conll(args.data_dev)
    logger.info("Done. Read %d sentences", len(dev))

    helper = ModelHelper.build(train)

    # now process all the input data.
    train_data = helper.vectorize(train)
    dev_data = helper.vectorize(dev)

    return helper, train_data, dev_data, train, dev
Beispiel #4
0
def do_evaluate(args):
    config = Config(args)
    helper = ModelHelper.load(args.model_path)
    input_data = read_conll(args.data)
    embeddings = load_embeddings(args, helper)
    config.embed_size = embeddings.shape[1]

    with tf.Graph().as_default():
        logger.info("Building model...",)
        start = time.time()
        model = RNNModel(helper, config, embeddings)

        logger.info("took %.2f seconds", time.time() - start)

        init = tf.global_variables_initializer()
        saver = tf.train.Saver()

        with tf.Session() as session:
            session.run(init)
            saver.restore(session, model.config.model_output)
            probs = None
            for output in model.output(session, input_data, summarize=args.verbose):
                if args.verbose:
                    (sentence, labels, predictions, probs) = output
                else:
                    (sentence, labels, predictions) = output
                predictions = [LBLS[l] for l in predictions]
                print_sentence(args.output, sentence, labels, predictions, probs)
Beispiel #5
0
def load_and_preprocess_data(args):
    logger.info("Loading training data...")
    train = read_conll(args.data_train)  # [([words of a sentence],[labels])]
    logger.info("Done. Read %d sentences", len(train))
    logger.info("Loading dev data...")
    dev = read_conll(args.data_dev)
    logger.info("Done. Read %d sentences", len(dev))

    helper = ModelHelper.build(train)  # 同时也初始化了ModelHelper,诡异的初始化方式...

    # now process all the input data.
    train_data = helper.vectorize(
        train)  # [ ( [ [wordid,caseid] ], [label_ids] ) ]
    dev_data = helper.vectorize(dev)

    return helper, train_data, dev_data, train, dev
def load_and_preprocess_data(args):
    logger.info("Loading training data...")
    train = read_conll(args.data_train)
    # ['EU', 'rejects', 'German', 'call', 'to', 'boycott', 'British', 'lamb', '.'] ['ORG', 'O', 'MISC', 'O', 'O', 'O', 'MISC', 'O', 'O']
    # ['Peter', 'Blackburn'] ['PER', 'PER']
    # ['The', 'European', 'Commission', 'said', 'on', 'Thursday', 'it', 'disagreed', 'with', 'German', 'advice', 'to', 'consumers', 'to', 'shun', 'British', 'lamb', 'until', 'scientists', 'determine', 'whether', 'mad', 'cow', 'disease', 'can', 'be', 'transmitted', 'to', 'sheep', '.']
    logger.info("Done. Read %d sentences", len(train))
    logger.info("Loading dev data...")
    dev = read_conll(args.data_dev)
    logger.info("Done. Read %d sentences", len(dev))

    helper = ModelHelper.build(train)

    # now process all the input data.
    train_data = helper.vectorize(train)
    dev_data = helper.vectorize(dev)
    """
    Let see an example: args.data_train: file tiny.conll
    train = train[0:5]
    The result of train
    [(['EU', 'rejects', 'German', 'call', 'to', 'boycott', 'British', 'lamb', '.'], ['ORG', 'O', 'MISC', 'O', 'O', 'O', 'MISC', 'O', 'O']), 
    (['Peter', 'Blackburn'], ['PER', 'PER']),
     (['BRUSSELS', '1996-08-22'], ['LOC', 'O']), 
     (['The', 'European', 'Commission', 'said', 'on', 'Thursday', 'it', 'disagreed', 'with', 'German', 'advice', 'to', 'consumers', 'to', 'shun', 'British', 'lamb', 'until', 'scientists', 'determine', 'whether', 'mad', 'cow', 'disease', 'can', 'be', 'transmitted', 'to', 'sheep', '.'], ['O', 'ORG', 'ORG', 'O', 'O', 'O', 'O', 'O', 'O', 'MISC', 'O', 'O', 'O', 'O', 'O', 'MISC', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O']), 
     (['Germany', "'s", 'representative', 'to', 'the', 'European', 'Union', "'s", 'veterinary', 'committee', 'Werner', 'Zwingmann', 'said', 'on', 'Wednesday', 'consumers', 'should', 'buy', 'sheepmeat', 'from', 'countries', 'other', 'than', 'Britain', 'until', 'the', 'scientific', 'advice', 'was', 'clearer', '.'], ['LOC', 'O', 'O', 'O', 'O', 'ORG', 'ORG', 'O', 'O', 'O', 'PER', 'PER', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'LOC', 'O', 'O', 'O', 'O', 'O', 'O', 'O'])]

    The result of helper.tok2id

    {'CASE:aA': 56, 'CASE:AA': 57, '1996-08-22': 14, 'committee': 15, 'be': 38, 'german': 4, 'it': 16, 'boycott': 17, 
    'britain': 18, 'werner': 19, 'determine': 20, 'UUUNKKK': 62, 'eu': 21, 'peter': 22, '</s>': 61, 'lamb': 5, 'disagreed': 24, 
    'said': 6, 'from': 25, 'sheepmeat': 26, 'consumers': 7, 'rejects': 27, 'union': 28, 'veterinary': 29, 'thursday': 30, '.': 2, 
    'zwingmann': 31, 'to': 1, 'other': 33, 'call': 34, 'scientists': 35, 'was': 36, 'until': 8, 'european': 9, 'CASE:aa': 59, 'sheep': 23, 
    'buy': 39, "'s": 10, 'scientific': 37, 'advice': 11, 'clearer': 40, 'wednesday': 41, 'germany': 42, 'mad': 43, 'shun': 44, 'brussels': 45, 
    'with': 46, 'than': 47, 'on': 12, 'disease': 53, 'cow': 48, 'blackburn': 49, 'whether': 50, 'should': 51, 'countries': 52, 'commission': 32, 
    'british': 13, 'CASE:Aa': 58, '<s>': 60, 'transmitted': 54, 'can': 55, 'the': 3, 'representative': 56}

    The result of train_data
    [([[21, 57], [27, 59], [4, 58], [34, 59], [1, 59], [17, 59], [13, 58], [5, 59], [2, 56]], [1, 4, 3, 4, 4, 4, 3, 4, 4]), 
    ([[22, 58], [49, 58]], [0, 0]), 
    ([[45, 57], [14, 56]], [2, 4]), 
    ([[3, 58], [9, 58], [32, 58], [6, 59], [12, 59], [30, 58], [16, 59], [24, 59], [46, 59], [4, 58], [11, 59], [1, 59], [7, 59], [1, 59], [44, 59], [13, 58], [5, 59], [8, 59], [35, 59], [20, 59], [50, 59], [43, 59], [48, 59], [53, 59], [55, 59], [38, 59], [54, 59], [1, 59], [23, 59], [2, 56]], [4, 1, 1, 4, 4, 4, 4, 4, 4, 3, 4, 4, 4, 4, 4, 3, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4]), 
    ([[42, 58], [10, 59], [56, 59], [1, 59], [3, 59], [9, 58], [28, 58], [10, 59], [29, 59], [15, 59], [19, 58], [31, 58], [6, 59], [12, 59], [41, 58], [7, 59], [51, 59], [39, 59], [26, 59], [25, 59], [52, 59], [33, 59], [47, 59], [18, 58], [8, 59], [3, 59], [37, 59], [11, 59], [36, 59], [40, 59], [2, 56]], [2, 4, 4, 4, 4, 1, 1, 4, 4, 4, 0, 0, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 2, 4, 4, 4, 4, 4, 4, 4])]

    """

    return helper, train_data, dev_data, train, dev
def load_and_preprocess_data(args):
    logger.info("Loading training data...")
    train = read_conll(args.data_train)
    #print("********************Writing train master data to a file:**********************")
    #with open("F:/Jupyter/NLP/assignment3/train.txt","w") as f:
        #f.write(str(train))
    logger.info("Done. Read %d sentences", len(train))
    logger.info("Loading dev data...")
    dev = read_conll(args.data_dev)
    logger.info("Done. Read %d sentences", len(dev))

    helper = ModelHelper.build(train)

    # now process all the input data.
    train_data = helper.vectorize(train)
    #print("***********************Writing vectorized train data*********************")
    #with open("F:/Jupyter/NLP/assignment3/train_data.txt","w") as f:
        #f.write(str(train_data))
    dev_data = helper.vectorize(dev)

    return helper, train_data, dev_data, train, dev
Beispiel #8
0
def load_data(args, helper=None):
    train_examples, dev_examples, data_examples = {}, {}, {}
    if hasattr(args, 'data_train'):
        logger.info("Loading training data...")
        train_examples['tokens'] = read_conll(args.data_train)
        logger.info("Done. Read %d sentences", len(train_examples['tokens']))
    if hasattr(args, 'data_dev'):
        logger.info("Loading dev data...")
        dev_examples['tokens'] = read_conll(args.data_dev)
        logger.info("Done. Read %d sentences", len(dev_examples['tokens']))
    if hasattr(args, 'data'):
        logger.info("Loading data...")
        data_examples['tokens'] = read_conll(args.data)
        logger.info("Done. Read %d sentences", len(data_examples['tokens']))

    if hasattr(args, 'data_train') and helper is None:
        helper = ModelHelper.build(train_examples['tokens'])
        helper.max_length = max(
            max(len(sentence) for sentence, _ in dev_examples['tokens']),
            helper.max_length)

    # now process all the input data.
    if hasattr(args, 'data_train'):
        train_examples['token_indices'] = helper.vectorize(
            train_examples['tokens'])
    if hasattr(args, 'data_dev'):
        dev_examples['token_indices'] = helper.vectorize(
            dev_examples['tokens'])
    if hasattr(args, 'data'):
        data_examples['token_indices'] = helper.vectorize(
            data_examples['tokens'])

    data = {
        'train_examples': train_examples,
        'dev_examples': dev_examples,
        'examples': data_examples
    }
    return helper, data
def do_evaluate(args):
    config = Config(args)
    helper = ModelHelper.load(args.model_path)
    input_data = read_conll(args.data)
    embeddings = load_embeddings(args, helper)
    config.embed_size = embeddings.shape[1]

    logger.info("Building model...",)
    start = time.time()
    model = RNNModel(helper, config, embeddings)

    logger.info("took %.2f seconds", time.time() - start)

    for sentence, labels, predictions in model.output(input_data):
        predictions = [LBLS[l] for l in predictions]
        print_sentence(args.output, sentence, labels, predictions)
Beispiel #10
0
def do_evaluate(args):
    config = Config(args)
    helper = ModelHelper.load(args.model_path)
    input_data = read_conll(args.data)
    embeddings = load_embeddings(args, helper)
    config.embed_size = embeddings.shape[1]

    with tf.Graph().as_default():
        logger.info("Building model...",)
        start = time.time()
        model = RNNModel(helper, config, embeddings)

        logger.info("took %.2f seconds", time.time() - start)

        init = tf.global_variables_initializer()
        saver = tf.train.Saver()

        with tf.Session() as session:
            session.run(init)
            saver.restore(session, model.config.model_output)
            for sentence, labels, predictions in model.output(session, input_data):
                predictions = [LBLS[l] for l in predictions]
                print_sentence(args.output, sentence, labels, predictions)