Beispiel #1
0
def init_from_scratch(args, train_exs, dev_exs):
    """New model, new data, new dictionary."""
    # Create a feature dict out of the annotations in the data
    logger.info('-' * 100)
    logger.info('Generate features')
    feature_dict = utils.build_feature_dict(args, train_exs)
    logger.info('Num features = %d' % len(feature_dict))
    logger.info(feature_dict)

    # Build a dictionary from the data questions + documents (train/dev splits)
    logger.info('-' * 100)
    logger.info('Build word dictionary')
    word_dict = utils.build_word_dict(args, train_exs + dev_exs)
    logger.info('Num words = %d' % len(word_dict))    

    # Build a char dictionary from the data questions + documents (train/dev splits)
    logger.info('-' * 100)
    logger.info('Build char dictionary')
    char_dict = utils.build_char_dict(args, train_exs + dev_exs)
    logger.info('Num chars = %d' % len(char_dict))
    # Initialize model
    model = DocReader(config.get_model_args(args), word_dict, char_dict, feature_dict)

    # Load pretrained embeddings for words in dictionary
    if args.embedding_file:
        model.load_embeddings(word_dict.tokens(), args.embedding_file)
    if args.char_embedding_file:
        model.load_char_embeddings(char_dict.tokens(), args.char_embedding_file)

    return model
Beispiel #2
0
def init_from_scratch(args, train_exs):
    print('init from scrath')
    print('building word vocabulary')
    word_dict = build_word_dict(args, train_exs)
    print('building char vocabulary')
    char_dict = build_char_dict(args, train_exs)
    model = TMmodel(args, word_dict, char_dict)
    model.load_word_embedding()
    model.load_char_embedding()
    return model