Exemple #1
0
def init_from_scratch(args, train_exs, dev_exs):
    """New module, new data, new dictionary."""
    # Create a feature dict out of the annotations in the data
    logger.info('-' * 100)
    logger.info('Generate features')
    feature_dict = utils.build_feature_dict(args, train_exs)
    logger.info('Num features = %d' % len(feature_dict))
    logger.info(feature_dict)

    # Build a dictionary from the data questions + words (train/dev splits)
    logger.info('-' * 100)
    logger.info('Build dictionary')
    word_dict = utils.build_word_dict(args, train_exs + dev_exs)
    logger.info('Num words = %d' % len(word_dict))

    if args.use_char_emb:
        # Build a character dictionary from the data questions + words (train/dev splits)
        logger.info('-' * 100)
        logger.info('Build character dictionary')
        character_dict = utils.build_character_dict(args, train_exs + dev_exs)
        logger.info('Num character = %d' % len(character_dict))
        # Initialize module
        model = DocReader(config.get_model_args(args), word_dict, feature_dict,
                          character_dict)
    else:
        # Initialize module
        model = DocReader(config.get_model_args(args), word_dict, feature_dict)

    # Load pretrained embeddings for words in dictionary
    if args.embedding_file:
        model.load_embeddings(word_dict.tokens(), args.embedding_file)

    return model
Exemple #2
0
def init_from_scratch(args, train_exs, dev_exs):
    """New model, new data, new dictionary."""
    # Create a feature dict out of the annotations in the data
    logger.info('-' * 100)
    logger.info('Generate features')
    feature_dict = utils.build_feature_dict(args, train_exs)
    logger.info('Num features = %d' % len(feature_dict))
    logger.info(feature_dict)

    # Build a dictionary from the data questions + words (train/dev splits)
    logger.info('-' * 100)
    logger.info('Build dictionary')
    word_dict = utils.build_word_dict(args, train_exs + dev_exs)
    logger.info('Num words = %d' % len(word_dict))

    # Initialize model
    model = DocReader(config.get_model_args(args), word_dict, feature_dict)

    # Load pretrained embeddings for words in dictionary
    if args.embedding_file:
        model.load_embeddings(word_dict.tokens(), args.embedding_file)

    # COMMENTED OUT QUANTIZATION/TT SUPPORT
    # if args.use_quant_embed:
    #     # assert it is loading from a existing file
    #     if not args.embedding_file:
    #         raise IOError("No embedding file specified when using real int based compressed embedding")
    #     quant_embed.quantize_embed(model.network, nbit=args.nbit)

    return model
Exemple #3
0
def init_from_scratch(args, train_exs, dev_exs):
    """New model, new data, new dictionary."""
    # Create a feature dict out of the annotations in the data
    logger.info('-' * 100)
    logger.info('Generate features')
    feature_dict = utils.build_feature_dict(args, train_exs)
    logger.info('Num features = %d' % len(feature_dict))
    logger.info(feature_dict)

    # Build a dictionary from the data questions + documents (train/dev splits)
    logger.info('-' * 100)
    logger.info('Build word dictionary')
    if args.embedding_from_model:
        sp = torch.load(args.embedding_from_model, map_location=lambda storage, loc: storage)
        word_dict = sp['word_dict']
        embedding_weights = sp['state_dict']['embedding.weight']
    else:
        embedding_weights = None
        word_dict = utils.build_word_dict(args, train_exs + dev_exs)
    logger.info('Num words = %d' % len(word_dict))

    # Build a char dictionary from the data questions + documents (train/dev splits)
    logger.info('-' * 100)
    logger.info('Build char dictionary')
    char_dict = utils.build_char_dict(args, train_exs + dev_exs)
    logger.info('Num chars = %d' % len(char_dict))
    # Initialize model
    model = DocReader(config.get_model_args(args), word_dict, char_dict, feature_dict)

    # Load pretrained embeddings for words in dictionary
    if args.embedding_from_model:
        model.load_emb_weights(embedding_weights)
    else:
        if args.embedding_file:
            model.load_embeddings(word_dict.tokens(), args.embedding_file)
    if args.char_embedding_file:
        model.load_char_embeddings(char_dict.tokens(), args.char_embedding_file)

    return model
Exemple #4
0
def init_from_scratch(args, train_exs, dev_exs):
    """New model, new data, new dictionary."""
    # Create a feature dict out of the annotations in the data
    logger.info('-' * 100)
    logger.info('Generate features')
    feature_dict = utils.build_feature_dict(args, train_exs)
    logger.info('Num features = %d' % len(feature_dict))
    logger.info(feature_dict)

    # Build a dictionary from the data questions + words (train/dev splits)
    logger.info('-' * 100)
    logger.info('Build dictionary')
    word_dict = utils.build_word_dict(args, train_exs + dev_exs)
    logger.info('Num words = %d' % len(word_dict))

    # Initialize model
    model = DocReader(config.get_model_args(args), word_dict, feature_dict)

    # Load pretrained embeddings for words in dictionary
    if args.embedding_file:
        model.load_embeddings(word_dict.tokens(), args.embedding_file)

    return model
Exemple #5
0
def init_from_scratch(args, train_exs, dev_exs):
    """New model, new data, new dictionary."""
    # Create a feature dict out of the annotations in the data
    logger.info("-" * 100)
    logger.info("Generate features")
    feature_dict = utils.build_feature_dict(args, train_exs)
    logger.info("Num features = %d" % len(feature_dict))
    logger.info(feature_dict)

    # Build a dictionary from the data questions + words (train/dev splits)
    logger.info("-" * 100)
    logger.info("Build dictionary")
    word_dict = utils.build_word_dict(args, train_exs + dev_exs)
    logger.info("Num words = %d" % len(word_dict))

    # Initialize model
    model = DocReader(config.get_model_args(args), word_dict, feature_dict)

    # Load pretrained embeddings for words in dictionary
    if args.embedding_file:
        model.load_embeddings(word_dict.tokens(), args.embedding_file)

    return model