Beispiel #1
0
def doc_test(log_file):
    """Test the model qualitatively on a document.
    
    Args:
        log_file: where to store the output document.
    """
    if os.path.isfile('./' + log_file + '_doc'):
        raise ValueError('log file already exists')

    # The temp file created here splits the input document into line by line
    # representations to make it easier to see how translations were done on a
    # sentence level.
    with tf.Session() as sess, open(log_file + '_doc', 'w+') as log, open(
            dc.NORMAL_DOC_PATH, 'r+') as doc_file, open('temp_doc',
                                                        'w+') as temp:
        doc_to_translate = doc_file.readline()
        sentences = [
            sentence + '.' for sentence in doc_to_translate.split('.')
        ]
        model = create_model(sess, True)
        model.batch_size = 1
        normal_vocab, _ = data_utils.get_vocabulary(dc.NORMAL_VOCAB_PATH)
        _, rev_simple_vocab = data_utils.get_vocabulary(dc.SIMPLE_VOCAB_PATH)
        for sentence in sentences:
            translation = pipe_sentence(sentence, normal_vocab,
                                        rev_simple_vocab, sess, model)
            log.write(translation + '\n')
            temp.write(sentence + '\n')
Beispiel #2
0
def main():

    args, settings = parse_args_and_settings()
    logger = output_utils.Logger(args)
    logger.shout('python main.py ' + ' '.join(sys.argv[1:]))

    num_threads = 5
    torch.set_num_threads(num_threads)

    # Load entity map (needed for all phases; primarily for loading data):
    entity_idx_to_name, entity_name_to_idx = data_utils.load_entity_map(settings.data.entity_map)

    if args.phase == 'train' or args.phase == 'deploy':

        # Loading train data is needed for train, but also for deploy, namely if vocabulary doesn't exist yet:
        train_data = None
        if args.phase == 'train' or not os.path.exists(settings.data.vocabulary):
            train_data = data_utils.load_data(settings.data.dataset, entity_name_to_idx, with_keys=True, logger=logger)

        # Load vocabulary (and extract from train_data if vocabulary doesn't exist yet)
        vocabulary_idx_to_word, vocabulary_word_to_idx = data_utils.get_vocabulary(settings.data.vocabulary,
                                                                                   extract_from=train_data,
                                                                                   logger=logger)

        # Avoid loading/generating google news embeddings in deploy phase:
        if args.phase == 'deploy' and settings.model.token_emb == config_utils.data_paths["embeddings"]["google_news"]:
            settings.model.token_emb = 300
            # Appropriate embeddings will be loaded anyway from saved .pt model file.
            # TODO: This won't generalize when using other embeddings.

        # Load embeddings if needed:
        if isinstance(settings.model.token_emb, str):
            settings.model.token_emb = embedding_loader.load_word_embeddings(settings.model.token_emb,
                                                                             settings.data.dataset, train_data, logger)
        if isinstance(settings.model.speaker_emb, str):
            settings.model.speaker_emb = embedding_loader.load_entity_embeddings(settings.model.speaker_emb,
                                                                                 settings.data.entity_map, logger)
        # convenient to compute and store some dependent parameters:
        settings.model.vocabulary_size = len(vocabulary_idx_to_word)
        settings.model.num_entities = len(entity_idx_to_name)

    if args.phase == 'train':
        logger.save_config(settings.orig)
        logger.say(output_utils.bcolors.BOLD + 'Training on ' + settings.data.dataset)
        run_training(settings, train_data, vocabulary_idx_to_word, vocabulary_word_to_idx, logger, not args.no_cuda)

    if args.phase == 'deploy':
        logger.say(output_utils.bcolors.BOLD + 'Deploying ' + str(len(args.model)) + ' models (' + (
            args.run_name if len(args.model) > 1 else args.model[0]) + ')...\n   ...on ' + ('folds of ' if not args.no_cv else '') + args.deploy_data)
        args.answer_file, with_keys = run_deploy(args.model, settings, args.deploy_data, vocabulary_idx_to_word, vocabulary_word_to_idx, entity_name_to_idx, args.answers_per_fold, args.no_cv, logger, not args.no_cuda)
        # After deploying, evaluate (unless not desired or data does not contain reference keys):
        if not args.no_eval:
            if with_keys is True:
                args.phase = 'evaluate'
            else:
                logger.shout('Warning: Model predictions will not be evaluated, since given data does not contain reference labels. ')

    if args.phase == 'evaluate':
        logger.say(output_utils.bcolors.BOLD + 'Evaluating ' + ('(not SemEval style) ' if args.no_semeval else '(SemEval style) ') + 'predictions of ' + args.answer_file)
        run_evaluate(args.answer_file, args.deploy_data, entity_name_to_idx, entity_idx_to_name, args.no_semeval, logger)
Beispiel #3
0
def load_word_embeddings(embeddings_fname,
                         training_datapath,
                         training_data,
                         logger=None):
    """
    :param embeddings_fname: The name of the file containing pre-trained embeddings. 
            E.g., the Google-news w2v embeddings
    :param training_datapath: The name of the file containing the training data for 
            a model which uses word embeddings (loaded from embeddings_fname). 
    """
    # vocab_fname: The name of the file containing the relevant vocabulary.
    #        Each line contains the word idx and the word, separated by tabs ("\t").
    vocab_fname = training_datapath.replace(".conll", ".vocab")
    word_emb_fname = data_utils.get_embeddings_path_for_vocab(
        embeddings_fname, vocab_fname)
    if os.path.exists(word_emb_fname):
        if logger:
            logger.whisper(
                "Loading token embedding from {0}".format(word_emb_fname))
        word_embeddings = np.load(word_emb_fname)
    else:
        vocabulary_idx_to_word, _ = data_utils.get_vocabulary(
            vocab_fname, extract_from=training_data, logger=logger)
        all_word_vectors = load_word2vec_embeddings(embeddings_fname)
        word_embeddings, _, _ = filter_embeddings(all_word_vectors,
                                                  vocabulary_idx_to_word)
        save_word_embeddings(word_embeddings, word_emb_fname)
    return word_embeddings
Beispiel #4
0
def input_test():
    """Input your own sentence and see how the model interprets it."""
    with tf.Session() as sess:
        model = create_model(sess, True)
        model.batch_size = 1

        normal_vocab, _ = data_utils.get_vocabulary(dc.NORMAL_VOCAB_PATH)
        _, rev_simple_vocab = data_utils.get_vocabulary(dc.SIMPLE_VOCAB_PATH)

        sys.stdout.write("> ")
        sys.stdout.flush()
        sentence = sys.stdin.readline()

        while sentence:
            translation = pipe_sentence(sentence, normal_vocab,
                                        rev_simple_vocab, sess, model)

            print translation

            print "> "
            sys.stdout.flush()
            sentence = sys.stdin.readline()
def get_data(args):
    df = pd.read_csv(PROTO_TSV, sep='\t')

    # Sentences
    sent_ids = set(df['Sentence.ID'].tolist())
    print(f'There are {len(sent_ids)} unique sentences.')
    sents_path = os.path.join(PICKLED_DIR, 'sents.pkl')
    sents = None
    if os.path.exists(sents_path) and not 'sents' in args.init_list:
        with open(sents_path, 'rb') as f:
            sents = pickle.load(f)
    else:
        with open(sents_path, 'wb') as f:
            sents = data_utils.get_nltk_sents(sent_ids)
            pickle.dump(sents, f)

    # Dependency data
    dependencies_path = os.path.join(PICKLED_DIR, 'dependencies.pkl')
    if os.path.exists(dependencies_path) and not 'deps' in args.init_list:
        with open(dependencies_path, 'rb') as f:
            deps, deps_just_tokens = pickle.load(f)
    else:
        with open(dependencies_path, 'wb') as f:
            deps, deps_just_tokens = data_utils.get_dependencies(sent_ids)
            pickle.dump((deps, deps_just_tokens), f)
    sents['dependencies'] = deps
    sents['deps_just_tokens'] = deps_just_tokens

    # Instances
    instances_path = os.path.join(PICKLED_DIR, 'instances.pkl')
    proto_instances = None
    possible = None  # Data to compare to SPRL paper
    if os.path.exists(instances_path) and not 'instances' in args.init_list:
        with open(instances_path, 'rb') as f:
            proto_instances, possible = pickle.load(f)
    else:
        proto_instances, possible = data_utils.build_instance_list(df)
        data_utils.add_pred_args(proto_instances, sents['trees'])
        with open(instances_path, 'wb') as f:
            pickle.dump((proto_instances, possible), f)

    # Matching between raw and dependency data
    if args.model_type != 'lstm':
        data_utils.match_conllu_to_raw(sents['raw'], deps)
        # no corresponding overwrite here since running logreg on local machine
    else:
        data_utils.match_raw_to_conllu(proto_instances, sents['raw'],
                                       deps_just_tokens)
        with open(instances_path, 'wb') as f:
            pickle.dump((proto_instances, possible), f)

    # Word embedding data
    sent_ids = {}  # Redefining sent_ids for this section
    for split in SPLITS:
        sent_ids[split] = [pt['Sentence.ID'] for pt in proto_instances[split]]
    w2e = None
    glove_path = os.path.join(PICKLED_DIR, f'glove_{args.glove_d}.pkl')
    if os.path.exists(glove_path) and not 'glove' in args.init_list:
        with open(glove_path, 'rb') as f:
            w2e = pickle.load(f)
    else:
        vocab = data_utils.get_vocabulary(deps_just_tokens)
        w2e = data_utils.w2e_from_file(GLOVE_FILE[args.glove_d], vocab=vocab)
        with open(glove_path, 'wb') as f:
            pickle.dump(w2e, f)

    w2i, i2w = None, None
    emb_np = None
    X, y = None, None
    if args.model_type == 'lstm':
        dicts_path = os.path.join(PICKLED_DIR, 'dicts.pkl')
        if os.path.exists(dicts_path) and not 'dicts' in args.init_list:
            with open(dicts_path, 'rb') as f:
                w2i, i2w = pickle.load(f)
        else:
            w2i, i2w = data_utils.build_dicts(sents['deps_just_tokens'],
                                              sent_ids=sent_ids,
                                              glove_vocab=sorted(
                                                  list(w2e.keys())))
            with open(dicts_path, 'wb') as f:
                pickle.dump((w2i, i2w), f)

        emb_np_path = os.path.join(PICKLED_DIR, 'emb_np.pkl')
        if os.path.exists(emb_np_path) and not 'emb_np' in args.init_list:
            with open(emb_np_path, 'rb') as f:
                emb_np = pickle.load(f)
        else:
            emb_np = data_utils.build_emb_np(w2e, w2i=w2i, i2w=i2w)
            with open(emb_np_path, 'wb') as f:
                pickle.dump(emb_np, f)

        lstm_data_path = os.path.join(PICKLED_DIR, 'lstm_data.pkl')
        if os.path.exists(
                lstm_data_path) and not 'lstm_data' in args.init_list:
            with open(lstm_data_path, 'rb') as f:
                X, y = pickle.load(f)
        else:
            # Proto instances modified in-place here
            data_utils.get_arg_head_idx(proto_instances, sents['dependencies'],
                                        sents['deps_just_tokens'])
            with open(instances_path, 'wb') as f:
                pickle.dump((proto_instances, possible), f)

            numericalized = data_utils.numericalize(sents['deps_just_tokens'],
                                                    w2i)
            X = {}
            y = {}
            for split in SPLITS:
                X[split], y[split] = data_utils.get_ins_outs_lstm(
                    proto_instances[split], numericalized)
            with open(lstm_data_path, 'wb') as f:
                pickle.dump((X, y), f)

    num_instances = sum([len(x) for x in proto_instances.values()])
    print(f'There are {num_instances} instances.')

    return {
        'df': df,
        'proto_instances': proto_instances,
        'possible': possible,
        'sents': sents,
        'w2e': w2e,
        'sent_ids': sent_ids,
        'lstm_data': (X, y),
        'dicts': (w2i, i2w),
        'emb_np': emb_np
    }