Ejemplo n.º 1
0
def load_data(use_test, model_loc):
    """
    Load the data into a train/val/test set that allows for easy access.

    @return bag-of-word representation of training, validation, test sets (with labels).
    """    

    t_ids = set(list(preprocessor.train_document_ids()))
    te_ids = set(list(preprocessor.test_document_ids()))
    val_ids = set(list(preprocessor.validation_document_ids()))
    train_Xy, inference_vectorizer = preprocessor.get_train_Xy(t_ids, sections_of_interest=None, vocabulary_file=None, include_sentence_span_splits = True)

    # load model
    model = load_model_scan(inference_vectorizer, model_loc)
        
    # create an internal validation set from the training data; use 90% for training and 10% for validation.
    random.shuffle(train_Xy)
    
    if not(use_test):
        split_index = int(len(train_Xy) * .9)
        val_Xy = train_Xy[split_index:]
        train_Xy = train_Xy[:split_index]
        test_Xy = preprocessor.get_Xy(set(list(preprocessor.validation_document_ids())), inference_vectorizer, sections_of_interest=None, include_sentence_span_splits = True) 
    else:
        val_Xy = preprocessor.get_Xy(val_ids, inference_vectorizer, sections_of_interest=None, include_sentence_span_splits = True) 
        test_Xy = preprocessor.get_Xy(te_ids, inference_vectorizer, sections_of_interest=None, include_sentence_span_splits = True) 
        
    
    x_train, y_train = reformat(train_Xy, inference_vectorizer, model)
    x_val, y_val     = reformat(val_Xy, inference_vectorizer, model)
    x_test, y_test   = reformat(test_Xy, inference_vectorizer, model)
    return x_train, y_train, x_val, y_val, x_test, y_test
Ejemplo n.º 2
0
def run_scan_net_regression(loc = './scan_net.pth'):
    train_Xy, inference_vectorizer = preprocessor.get_train_Xy(set(list(preprocessor.train_document_ids())), sections_of_interest=None, vocabulary_file=None, include_sentence_span_splits = True)
        
    if not(USE_TEST):
        # create an internal validation set from the training data; use 90% for training and 10% for validation.
        split_index = int(len(train_Xy) * .9)
        val_Xy = train_Xy[split_index:]
        train_Xy = train_Xy[:split_index]
        test_Xy = preprocessor.get_Xy(preprocessor.validation_document_ids(), inference_vectorizer, sections_of_interest=None, include_sentence_span_splits = True) 
    else:
        val_Xy = preprocessor.get_Xy(preprocessor.validation_document_ids(), inference_vectorizer, sections_of_interest=None, include_sentence_span_splits = True) 
        test_Xy = preprocessor.get_Xy(preprocessor.test_document_ids(), inference_vectorizer, sections_of_interest=None, include_sentence_span_splits = True) 
        
    train_Xy, val_Xy, test_Xy = train_reformat(train_Xy), scan_reform(val_Xy), scan_reform(test_Xy) 
    

    
    # train with 50 epochs, batch_size of 1, and patience of 3 (early stopping)
    model = train_scan(inference_vectorizer, train_Xy, val_Xy, test_Xy, 100, 32, 5)
    
    torch.save(model.state_dict(), loc)
def get_data(sections_of_interest=None, mode='experiment', include_sentence_span_splits = False):
    random.seed(177)
    if mode == 'experiment':
        # raise ValueError('implement me!')
        train_docs = list(preprocessor.train_document_ids())
        random.shuffle(train_docs)
        split_index = int(len(train_docs) * .9)
        real_train_docs = train_docs[:split_index]
        real_val_docs = train_docs[split_index:]
        parent_path = abspath(os.path.join(dirname(abspath(__file__)), '..', '..'))
        vocab_f = os.path.join(parent_path, "annotations", "vocab.txt")
        real_train_Xy, inference_vectorizer = preprocessor.get_train_Xy(set(real_train_docs), sections_of_interest=sections_of_interest, vocabulary_file=vocab_f, include_sentence_span_splits = include_sentence_span_splits)
        real_val_Xy = preprocessor.get_Xy(set(real_val_docs), inference_vectorizer, sections_of_interest=sections_of_interest, include_sentence_span_splits = include_sentence_span_splits)

        # in development, our "test" set is our validation ids so we don't cheat.
        real_test_Xy = preprocessor.get_Xy(preprocessor.validation_document_ids(), inference_vectorizer, sections_of_interest, include_sentence_span_splits = include_sentence_span_splits)
        return real_train_Xy, real_val_Xy, real_test_Xy, inference_vectorizer
    elif mode == 'paper':
        parent_path = abspath(os.path.join(dirname(abspath(__file__)), '..', '..'))
        vocab_f = os.path.join(parent_path, "annotations", "vocab.txt")
        train_docs = preprocessor.train_document_ids()
        train_Xy, inference_vectorizer = preprocessor.get_train_Xy(train_docs, sections_of_interest=sections_of_interest, vocabulary_file=vocab_f, include_sentence_span_splits = include_sentence_span_splits)
        val_Xy = preprocessor.get_Xy(preprocessor.validation_document_ids(), inference_vectorizer, sections_of_interest=sections_of_interest, include_sentence_span_splits = include_sentence_span_splits)
        test_Xy = preprocessor.get_Xy(preprocessor.test_document_ids(), inference_vectorizer, sections_of_interest, include_sentence_span_splits = include_sentence_span_splits)
        return train_Xy, val_Xy, test_Xy, inference_vectorizer
    elif mode == 'minimal':
        parent_path = abspath(os.path.join(dirname(abspath(__file__)), '..', '..'))
        vocab_f = os.path.join(parent_path, "annotations", "vocab.txt")
        train_docs = list(preprocessor.train_document_ids())[:5]
        train_Xy, inference_vectorizer = preprocessor.get_train_Xy(train_docs, sections_of_interest=sections_of_interest, vocabulary_file=vocab_f, include_sentence_span_splits = include_sentence_span_splits)
        val_Xy = preprocessor.get_Xy(list(preprocessor.validation_document_ids())[:5], inference_vectorizer, sections_of_interest=sections_of_interest, include_sentence_span_splits = include_sentence_span_splits)
        test_Xy = preprocessor.get_Xy(list(preprocessor.validation_document_ids())[5:10], inference_vectorizer, sections_of_interest, include_sentence_span_splits = include_sentence_span_splits)
        return train_Xy, val_Xy, test_Xy, inference_vectorizer
    else:
        raise ValueError('implement me!')
Ejemplo n.º 4
0
def run_scan_net_redux(loc='scan_net_redux.pth'):
    parent_path = abspath(os.path.join(dirname(abspath(__file__)), '..', '..'))
    vocab_f = os.path.join(parent_path, "annotations", "vocab.txt")
    train_Xy, inference_vectorizer = preprocessor.get_train_Xy(
        list(preprocessor.train_document_ids()),
        sections_of_interest=None,
        vocabulary_file=vocab_f,
        include_sentence_span_splits=True)

    if not (USE_TEST):
        # create an internal validation set from the training data; use 90% for training and 10% for validation.
        split_index = int(len(train_Xy) * .9)
        val_Xy = train_Xy[split_index:]
        train_Xy = train_Xy[:split_index]
        test_Xy = preprocessor.get_Xy(preprocessor.validation_document_ids(),
                                      inference_vectorizer,
                                      sections_of_interest=None,
                                      include_sentence_span_splits=True)
    else:
        val_Xy = preprocessor.get_Xy(preprocessor.validation_document_ids(),
                                     inference_vectorizer,
                                     sections_of_interest=None,
                                     include_sentence_span_splits=True)
        test_Xy = preprocessor.get_Xy(preprocessor.test_document_ids(),
                                      inference_vectorizer,
                                      sections_of_interest=None,
                                      include_sentence_span_splits=True)

    if USE_CUDA:
        se_scn = ScanNet(inference_vectorizer, use_attention=use_attn).cuda()
    else:
        se_scn = ScanNet(inference_vectorizer, use_attention=use_attn)

    # train with 50 epochs, batch_size of 1, and patience of 3 (early stopping)
    train_scan(se_scn, inference_vectorizer, train_Xy, val_Xy, test_Xy, 50, 32,
               10)

    # save to specified path
    torch.save(se_scn.state_dict(), loc)
Ejemplo n.º 5
0
def run_scan_net_ico(loc = "scan_net_ICO_no_attn_test.pth"):
    print("Modules loaded.")
    
    parent_path = abspath(os.path.join(dirname(abspath(__file__)), '..', '..'))
    vocab_f = os.path.join(parent_path, "annotations", "vocab.txt")
    
    
    train_Xy, inference_vectorizer = preprocessor.get_train_Xy(list(preprocessor.train_document_ids()), sections_of_interest=None, 
                                                               vocabulary_file=vocab_f,
                                                               include_sentence_span_splits=True)
        
    print("Train Data Achieved")  
    if not(USE_TEST):
        # create an internal validation set from the training data; use 90% for training and 10% for validation.
        split_index = int(len(train_Xy) * .9)
        val_Xy = train_Xy[split_index:]
        train_Xy = train_Xy[:split_index]
        test_Xy = preprocessor.get_Xy(list(preprocessor.validation_document_ids()), inference_vectorizer, sections_of_interest=None, include_sentence_span_splits = True) 
    else:
        val_Xy = preprocessor.get_Xy(preprocessor.validation_document_ids(), inference_vectorizer, sections_of_interest=None, include_sentence_span_splits = True) 
        test_Xy = preprocessor.get_Xy(preprocessor.test_document_ids(), inference_vectorizer, sections_of_interest=None, include_sentence_span_splits = True) 
    
    print("Test Data Achieved")  
    
    if USE_CUDA:
        se_scn = ScanNet(inference_vectorizer, use_attention=use_attn).cuda()
    else:
        se_scn = ScanNet(inference_vectorizer, use_attention=use_attn)
        
    print("Model loaded")
    # train with 50 epochs, batch_size of 1, and patience of 3 (early stopping)
    train_scan(se_scn, inference_vectorizer, train_Xy, val_Xy, 50, 32, 10)
    
    acc, f1, prc, rc, auc = test_model(se_scn, test_Xy, inference_vectorizer)
    
    # save to specified path
    #args = parser.parse_args()
    torch.save(se_scn.state_dict(), loc)
Ejemplo n.º 6
0
def train():
    # train the model -- this assumes access to evidence_inference:
    # https://github.com/jayded/evidence-inference/tree/master/evidence_inference
    # which is not needed in general to load the trained model.
    #
    # if inference_true flag is on, then a model will also be fit that predicts the
    # outcome (sig. decrease, no diff, sig. increase) given punchline snippets.
    from evidence_inference.preprocess.preprocessor import get_Xy, train_document_ids, test_document_ids, validation_document_ids, get_train_Xy

    extractor_model = PunchlineExtractor()

    tr_ids, val_ids, te_ids = train_document_ids(), validation_document_ids(
    ), test_document_ids()
    tr_ids = list(train_document_ids())
    train_Xy, inference_vectorizer = get_train_Xy(
        tr_ids,
        sections_of_interest=None,
        vocabulary_file=None,
        include_sentence_span_splits=False,
        include_raw_texts=True)
    # Create vectors and targets for extraction task
    X_k, y_k = make_Xy(train_Xy, extractor_model.bc)
    print("train data loaded!")

    val_Xy = get_Xy(val_ids, inference_vectorizer, include_raw_texts=True)
    X_kv, y_kv = make_Xy(val_Xy, extractor_model.bc, neg_samples=1)
    print("val data loaded!")

    # Fit the model!
    filepath = "punchline.weights.best.hdf5"
    checkpoint = ModelCheckpoint(filepath,
                                 monitor='val_acc',
                                 verbose=1,
                                 save_best_only=True,
                                 mode='max')
    callbacks_list = [checkpoint]

    with open("punchline_model.json", "w") as outf:
        outf.write(extractor_model.model.to_json())

    print("fitting punchline extractor!")
    extractor_model.model.fit(X_k,
                              y_k,
                              validation_data=(X_kv, y_kv),
                              callbacks=callbacks_list,
                              epochs=50)
Ejemplo n.º 7
0
def train_simple_inference_net(n_epochs=30):
    inf_net = SimpleInferenceNet()
    tr_ids, val_ids, te_ids = train_document_ids(), validation_document_ids(
    ), test_document_ids()
    tr_ids = list(train_document_ids())
    train_Xy, inference_vectorizer = get_train_Xy(
        tr_ids,
        sections_of_interest=None,
        vocabulary_file=None,
        include_sentence_span_splits=False,
        include_raw_texts=True)

    X_k, y_k = make_Xy_inference(train_Xy, inf_net.bc)
    print("train data for inference task loaded!")

    val_Xy = get_Xy(val_ids, inference_vectorizer, include_raw_texts=True)
    X_kv, y_kv = make_Xy_inference(val_Xy, inf_net.bc)
    print("val data loaded!")

    filepath = "inference.weights.best.hdf5"
    checkpoint = ModelCheckpoint(filepath,
                                 monitor='val_acc',
                                 verbose=1,
                                 save_best_only=True,
                                 mode='max')
    callbacks_list = [checkpoint]

    with open("inference_model.json", "w") as outf:
        outf.write(inf_net.model.to_json())

    print("fitting inference model!")
    inf_net.model.fit(X_k,
                      y_k,
                      validation_data=(X_kv, y_kv),
                      callbacks=callbacks_list,
                      epochs=n_epochs)
print("Loading data.")
# get training data
train_Xy, inference_vectorizer = preprocessor.get_train_Xy(
    set(list(preprocessor.train_document_ids())),
    sections_of_interest=None,
    vocabulary_file=None,
    include_sentence_span_splits=True)
print("Training data loaded.")

if not (USE_TEST):
    split_index = int(len(train_Xy) * .9)
    val_Xy = train_Xy[split_index:]
    train_Xy = train_Xy[:split_index]
    test_Xy = preprocessor.get_Xy(set(
        list(preprocessor.validation_document_ids())),
                                  inference_vectorizer,
                                  sections_of_interest=None,
                                  include_sentence_span_splits=True)
else:
    val_Xy = preprocessor.get_Xy(set(
        list(preprocessor.validation_document_ids())),
                                 inference_vectorizer,
                                 sections_of_interest=None,
                                 include_sentence_span_splits=True)
    test_Xy = preprocessor.get_Xy(set(list(preprocessor.test_document_ids())),
                                  inference_vectorizer,
                                  sections_of_interest=None,
                                  include_sentence_span_splits=True)

print("Test data loaded.")
Ejemplo n.º 9
0
sys.path.append("../")
from evidence_inference.preprocess.preprocessor import get_Xy, train_document_ids, test_document_ids, validation_document_ids, get_train_Xy
device = torch.device('cuda')

print("loading train docs...")
tr_ids = list(train_document_ids())
train_Xy, inference_vectorizer = get_train_Xy(
    tr_ids[:100],
    sections_of_interest=None,
    vocabulary_file=None,
    include_sentence_span_splits=False,
    include_raw_texts=True)
print("done")

val_ids = list(validation_document_ids())
val_Xy = get_Xy(val_ids, inference_vectorizer, include_raw_texts=True)


def instances_from_article(article_dict, neg_samples=2, max_instances=6):
    def filter_empty(snippets):
        return [s for s in snippets if len(s) > 1]

    evidence_snippets = filter_empty(
        [snippet[1].lower() for snippet in article_dict['y']])
    positive_snippets = evidence_snippets

    if len(positive_snippets) == 0:
        print("no evidence snippets in an article!")
        return ([], [])

    max_pos = max(1, max_instances / (neg_samples + 1))
Ejemplo n.º 10
0
    weighted by number of prompts for that document. 
    """
    tokens = {}  # Map article ids to token
    prompts = {}  # Map article ids to num prompts

    for d in Xy:
        n_tokens = len(d['article'])
        tokens[d['a_id']] = n_tokens

        if d['a_id'] in prompts:
            prompts[d['a_id']] += 1
        else:
            prompts[d['a_id']] = 1

    total_entropy = 0
    for art in prompts.keys():
        total_entropy += np.log(tokens[art]) * prompts[art] / len(Xy)

    return total_entropy


tr_ids, val_ids, te_ids = preprocessor.train_document_ids(
), preprocessor.validation_document_ids(), preprocessor.test_document_ids()
train_Xy, inference_vectorizer = preprocessor.get_train_Xy(tr_ids)
val_Xy = preprocessor.get_Xy(val_ids, inference_vectorizer)
test_Xy = preprocessor.get_Xy(te_ids, inference_vectorizer)

print(calculate_entropy(train_Xy))
print(calculate_entropy(val_Xy))
print(calculate_entropy(test_Xy))