def get_data(sections_of_interest=None, mode='experiment', include_sentence_span_splits = False):
    random.seed(177)
    if mode == 'experiment':
        # raise ValueError('implement me!')
        train_docs = list(preprocessor.train_document_ids())
        random.shuffle(train_docs)
        split_index = int(len(train_docs) * .9)
        real_train_docs = train_docs[:split_index]
        real_val_docs = train_docs[split_index:]
        parent_path = abspath(os.path.join(dirname(abspath(__file__)), '..', '..'))
        vocab_f = os.path.join(parent_path, "annotations", "vocab.txt")
        real_train_Xy, inference_vectorizer = preprocessor.get_train_Xy(set(real_train_docs), sections_of_interest=sections_of_interest, vocabulary_file=vocab_f, include_sentence_span_splits = include_sentence_span_splits)
        real_val_Xy = preprocessor.get_Xy(set(real_val_docs), inference_vectorizer, sections_of_interest=sections_of_interest, include_sentence_span_splits = include_sentence_span_splits)

        # in development, our "test" set is our validation ids so we don't cheat.
        real_test_Xy = preprocessor.get_Xy(preprocessor.validation_document_ids(), inference_vectorizer, sections_of_interest, include_sentence_span_splits = include_sentence_span_splits)
        return real_train_Xy, real_val_Xy, real_test_Xy, inference_vectorizer
    elif mode == 'paper':
        parent_path = abspath(os.path.join(dirname(abspath(__file__)), '..', '..'))
        vocab_f = os.path.join(parent_path, "annotations", "vocab.txt")
        train_docs = preprocessor.train_document_ids()
        train_Xy, inference_vectorizer = preprocessor.get_train_Xy(train_docs, sections_of_interest=sections_of_interest, vocabulary_file=vocab_f, include_sentence_span_splits = include_sentence_span_splits)
        val_Xy = preprocessor.get_Xy(preprocessor.validation_document_ids(), inference_vectorizer, sections_of_interest=sections_of_interest, include_sentence_span_splits = include_sentence_span_splits)
        test_Xy = preprocessor.get_Xy(preprocessor.test_document_ids(), inference_vectorizer, sections_of_interest, include_sentence_span_splits = include_sentence_span_splits)
        return train_Xy, val_Xy, test_Xy, inference_vectorizer
    elif mode == 'minimal':
        parent_path = abspath(os.path.join(dirname(abspath(__file__)), '..', '..'))
        vocab_f = os.path.join(parent_path, "annotations", "vocab.txt")
        train_docs = list(preprocessor.train_document_ids())[:5]
        train_Xy, inference_vectorizer = preprocessor.get_train_Xy(train_docs, sections_of_interest=sections_of_interest, vocabulary_file=vocab_f, include_sentence_span_splits = include_sentence_span_splits)
        val_Xy = preprocessor.get_Xy(list(preprocessor.validation_document_ids())[:5], inference_vectorizer, sections_of_interest=sections_of_interest, include_sentence_span_splits = include_sentence_span_splits)
        test_Xy = preprocessor.get_Xy(list(preprocessor.validation_document_ids())[5:10], inference_vectorizer, sections_of_interest, include_sentence_span_splits = include_sentence_span_splits)
        return train_Xy, val_Xy, test_Xy, inference_vectorizer
    else:
        raise ValueError('implement me!')
Example #2
0
def load_data(use_test, model_loc):
    """
    Load the data into a train/val/test set that allows for easy access.

    @return bag-of-word representation of training, validation, test sets (with labels).
    """    

    t_ids = set(list(preprocessor.train_document_ids()))
    te_ids = set(list(preprocessor.test_document_ids()))
    val_ids = set(list(preprocessor.validation_document_ids()))
    train_Xy, inference_vectorizer = preprocessor.get_train_Xy(t_ids, sections_of_interest=None, vocabulary_file=None, include_sentence_span_splits = True)

    # load model
    model = load_model_scan(inference_vectorizer, model_loc)
        
    # create an internal validation set from the training data; use 90% for training and 10% for validation.
    random.shuffle(train_Xy)
    
    if not(use_test):
        split_index = int(len(train_Xy) * .9)
        val_Xy = train_Xy[split_index:]
        train_Xy = train_Xy[:split_index]
        test_Xy = preprocessor.get_Xy(set(list(preprocessor.validation_document_ids())), inference_vectorizer, sections_of_interest=None, include_sentence_span_splits = True) 
    else:
        val_Xy = preprocessor.get_Xy(val_ids, inference_vectorizer, sections_of_interest=None, include_sentence_span_splits = True) 
        test_Xy = preprocessor.get_Xy(te_ids, inference_vectorizer, sections_of_interest=None, include_sentence_span_splits = True) 
        
    
    x_train, y_train = reformat(train_Xy, inference_vectorizer, model)
    x_val, y_val     = reformat(val_Xy, inference_vectorizer, model)
    x_test, y_test   = reformat(test_Xy, inference_vectorizer, model)
    return x_train, y_train, x_val, y_val, x_test, y_test
Example #3
0
def main():
    # Load in the data
    prompts = np.asarray(preprocessor.read_prompts())
    annotations = np.asarray(preprocessor.read_annotations())
    data = {}
    preds, y_test = [], []

    
    # store data in dictionary
    for p in prompts:
        data[p[0]] = {'xml': p[1], 'outcome': p[2], 'intervention': p[3], 'comparator': p[4], 'answer': '', 'reasoning': ''}
        
    for a in annotations:
        if (a[3]):
            data[a[1]]['answer'] = a[7]
        if (a[4]):
            data[a[1]]['reasoning'] += str(a[6])
       
    test_id = preprocessor.test_document_ids()
    # get predictions and add them to array
    for k in data.keys():
        # try to parse text to remove weird things
        id_   = data[k]['xml']
        
        # if the file is not a test file
        if not(id_ in test_id):
            continue 
        
        out   = try_except_parse(data[k]['outcome'])
        inter = try_except_parse(data[k]['intervention'])
        cmp   = try_except_parse(data[k]['comparator'])
        ans   = try_except_parse(data[k]['answer'])
        res   = try_except_parse(data[k]['reasoning'])
        
        if (ans == ''):
            continue # we don't have a valid answer for this one... 
            
        y_test.append(ans)

        # just use the reasoning as our sentence        
        likely_sentence = res
        guess = eval_sentence(likely_sentence, out, inter, cmp)
        
        if (guess == "No significant difference"):
            preds.append(0)
        elif (guess == "Significantly decreased"):
            preds.append(-1)
        else:
            preds.append(1)
         
    acc  = accuracy_score(y_test, preds)
    f1   = f1_score(y_test, preds, average='macro')
    prec = precision_score(y_test, preds, average = 'macro')
    rec  = recall_score(y_test, preds, average = 'macro')
        
    return acc, f1, prec, rec
def train():
    # train the model -- this assumes access to evidence_inference:
    # https://github.com/jayded/evidence-inference/tree/master/evidence_inference
    # which is not needed in general to load the trained model.
    #
    # if inference_true flag is on, then a model will also be fit that predicts the
    # outcome (sig. decrease, no diff, sig. increase) given punchline snippets.
    from evidence_inference.preprocess.preprocessor import get_Xy, train_document_ids, test_document_ids, validation_document_ids, get_train_Xy

    extractor_model = PunchlineExtractor()

    tr_ids, val_ids, te_ids = train_document_ids(), validation_document_ids(
    ), test_document_ids()
    tr_ids = list(train_document_ids())
    train_Xy, inference_vectorizer = get_train_Xy(
        tr_ids,
        sections_of_interest=None,
        vocabulary_file=None,
        include_sentence_span_splits=False,
        include_raw_texts=True)
    # Create vectors and targets for extraction task
    X_k, y_k = make_Xy(train_Xy, extractor_model.bc)
    print("train data loaded!")

    val_Xy = get_Xy(val_ids, inference_vectorizer, include_raw_texts=True)
    X_kv, y_kv = make_Xy(val_Xy, extractor_model.bc, neg_samples=1)
    print("val data loaded!")

    # Fit the model!
    filepath = "punchline.weights.best.hdf5"
    checkpoint = ModelCheckpoint(filepath,
                                 monitor='val_acc',
                                 verbose=1,
                                 save_best_only=True,
                                 mode='max')
    callbacks_list = [checkpoint]

    with open("punchline_model.json", "w") as outf:
        outf.write(extractor_model.model.to_json())

    print("fitting punchline extractor!")
    extractor_model.model.fit(X_k,
                              y_k,
                              validation_data=(X_kv, y_kv),
                              callbacks=callbacks_list,
                              epochs=50)
Example #5
0
def run_scan_net_regression(loc = './scan_net.pth'):
    train_Xy, inference_vectorizer = preprocessor.get_train_Xy(set(list(preprocessor.train_document_ids())), sections_of_interest=None, vocabulary_file=None, include_sentence_span_splits = True)
        
    if not(USE_TEST):
        # create an internal validation set from the training data; use 90% for training and 10% for validation.
        split_index = int(len(train_Xy) * .9)
        val_Xy = train_Xy[split_index:]
        train_Xy = train_Xy[:split_index]
        test_Xy = preprocessor.get_Xy(preprocessor.validation_document_ids(), inference_vectorizer, sections_of_interest=None, include_sentence_span_splits = True) 
    else:
        val_Xy = preprocessor.get_Xy(preprocessor.validation_document_ids(), inference_vectorizer, sections_of_interest=None, include_sentence_span_splits = True) 
        test_Xy = preprocessor.get_Xy(preprocessor.test_document_ids(), inference_vectorizer, sections_of_interest=None, include_sentence_span_splits = True) 
        
    train_Xy, val_Xy, test_Xy = train_reformat(train_Xy), scan_reform(val_Xy), scan_reform(test_Xy) 
    

    
    # train with 50 epochs, batch_size of 1, and patience of 3 (early stopping)
    model = train_scan(inference_vectorizer, train_Xy, val_Xy, test_Xy, 100, 32, 5)
    
    torch.save(model.state_dict(), loc)
def run_scan_net_redux(loc='scan_net_redux.pth'):
    parent_path = abspath(os.path.join(dirname(abspath(__file__)), '..', '..'))
    vocab_f = os.path.join(parent_path, "annotations", "vocab.txt")
    train_Xy, inference_vectorizer = preprocessor.get_train_Xy(
        list(preprocessor.train_document_ids()),
        sections_of_interest=None,
        vocabulary_file=vocab_f,
        include_sentence_span_splits=True)

    if not (USE_TEST):
        # create an internal validation set from the training data; use 90% for training and 10% for validation.
        split_index = int(len(train_Xy) * .9)
        val_Xy = train_Xy[split_index:]
        train_Xy = train_Xy[:split_index]
        test_Xy = preprocessor.get_Xy(preprocessor.validation_document_ids(),
                                      inference_vectorizer,
                                      sections_of_interest=None,
                                      include_sentence_span_splits=True)
    else:
        val_Xy = preprocessor.get_Xy(preprocessor.validation_document_ids(),
                                     inference_vectorizer,
                                     sections_of_interest=None,
                                     include_sentence_span_splits=True)
        test_Xy = preprocessor.get_Xy(preprocessor.test_document_ids(),
                                      inference_vectorizer,
                                      sections_of_interest=None,
                                      include_sentence_span_splits=True)

    if USE_CUDA:
        se_scn = ScanNet(inference_vectorizer, use_attention=use_attn).cuda()
    else:
        se_scn = ScanNet(inference_vectorizer, use_attention=use_attn)

    # train with 50 epochs, batch_size of 1, and patience of 3 (early stopping)
    train_scan(se_scn, inference_vectorizer, train_Xy, val_Xy, test_Xy, 50, 32,
               10)

    # save to specified path
    torch.save(se_scn.state_dict(), loc)
Example #7
0
def run_scan_net_ico(loc = "scan_net_ICO_no_attn_test.pth"):
    print("Modules loaded.")
    
    parent_path = abspath(os.path.join(dirname(abspath(__file__)), '..', '..'))
    vocab_f = os.path.join(parent_path, "annotations", "vocab.txt")
    
    
    train_Xy, inference_vectorizer = preprocessor.get_train_Xy(list(preprocessor.train_document_ids()), sections_of_interest=None, 
                                                               vocabulary_file=vocab_f,
                                                               include_sentence_span_splits=True)
        
    print("Train Data Achieved")  
    if not(USE_TEST):
        # create an internal validation set from the training data; use 90% for training and 10% for validation.
        split_index = int(len(train_Xy) * .9)
        val_Xy = train_Xy[split_index:]
        train_Xy = train_Xy[:split_index]
        test_Xy = preprocessor.get_Xy(list(preprocessor.validation_document_ids()), inference_vectorizer, sections_of_interest=None, include_sentence_span_splits = True) 
    else:
        val_Xy = preprocessor.get_Xy(preprocessor.validation_document_ids(), inference_vectorizer, sections_of_interest=None, include_sentence_span_splits = True) 
        test_Xy = preprocessor.get_Xy(preprocessor.test_document_ids(), inference_vectorizer, sections_of_interest=None, include_sentence_span_splits = True) 
    
    print("Test Data Achieved")  
    
    if USE_CUDA:
        se_scn = ScanNet(inference_vectorizer, use_attention=use_attn).cuda()
    else:
        se_scn = ScanNet(inference_vectorizer, use_attention=use_attn)
        
    print("Model loaded")
    # train with 50 epochs, batch_size of 1, and patience of 3 (early stopping)
    train_scan(se_scn, inference_vectorizer, train_Xy, val_Xy, 50, 32, 10)
    
    acc, f1, prc, rc, auc = test_model(se_scn, test_Xy, inference_vectorizer)
    
    # save to specified path
    #args = parser.parse_args()
    torch.save(se_scn.state_dict(), loc)
def train_simple_inference_net(n_epochs=30):
    inf_net = SimpleInferenceNet()
    tr_ids, val_ids, te_ids = train_document_ids(), validation_document_ids(
    ), test_document_ids()
    tr_ids = list(train_document_ids())
    train_Xy, inference_vectorizer = get_train_Xy(
        tr_ids,
        sections_of_interest=None,
        vocabulary_file=None,
        include_sentence_span_splits=False,
        include_raw_texts=True)

    X_k, y_k = make_Xy_inference(train_Xy, inf_net.bc)
    print("train data for inference task loaded!")

    val_Xy = get_Xy(val_ids, inference_vectorizer, include_raw_texts=True)
    X_kv, y_kv = make_Xy_inference(val_Xy, inf_net.bc)
    print("val data loaded!")

    filepath = "inference.weights.best.hdf5"
    checkpoint = ModelCheckpoint(filepath,
                                 monitor='val_acc',
                                 verbose=1,
                                 save_best_only=True,
                                 mode='max')
    callbacks_list = [checkpoint]

    with open("inference_model.json", "w") as outf:
        outf.write(inf_net.model.to_json())

    print("fitting inference model!")
    inf_net.model.fit(X_k,
                      y_k,
                      validation_data=(X_kv, y_kv),
                      callbacks=callbacks_list,
                      epochs=n_epochs)
def load_data(use_test, bow=True):
    """
    Load the data into a train/val/test set that allows for easy access.

    @return bag-of-word representation of training, validation, test sets (with labels).
    """

    prompts = preprocessor.read_prompts()
    annotations = preprocessor.read_annotations()

    # filter out prompts for which we do not have annotations for whatever reason
    # this was actually just one case; not sure what was going on there.
    def have_annotations_for_prompt(prompt_id):
        return len(
            annotations[annotations[PROMPT_ID_COL_NAME] == prompt_id]) > 0

    prompts = [
        prompt for row_idx, prompt in prompts.iterrows()
        if have_annotations_for_prompt(prompt[PROMPT_ID_COL_NAME])
    ]
    prompts = pd.DataFrame(prompts)

    # Sort into training and validation by article id
    train_doc_ids = preprocessor.train_document_ids()
    val_doc_ids = preprocessor.validation_document_ids()
    test_doc_ids = preprocessor.test_document_ids()

    # get a dev set randomly
    dev_doc_ids = list(train_doc_ids)
    random.shuffle(dev_doc_ids)
    dev_doc_ids = set(dev_doc_ids[:int(len(dev_doc_ids) * .1)])

    x_train, y_train, x_dev, y_dev, x_val, y_val, x_test, y_test = [], [], [], [], [], [], [], []
    pids = prompts[STUDY_ID_COL].values
    for i in range(len(pids)):
        annotations_for_prompt = annotations[annotations[PROMPT_ID_COL_NAME] ==
                                             prompts["PromptID"].values[i]]
        labels = annotations_for_prompt[[LBL_COL_NAME,
                                         EVIDENCE_COL_NAME]].values
        id_ = pids[i]

        # this is all of the reasonings
        articles = [a[1] for a in labels]

        for article_text in articles:
            # extract i/c/o
            out = prompts["Outcome"].values[i].lower()
            inter = prompts["Intervention"].values[i].lower()
            cmp = prompts["Comparator"].values[i].lower()

            # add to correct pile: train/val/test
            tmp = [article_text, out, inter, cmp]
            loss = stats.mode([l1[0] for l1 in labels])[0][0]

            if id_ in dev_doc_ids and not (use_test):
                x_dev.append(tmp)
                y_dev.append(loss)
            elif id_ in train_doc_ids:
                x_train.append(tmp)
                y_train.append(loss)
            elif id_ in val_doc_ids:
                x_val.append(tmp)
                y_val.append(loss)
            elif id_ in test_doc_ids:
                x_test.append(tmp)
                y_test.append(loss)
            else:
                raise ValueError("Unknown study id {}".format(id_))

    # transform to np.array
    y_test = np.asarray(y_test)

    # if we are removing the test set, use validation as test set.
    if not (use_test):
        x_test = x_val
        y_test = y_val
        x_val = x_dev
        y_val = y_dev

    print("Running bag of words...")
    ret = bag_of_words(
        x_train, y_train, x_val, y_val, x_test,
        y_test) if bow else [x_train, y_train, x_val, y_val, x_test, y_test]
    return ret
if not (USE_TEST):
    split_index = int(len(train_Xy) * .9)
    val_Xy = train_Xy[split_index:]
    train_Xy = train_Xy[:split_index]
    test_Xy = preprocessor.get_Xy(set(
        list(preprocessor.validation_document_ids())),
                                  inference_vectorizer,
                                  sections_of_interest=None,
                                  include_sentence_span_splits=True)
else:
    val_Xy = preprocessor.get_Xy(set(
        list(preprocessor.validation_document_ids())),
                                 inference_vectorizer,
                                 sections_of_interest=None,
                                 include_sentence_span_splits=True)
    test_Xy = preprocessor.get_Xy(set(list(preprocessor.test_document_ids())),
                                  inference_vectorizer,
                                  sections_of_interest=None,
                                  include_sentence_span_splits=True)

print("Test data loaded.")

# modify training data
train_Xy, val_Xy, test_Xy = train_reformat(train_Xy), scan_reform(
    val_Xy), scan_reform(test_Xy)
bow = Bag_of_words(inference_vectorizer)

print("Data transformed.")

# load the model
model = load_model_scan(inference_vectorizer, './models/')
def load_data(use_test, bow=True):
    """
    Load the data into a train/val/test set that allows for easy access.

    @return bag-of-word representation of training, validation, test sets (with labels).
    """

    prompts = preprocessor.read_prompts()
    annotations = preprocessor.read_annotations()

    # filter out prompts for which we do not have annotations for whatever reason
    # this was actually just one case; not sure what was going on there.
    def have_annotations_for_prompt(prompt_id):
        return len(
            annotations[annotations[PROMPT_ID_COL_NAME] == prompt_id]) > 0

    prompts = [
        prompt for row_idx, prompt in prompts.iterrows()
        if have_annotations_for_prompt(prompt[PROMPT_ID_COL_NAME])
    ]
    prompts = pd.DataFrame(prompts)

    # Sort into training and validation by article id
    train_doc_ids = preprocessor.train_document_ids()
    val_doc_ids = preprocessor.validation_document_ids()
    test_doc_ids = preprocessor.test_document_ids()

    # get a dev set randomly
    dev_doc_ids = list(train_doc_ids)
    random.shuffle(dev_doc_ids)
    dev_doc_ids = set(dev_doc_ids[:int(len(dev_doc_ids) * .1)])

    x_train, y_train, x_dev, y_dev, x_val, y_val, x_test, y_test = [], [], [], [], [], [], [], []
    pids = prompts[STUDY_ID_COL].values
    for i in range(len(pids)):
        id_, data, losses = parse_prompt_id_data(annotations, prompts, pids, i)

        in_training = id_ in (train_doc_ids - dev_doc_ids)
        # get a reasoning from previous/next prompt id
        if (i > 0 and id_ == pids[i - 1] and not (in_training)):
            _, mismatched_data, _ = parse_prompt_id_data(
                annotations, prompts, pids, i - 1)
            # add the mismatched data here
            row = copy.deepcopy(data[0])
            row[1] = mismatched_data[0][1]
            data.append(row)
            losses.append(losses[-1])

        elif (i < len(pids) and id_ == pids[i + 1] and not (in_training)):
            _, mismatched_data, _ = parse_prompt_id_data(
                annotations, prompts, pids, i + 1)
            # add the mismatched data here
            row = copy.deepcopy(data[0])
            row[1] = mismatched_data[0][1]
            data.append(row)
            losses.append(losses[-1])

        for i in range(len(data)):
            tmp = data[i]
            loss = losses[i]

            # find where to put this section
            if id_ in dev_doc_ids and not (use_test):
                x_dev.append(tmp)
                y_dev.append(loss)
            elif id_ in train_doc_ids:
                x_train.append(tmp)
                y_train.append(loss)
            elif id_ in val_doc_ids:
                x_val.append(tmp)
                y_val.append(loss)
            elif id_ in test_doc_ids:
                x_test.append(tmp)
                y_test.append(loss)
            else:
                raise ValueError("Unknown study id {}".format(id_))

    # if we are removing the test set, use validation as test set.
    if not (use_test):
        x_test = x_val
        y_test = y_val
        x_val = x_dev
        y_val = y_dev

    ret = bag_of_words(
        x_train, y_train, x_val, y_val, x_test, y_test,
        5) if bow else [x_train, y_train, x_val, y_val, x_test, y_test]
    return ret
Example #12
0
    weighted by number of prompts for that document. 
    """
    tokens = {}  # Map article ids to token
    prompts = {}  # Map article ids to num prompts

    for d in Xy:
        n_tokens = len(d['article'])
        tokens[d['a_id']] = n_tokens

        if d['a_id'] in prompts:
            prompts[d['a_id']] += 1
        else:
            prompts[d['a_id']] = 1

    total_entropy = 0
    for art in prompts.keys():
        total_entropy += np.log(tokens[art]) * prompts[art] / len(Xy)

    return total_entropy


tr_ids, val_ids, te_ids = preprocessor.train_document_ids(
), preprocessor.validation_document_ids(), preprocessor.test_document_ids()
train_Xy, inference_vectorizer = preprocessor.get_train_Xy(tr_ids)
val_Xy = preprocessor.get_Xy(val_ids, inference_vectorizer)
test_Xy = preprocessor.get_Xy(te_ids, inference_vectorizer)

print(calculate_entropy(train_Xy))
print(calculate_entropy(val_Xy))
print(calculate_entropy(test_Xy))
Example #13
0
    pred = preds[0].data.tolist()[0]
    return pred

print("Loading data...")
# get training data
train_Xy, inference_vectorizer = preprocessor.get_train_Xy(set(list(preprocessor.train_document_ids())), sections_of_interest=None, vocabulary_file="..//..//.//annotations/vocab.txt", include_sentence_span_splits = True)
print("Training data loaded...")  

if not(USE_TEST):
    split_index = int(len(train_Xy) * .9)
    val_Xy = train_Xy[split_index:]
    train_Xy = train_Xy[:split_index]
    test_Xy = preprocessor.get_Xy(set(list(preprocessor.validation_document_ids())), inference_vectorizer, sections_of_interest=None, include_sentence_span_splits = True) 
else:
    val_Xy = preprocessor.get_Xy(set(list(preprocessor.validation_document_ids())), inference_vectorizer, sections_of_interest=None, include_sentence_span_splits = True) 
    test_Xy = preprocessor.get_Xy(set(list(preprocessor.test_document_ids())), inference_vectorizer, sections_of_interest=None, include_sentence_span_splits = True) 
        
print("Test data loaded...")  

# modify training data
val_Xy, test_Xy = scan_reform(val_Xy), scan_reform(test_Xy) 
print("Reformatted data.")

# load the model
model = load_model_scan(inference_vectorizer, './models/scan_model_neural.pth')
print("Model loaded...")  

# after loading the model, get all predictions.
instances = val_Xy # validation set for now... 
y_preds = []
y_test  = []