Ejemplo n.º 1
0
def train_model_random_lfs(randomly_sampled_lfs, train_matrix, dev_matrix, dev_labels, test_matrix, regularization_grid):
    hyper_grid_results = defaultdict(dict)
    train_grid_results = defaultdict(dict)
    dev_grid_results = defaultdict(dict)
    test_grid_results = defaultdict(dict)
    for lf_sample in tqdm_notebook(enumerate(randomly_sampled_lfs)):
        for param in regularization_grid:

            label_model = LabelModel(k=2)
            label_model.train_model(
                train_matrix[:,lf_sample[1]], n_epochs=1000, 
                log_train_every=200, seed=100, lr=0.01, l2=param,
                verbose=False
            )
            
            hyper_grid_results[str(param)] = label_model.predict_proba(dev_matrix[:,lf_sample[1]])
       
        best_param = float(max(hyper_grid_results))
        label_model.train_model(
                train_matrix[:,lf_sample[1]], n_epochs=1000, 
                log_train_every=200, seed=50, lr=0.01, l2=best_param,
                verbose=False
        )
        
        key = f'{lf_sample[0]}:{",".join(map(str, lf_sample[1]))}'
        train_grid_results[key] = label_model.predict_proba(train_matrix[:,lf_sample[1]])
        dev_grid_results[key] = label_model.predict_proba(dev_matrix[:,lf_sample[1]])
        test_grid_results[key] = label_model.predict_proba(test_matrix[:,lf_sample[1]])
        
    return train_grid_results, dev_grid_results, test_grid_results
Ejemplo n.º 2
0
def generative_model(L_train, n_epochs=500, print_every=100):
    model = LabelModel(k=2)

    logger.info("Training generative model...")
    model.train_model(L_train, n_epochs=n_epochs, print_every=print_every)
    logger.info("Done.")

    marginals = model.predict_proba(L_train)
    return marginals
Ejemplo n.º 3
0
def generative_model(L_train, n_epochs=500, print_every=100):
    model = LabelModel(k=2)

    logger.info(f"Training generative model for...")
    model.train_model(L_train, n_epochs=n_epochs, print_every=print_every)
    logger.info("Done.")

    marginals = model.predict_proba(L_train)
    plt.hist(marginals[:, TRUE - 1], bins=20)
    plt.savefig(
        os.path.join(os.path.dirname(__file__), f"opamps_marginals.pdf"))
    return marginals
Ejemplo n.º 4
0
 def __init__(
     self,
     positive_label: str,
     class_cardinality: int = 2,
     num_epochs: int = 500,
     log_train_every: int = 50,
     seed: int = 123,
 ):
     self.positive_label = positive_label
     self.class_cardinality = class_cardinality
     self.num_epochs = num_epochs
     self.log_train_every = log_train_every
     self.seed = seed
     self.label_model = LabelModel(k=self.class_cardinality, seed=seed)
Ejemplo n.º 5
0
def apply_labellling_functions(featurizer_output):
    session = featurizer_output['session']
    cands = featurizer_output['candidate_variable']
    labeler = Labeler(session, cands)
    labeler.apply(lfs=[lfs], train=True, parallelism=config.PARALLEL)

    train_cands = []
    train_cands.append(
        session.query(featurizer_output['candidate_variable'][0]).all())
    L_train = labeler.get_label_matrices(train_cands)

    gen_model = LabelModel(k=2)
    gen_model.train_model(L_train[0], n_epochs=300, print_every=100)

    train_marginals = gen_model.predict_proba(L_train[0])

    featurizer_output['train_marginals'] = train_marginals
    return featurizer_output
Ejemplo n.º 6
0
    def test_gpustorage(self):
        # Running basics tutorial problem
        with open("tutorials/data/basics_tutorial.pkl", "rb") as f:
            X, Y, L, D = pickle.load(f)

        Xs, Ys, Ls, Ds = split_data(X,
                                    Y,
                                    L,
                                    D,
                                    splits=[0.8, 0.1, 0.1],
                                    stratify_by=Y,
                                    seed=123)

        label_model = LabelModel(k=2, seed=123)
        label_model.train_model(Ls[0],
                                Y_dev=Ys[1],
                                n_epochs=500,
                                log_train_every=25)
        Y_train_ps = label_model.predict_proba(Ls[0])

        # Creating a really large end model to use lots of memory
        end_model = EndModel([1000, 100000, 2], seed=123, device="cuda")

        # Getting initial GPU storage use
        initial_gpu_mem = GPUtil.getGPUs()[0].memoryUsed

        # Training model
        end_model.train_model(
            (Xs[0], Y_train_ps),
            valid_data=(Xs[1], Ys[1]),
            l2=0.1,
            batch_size=256,
            n_epochs=3,
            log_train_every=1,
            validation_metric="f1",
        )

        # Final GPU storage use
        final_gpu_mem = GPUtil.getGPUs()[0].memoryUsed

        # On a Titan X, this model uses ~ 3 GB of memory
        gpu_mem_difference = final_gpu_mem - initial_gpu_mem

        self.assertGreater(gpu_mem_difference, 1000)
    def load_context(self, context: PythonModelContext) -> None:
        # Configure logging for Fonduer
        init_logging(log_dir="logs")
        logger.info("loading context")

        pyfunc_conf = _get_flavor_configuration(model_path=self.model_path,
                                                flavor_name=pyfunc.FLAVOR_NAME)
        conn_string = pyfunc_conf.get(CONN_STRING, None)
        if conn_string is None:
            raise RuntimeError("conn_string is missing from MLmodel file.")
        self.parallel = pyfunc_conf.get(PARALLEL, 1)
        session = Meta.init(conn_string).Session()

        logger.info("Getting parser")
        self.corpus_parser = self._get_parser(session)
        logger.info("Getting mention extractor")
        self.mention_extractor = self._get_mention_extractor(session)
        logger.info("Getting candidate extractor")
        self.candidate_extractor = self._get_candidate_extractor(session)
        candidate_classes = self.candidate_extractor.candidate_classes

        self.model_type = pyfunc_conf.get(MODEL_TYPE, "discriminative")
        if self.model_type == "discriminative":
            self.featurizer = Featurizer(session, candidate_classes)
            with open(os.path.join(self.model_path, "feature_keys.pkl"),
                      "rb") as f:
                key_names = pickle.load(f)
            self.featurizer.drop_keys(key_names)
            self.featurizer.upsert_keys(key_names)

            disc_model = LogisticRegression()

            # Workaround to https://github.com/HazyResearch/fonduer/issues/208
            checkpoint = torch.load(
                os.path.join(self.model_path, "best_model.pt"))
            disc_model.settings = checkpoint["config"]
            disc_model.cardinality = checkpoint["cardinality"]
            disc_model._build_model()

            disc_model.load(model_file="best_model.pt",
                            save_dir=self.model_path)
            self.disc_model = disc_model
        else:
            self.labeler = Labeler(session, candidate_classes)
            with open(os.path.join(self.model_path, "labeler_keys.pkl"),
                      "rb") as f:
                key_names = pickle.load(f)
            self.labeler.drop_keys(key_names)
            self.labeler.upsert_keys(key_names)

            self.gen_models = [
                LabelModel.load(
                    os.path.join(self.model_path, _.__name__ + ".pkl"))
                for _ in candidate_classes
            ]
Ejemplo n.º 8
0
def train_baseline_model(
    train_matrix, 
    dev_matrix,
    dev_labels,
    test_matrix, 
    lf_indicies, 
    regularization_grid, 
    train_marginal_dir,
    write_file=False
):
    grid_results = {}
    dev_grid_results = {}
    test_grid_results = {}
    for param in regularization_grid:
        label_model = LabelModel(k=2)
        label_model.train_model(
            train_matrix[:,lf_indicies], n_epochs=1000, 
            log_train_every=200, seed=100, lr=0.01, l2=param,
            verbose=False, #Y_dev=dev_labels
        )
        grid_results[str(param)] = label_model.predict_proba(dev_matrix[:,lf_indicies])

    best_param = float(max(grid_results))
    label_model.train_model(
            train_matrix[:,lf_indicies], n_epochs=1000, 
            log_train_every=200, seed=50, lr=0.01, l2=best_param,
            verbose=False, #Y_dev=dev_labels
    )
    if write_file:
        (
            pd.DataFrame(
                label_model.predict_proba(train_matrix[:,lf_indicies]), 
                columns=["pos_class_marginals", "neg_class_marginals"]
            )
            .to_csv(f"{train_marginal_dir}baseline_marginals.tsv.xz", compression="xz", index=False, sep="\t")
        )

    dev_grid_results[best_param] = label_model.predict_proba(dev_matrix[:,lf_indicies])
    test_grid_results[best_param] = label_model.predict_proba(test_matrix[:,lf_indicies])

    return dev_grid_results, test_grid_results
Ejemplo n.º 9
0
    def getTrainedModel1(self):

        # We build a matrix of LF votes for each comment ticket
        LF_matrix = self.make_Ls_matrix(self.LF_set['comments'], self.LFs)

        # Get true labels for LF set
        Y_LF_set = np.array(self.LF_set['resolution'])

        display(
            lf_summary(sparse.csr_matrix(LF_matrix),
                       Y=Y_LF_set,
                       lf_names=self.LF_names.values()))

        print("label coverage: " + label_coverage(LF_matrix))

        mv = MajorityLabelVoter()
        Y_train_majority_votes = mv.predict(LF_matrix)
        print("classification report:\n" +
              classification_report(Y_LF_set, Y_train_majority_votes))

        Ls_train = self.make_Ls_matrix(self.train, self.LFs)

        # You can tune the learning rate and class balance.
        model = LabelModel(k=2, seed=123)
        trainer = model.train_model(Ls_train,
                                    n_epochs=2000,
                                    print_every=1000,
                                    lr=0.0001,
                                    class_balance=np.array([0.2, 0.8]))

        Y_train = model.predict(Ls_train) + Y_LF_set

        print('Trained Label Model Metrics:')
        scores = model.score((Ls_train[1], Y_train[1]),
                             metric=['accuracy', 'precision', 'recall', 'f1'])
        print(scores)

        return trainer, Y_train
Ejemplo n.º 10
0
    def getTrainedModel2(self):
        # Apply the LFs to the unlabeled training data
        applier = PandasLFApplier(self.LFs)
        L_train = applier.apply(self.train['comments'])

        # Train the label model and compute the training labels
        label_model = LabelModel(cardinality=2, verbose=True)
        label_model.fit(L_train, n_epochs=500, log_freq=50, seed=123)
        self.train['resolution'] = label_model.predict(
            L=L_train, tie_break_policy="abstain")
        df_train = self.train[self.train.resolution != self.ABSTAIN]

        train_text = df_train.comments.tolist()
        X_train = CountVectorizer(ngram_range=(1, 2)).fit_transform(train_text)

        clf = LogisticRegression(solver="lbfgs")
        clf.fit(X=X_train, y=df_train.resolution.values)
        prob = clf.predict_proba(self.test)

        if torch.cuda.is_available():
            device = 'cuda'
        else:
            device = 'cpu'
        end_model = EndModel([1000, 10, 2], seed=123, device=device)

        end_model.train_model(
            (self.train['comments'], self.test['comments']),
            valid_data=(self.train['resolution'], self.test['comments']),
            lr=0.01,
            l2=0.01,
            batch_size=256,
            n_epochs=5,
            checkpoint_metric='accuracy',
            checkpoint_metric_mode='max')

        return prob
Ejemplo n.º 11
0
def LabelLearner(cardinalities=2, dependencies=[], **kwargs):
    """
    A generative factory function for data programming (MeTal).
    """
    # Use single task label model if cardinality is an integer,
    # Otherwise use multi task label model
    if isinstance(cardinalities, int):
        logger.info("Using MeTaL single task label model")
        label_model = LabelModel(k=cardinalities, **kwargs)
    else:
        logger.info("Using MeTaL multi task label model")
        task_graph = TaskHierarchy(cardinalities=cardinalities,
                                   edges=dependencies)
        label_model = MTLabelModel(task_graph=task_graph, **kwargs)

    return label_model
Ejemplo n.º 12
0
    def predict_proba(self, L):
        """Returns the task marginals estimated by the model: a t-length list of
        [n,k_t] matrices where the (i,j) entry of the sth matrix represents the
        estimated P((Y_i)_s | \lambda_j(x_i))

        Args:
            L: A t-length list of [n,m] scipy.sparse label matrices with values
                in {0,1,...,k}
        """
        # First, get the estimated probability distribution over the feasible
        # set defined by the TaskGraph
        # This is an [n,k] array, where k = |(feasible set)|
        Y_pf = LabelModel.predict_proba(self, L)
        n, k = Y_pf.shape

        # Now get the per-task marginals
        # TODO: Make this optional, versus just returning the above
        Y_p = [np.zeros((n, k_t)) for k_t in self.task_graph.K]
        for yi, y in enumerate(self.task_graph.feasible_set()):
            for t in range(self.t):
                k_t = int(y[t])
                Y_p[t][:, k_t - 1] += Y_pf[:, yi]
        return Y_p
Ejemplo n.º 13
0
def train_model(args):

    #global args
    #args = parser.parse_args()

    hidden_size = 128
    num_classes = 2
    encode_dim = 1000  # using get_frm_output_size()

    L, Y = load_labels(args)

    # Label Model
    # labelling functions analysis
    print(lf_summary(L["dev"], Y=Y["dev"]))

    # training label model
    label_model = LabelModel(k=num_classes, seed=123)
    label_model.train_model(L["train"],
                            Y["dev"],
                            n_epochs=2000,
                            log_train_every=100)

    # evaluating label model
    print('Trained Label Model Metrics:')
    label_model.score((L["dev"], Y["dev"]),
                      metric=['accuracy', 'precision', 'recall', 'f1'])

    # comparison with majority vote of LFs
    mv = MajorityLabelVoter(seed=123)
    print('Majority Label Voter Metrics:')
    mv.score((L["dev"], Y["dev"]),
             metric=['accuracy', 'precision', 'recall', 'f1'])

    Ytrain_p = label_model.predict_proba(L["train"])
    #print(Ytrain_ps.shape) #(377*50,2)
    #Ydev_p = label_model.predict_proba(L["dev"])

    # test models
    #label_model.score((Ltest,Ytest), metric=['accuracy','precision', 'recall', 'f1'])

    # End Model
    # Create datasets and dataloaders
    train, dev, test = load_dataset(args, Ytrain_p, Y["dev"], Y["test"])
    data_loader = get_data_loader(train, dev, test, args.batch_size,
                                  args.num_workers)
    #print(len(data_loader["train"])) # 18850 / batch_size
    #print(len(data_loader["dev"])) # 1500 / batch_size
    #print(len(data_loader["test"])) # 1000 / batch_size
    #import ipdb; ipdb.set_trace()

    # Define input encoder
    #cnn_encoder = FrameEncoderOC
    cnn_encoder = FrameEncoderOCDense

    if (torch.cuda.is_available()):
        device = 'cuda'
    else:
        device = 'cpu'
    #import ipdb; ipdb.set_trace()

    # Define LSTM module
    lstm_module = LSTMModule(
        encode_dim,
        hidden_size,
        bidirectional=False,
        verbose=False,
        lstm_reduction=args.lstm_reduction,
        encoder_class=cnn_encoder,
        encoder_kwargs={"requires_grad": args.requires_grad})
    '''
	# Define end model
	end_model = EndModel(
		input_module=lstm_module,
		layer_out_dims=[hidden_size, num_classes],
		optimizer="adam",
		#use_cuda=cuda,
		batchnorm=False,
		seed=args.seed,
		verbose=False,
		device = device,
		)
	'''

    init_kwargs = {
        "layer_out_dims": [hidden_size, num_classes],
        "input_module": lstm_module,
        "optimizer": "adam",
        "verbose": False,
        "input_batchnorm": False,
        "use_cuda": cuda,
        'seed': args.seed,
        'device': device
    }

    end_model = EndModel(**init_kwargs)

    if not os.path.exists(args.checkpoint_dir):
        os.mkdir(args.checkpoint_dir)

    with open(args.checkpoint_dir + '/init_kwargs.pickle', "wb") as f:
        pickle.dump(init_kwargs, f, protocol=pickle.HIGHEST_PROTOCOL)

    dropout = 0.4
    # Train end model
    end_model.train_model(
        train_data=data_loader["train"],
        valid_data=data_loader["dev"],
        l2=args.weight_decay,
        lr=args.lr,
        n_epochs=args.n_epochs,
        log_train_every=1,
        verbose=True,
        progress_bar=True,
        loss_weights=[0.55, 0.45],
        input_dropout=0.1,
        middle_dropout=dropout,
        checkpoint_dir=args.checkpoint_dir,
        #writer = "json",
        #writer_config = {
        #"log_dir":  args.log_dir,
        #"run_dir":  args.run_dir,
        #"run_name": args.run_name,
        #"writer_metrics": ['accuracy','precision', 'recall', 'f1','roc-auc','ndcg']
        #},
        #validation_metric='f1',
    )

    # evaluate end model
    print("Dev Set Performance")
    end_model.score(
        data_loader["dev"],
        verbose=True,
        metric=['accuracy', 'precision', 'recall', 'f1', 'roc-auc', 'ndcg'])
    print("Test Set Performance")
    end_model.score(
        data_loader["test"],
        verbose=True,
        metric=['accuracy', 'precision', 'recall', 'f1', 'roc-auc', 'ndcg'])
Ejemplo n.º 14
0
def train_model(args):

    #global args
    #args = parser.parse_args()

	hidden_size = 128 
	num_classes = 2
	encode_dim = 1000 # using get_frm_output_size()

	L,Y = load_labels(args) 

	# Label Model
	# labelling functions analysis
	print(lf_summary(L["dev"], Y = Y["dev"]))

	# training label model
	label_model = LabelModel(k=num_classes, seed=123)
	label_model.train_model(L["train"], Y["dev"], n_epochs = 500, log_train_every = 50)

	# evaluating label model
	print('Trained Label Model Metrics:')
	label_model.score((L["dev"], Y["dev"]), metric=['accuracy','precision', 'recall', 'f1'])

	# comparison with majority vote of LFs
	mv = MajorityLabelVoter(seed=123)
	print('Majority Label Voter Metrics:')
	mv.score((L["dev"], Y["dev"]), metric=['accuracy','precision', 'recall', 'f1'])

	Ytrain_p = label_model.predict_proba(L["train"])
	#print(Ytrain_ps.shape) #(377*50,2)
	#Ydev_p = label_model.predict_proba(L["dev"])

	# test models
	#label_model.score((Ltest,Ytest), metric=['accuracy','precision', 'recall', 'f1'])

	# End Model
	# Create datasets and dataloaders
	train, dev, test = load_dataset(args, Ytrain_p, Y["dev"], Y["test"])
	data_loader = get_data_loader(train, dev, test, args.batch_size, args.num_workers)
	#print(len(data_loader["train"])) # 18850 / batch_size
	#print(len(data_loader["dev"])) # 1500 / batch_size
	#print(len(data_loader["test"])) # 1000 / batch_size 
	#import ipdb; ipdb.set_trace()

	# Define input encoder
	cnn_encoder = FrameEncoderOC

	if(torch.cuda.is_available()):
		device = 'cuda'
	else:
		device = 'cpu'
	#import ipdb; ipdb.set_trace()

	# Define LSTM module
	lstm_module = LSTMModule(
		encode_dim,
		hidden_size,
		bidirectional=False,
		verbose=False,
		lstm_reduction="attention",
		encoder_class=cnn_encoder,
		)

	# Define end model
	end_model = EndModel(
		input_module=lstm_module,
		layer_out_dims=[hidden_size, num_classes],
		optimizer="adam",
		#use_cuda=cuda,
		batchnorm=True,
		seed=123,
		verbose=False,
		device = device,
		)

	#print('Training model')
	#tic = time.time()
	
	dropout = 0.4
	# Train end model
	end_model.train_model(
		train_data=data_loader["train"],
		valid_data=data_loader["dev"],
		l2=args.weight_decay,
		lr=args.lr,
		n_epochs=args.n_epochs,
		log_train_every=1,
		verbose=True,
		progress_bar = True,
		loss_weights = [0.45,0.55],
		batchnorm = 'True',
		input_dropout = dropout,
		middle_dropout = dropout,
		#validation_metric='f1',
		)

	#print('Time taken for training:')
	#print(time.time() - tic)

	# evaluate end model
	end_model.score(data_loader["dev"], verbose=True, metric=['accuracy','precision', 'recall', 'f1'])
Ejemplo n.º 15
0
def test_e2e(caplog):
    """Run an end-to-end test on documents of the hardware domain."""
    caplog.set_level(logging.INFO)

    PARALLEL = 4

    max_docs = 12

    session = Meta.init("postgresql://localhost:5432/" + DB).Session()

    docs_path = "tests/data/html/"
    pdf_path = "tests/data/pdf/"

    doc_preprocessor = HTMLDocPreprocessor(docs_path, max_docs=max_docs)

    corpus_parser = Parser(
        session,
        parallelism=PARALLEL,
        structural=True,
        lingual=True,
        visual=True,
        pdf_path=pdf_path,
    )
    corpus_parser.apply(doc_preprocessor)
    assert session.query(Document).count() == max_docs

    num_docs = session.query(Document).count()
    logger.info(f"Docs: {num_docs}")
    assert num_docs == max_docs

    num_sentences = session.query(Sentence).count()
    logger.info(f"Sentences: {num_sentences}")

    # Divide into test and train
    docs = sorted(corpus_parser.get_documents())
    last_docs = sorted(corpus_parser.get_last_documents())

    ld = len(docs)
    assert ld == len(last_docs)
    assert len(docs[0].sentences) == len(last_docs[0].sentences)

    assert len(docs[0].sentences) == 799
    assert len(docs[1].sentences) == 663
    assert len(docs[2].sentences) == 784
    assert len(docs[3].sentences) == 661
    assert len(docs[4].sentences) == 513
    assert len(docs[5].sentences) == 700
    assert len(docs[6].sentences) == 528
    assert len(docs[7].sentences) == 161
    assert len(docs[8].sentences) == 228
    assert len(docs[9].sentences) == 511
    assert len(docs[10].sentences) == 331
    assert len(docs[11].sentences) == 528

    # Check table numbers
    assert len(docs[0].tables) == 9
    assert len(docs[1].tables) == 9
    assert len(docs[2].tables) == 14
    assert len(docs[3].tables) == 11
    assert len(docs[4].tables) == 11
    assert len(docs[5].tables) == 10
    assert len(docs[6].tables) == 10
    assert len(docs[7].tables) == 2
    assert len(docs[8].tables) == 7
    assert len(docs[9].tables) == 10
    assert len(docs[10].tables) == 6
    assert len(docs[11].tables) == 9

    # Check figure numbers
    assert len(docs[0].figures) == 32
    assert len(docs[1].figures) == 11
    assert len(docs[2].figures) == 38
    assert len(docs[3].figures) == 31
    assert len(docs[4].figures) == 7
    assert len(docs[5].figures) == 38
    assert len(docs[6].figures) == 10
    assert len(docs[7].figures) == 31
    assert len(docs[8].figures) == 4
    assert len(docs[9].figures) == 27
    assert len(docs[10].figures) == 5
    assert len(docs[11].figures) == 27

    # Check caption numbers
    assert len(docs[0].captions) == 0
    assert len(docs[1].captions) == 0
    assert len(docs[2].captions) == 0
    assert len(docs[3].captions) == 0
    assert len(docs[4].captions) == 0
    assert len(docs[5].captions) == 0
    assert len(docs[6].captions) == 0
    assert len(docs[7].captions) == 0
    assert len(docs[8].captions) == 0
    assert len(docs[9].captions) == 0
    assert len(docs[10].captions) == 0
    assert len(docs[11].captions) == 0

    train_docs = set()
    dev_docs = set()
    test_docs = set()
    splits = (0.5, 0.75)
    data = [(doc.name, doc) for doc in docs]
    data.sort(key=lambda x: x[0])
    for i, (doc_name, doc) in enumerate(data):
        if i < splits[0] * ld:
            train_docs.add(doc)
        elif i < splits[1] * ld:
            dev_docs.add(doc)
        else:
            test_docs.add(doc)
    logger.info([x.name for x in train_docs])

    # NOTE: With multi-relation support, return values of getting candidates,
    # mentions, or sparse matrices are formatted as a list of lists. This means
    # that with a single relation, we need to index into the list of lists to
    # get the candidates/mentions/sparse matrix for a particular relation or
    # mention.

    # Mention Extraction
    part_ngrams = MentionNgramsPart(parts_by_doc=None, n_max=3)
    temp_ngrams = MentionNgramsTemp(n_max=2)
    volt_ngrams = MentionNgramsVolt(n_max=1)

    Part = mention_subclass("Part")
    Temp = mention_subclass("Temp")
    Volt = mention_subclass("Volt")

    mention_extractor = MentionExtractor(
        session,
        [Part, Temp, Volt],
        [part_ngrams, temp_ngrams, volt_ngrams],
        [part_matcher, temp_matcher, volt_matcher],
    )

    mention_extractor.apply(docs, parallelism=PARALLEL)

    assert session.query(Part).count() == 299
    assert session.query(Temp).count() == 147
    assert session.query(Volt).count() == 140
    assert len(mention_extractor.get_mentions()) == 3
    assert len(mention_extractor.get_mentions()[0]) == 299
    assert (
        len(
            mention_extractor.get_mentions(
                docs=[session.query(Document).filter(Document.name == "112823").first()]
            )[0]
        )
        == 70
    )

    # Candidate Extraction
    PartTemp = candidate_subclass("PartTemp", [Part, Temp])
    PartVolt = candidate_subclass("PartVolt", [Part, Volt])

    candidate_extractor = CandidateExtractor(
        session, [PartTemp, PartVolt], throttlers=[temp_throttler, volt_throttler]
    )

    for i, docs in enumerate([train_docs, dev_docs, test_docs]):
        candidate_extractor.apply(docs, split=i, parallelism=PARALLEL)

    assert session.query(PartTemp).filter(PartTemp.split == 0).count() == 3684
    assert session.query(PartTemp).filter(PartTemp.split == 1).count() == 72
    assert session.query(PartTemp).filter(PartTemp.split == 2).count() == 448
    assert session.query(PartVolt).count() == 4282

    # Grab candidate lists
    train_cands = candidate_extractor.get_candidates(split=0, sort=True)
    dev_cands = candidate_extractor.get_candidates(split=1, sort=True)
    test_cands = candidate_extractor.get_candidates(split=2, sort=True)
    assert len(train_cands) == 2
    assert len(train_cands[0]) == 3684
    assert (
        len(
            candidate_extractor.get_candidates(
                docs=[session.query(Document).filter(Document.name == "112823").first()]
            )[0]
        )
        == 1496
    )

    # Featurization
    featurizer = Featurizer(session, [PartTemp, PartVolt])

    # Test that FeatureKey is properly reset
    featurizer.apply(split=1, train=True, parallelism=PARALLEL)
    assert session.query(Feature).count() == 225
    assert session.query(FeatureKey).count() == 1179

    # Test Dropping FeatureKey
    # Should force a row deletion
    featurizer.drop_keys(["DDL_e1_W_LEFT_POS_3_[NFP NN NFP]"])
    assert session.query(FeatureKey).count() == 1178

    # Should only remove the part_volt as a relation and leave part_temp
    assert set(
        session.query(FeatureKey)
        .filter(FeatureKey.name == "DDL_e1_LEMMA_SEQ_[bc182]")
        .one()
        .candidate_classes
    ) == {"part_temp", "part_volt"}
    featurizer.drop_keys(["DDL_e1_LEMMA_SEQ_[bc182]"], candidate_classes=[PartVolt])
    assert session.query(FeatureKey).filter(
        FeatureKey.name == "DDL_e1_LEMMA_SEQ_[bc182]"
    ).one().candidate_classes == ["part_temp"]
    assert session.query(FeatureKey).count() == 1178
    # Removing the last relation from a key should delete the row
    featurizer.drop_keys(["DDL_e1_LEMMA_SEQ_[bc182]"], candidate_classes=[PartTemp])
    assert session.query(FeatureKey).count() == 1177
    session.query(Feature).delete()
    session.query(FeatureKey).delete()

    featurizer.apply(split=0, train=True, parallelism=PARALLEL)
    assert session.query(Feature).count() == 6669
    assert session.query(FeatureKey).count() == 4161
    F_train = featurizer.get_feature_matrices(train_cands)
    assert F_train[0].shape == (3684, 4161)
    assert F_train[1].shape == (2985, 4161)
    assert len(featurizer.get_keys()) == 4161

    featurizer.apply(split=1, parallelism=PARALLEL)
    assert session.query(Feature).count() == 6894
    assert session.query(FeatureKey).count() == 4161
    F_dev = featurizer.get_feature_matrices(dev_cands)
    assert F_dev[0].shape == (72, 4161)
    assert F_dev[1].shape == (153, 4161)

    featurizer.apply(split=2, parallelism=PARALLEL)
    assert session.query(Feature).count() == 8486
    assert session.query(FeatureKey).count() == 4161
    F_test = featurizer.get_feature_matrices(test_cands)
    assert F_test[0].shape == (448, 4161)
    assert F_test[1].shape == (1144, 4161)

    gold_file = "tests/data/hardware_tutorial_gold.csv"
    load_hardware_labels(session, PartTemp, gold_file, ATTRIBUTE, annotator_name="gold")
    assert session.query(GoldLabel).count() == 4204
    load_hardware_labels(session, PartVolt, gold_file, ATTRIBUTE, annotator_name="gold")
    assert session.query(GoldLabel).count() == 8486

    stg_temp_lfs = [
        LF_storage_row,
        LF_operating_row,
        LF_temperature_row,
        LF_tstg_row,
        LF_to_left,
        LF_negative_number_left,
    ]

    ce_v_max_lfs = [
        LF_bad_keywords_in_row,
        LF_current_in_row,
        LF_non_ce_voltages_in_row,
    ]

    labeler = Labeler(session, [PartTemp, PartVolt])

    with pytest.raises(ValueError):
        labeler.apply(split=0, lfs=stg_temp_lfs, train=True, parallelism=PARALLEL)

    labeler.apply(
        split=0, lfs=[stg_temp_lfs, ce_v_max_lfs], train=True, parallelism=PARALLEL
    )
    assert session.query(Label).count() == 6669
    assert session.query(LabelKey).count() == 9
    L_train = labeler.get_label_matrices(train_cands)
    assert L_train[0].shape == (3684, 9)
    assert L_train[1].shape == (2985, 9)
    assert len(labeler.get_keys()) == 9

    L_train_gold = labeler.get_gold_labels(train_cands)
    assert L_train_gold[0].shape == (3684, 1)

    L_train_gold = labeler.get_gold_labels(train_cands, annotator="gold")
    assert L_train_gold[0].shape == (3684, 1)

    gen_model = LabelModel(k=2)
    gen_model.train_model(L_train[0], n_epochs=500, print_every=100)

    train_marginals = gen_model.predict_proba(L_train[0])[:, 1]

    disc_model = LogisticRegression()
    disc_model.train(
        (train_cands[0], F_train[0]), train_marginals, n_epochs=20, lr=0.001
    )

    test_score = disc_model.predictions((test_cands[0], F_test[0]), b=0.6)
    true_pred = [test_cands[0][_] for _ in np.nditer(np.where(test_score > 0))]

    pickle_file = "tests/data/parts_by_doc_dict.pkl"
    with open(pickle_file, "rb") as f:
        parts_by_doc = pickle.load(f)

    (TP, FP, FN) = entity_level_f1(
        true_pred, gold_file, ATTRIBUTE, test_docs, parts_by_doc=parts_by_doc
    )

    tp_len = len(TP)
    fp_len = len(FP)
    fn_len = len(FN)
    prec = tp_len / (tp_len + fp_len) if tp_len + fp_len > 0 else float("nan")
    rec = tp_len / (tp_len + fn_len) if tp_len + fn_len > 0 else float("nan")
    f1 = 2 * (prec * rec) / (prec + rec) if prec + rec > 0 else float("nan")

    logger.info(f"prec: {prec}")
    logger.info(f"rec: {rec}")
    logger.info(f"f1: {f1}")

    assert f1 < 0.7 and f1 > 0.3

    stg_temp_lfs_2 = [
        LF_to_left,
        LF_test_condition_aligned,
        LF_collector_aligned,
        LF_current_aligned,
        LF_voltage_row_temp,
        LF_voltage_row_part,
        LF_typ_row,
        LF_complement_left_row,
        LF_too_many_numbers_row,
        LF_temp_on_high_page_num,
        LF_temp_outside_table,
        LF_not_temp_relevant,
    ]
    labeler.update(split=0, lfs=[stg_temp_lfs_2, ce_v_max_lfs], parallelism=PARALLEL)
    assert session.query(Label).count() == 6669
    assert session.query(LabelKey).count() == 16
    L_train = labeler.get_label_matrices(train_cands)
    assert L_train[0].shape == (3684, 16)

    gen_model = LabelModel(k=2)
    gen_model.train_model(L_train[0], n_epochs=500, print_every=100)

    train_marginals = gen_model.predict_proba(L_train[0])[:, 1]

    disc_model = LogisticRegression()
    disc_model.train(
        (train_cands[0], F_train[0]), train_marginals, n_epochs=20, lr=0.001
    )

    test_score = disc_model.predictions((test_cands[0], F_test[0]), b=0.6)
    true_pred = [test_cands[0][_] for _ in np.nditer(np.where(test_score > 0))]

    (TP, FP, FN) = entity_level_f1(
        true_pred, gold_file, ATTRIBUTE, test_docs, parts_by_doc=parts_by_doc
    )

    tp_len = len(TP)
    fp_len = len(FP)
    fn_len = len(FN)
    prec = tp_len / (tp_len + fp_len) if tp_len + fp_len > 0 else float("nan")
    rec = tp_len / (tp_len + fn_len) if tp_len + fn_len > 0 else float("nan")
    f1 = 2 * (prec * rec) / (prec + rec) if prec + rec > 0 else float("nan")

    logger.info(f"prec: {prec}")
    logger.info(f"rec: {rec}")
    logger.info(f"f1: {f1}")

    assert f1 > 0.7

    # Testing LSTM
    disc_model = LSTM()
    disc_model.train(
        (train_cands[0], F_train[0]), train_marginals, n_epochs=5, lr=0.001
    )

    test_score = disc_model.predictions((test_cands[0], F_test[0]), b=0.6)
    true_pred = [test_cands[0][_] for _ in np.nditer(np.where(test_score > 0))]

    (TP, FP, FN) = entity_level_f1(
        true_pred, gold_file, ATTRIBUTE, test_docs, parts_by_doc=parts_by_doc
    )

    tp_len = len(TP)
    fp_len = len(FP)
    fn_len = len(FN)
    prec = tp_len / (tp_len + fp_len) if tp_len + fp_len > 0 else float("nan")
    rec = tp_len / (tp_len + fn_len) if tp_len + fn_len > 0 else float("nan")
    f1 = 2 * (prec * rec) / (prec + rec) if prec + rec > 0 else float("nan")

    logger.info(f"prec: {prec}")
    logger.info(f"rec: {rec}")
    logger.info(f"f1: {f1}")

    assert f1 > 0.7

    # Testing Sparse Logistic Regression
    disc_model = SparseLogisticRegression()
    disc_model.train(
        (train_cands[0], F_train[0]), train_marginals, n_epochs=20, lr=0.001
    )

    test_score = disc_model.predictions((test_cands[0], F_test[0]), b=0.6)
    true_pred = [test_cands[0][_] for _ in np.nditer(np.where(test_score > 0))]

    (TP, FP, FN) = entity_level_f1(
        true_pred, gold_file, ATTRIBUTE, test_docs, parts_by_doc=parts_by_doc
    )

    tp_len = len(TP)
    fp_len = len(FP)
    fn_len = len(FN)
    prec = tp_len / (tp_len + fp_len) if tp_len + fp_len > 0 else float("nan")
    rec = tp_len / (tp_len + fn_len) if tp_len + fn_len > 0 else float("nan")
    f1 = 2 * (prec * rec) / (prec + rec) if prec + rec > 0 else float("nan")

    logger.info(f"prec: {prec}")
    logger.info(f"rec: {rec}")
    logger.info(f"f1: {f1}")

    assert f1 > 0.7

    # Testing Sparse LSTM
    disc_model = SparseLSTM()
    disc_model.train(
        (train_cands[0], F_train[0]), train_marginals, n_epochs=5, lr=0.001
    )

    test_score = disc_model.predictions((test_cands[0], F_test[0]), b=0.6)
    true_pred = [test_cands[0][_] for _ in np.nditer(np.where(test_score > 0))]

    (TP, FP, FN) = entity_level_f1(
        true_pred, gold_file, ATTRIBUTE, test_docs, parts_by_doc=parts_by_doc
    )

    tp_len = len(TP)
    fp_len = len(FP)
    fn_len = len(FN)
    prec = tp_len / (tp_len + fp_len) if tp_len + fp_len > 0 else float("nan")
    rec = tp_len / (tp_len + fn_len) if tp_len + fn_len > 0 else float("nan")
    f1 = 2 * (prec * rec) / (prec + rec) if prec + rec > 0 else float("nan")

    logger.info(f"prec: {prec}")
    logger.info(f"rec: {rec}")
    logger.info(f"f1: {f1}")

    assert f1 > 0.7
Ejemplo n.º 16
0
class SnorkeMeTalCollator(Collator):
    def __init__(
        self,
        positive_label: str,
        class_cardinality: int = 2,
        num_epochs: int = 500,
        log_train_every: int = 50,
        seed: int = 123,
    ):
        self.positive_label = positive_label
        self.class_cardinality = class_cardinality
        self.num_epochs = num_epochs
        self.log_train_every = log_train_every
        self.seed = seed
        self.label_model = LabelModel(k=self.class_cardinality, seed=seed)

    @classmethod
    def get_snorkel_index(cls, tag: str) -> int:
        if is_positive(tag):
            return 2
        elif is_negative(tag):
            return 1
        else:
            return 0

    def get_tag(self, index: int) -> str:
        if index == 1:
            return self.positive_label
        else:
            return NEGATIVE_LABEL

    def get_index(self, prob: np.ndarray) -> str:
        assert prob.shape == (2, )
        return prob.argmax()

    def collate_np(self,
                   annotations) -> Tuple[np.ndarray, List[str], List[int]]:
        output_arrs: List[np.ndarray] = []
        words_list: List[str] = []
        id_to_labels: Dict[int, Tuple[int, int]] = {}
        num_funcs = len(annotations)
        for i, ann_inst in tqdm(enumerate(zip(*annotations))):
            ids = [inst['id'] for inst in ann_inst]
            inputs = [inst['input'] for inst in ann_inst]
            outputs = [inst['output'] for inst in ann_inst]
            input_len = len(inputs[0])
            entry_id = ids[0]

            # output arr = (sentence x num_labels)
            output_arr = np.zeros((input_len, num_funcs))
            for i, output in enumerate(outputs):
                for j, out_j in enumerate(output):
                    output_arr[j, i] = SnorkeMeTalCollator.get_snorkel_index(
                        out_j)

            label_start = len(words_list)
            for word_i, word in enumerate(inputs[0]):
                words_list.append(word)
            output_arrs.append(output_arr)
            label_end = len(words_list)
            id_to_labels[entry_id] = (label_start, label_end)
        output_res = np.concatenate(output_arrs, axis=0)
        return output_res, words_list, id_to_labels

    def train_label_model(
        self,
        collated_labels: np.ndarray,
        descriptions: Optional[List[str]],
        train_data_np: Optional[np.ndarray],
    ):
        sparse_labels = sparse.csr_matrix(collated_labels)
        if descriptions is not None:
            descriptions = [(i, desc) for i, desc in enumerate(descriptions)]
            logger.warn(f'labeling function order: {descriptions}')
        logger.warn(lf_summary(sparse_labels))
        self.label_model.train_model(
            sparse_labels,
            n_epochs=self.num_epochs,
            log_train_every=self.log_train_every,
            Y_dev=train_data_np,
        )

    def get_probabilistic_labels(self,
                                 collated_labels: np.ndarray) -> np.ndarray:
        sparse_labels = sparse.csr_matrix(collated_labels)
        return self.label_model.predict_proba(sparse_labels)

    def convert_to_tags(
        self,
        train_probs: np.ndarray,
        word_list: List[str],
        id_to_labels: Dict[int, Tuple[int, int]],
    ) -> List[AnnotatedDataType]:
        output = []
        for entry_id, (label_start, label_end) in id_to_labels.items():
            words = word_list[label_start:label_end]
            prob_labels = train_probs[label_start:label_end]
            label_ids = prob_labels.argmax(axis=1)
            labels = [self.get_tag(i) for i in label_ids]
            output.append({
                'id': entry_id,
                'input': words,
                'output': labels,
            })
        return output

    def collate(
            self,
            annotations: List[AnnotatedDataType],
            should_verify: bool = False,
            descriptions: Optional[List[str]] = None,
            train_data: Optional[AnnotatedDataType] = None
    ) -> AnnotatedDataType:
        '''
        args:
            ``annotations``: List[AnnotatedDataType]
                given a series of annotations, collate them into a single
                series of annotations per instance
        '''
        if should_verify:
            # make sure the annotations are in the
            # proper format
            Collator.verify_annotations(annotations)

        train_data_np = None
        if train_data:
            # if train data specified, will be used by Snorkel to estimate class balanc
            train_data_np, word_lists, id_to_labels = self.collate_np(
                [train_data])
            train_data_np = train_data_np.astype(int)
            train_data_np = train_data_np.reshape(-1)
        collate_np, word_lists, id_to_labels = self.collate_np(annotations)
        self.train_label_model(collated_labels=collate_np,
                               descriptions=descriptions,
                               train_data_np=train_data_np)
        y_train_probs = self.get_probabilistic_labels(
            collated_labels=collate_np, )
        tags = self.convert_to_tags(y_train_probs,
                                    word_list=word_lists,
                                    id_to_labels=id_to_labels)
        return tags
Ejemplo n.º 17
0
def train_model(args):

    #global args
    #args = parser.parse_args()

    hidden_size = 128
    num_classes = 2
    encode_dim = 108  # using get_frm_output_size()

    if (torch.cuda.is_available()):
        device = torch.device('cuda:0')
        #device = 'cuda'
    else:
        device = 'cpu'

    #print(device)
    L, Y = load_labels(args)

    # Label Model
    # labelling functions analysis
    print(lf_summary(L["dev"], Y=Y["dev"]))

    # majority vote of LFs
    mv = MajorityLabelVoter(seed=123)
    print('Majority Label Voter Metrics:')
    mv.score((L["dev"], Y["dev"]),
             metric=['accuracy', 'precision', 'recall', 'f1'])

    # training label model - no temporal modelling
    label_model = LabelModel(k=num_classes, seed=123)
    label_model.train_model(L["train"],
                            Y["dev"],
                            n_epochs=500,
                            log_train_every=50)

    # evaluating label model
    print('Trained Label Model Metrics:')
    label_model.score((L["dev"], Y["dev"]),
                      metric=['accuracy', 'precision', 'recall', 'f1'])

    # training label model without temporal modelling
    # naive model
    #print(L["train"].todense().shape) # (18850,5)
    #print(L["dev"].todense().shape) # (1500,5)
    #print(Y["dev"].shape) # (1500,)
    m_per_task = L["train"].todense().shape[1]  # 5
    MRI_data_naive = {
        'Li_train':
        torch.FloatTensor(np.array(L["train"].todense().astype('int_'))),
        'Li_dev':
        torch.FloatTensor(np.array(L["dev"].todense())),
        'R_dev':
        Y["dev"]
    }

    MRI_data_naive['class_balance'] = torch.FloatTensor([0.5, 0.5]).to(device)

    # training naive model
    naive_model = DPLabelModel(
        m=m_per_task,
        T=1,
        edges=[],
        coverage_sets=[[
            0,
        ]] * m_per_task,
        mu_sharing=[[
            i,
        ] for i in range(m_per_task)],
        phi_sharing=[],
        device=device,
        #class_balance=MRI_data_naive['class_balance'],
        seed=0)

    optimize(naive_model,
             L_hat=MRI_data_naive['Li_train'],
             num_iter=300,
             lr=1e-3,
             momentum=0.8,
             clamp=True,
             seed=0)

    # evaluating naive model
    R_pred = naive_model.predict(MRI_data_naive['Li_dev']).data.numpy()
    R_pred = 2 - R_pred
    #print(R_pred)
    #print(MRI_data_naive['R_dev'])

    for metric in ['accuracy', 'f1', 'recall', 'precision']:
        score = metric_score(MRI_data_naive['R_dev'], R_pred, metric)
        print(f"{metric.capitalize()}: {score:.3f}")

    # training label model with temporal modelling
    # reshaping dataset
    num_frames = 50
    n_patients_train = round(L["train"].todense().shape[0] /
                             num_frames)  #(377)
    n_patients_dev = round(L["dev"].todense().shape[0] / num_frames)  #(30)
    Ltrain = np.reshape(np.array(L["train"].todense()),
                        (n_patients_train, num_frames, -1))
    Ldev = np.reshape(np.array(L["dev"].todense()),
                      (n_patients_dev, num_frames, -1))
    Ydev = np.reshape(Y["dev"], (n_patients_dev, num_frames))
    # print(Ltrain.shape) # (377,50,5)
    #print(Ldev.shape) # (30,50,5)
    #print(Ydev.shape) # (30,50)

    # subsampling
    # selecting frames 3,13,23,33,43
    indices = np.linspace(2, 42, 5).astype(int)
    m_per_task = 5
    T = 5

    Ltrain_small = Ltrain[:, indices, :]  # shape (377,5,5)
    Ldev_small = Ldev[:, indices, :]  # shape (30,5,5)
    Ydev_small = Ydev[:, indices]  # shape (30,5)

    Ltrain_small = np.reshape(
        Ltrain_small, ((n_patients_train * T), m_per_task))  # shape (1885,5)
    Ldev_small = np.reshape(
        Ldev_small, ((n_patients_dev * T), m_per_task))  # shape (150,5)
    Ydev_small = np.reshape(Ydev_small,
                            ((n_patients_dev * T), ))  # shape (150,)

    MRI_data_temporal = {
        'Li_train':
        torch.LongTensor(Ltrain_small).view(n_patients_train,
                                            (m_per_task * T)),
        'Li_dev':
        torch.LongTensor(Ldev_small).view(n_patients_dev, (m_per_task * T)),
        'R_dev':
        torch.LongTensor(Ydev_small)[::T] * (2**T - 1),
        'm':
        m_per_task * T,
        'T':
        T
    }

    MRI_data_temporal['class_balance'] = normalize(
        (MRI_data_temporal['R_dev'].unsqueeze(1) == torch.arange(
            2**T, device=device).unsqueeze(0)).sum(0).float(),
        dim=0,
        p=1)

    max_seed = 10
    temporal_models = [
        None,
    ] * max_seed
    for seed in range(max_seed):
        markov_model = DPLabelModel(
            m=m_per_task * T,
            T=T,
            edges=[(i, i + m_per_task) for i in range((T - 1) * m_per_task)],
            coverage_sets=[[
                t,
            ] for t in range(T) for _ in range(m_per_task)],
            mu_sharing=[[t * m_per_task + i for t in range(T)]
                        for i in range(m_per_task)],
            phi_sharing=[[(t * m_per_task + i, (t + 1) * m_per_task + i)
                          for t in range(T - 1)] for i in range(m_per_task)],
            device=device,
            class_balance=MRI_data_temporal['class_balance'],
            seed=seed)
        optimize(markov_model,
                 L_hat=MRI_data_temporal['Li_train'],
                 num_iter=1000,
                 lr=1e-5,
                 momentum=0.8,
                 clamp=True,
                 verbose=False,
                 seed=seed)
        temporal_models[seed] = markov_model

    for seed, model in enumerate(temporal_models):
        R_pred = model.predict(MRI_data_temporal['Li_dev'].cpu())
        F1 = metric_score(MRI_data_temporal['R_dev'].cpu() > 0,
                          R_pred.cpu() > 0, 'f1')
        accuracy = metric_score(MRI_data_temporal['R_dev'].cpu(), R_pred.cpu(),
                                'accuracy')
        print(f"seed={seed}  accuracy={accuracy:.3f}  F1={F1:.3f}")
# In this section, we are going to train a generative model. This model is designed to estimate the best class a training point belongs to given a set of label functions ($P(\hat{Y} \mid \lambda_{1\ldots n})$). This is accomplished by estimating a parameter called mu, which is the probability of a label function emitting a 1 or 2 given the true class. ($P(\lambda_{j} = y | Y=y)$). Once mu is estimated, the final step is to use mu and calculate the above probability ($P(\hat{Y} \mid \lambda_{1\ldots n})$). In order to find the best model, we will use hyperparameter optimization. This process involves trying multiple values keeping the value that provides the best result. The challenge with this process is that the search space is vast and can take non trivial time to navigate. As a consequence we resort to using advance methods called [bayesian optimization](https://en.wikipedia.org/wiki/Bayesian_optimization) to navigate this space in an effective way. The hyperparameter we are tuning is the l2 norm. This penalty term penalizes a machine learning classifier for the magnitude of the weights. By using this parameter we optimize the l2 norm for each label function individually. Luckily, the generative model is pretty fast to train, so this process won't take a significant amount of time.

# # Distant Supervision

# Here in this section we are using the distant superivion paradigm to label our candidate sentences.

# ## Grid Search

# In[18]:

regularization_grid = pd.np.round(pd.np.linspace(0.01, 5, num=15), 2)

# In[19]:

grid_results = {}
label_model = LabelModel(k=2)
for param in tqdm_notebook(regularization_grid):
    label_model.train_model(correct_L[:, 0:7],
                            n_epochs=1000,
                            print_every=200,
                            seed=100,
                            lr=0.01,
                            l2=param)
    grid_results[str(param)] = label_model.predict_proba(correct_L_train[:,
                                                                         0:7])

# In[20]:

acc_results = defaultdict(list)

for key in grid_results:
Ejemplo n.º 19
0
Ls.append(L_test)
Ds.append(D_test)

print(lf_summary(Ls[1], Y=Ys[1]))

balance = sorted(Counter(Y_test).items())
balance2 = Counter(Y_test).values()

new_balance = []
for elem in balance:
    new_balance.append(elem[1] / sum(balance2))
print(sorted(Counter(Y_test).items()))
print(balance)
print(new_balance)

label_model = LabelModel(k=2, seed=123)
label_model.train_model(Ls[0],
                        class_balance=new_balance,
                        n_epochs=500,
                        log_train_every=50)

score = label_model.score((Ls[1], Ys[1]))

print('Trained Label Model Metrics:')
scores = label_model.score((Ls[1], Ys[1]),
                           metric=['accuracy', 'precision', 'recall', 'f1'])

mv = MajorityLabelVoter(seed=123)
print('Majority Label Voter Metrics:')
scores = mv.score((Ls[1], Ys[1]),
                  metric=['accuracy', 'precision', 'recall', 'f1'])
Ejemplo n.º 20
0
              parallelism=PARALLEL)
L_train = labeler.get_label_matrices(train_cands)

L_gold_train = labeler.get_gold_labels(train_cands, annotator="gold")

from metal import analysis

analysis.lf_summary(
    L_train[0],
    lf_names=labeler.get_keys(),
    Y=L_gold_train[0].todense().reshape(-1).tolist()[0],
)

from metal.label_model import LabelModel

gen_model = LabelModel(k=2)
gen_model.train_model(L_train[0], n_epochs=500, verbose=False)

train_marginals = gen_model.predict_proba(L_train[0])

from fonduer.learning import LogisticRegression

disc_model = LogisticRegression()
disc_model.train((train_cands[0], F_train[0]),
                 train_marginals,
                 n_epochs=10,
                 lr=0.001)

from my_fonduer_model import MyFonduerModel

model = MyFonduerModel()
Ejemplo n.º 21
0
    transformed_data = np.zeros(data.shape, dtype=np.int)
    transformed_data[data == -1] = 1
    transformed_data[data == 1] = 2
    return transformed_data


train_ground = remap_labels(loader.train_ground)
val_ground = remap_labels(loader.val_ground)
L_train_sparse = sparse.csc_matrix(
    (remap_labels(L_train_sparse.data), L_train_sparse.indices,
     L_train_sparse.indptr)).T
L_val_sparse = sparse.csc_matrix((remap_labels(L_val_sparse.data),
                                  L_val_sparse.indices, L_val_sparse.indptr)).T

print('\n\n####### Running METAL Label Model ########')
label_model = LabelModel()
label_model.train_model(L_train_sparse,
                        n_epochs=200,
                        print_every=50,
                        seed=123,
                        verbose=False)
train_marginals = label_model.predict_proba(L_train_sparse)
label_model.score((L_train_sparse, train_ground), metric=metrics)

####### METAL with Exact Class Balance ########
print(
    '\n\n####### Running METAL Label Model with exact class balance ########')
train_class_balance = np.array([
    np.sum(train_ground == 1) / loader.train_num,
    np.sum(train_ground == 2) / loader.train_num
])
Ejemplo n.º 22
0
    "DaG_TEXT": dg_text,
    "CtD_DB_TEXT": cd_db + cd_text,
    "CtD_ALL": cd_db + cd_text + cd_bicluster,
    "All_the_jawns": cd_db + cd_text + cd_bicluster + cg_text + dg_text
}

# In[ ]:

L = label_matricies['train'].toarray()
L[L < 0] = 2
L_dev = label_matricies['dev'].toarray()
L_dev[L_dev < 0] = 2
L_test = label_matricies['test'].toarray()
L_test[L_test < 0] = 2

label_model = LabelModel(k=2, seed=100)

# In[ ]:

reg_param_grid = pd.np.round(pd.np.linspace(1e-1, 1, num=30), 3)
grid_results = defaultdict(dict)
for model in tqdm_notebook(model_dict):
    for reg_param in reg_param_grid:
        label_model.train(L[:, model_dict[model]],
                          n_epochs=1000,
                          verbose=False,
                          lr=0.01,
                          l2=reg_param)
        grid_results[model][str(reg_param)] = label_model.predict_proba(
            L_dev[:, model_dict[model]])[:, 0]
Ejemplo n.º 23
0
def train_model(args):

    #global args
    #args = parser.parse_args()

	hidden_size = 128 
	num_classes = 2
	encode_dim = 1000 # using get_frm_output_size()

	L,Y = load_labels(args) 

	# Label Model
	# labelling functions analysis
	print(lf_summary(L["dev"], Y = Y["dev"]))

	# training label model
	label_model = LabelModel(k=num_classes, seed=123)
	label_model.train_model(L["train"], Y["dev"], n_epochs = 500, log_train_every = 50)

	# evaluating label model
	print('Trained Label Model Metrics:')
	label_model.score((L["dev"], Y["dev"]), metric=['accuracy','precision', 'recall', 'f1'])

	# comparison with majority vote of LFs
	mv = MajorityLabelVoter(seed=123)
	print('Majority Label Voter Metrics:')
	mv.score((L["dev"], Y["dev"]), metric=['accuracy','precision', 'recall', 'f1'])

	Ytrain_p = label_model.predict_proba(L["train"])
	#print(Ytrain_ps.shape) #(377*50,2)
	#Ydev_p = label_model.predict_proba(L["dev"])

	# test models
	#label_model.score((Ltest,Ytest), metric=['accuracy','precision', 'recall', 'f1'])

	# End Model
	# Create datasets and dataloaders
	train, dev, test = load_dataset(args, Ytrain_p, Y["dev"], Y["test"])
	data_loader = get_data_loader(train, dev, test, args.batch_size, args.num_workers)
	#print(len(data_loader["train"])) # 18850 / batch_size
	#print(len(data_loader["dev"])) # 1500 / batch_size
	#print(len(data_loader["test"])) # 1000 / batch_size 
	#import ipdb; ipdb.set_trace()

	# Define input encoder
	cnn_encoder = FrameEncoderOC

	if(torch.cuda.is_available()):
		device = 'cuda'
	else:
		device = 'cpu'
	#import ipdb; ipdb.set_trace()

	# Define LSTM module
	lstm_module = LSTMModule(
		encode_dim,
		hidden_size,
		bidirectional=False,
		verbose=False,
		lstm_reduction="attention",
		encoder_class=cnn_encoder,
		)

	train_args = [data_loader["train"]]

	train_kwargs = {
	'seed':args.seed,
	'progress_bar':True,
	'log_train_every':1}

	init_args = [
	[hidden_size, num_classes]
	]

	init_kwargs = {
	"input_module": lstm_module, 
	"optimizer": "adam",
	"verbose": False,
	"input_batchnorm": True,
	"use_cuda":torch.cuda.is_available(),
	'checkpoint_dir':args.checkpoint_dir,
	'seed':args.seed,
	'device':device}
	
	search_space = {
	'n_epochs':[10],
	'batchnorm':[True],
	'dropout': [0.1,0.25,0.4],
	'lr':{'range': [1e-3, 1e-2], 'scale': 'log'}, 
	'l2':{'range': [1e-5, 1e-4], 'scale': 'log'},#[ 1.21*1e-5],
	#'checkpoint_metric':['f1'],
	}	
	
	log_config = {
	"log_dir": "./run_logs", 
	"run_name": 'cnn_lstm_oc'
	}

	max_search = 5
	tuner_config = {"max_search": max_search }

	validation_metric = 'accuracy'

	# Set up logger and searcher
	tuner = RandomSearchTuner(EndModel, 
	**log_config,
	log_writer_class=TensorBoardWriter,
	validation_metric=validation_metric,
	seed=1701)
	
	disc_model = tuner.search(
	search_space,
	valid_data = data_loader["dev"],
	train_args=train_args,
	init_args=init_args,
	init_kwargs=init_kwargs,
	train_kwargs=train_kwargs,
	max_search=tuner_config["max_search"],
	clean_up=False,
	)

	# evaluate end model
	disc_model.score(data_loader["dev"], verbose=True, metric=['accuracy','precision', 'recall', 'f1'])
Ejemplo n.º 24
0
labeler.apply(split=0, lfs=[president_name_pob_lfs], train=True, parallelism=PARALLEL)
L_train = labeler.get_label_matrices(train_cands)

L_gold_train = labeler.get_gold_labels(train_cands, annotator="gold")

from metal import analysis

analysis.lf_summary(
    L_train[0],
    lf_names=labeler.get_keys(),
    Y=L_gold_train[0].todense().reshape(-1).tolist()[0],
)

from metal.label_model import LabelModel

gen_model = LabelModel(k=2)
gen_model.train_model(L_train[0], n_epochs=500, print_every=100)

train_marginals = gen_model.predict_proba(L_train[0])

from fonduer.learning import LogisticRegression

disc_model = LogisticRegression()
disc_model.train((train_cands[0], F_train[0]), train_marginals, n_epochs=10, lr=0.001)

from my_fonduer_model import MyFonduerModel
model = MyFonduerModel()

import fonduer_model
fonduer_model.save_model(
    fonduer_model=model,
Ejemplo n.º 25
0
                       'categorical')
L_test = convert_labels(label_matricies['test'].toarray(), 'plusminus',
                        'categorical')

validation_data = list(
    zip([L[:, :7], L[:, :24], L], [L_dev[:, :7], L_dev[:, :24], L_dev]))
test_data = list(
    zip([L[:, :7], L[:, :24], L], [L_test[:, :7], L_test[:, :24], L_test]))
model_labels = ["Distant Supervision (DS)", "DS+User Defined Rules", "All"]

# In[15]:

model_grid_search = {}
for model_data, model_label in zip(validation_data, model_labels):

    label_model = LabelModel(k=2, seed=100)
    grid_results = {}
    for param in regularization_grid:
        label_model.train_model(model_data[0],
                                n_epochs=1000,
                                verbose=False,
                                lr=0.01,
                                l2=param)
        grid_results[str(param)] = label_model.predict_proba(model_data[1])[:,
                                                                            0]

    model_grid_search[model_label] = pd.DataFrame.from_dict(grid_results)

# In[16]:

model_grid_aucs = {}