Esempio n. 1
0
def train_snorkel_gen_model(L, gte=True):
    L_train = sparse.csr_matrix(L)

    gen_model = GenerativeModel()
    gen_model.train(L_train, epochs=100, decay=0.95,
                    step_size=0.01 / L_train.shape[0],
                    reg_param=1e-6)

    train_marginals = gen_model.marginals(L_train)
    marginals_threshold = (max(train_marginals) - min(train_marginals)) / 2
    train_labels = (2 * (train_marginals >= marginals_threshold) - 1 if gte
                    else 2 * (train_marginals < marginals_threshold) - 1)

    return gen_model, train_labels, train_marginals
Esempio n. 2
0
def apply_GenMod(L_train):
    """
    Applies generative model on label matrix
    :param L_train: Label matrix
    :return: None
    """
    gen_model = GenerativeModel()
    # gen_model.train(L_train, epochs=100, decay=0.95, step_size=0.1 / L_train.shape[0], reg_param=1e-6)
    gen_model.train(L_train, cardinality=3)
    # print(gen_model.weights.lf_accuracy)
    train_marginals = gen_model.marginals(L_train)
    report.append('\n#Gen Model Stats\n')
    report.append(gen_model.learned_lf_stats().to_csv(sep=' ', index=False, header=True))
    save_marginals(session, L_train, train_marginals)
Esempio n. 3
0
class Snorkeller:
    def __init__(self,
                 query_pairwise_bins_by_ranker: QueryPairwiseBinsByRanker):
        self.query_pairwise_bins_by_ranker = query_pairwise_bins_by_ranker
        self.snorkel_gm = GenerativeModel()
        self.is_trained = False

    def train(self, train_ranked_lists_by_ranker: Dict[str, List[List[int]]]):
        L_train = get_L_from_rankings(train_ranked_lists_by_ranker)
        ds = DependencySelector()
        deps = ds.select(L_train, threshold=0.0)
        self.snorkel_gm.train(L_train,
                              deps,
                              epochs=100,
                              decay=0.95,
                              step_size=0.1 / L_train.shape[0],
                              reg_param=1e-6)
        self.is_trained = True

    def calc_marginals(self, target_info: List[TargetInfo]):
        non_rand_target_info: List[TargetInfo] = []
        deltas: List[int] = []
        delta_idxs: List[int] = []
        for idx, info in enumerate(target_info):
            if info[3]:
                deltas.append(deltas[-1] + 1 if len(deltas) != 0 else 0)
                delta_idxs.append(idx)
            else:
                non_rand_target_info.append(info)
        offset = len(non_rand_target_info)
        order = []
        marginal_ctr = 0
        for idx in range(len(target_info)):
            if len(delta_idxs) != 0 and idx == delta_idxs[0]:
                order.append(offset + deltas[0])
                deltas = deltas[1:]
                delta_idxs = delta_idxs[1:]
            else:
                order.append(marginal_ctr)
                marginal_ctr += 1
        order = np.array(order)
        L = get_L_from_pairs(self.query_pairwise_bins_by_ranker,
                             non_rand_target_info)
        marginals = self.snorkel_gm.marginals(L)
        all_marginals = np.concatenate([
            marginals,
            np.ones(len(target_info) - len(marginals), dtype=marginals.dtype)
        ])
        return all_marginals[order]
Esempio n. 4
0
def Fitting_Gen_Model(L_train):
    gen_model = GenerativeModel()
    gen_model.train(L_train,
                    epochs=100,
                    decay=0.95,
                    step_size=0.1 / L_train.shape[0],
                    reg_param=1e-6)

    #-------------------------
    print(gen_model.weights.lf_accuracy)
    print(gen_model.weights.class_prior)
    #-------------------------
    #We now apply the generative model to the training candidates to get the noise-aware training label set. We'll refer to these as the training marginals:
    train_marginals = gen_model.marginals(L_train)
    return gen_model, train_marginals
Esempio n. 5
0
def train_snorkel_gen_model(L, gte=True):
    L_train = sparse.csr_matrix(L)

    gen_model = GenerativeModel()
    gen_model.train(L_train,
                    epochs=100,
                    decay=0.95,
                    step_size=0.01 / L_train.shape[0],
                    reg_param=1e-6)

    train_marginals = gen_model.marginals(L_train)
    marginals_threshold = (max(train_marginals) - min(train_marginals)) / 2
    train_labels = (2 * (train_marginals >= marginals_threshold) -
                    1 if gte else 2 * (train_marginals < marginals_threshold) -
                    1)

    return gen_model, train_labels, train_marginals
Esempio n. 6
0
def train_gen_model(predicate_resume, parallelism=8):
    logging.info("Start train gen")
    session = SnorkelSession()

    labeler = _get_labeler(predicate_resume)
    logging.info("Load matrix")
    L_train = _load_matrix(predicate_resume, session, labeler)
    gen_model = GenerativeModel()
    logging.info("Train model")
    gen_model.train(L_train,
                    epochs=100,
                    decay=0.95,
                    step_size=0.1 / L_train.shape[0],
                    reg_param=1e-6,
                    threads=int(parallelism))
    logging.info("Save model")
    _save_model(predicate_resume, gen_model)
    #Save marginals
    logging.info("Get marginals")
    train_marginals = gen_model.marginals(L_train)
    logging.info("Save marginals")
    save_marginals(session, L_train, train_marginals)
Esempio n. 7
0
def score_gen_model(predicate_resume,
                    session,
                    gen_model_name=None,
                    parallelism=16):
    if gen_model_name is None:
        model_name = "G" + predicate_resume["predicate_name"] + "Latest"
    logging.info("Stats logging")
    key_group = predicate_resume["label_group"]
    train_cids_query = get_train_cids_with_span(predicate_resume, session)
    L_train = load_ltrain(predicate_resume, session)
    gen_model = GenerativeModel()
    gen_model.load(model_name)
    gen_model.train(L_train,
                    epochs=100,
                    decay=0.95,
                    step_size=0.1 / L_train.shape[0],
                    reg_param=1e-6)
    logging.info(gen_model.weights.lf_accuracy)
    print(gen_model.weights.lf_accuracy)
    train_marginals = gen_model.marginals(L_train)
    fig = plt.figure()
    #hist=plt.hist(train_marginals, bins=20)
    #plt.savefig("plt"+strftime("%d-%m-%Y_%H_%M_%S", gmtime())+".png", dpi=fig.dpi)
    gen_model.learned_lf_stats()
Esempio n. 8
0
def score_lfs(predicate_resume,
              L_gold_test,
              session,
              date_time,
              parallelism=8):
    dump_file_path = "./results/" + "lfs_1_" + predicate_resume[
        "predicate_name"] + date_time + ".csv"

    key_group = predicate_resume["label_group"]
    LFs = get_labelling_functions(predicate_resume)
    labeler = LabelAnnotator(lfs=LFs)
    test_cids_query = get_test_cids_with_span(predicate_resume, session)
    L_test = labeler.apply(parallelism=parallelism,
                           cids_query=test_cids_query,
                           key_group=key_group,
                           clear=True,
                           replace_key_set=False)

    data_frame = L_test.lf_stats(session)
    print(data_frame)
    logging.info(data_frame)
    data_frame.to_csv(dump_file_path)

    gen_model = GenerativeModel()
    gen_model.train(L_test,
                    epochs=100,
                    decay=0.95,
                    step_size=0.1 / L_test.shape[0],
                    reg_param=1e-6)

    p, r, f1 = gen_model.score(L_test, L_gold_test)
    print("Prec: {0:.3f}, Recall: {1:.3f}, F1 Score: {2:.3f}".format(p, r, f1))
    logging.info("Prec: {0:.3f}, Recall: {1:.3f}, F1 Score: {2:.3f}".format(
        p, r, f1))
    dump_file_path1 = "./results/" + "test_gen_1_" + predicate_resume[
        "predicate_name"] + date_time + ".csv"
    with open(dump_file_path1, 'w+b') as f:
        writer = csv.writer(f, delimiter=',', quoting=csv.QUOTE_MINIMAL)
        writer.writerow(["Precision", "Recall", "F1"])
        writer.writerow(
            ["{0:.3f}".format(p), "{0:.3f}".format(r), "{0:.3f}".format(f1)])

    test_marginals = gen_model.marginals(L_test)

    dump_file_path2 = "./results/" + "plt_1_" + predicate_resume[
        "predicate_name"] + date_time + ".csv"
    #plt.hist(test_marginals, bins=20)
    #plt.savefig(dump_file_path2)
    #plt.show()

    dump_file_path3 = "./results/" + "gen_2_" + predicate_resume[
        "predicate_name"] + date_time + ".csv"
    data_frame3 = gen_model.learned_lf_stats()
    data_frame3.to_csv(dump_file_path3)

    dump_file_path4 = "./results/" + "gen_3_" + predicate_resume[
        "predicate_name"] + date_time + ".csv"
    tp, fp, tn, fn = gen_model.error_analysis(session, L_test, L_gold_test)
    with open(dump_file_path4, 'w+b') as f:
        writer = csv.writer(f, delimiter=',', quoting=csv.QUOTE_MINIMAL)
        writer.writerow(["TP", "FP", "TN", "FN"])
        writer.writerow(
            [str(len(tp)),
             str(len(fp)),
             str(len(tn)),
             str(len(fn))])

    dump_file_path5 = "./results/" + "gen_4_" + predicate_resume[
        "predicate_name"] + date_time + ".csv"
    data_frame4 = L_test.lf_stats(session, L_gold_test,
                                  gen_model.learned_lf_stats()['Accuracy'])
    data_frame4.to_csv(dump_file_path5)
Esempio n. 9
0
                                  split=2)
    L_gold_test = load_gold_labels(session, annotator_name='gold', split=2)

    # Generative model
    ds = DependencySelector()
    deps = ds.select(L_train, threshold=0.1)

    gen_model = GenerativeModel()
    gen_model.train(L_train,
                    epochs=100,
                    decay=0.95,
                    step_size=0.1 / L_train.shape[0],
                    reg_param=1.00e-03,
                    deps=deps)

    train_marginals = gen_model.marginals(L_train)

    # Discriminative model
    featurizer = FeatureAnnotator(f=hybrid_span_mention_ftrs)

    F_train = featurizer.load_matrix(session, split=0)
    F_dev = featurizer.load_matrix(session, split=1)
    F_test = featurizer.load_matrix(session, split=2)

    if F_train.size == 0:
        F_train = featurizer.apply(split=0, parallelism=1)
    if F_dev.size == 0:
        F_dev = featurizer.apply_existing(split=1, parallelism=1)
    if F_test.size == 0:
        F_test = featurizer.apply_existing(split=2, parallelism=1)
# defining model
from snorkel.learning import GenerativeModel
# Creating generative model
gen_model = GenerativeModel()

# defining saved weights directory and name
model_name = 'Price_Gen_20K'  # this was provided when the model was saved!
save_dir = '/dfs/scratch0/jdunnmon/data/memex-data/extractor_checkpoints/Price_Gen_20K'  # this was provided when the model was saved!

# loading
print("Loading generative model...")
gen_model.load(model_name=model_name, save_dir=save_dir, verbose=True)

# Evaluating LSTM
print("Evaluating marginals...")
eval_marginals = gen_model.marginals(L_eval)

# Geocoding
from gm_utils import create_extractions_dict
# Enter googlemaps api key to get geocodes, leave blank to just use extracted locations
geocode_key = None
# geocode_key = 'AIzaSyBlLyOaasYMgMxFGUh2jJyxIG0_pZFF_jM'
print("Creating extractions dictionary...")
doc_extractions = create_extractions_dict(session,
                                          L_eval,
                                          eval_marginals,
                                          extractions=[extraction_type],
                                          dummy=False,
                                          geocode_key=geocode_key)

# Setting filename
Esempio n. 11
0
class SnorkelCollator(Collator):
    def __init__(
        self,
        positive_label: str,
        class_cardinality: int = 2,
        num_epochs: int = 500,
        log_train_every: int = 50,
        seed: int = 123,
        threshold: float = 0.5,
    ):
        self.positive_label = positive_label
        self.class_cardinality = class_cardinality
        self.num_epochs = num_epochs
        self.log_train_every = log_train_every
        self.seed = seed
        self.ds = DependencySelector()
        self.gen_model = GenerativeModel(lf_propensity=True)
        self.threshold = threshold

    @classmethod
    def get_snorkel_index(cls, tag: str) -> int:
        if is_positive(tag):
            return 1
        elif is_negative(tag):
            return 0
        else:
            return -1

    def get_tag(self, index: int) -> str:
        if index == 1:
            return self.positive_label
        else:
            return NEGATIVE_LABEL

    def get_index(self, prob: np.ndarray) -> str:
        assert prob.shape == (2, )
        return prob.argmax()

    def collate_np(self,
                   annotations) -> Tuple[np.ndarray, List[str], List[int]]:
        output_arrs: List[np.ndarray] = []
        words_list: List[str] = []
        id_to_labels: Dict[int, Tuple[int, int]] = {}
        num_funcs = len(annotations)
        for i, ann_inst in tqdm(enumerate(zip(*annotations))):
            ids = [inst['id'] for inst in ann_inst]
            inputs = [inst['input'] for inst in ann_inst]
            outputs = [inst['output'] for inst in ann_inst]
            input_len = len(inputs[0])
            entry_id = ids[0]

            # output arr = (sentence x num_labels)
            output_arr = np.zeros((input_len, num_funcs))
            for i, output in enumerate(outputs):
                for j, out_j in enumerate(output):
                    output_arr[j, i] = SnorkelCollator.get_snorkel_index(out_j)

            label_start = len(words_list)
            for word_i, word in enumerate(inputs[0]):
                words_list.append(word)
            output_arrs.append(output_arr)
            label_end = len(words_list)
            id_to_labels[entry_id] = (label_start, label_end)
        output_res = np.concatenate(output_arrs, axis=0)
        return output_res, words_list, id_to_labels

    def train_label_model(
        self,
        collated_labels: np.ndarray,
        descriptions: Optional[List[str]],
        train_data_np: Optional[np.ndarray],
    ):
        sparse_labels = sparse.csr_matrix(collated_labels.astype(int))
        if descriptions is not None:
            descriptions = [(i, desc) for i, desc in enumerate(descriptions)]
            logger.warn(f'labeling function order: {descriptions}')
        deps = self.ds.select(sparse_labels, threshold=0.05)
        self.gen_model.train(
            sparse_labels,
            deps=deps,
            decay=0.95,
            step_size=0.1 / sparse_labels.shape[0],
            reg_param=0.0,
            cardinality=self.class_cardinality,
        )

    def get_probabilistic_labels(self,
                                 collated_labels: np.ndarray) -> np.ndarray:
        sparse_labels = sparse.csr_matrix(collated_labels)
        return self.gen_model.marginals(sparse_labels)

    def convert_to_tags(
        self,
        train_probs: np.ndarray,
        word_list: List[str],
        id_to_labels: Dict[int, Tuple[int, int]],
    ) -> List[AnnotatedDataType]:
        output = []
        for entry_id, (label_start, label_end) in id_to_labels.items():
            words = word_list[label_start:label_end]
            prob_labels = train_probs[label_start:label_end]
            if self.class_cardinality == 2:
                # (m, ) marginals in prob labels
                label_ids = (prob_labels > self.threshold).astype(int)
            else:
                # (m, k) marginals in prob labels
                label_ids = prob_labels.argmax(axis=1)
            labels = [self.get_tag(i) for i in label_ids]
            output.append({
                'id': entry_id,
                'input': words,
                'output': labels,
            })
        return output

    def collate(
            self,
            annotations: List[AnnotatedDataType],
            should_verify: bool = False,
            descriptions: Optional[List[str]] = None,
            train_data: Optional[AnnotatedDataType] = None
    ) -> AnnotatedDataType:
        '''
        args:
            ``annotations``: List[AnnotatedDataType]
                given a series of annotations, collate them into a single
                series of annotations per instance
        '''
        if should_verify:
            # make sure the annotations are in the
            # proper format
            Collator.verify_annotations(annotations)

        train_data_np = None
        if train_data:
            # if train data specified, will be used by Snorkel to estimate class balanc
            train_data_np, word_lists, id_to_labels = self.collate_np(
                [train_data])
            train_data_np = train_data_np.astype(int)
            train_data_np = train_data_np.reshape(-1)
        collate_np, word_lists, id_to_labels = self.collate_np(annotations)
        self.train_label_model(collated_labels=collate_np,
                               descriptions=descriptions,
                               train_data_np=train_data_np)
        y_train_probs = self.get_probabilistic_labels(
            collated_labels=collate_np, )
        tags = self.convert_to_tags(y_train_probs,
                                    word_list=word_lists,
                                    id_to_labels=id_to_labels)
        return tags