def get_negative_labeling_function(divisor: int) -> LabelingFunction: """Get LabelingFunction that abstains unless x0 is divisible by divisor.""" def f(x): return 0 if x.x0 % divisor == 0 and x.x1 <= x.x2 + 0.25 else -1 return LabelingFunction(f"lf_neg_{divisor}", f)
def make_keyword_lf_NL(keywords, label=NL): return LabelingFunction( name=f"keyword_{keywords[0]}_NL", f=keyword_lookup, resources=dict(keywords=keywords, label=label), # pre=[spacy_preproc] )
def make_thresold_lf(thresh, col_name, next_threshold=sys.maxsize): return LabelingFunction( name="more_than_%s_%s" % (thresh, col_name), f=more_than_treshold, resources=dict(thresh=thresh, col_name=col_name, next_threshold=next_threshold), )
def wrapper(func): # set up kwargs for Snorkel's LF # a default name that can be overridden snorkel_kwargs = {"name": func.__name__} snorkel_kwargs.update(kwargs) # return value of hover's decorator lf = SnorkelLF(f=func, **snorkel_kwargs) # additional attributes lf.uuid = uuid.uuid1() lf.targets = targets[:] # link a snorkel-style labeling function if applicable if label_encoder: lf.label_encoder = label_encoder def snorkel_style_func(x): return lf.label_encoder[func(x)] lf.snorkel = snorkel_lf(**kwargs)(snorkel_style_func) else: lf.label_encoder = None lf.snorkel = None return lf
def make_worker_lf(worker_id, x_id_field: str = "tweet_id") -> LabelingFunction: def worker_lf(x, worker_dict): return worker_dict.get(x[x_id_field], ABSTAIN) worker_dict = worker_dicts[worker_id] name = f"worker_{worker_id}" return LabelingFunction(name, f=worker_lf, resources={"worker_dict": worker_dict})
def createAnalysis(final_df, category_names): L_final = [] for name in category_names: category_if = [-1 if i == 0 else i for i in final_df[name].tolist()] L_final.append(category_if) L_train = [list(x) for x in list(zip(*L_final))] lfs = [LabelingFunction(name=name, f=None) for name in category_names] return LFAnalysis(L=np.array(L_train), lfs=lfs).lf_summary()
def make_keyword_lf(keyword, label, neg_label, context_len, with_period): return LabelingFunction( name="pattern_%s_%s%s" % (keyword, "context:%d" % context_len, "_period" if with_period else ""), f=pattern_match, resources=dict(keyword=keyword, label=label, neg_label=neg_label, with_period=with_period, context_len=context_len), )
def make_keyword_lf(keywords, label=1): def keyword_lookup(x, keywords, label): if any(word in x.text.lower() for word in keywords): return label return -1 return LabelingFunction( name=f"keyword_{keywords[0]}", f=keyword_lookup, resources=dict(keywords=keywords, label=label), )
def make_annotator_lf(worker_index, num_annotators = sys.maxsize): worker_index_old = worker_index worker_index = worker_index % num_annotators reader = csv.reader(open("/home/tigunova/PycharmProjects/snorkel_labels/data/hobby/labeling_lf/%d.csv" % (worker_index))) next(reader) worker_dict = dict((x[-1], POS if x[1] == "checked" else NEG) for x in reader) return LabelingFunction( name="worker_%d" % (worker_index_old), f=worker_lf, resources=dict(worker_dict=worker_dict), )
def make_keyword_lf(keywords: List[str], label: str, field: str = "text"): def keyword_lookup(x, keywords, label): if any(word in x[field].lower() for word in keywords): return label return ABSTAIN return LabelingFunction( name=f"keyword_{keywords[0]}", f=keyword_lookup, resources=dict(keywords=keywords, label=label), )
def make_annotator_lf(worker_index): reader = csv.reader( open( "/home/tigunova/PycharmProjects/snorkel_labels/data/profession/labeling_lf/%d.csv" % (worker_index))) next(reader) worker_dict = dict( (x[-1], POS if x[1] == "checked" else ABSTAIN) for x in reader) return LabelingFunction( name="worker_%d" % (worker_index), f=worker_lf, resources=dict(worker_dict=worker_dict), )
def make_keyword_lf(keywords: list, label: int = RELEVANT): """Generate labeling functions from keywords related to the application area :param keywords: A list of keywords related to the application area :type keywords: list :param label: The label that should be assigned to each labeling function, defaults to RELEVANT :type label: int, optional :return: Returns a labeling function which implements the `keyword_lookup` function :rtype: LabelingFunction """ return LabelingFunction(name=f"keyword_{keywords[0]}", f=keyword_lookup, resources=dict(keywords=keywords, label=label))
def make_keyword_lf(self, keywords: list, label: int, lf_name: str = None): """Generate a labeling function from a keyword :param keywords: A list of keywords which will be used to generate the labeling functions :type keywords: list :param label: The label to assign to the labeling function :type label: int :param lf_name: A unique name for the labling function :type lf_name: str, optional :return: returns a labeling function which implements `keyword_lookup` :rtype: LabelingFunction """ labeling_function_name = f"keyword_{re.sub(' ', '_', keywords[0].strip())}" return LabelingFunction(name=labeling_function_name, f=self.keyword_lookup, resources=dict(keywords=keywords, label=label))
def test_lf_summary(self) -> None: df = self.lfa.lf_summary(self.Y, est_weights=None) df_expected = pd.DataFrame( { "Polarity": [[1, 2], [], [0, 2], [2], [0, 1], [0]], "Coverage": [3 / 6, 0, 3 / 6, 2 / 6, 2 / 6, 4 / 6], "Overlaps": [3 / 6, 0, 3 / 6, 1 / 6, 2 / 6, 4 / 6], "Conflicts": [3 / 6, 0, 2 / 6, 1 / 6, 2 / 6, 3 / 6], "Correct": [1, 0, 1, 1, 1, 2], "Incorrect": [2, 0, 2, 1, 1, 2], "Emp. Acc.": [1 / 3, 0, 1 / 3, 1 / 2, 1 / 2, 2 / 4], } ) pd.testing.assert_frame_equal(df.round(6), df_expected.round(6)) df = self.lfa.lf_summary(Y=None, est_weights=None) df_expected = pd.DataFrame( { "Polarity": [[1, 2], [], [0, 2], [2], [0, 1], [0]], "Coverage": [3 / 6, 0, 3 / 6, 2 / 6, 2 / 6, 4 / 6], "Overlaps": [3 / 6, 0, 3 / 6, 1 / 6, 2 / 6, 4 / 6], "Conflicts": [3 / 6, 0, 2 / 6, 1 / 6, 2 / 6, 3 / 6], } ) pd.testing.assert_frame_equal(df.round(6), df_expected.round(6)) est_weights = [1, 0, 1, 1, 1, 0.5] names = list("abcdef") lfs = [LabelingFunction(s, f) for s in names] lfa = LFAnalysis(np.array(L), lfs) df = lfa.lf_summary(self.Y, est_weights=est_weights) df_expected = pd.DataFrame( { "j": [0, 1, 2, 3, 4, 5], "Polarity": [[1, 2], [], [0, 2], [2], [0, 1], [0]], "Coverage": [3 / 6, 0, 3 / 6, 2 / 6, 2 / 6, 4 / 6], "Overlaps": [3 / 6, 0, 3 / 6, 1 / 6, 2 / 6, 4 / 6], "Conflicts": [3 / 6, 0, 2 / 6, 1 / 6, 2 / 6, 3 / 6], "Correct": [1, 0, 1, 1, 1, 2], "Incorrect": [2, 0, 2, 1, 1, 2], "Emp. Acc.": [1 / 3, 0, 1 / 3, 1 / 2, 1 / 2, 2 / 4], "Learned Weight": [1, 0, 1, 1, 1, 0.5], } ).set_index(pd.Index(names)) pd.testing.assert_frame_equal(df.round(6), df_expected.round(6))
def get_lfs(keywords): lfs = [] idx_label_map = {} for i, (label, kws) in enumerate(keywords.items()): idx_label_map[i] = label for kw in kws: parsed_kw = parse_kw(kw) if parsed_kw['first'] == 'B': name = parsed_kw['text'] + f'...[{label}]' elif parsed_kw['first'] == 'I': name = f'[{parsed_kw["text"]}...{label}]' elif parsed_kw['first'] == 'A': name = f'[{label}]...{parsed_kw["text"]}' lfs.append( LabelingFunction(name=name, f=sent_context_lf, resources=dict(label=i, parsed_kw=parsed_kw))) return lfs, idx_label_map
def pos(sample): return 1 if re.search(PWORDS, str(sample)) else -1 NWORDS = r"\b(hell yeah|bribery|not happy|less moral|impeach trump|impeach our president)" def neg(sample): return 0 if re.search(NWORDS, str(sample)) else -1 # In[8]: positive = LabelingFunction(f"positive", f=pos) # In[9]: negative = LabelingFunction(f"negative", f=neg) # In[10]: df_train = dt[0:450000] # In[11]: # Create the labeling functions using the textblob sentiment analyzer @preprocessor(memoize=True) def textblob_polarity(x):
def make_keyword_lf(keywords, label=SPAM): return LabelingFunction( name=f"keyword_{keywords[0]}", f=keyword_lookup, resources=dict(keywords=keywords, label=label), )
def test_wrong_number_of_lfs(self) -> None: with self.assertRaisesRegex(ValueError, "Number of LFs"): LFAnalysis(np.array(L), [LabelingFunction(s, f) for s in "ab"])
def make_keyword_lf(lf_name, keywords, label=IRRELEVANT): return LabelingFunction( name=lf_name, f=regex_keyword_lookup, resources=dict(keywords=keywords, label=label), )
def make_abstract_lf(keywords, name, label=None): return LabelingFunction( name=f"abstract_{name}", f=abstract_lookup, resources=dict(keywords=keywords, label=label), )
def make_keyword_lf(keywords, virus, name, label=None): return LabelingFunction( name=f"keyword_{name}", f=keyword_lookup, resources=dict(keywords=keywords, virus=virus, label=label), )
def make_lfs_list(post_hoc_callables, GI_callables, rule_out_callables, lab_callables): lfs = [] for f in post_hoc_callables: lfs.append(f) for f in rule_out_callables: lfs.append(f) for f in GI_callables: lfs.append(f) for f in lab_callables: lfs.append(f) # """ # Returns 1 if any rule out condition is met, 0 otherwise. # """ # def rule_out(x): # for f in rule_out_callables: # if f(x)==0: # return 1 # return 0 # # """ # 1 if any GI callable labels 1, -1 otherwise # """ # def any_GI(x): # for f in GI_callables: # if f(x)==1: # return 1 # return -1 # # """ # 1 if any lab callable labels 1, -1 otherwise # """ # def any_lab(x): # for f in lab_callables: # if f(x)==1: # return 1 # return -1 # # """ # Form LFs # """ # for f in GI_callables: # def new_lf(x, f=f): # if f(x) == 1 and rule_out(x) == 0: # return 1 # else: # return -1 # new_lf.__name__ = f.__name__ # lfs.append(new_lf) # # def GI_any_lab(x, new_lf=new_lf): # if new_lf(x) == 0: # return 0 # elif new_lf(x) == 1 and any_lab(x) == 1: # return 1 # else: # return -1 # GI_any_lab.__name__ = f.__name__+'_lab' # lfs.append(GI_any_lab) # # # for f in lab_callables: # def new_lf(x, f=f): # if f(x) == 1 and rule_out(x) == 0: # return 1 # else: # return -1 # new_lf.__name__ = f.__name__ # lfs.append(new_lf) # # def lab_any_GI(x, new_lf=new_lf): # if new_lf(x) == 0: # return 0 # elif new_lf(x) == 1 and any_GI(x) == 1: # return 1 # else: # return -1 # lab_any_GI.__name__ = f.__name__+'_GI' # lfs.append(lab_any_GI) wrapped_lfs = [] for lf in lfs: wrapped_lfs.append(LabelingFunction(name=lf.__name__, f=lf)) return wrapped_lfs
def make_lexicon_lf(thresh, pref="", previous_threshold=-sys.maxsize): return LabelingFunction( name="%s_less_%s" % (pref, thresh), f=less_than_treshold, resources=dict(thresh=thresh, previous_threshold=previous_threshold), )
def make_keyword_lf(name, keywords_pos, keywords_neg): return LabelingFunction( name=name, f=keyword_lookup, resources=dict(keywords_pos=keywords_pos, keywords_neg=keywords_neg), )
def make_expression_lf(name, pre_pos, expression): return LabelingFunction( name=name, f=expression_lookup, resources=dict(pre_pos=pre_pos, expression=expression), )
def make_worker_lf(worker_id): worker_dict = worker_dicts[worker_id] name = f"worker_{worker_id}" return LabelingFunction(name, f=worker_lf, resources={"worker_dict": worker_dict})