def do_job_unit(self, event, corpus, unit, **kwargs): assert unit == 0 extractor = kwargs.get('extractor', "goose") thresh = kwargs.get('thresh', .8) delay = kwargs.get('delay', None) topk = kwargs.get('topk', 20) train_events = [e for e in cuttsum.events.get_events() if e.query_num not in set([event.query_num, 7])] res = InputStreamResource() y = [] X = [] for train_event in train_events: y_e = [] X_e = [] istream = res.get_dataframes( train_event, cuttsum.corpora.get_raw_corpus(train_event), extractor, thresh, delay, topk) for df in istream: selector = (df["n conf"] == 1) & (df["nugget probs"].apply(len) == 0) df.loc[selector, "nugget probs"] = \ df.loc[selector, "nuggets"].apply(lambda x: {n:1 for n in x}) df["probs"] = df["nugget probs"].apply(lambda x: [val for key, val in x.items()] +[0]) df["probs"] = df["probs"].apply(lambda x: np.max(x)) df.loc[(df["n conf"] == 1) & (df["nuggets"].apply(len) == 0), "probs"] = 0 y_t = df["probs"].values y_t = y_t[:, np.newaxis] y_e.append(y_t) X_t = df[self.cols].values X_e.append(X_t) y_e = np.vstack(y_e) y.append(y_e) X_e = np.vstack(X_e) X.append(X_e) # print "WARNING NOT USING 2014 EVENTS" X = np.vstack(X) y = np.vstack(y) gbc = GradientBoostingRegressor( n_estimators=100, learning_rate=1., max_depth=3, random_state=0) print "fitting", event gbc.fit(X, y.ravel()) print event, "SCORE", gbc.score(X, y.ravel()) model_dir = self.get_model_dir(event) if not os.path.exists(model_dir): os.makedirs(model_dir) joblib.dump(gbc, self.get_model_path(event), compress=9)
def get_input_stream(event, gold_probs, extractor="goose", thresh=.8, delay=None, topk=20): max_nuggets = 3 corpus = cuttsum.corpora.FilteredTS2015() res = InputStreamResource() df = pd.concat( res.get_dataframes(event, corpus, extractor, thresh, delay, topk)) selector = (df["n conf"] == 1) & (df["nugget probs"].apply(len) == 0) df.loc[selector, "nugget probs"] = df.loc[selector, "nuggets"].apply( lambda x: {n: 1 for n in x}) df["true probs"] = df["nugget probs"].apply( lambda x: [val for key, val in x.items()] + [0]) df["true probs"] = df["true probs"].apply(lambda x: np.max(x)) df.loc[(df["n conf"] == 1) & (df["nuggets"].apply(len) == 0), "true probs"] = 0 if gold_probs is True: df["probs"] = df["true probs"] else: df["probs"] = NuggetRegressor().predict(event, df) df["nuggets"] = df["nugget probs"].apply( lambda x: set([key for key, val in x.items() if val > .9])) nid2time = {} nids = set(matches_df[matches_df["query id"] == event.query_id] ["nugget id"].tolist()) for nid in nids: ts = matches_df[matches_df["nugget id"] == nid]["update id"].apply( lambda x: int(x.split("-")[0])).tolist() ts.sort() nid2time[nid] = ts[0] fltr_nuggets = [] for name, row in df.iterrows(): fltr_nuggets.append( set([ nug for nug in row["nuggets"] if nid2time[nug] <= row["timestamp"] ])) #print df[["nuggets", "timestamp"]].apply(lambda y: print y[0]) # datetime.utcfromtimestamp(int(y["timestamp"]))) #print nids df["nuggets"] = fltr_nuggets df["nuggets"] = df["nuggets"].apply(lambda x: x if len(x) <= max_nuggets else set([])) return df
def get_input_stream(event, gold_probs, extractor="goose", thresh=.8, delay=None, topk=20, max_nuggets=None, is_filter=False): corpus = cuttsum.corpora.get_raw_corpus(event) res = InputStreamResource() df = pd.concat( res.get_dataframes(event, corpus, extractor, thresh, delay, topk)) selector = (df["n conf"] == 1) & (df["nugget probs"].apply(len) == 0) df.loc[selector, "nugget probs"] = df.loc[selector, "nuggets"].apply(lambda x: {n:1 for n in x}) df["true probs"] = df["nugget probs"].apply(lambda x: [val for key, val in x.items()] +[0]) df["true probs"] = df["true probs"].apply(lambda x: np.max(x)) df.loc[(df["n conf"] == 1) & (df["nuggets"].apply(len) == 0), "true probs"] = 0 if gold_probs is True: df["probs"] = df["true probs"] else: df["probs"] = NuggetRegressor().predict(event, df) df["nuggets"] = df["nugget probs"].apply( lambda x: set([key for key, val in x.items() if val > .97])) if max_nuggets is not None: def sortme(x): l = [(key, val) for key, val in x.items() if val > .5] sorted(l, key=lambda y: y[1], reverse=True) return [k for k,v in l[:max_nuggets]] df["nuggets"] = df["nuggets"].apply(lambda x: x if len(x) <= max_nuggets else set([])) #df["nuggets"] = df["nugget probs"].apply(sortme) if is_filter: nid2time = {} nids = set(matches_df[matches_df["query id"] == event.query_id]["nugget id"].tolist()) for nid in nids: ts = matches_df[matches_df["nugget id"] == nid]["update id"].apply(lambda x: int(x.split("-")[0])).tolist() ts.sort() nid2time[nid] = ts[0] #tss = nuggets[nuggets["query id"] == event.query_id]["timestamp"].tolist() #ids = nuggets[nuggets["query id"] == event.query_id]["nugget id"].tolist() #nt = {nid: ts for ts, nid in zip(tss, ids)} fltr_nuggets = [] for name, row in df.iterrows(): fltr_nuggets.append( set([nug for nug in row["nuggets"] if nid2time[nug] <= row["timestamp"]])) #print df[["nuggets", "timestamp"]].apply(lambda y: print y[0]) # datetime.utcfromtimestamp(int(y["timestamp"]))) #print nids df["nuggets"] = fltr_nuggets return df
def get_input_stream(event, gold_probs, extractor="goose", thresh=.8, delay=None, topk=20): max_nuggets = 3 corpus = cuttsum.corpora.get_raw_corpus(event) res = InputStreamResource() df = pd.concat( res.get_dataframes(event, corpus, extractor, thresh, delay, topk)) selector = (df["n conf"] == 1) & (df["nugget probs"].apply(len) == 0) df.loc[selector, "nugget probs"] = df.loc[selector, "nuggets"].apply(lambda x: {n:1 for n in x}) df["true probs"] = df["nugget probs"].apply(lambda x: [val for key, val in x.items()] +[0]) df["true probs"] = df["true probs"].apply(lambda x: np.max(x)) df.loc[(df["n conf"] == 1) & (df["nuggets"].apply(len) == 0), "true probs"] = 0 if gold_probs is True: df["probs"] = df["true probs"] else: df["probs"] = NuggetRegressor().predict(event, df) df["nuggets"] = df["nugget probs"].apply( lambda x: set([key for key, val in x.items() if val > .9])) nid2time = {} nids = set(matches_df[matches_df["query id"] == event.query_id]["nugget id"].tolist()) for nid in nids: ts = matches_df[matches_df["nugget id"] == nid]["update id"].apply(lambda x: int(x.split("-")[0])).tolist() ts.sort() nid2time[nid] = ts[0] fltr_nuggets = [] for name, row in df.iterrows(): fltr_nuggets.append( set([nug for nug in row["nuggets"] if nid2time[nug] <= row["timestamp"]])) #print df[["nuggets", "timestamp"]].apply(lambda y: print y[0]) # datetime.utcfromtimestamp(int(y["timestamp"]))) #print nids df["nuggets"] = fltr_nuggets df["nuggets"] = df["nuggets"].apply(lambda x: x if len(x) <= max_nuggets else set([])) return df
def get_input_stream(event, gold_probs, extractor="goose", thresh=.8, delay=None, topk=20, use_2015F=False, truncate=False): max_nuggets = 3 corpus = cuttsum.corpora.get_raw_corpus(event) if use_2015F is True and event.query_num > 25: corpus = cuttsum.corpora.FilteredTS2015() print event, corpus res = InputStreamResource() df = pd.concat( res.get_dataframes(event, corpus, extractor, thresh, delay, topk)) selector = (df["n conf"] == 1) & (df["nugget probs"].apply(len) == 0) df.loc[selector, "nugget probs"] = df.loc[selector, "nuggets"].apply( lambda x: {n: 1 for n in x}) df["true probs"] = df["nugget probs"].apply( lambda x: [val for key, val in x.items()] + [0]) df["true probs"] = df["true probs"].apply(lambda x: np.max(x)) df.loc[(df["n conf"] == 1) & (df["nuggets"].apply(len) == 0), "true probs"] = 0 if gold_probs is True: df["probs"] = df["true probs"] else: df["probs"] = NuggetRegressor().predict(event, df) df["nuggets"] = df["nugget probs"].apply( lambda x: set([key for key, val in x.items() if val > .9])) nid2time = {} nids = set(matches_df[matches_df["query id"] == event.query_id] ["nugget id"].tolist()) for nid in nids: ts = matches_df[matches_df["nugget id"] == nid]["update id"].apply( lambda x: int(x.split("-")[0])).tolist() ts.sort() nid2time[nid] = ts[0] fltr_nuggets = [] for name, row in df.iterrows(): fltr_nuggets.append( set([ nug for nug in row["nuggets"] if nid2time[nug] <= row["timestamp"] ])) #print df[["nuggets", "timestamp"]].apply(lambda y: print y[0]) # datetime.utcfromtimestamp(int(y["timestamp"]))) #print nids df["nuggets"] = fltr_nuggets df["nuggets"] = df["nuggets"].apply(lambda x: x if len(x) <= max_nuggets else set([])) from cuttsum.pipeline import DedupedArticlesResource ded = DedupedArticlesResource() stats_df = ded.get_stats_df(event, corpus, extractor, thresh) stats_df["stream ids"] = stats_df["stream ids"].apply( lambda x: set(eval(x))) sid2match = {} for _, row in stats_df.iterrows(): for sid in row["stream ids"]: sid2match[sid] = row["match"] all_ts = [] all_docs = [] new_docs = [] for (sid, ts), doc in df.groupby(["stream id", "timestamp"]): #if truncate is True: doc = doc.iloc[0:20] # print sub_doc if len(all_ts) > 0: assert ts >= all_ts[-1] all_ts.append(ts) if sid2match[sid] is True: new_docs.append(doc) all_docs.append(doc) df = pd.concat(new_docs) print len(all_docs), len(new_docs) return df
def get_input_stream( event, gold_probs, extractor="goose", thresh=0.8, delay=None, topk=20, use_2015F=False, truncate=False ): max_nuggets = 3 corpus = cuttsum.corpora.get_raw_corpus(event) if use_2015F is True and event.query_num > 25: corpus = cuttsum.corpora.FilteredTS2015() print event, corpus res = InputStreamResource() df = pd.concat(res.get_dataframes(event, corpus, extractor, thresh, delay, topk)) selector = (df["n conf"] == 1) & (df["nugget probs"].apply(len) == 0) df.loc[selector, "nugget probs"] = df.loc[selector, "nuggets"].apply(lambda x: {n: 1 for n in x}) df["true probs"] = df["nugget probs"].apply(lambda x: [val for key, val in x.items()] + [0]) df["true probs"] = df["true probs"].apply(lambda x: np.max(x)) df.loc[(df["n conf"] == 1) & (df["nuggets"].apply(len) == 0), "true probs"] = 0 if gold_probs is True: df["probs"] = df["true probs"] else: df["probs"] = NuggetRegressor().predict(event, df) df["nuggets"] = df["nugget probs"].apply(lambda x: set([key for key, val in x.items() if val > 0.9])) nid2time = {} nids = set(matches_df[matches_df["query id"] == event.query_id]["nugget id"].tolist()) for nid in nids: ts = matches_df[matches_df["nugget id"] == nid]["update id"].apply(lambda x: int(x.split("-")[0])).tolist() ts.sort() nid2time[nid] = ts[0] fltr_nuggets = [] for name, row in df.iterrows(): fltr_nuggets.append(set([nug for nug in row["nuggets"] if nid2time[nug] <= row["timestamp"]])) # print df[["nuggets", "timestamp"]].apply(lambda y: print y[0]) # datetime.utcfromtimestamp(int(y["timestamp"]))) # print nids df["nuggets"] = fltr_nuggets df["nuggets"] = df["nuggets"].apply(lambda x: x if len(x) <= max_nuggets else set([])) from cuttsum.pipeline import DedupedArticlesResource ded = DedupedArticlesResource() stats_df = ded.get_stats_df(event, corpus, extractor, thresh) stats_df["stream ids"] = stats_df["stream ids"].apply(lambda x: set(eval(x))) sid2match = {} for _, row in stats_df.iterrows(): for sid in row["stream ids"]: sid2match[sid] = row["match"] all_ts = [] all_docs = [] new_docs = [] for (sid, ts), doc in df.groupby(["stream id", "timestamp"]): if truncate is True: doc = doc.iloc[0:5] # print sub_doc if len(all_ts) > 0: assert ts >= all_ts[-1] all_ts.append(ts) if sid2match[sid] is True: new_docs.append(doc) all_docs.append(doc) df = pd.concat(new_docs) print len(all_docs), len(new_docs) return df
def get_input_stream(event, extractor="goose", thresh=0.8, delay=None, topk=20): corpus = cuttsum.corpora.get_raw_corpus(event) res = InputStreamResource() return res.get_dataframes(event, corpus, extractor, thresh, delay, topk)
def get_input_stream(event, gold_probs, extractor="goose", thresh=.8, delay=None, topk=20, max_nuggets=None, is_filter=False): corpus = cuttsum.corpora.get_raw_corpus(event) res = InputStreamResource() df = pd.concat( res.get_dataframes(event, corpus, extractor, thresh, delay, topk)) selector = (df["n conf"] == 1) & (df["nugget probs"].apply(len) == 0) df.loc[selector, "nugget probs"] = df.loc[selector, "nuggets"].apply( lambda x: {n: 1 for n in x}) df["true probs"] = df["nugget probs"].apply( lambda x: [val for key, val in x.items()] + [0]) df["true probs"] = df["true probs"].apply(lambda x: np.max(x)) df.loc[(df["n conf"] == 1) & (df["nuggets"].apply(len) == 0), "true probs"] = 0 if gold_probs is True: df["probs"] = df["true probs"] else: df["probs"] = NuggetRegressor().predict(event, df) df["nuggets"] = df["nugget probs"].apply( lambda x: set([key for key, val in x.items() if val > .97])) if max_nuggets is not None: def sortme(x): l = [(key, val) for key, val in x.items() if val > .5] sorted(l, key=lambda y: y[1], reverse=True) return [k for k, v in l[:max_nuggets]] df["nuggets"] = df["nuggets"].apply( lambda x: x if len(x) <= max_nuggets else set([])) #df["nuggets"] = df["nugget probs"].apply(sortme) if is_filter: nid2time = {} nids = set(matches_df[matches_df["query id"] == event.query_id] ["nugget id"].tolist()) for nid in nids: ts = matches_df[matches_df["nugget id"] == nid]["update id"].apply( lambda x: int(x.split("-")[0])).tolist() ts.sort() nid2time[nid] = ts[0] #tss = nuggets[nuggets["query id"] == event.query_id]["timestamp"].tolist() #ids = nuggets[nuggets["query id"] == event.query_id]["nugget id"].tolist() #nt = {nid: ts for ts, nid in zip(tss, ids)} fltr_nuggets = [] for name, row in df.iterrows(): fltr_nuggets.append( set([ nug for nug in row["nuggets"] if nid2time[nug] <= row["timestamp"] ])) #print df[["nuggets", "timestamp"]].apply(lambda y: print y[0]) # datetime.utcfromtimestamp(int(y["timestamp"]))) #print nids df["nuggets"] = fltr_nuggets return df
def main(learner, training_ids, test_ids, sample_size, n_iters, report_dir_base): extractor = "goose" topk = 20 delay = None threshold = .8 res = InputStreamResource() events = [e for e in cuttsum.events.get_events() if e.query_num in training_ids or e.query_num in test_ids] training_insts = [] test_insts = [] for event in events: print "Loading event", event.fs_name() corpus = cuttsum.corpora.get_raw_corpus(event) # A list of dataframes. Each dataframe is a document with =< 20 sentences. # This is the events document stream. dataframes = res.get_dataframes(event, corpus, extractor, threshold, delay, topk) if event.query_num in training_ids: training_insts.append((event, dataframes)) if event.query_num in test_ids: test_insts.append((event, dataframes)) # Init l2s task. vw = pyvw.vw("--search 0 --csoaa_ldf m --search_task hook --ring_size 1024 --quiet --search_no_caching") #task = vw.init_search_task(UpdateSummarizer) if learner == "PerfectOracle": task = vw.init_search_task(PerfectOracle) elif learner == "LessPerfectOracle": task = vw.init_search_task(LessPerfectOracle) elif learner == "SelectLexNextOracle": task = vw.init_search_task(SelectLexNextOracle) elif learner == "SelectLexNextLex": task = vw.init_search_task(SelectLexNextLex) elif learner == "SelectLexNextLexCache": task = vw.init_search_task(SelectLexNextLexCache) elif learner == "SelectLexGenericNextOracle": task = vw.init_search_task(SelectLexGenericNextOracle) elif learner == "SelectBasicNextBias": task = vw.init_search_task(SelectBasicNextBias) elif learner == "SelectBasicNextBiasDocAvg": task = vw.init_search_task(SelectBasicNextBiasDocAvg) for n_iter in range(n_iters): print "iter", n_iter + 1 ds = downsample(training_insts, size=sample_size) task.learn(ds) all_train_df = [df for inst in training_insts for df in inst[1]] feature_weights = task.get_feature_weights(all_train_df) write_model(feature_weights, report_dir_base, n_iter) for event, dataframes in training_insts: # Predict a sequence for this training examples and see if it is sensible. print "PREDICTING", event.fs_name() sequence, scores = task.predict_with_scores((event, dataframes)) print sequence make_report(event, dataframes, sequence, scores, "train", n_iter, report_dir_base) for event, dataframes in test_insts: # Predict a sequence for this training examples and see if it is sensible. print "PREDICTING", event.fs_name() sequence, scores = task.predict_with_scores((event, dataframes)) print sequence make_report(event, dataframes, sequence, scores, "test", n_iter, report_dir_base)
def main(learner, training_ids, test_ids, sample_size, n_iters, report_dir_base): extractor = "goose" topk = 20 delay = None threshold = .8 res = InputStreamResource() events = [ e for e in cuttsum.events.get_events() if e.query_num in training_ids or e.query_num in test_ids ] training_insts = [] test_insts = [] for event in events: print "Loading event", event.fs_name() corpus = cuttsum.corpora.get_raw_corpus(event) # A list of dataframes. Each dataframe is a document with =< 20 sentences. # This is the events document stream. dataframes = res.get_dataframes(event, corpus, extractor, threshold, delay, topk) if event.query_num in training_ids: training_insts.append((event, dataframes)) if event.query_num in test_ids: test_insts.append((event, dataframes)) # Init l2s task. vw = pyvw.vw( "--search 0 --csoaa_ldf m --search_task hook --ring_size 1024 --quiet --search_no_caching" ) #task = vw.init_search_task(UpdateSummarizer) if learner == "PerfectOracle": task = vw.init_search_task(PerfectOracle) elif learner == "LessPerfectOracle": task = vw.init_search_task(LessPerfectOracle) elif learner == "SelectLexNextOracle": task = vw.init_search_task(SelectLexNextOracle) elif learner == "SelectLexNextLex": task = vw.init_search_task(SelectLexNextLex) elif learner == "SelectLexNextLexCache": task = vw.init_search_task(SelectLexNextLexCache) elif learner == "SelectLexGenericNextOracle": task = vw.init_search_task(SelectLexGenericNextOracle) elif learner == "SelectBasicNextBias": task = vw.init_search_task(SelectBasicNextBias) elif learner == "SelectBasicNextBiasDocAvg": task = vw.init_search_task(SelectBasicNextBiasDocAvg) for n_iter in range(n_iters): print "iter", n_iter + 1 ds = downsample(training_insts, size=sample_size) task.learn(ds) all_train_df = [df for inst in training_insts for df in inst[1]] feature_weights = task.get_feature_weights(all_train_df) write_model(feature_weights, report_dir_base, n_iter) for event, dataframes in training_insts: # Predict a sequence for this training examples and see if it is sensible. print "PREDICTING", event.fs_name() sequence, scores = task.predict_with_scores((event, dataframes)) print sequence make_report(event, dataframes, sequence, scores, "train", n_iter, report_dir_base) for event, dataframes in test_insts: # Predict a sequence for this training examples and see if it is sensible. print "PREDICTING", event.fs_name() sequence, scores = task.predict_with_scores((event, dataframes)) print sequence make_report(event, dataframes, sequence, scores, "test", n_iter, report_dir_base)
def get_input_stream(event, extractor="goose", thresh=.8, delay=None, topk=20): corpus = cuttsum.corpora.get_raw_corpus(event) res = InputStreamResource() return res.get_dataframes(event, corpus, extractor, thresh, delay, topk)