def get_input_stream(event, gold_probs, extractor="goose", thresh=.8, delay=None, topk=20): max_nuggets = 3 corpus = cuttsum.corpora.FilteredTS2015() res = InputStreamResource() df = pd.concat( res.get_dataframes(event, corpus, extractor, thresh, delay, topk)) selector = (df["n conf"] == 1) & (df["nugget probs"].apply(len) == 0) df.loc[selector, "nugget probs"] = df.loc[selector, "nuggets"].apply( lambda x: {n: 1 for n in x}) df["true probs"] = df["nugget probs"].apply( lambda x: [val for key, val in x.items()] + [0]) df["true probs"] = df["true probs"].apply(lambda x: np.max(x)) df.loc[(df["n conf"] == 1) & (df["nuggets"].apply(len) == 0), "true probs"] = 0 if gold_probs is True: df["probs"] = df["true probs"] else: df["probs"] = NuggetRegressor().predict(event, df) df["nuggets"] = df["nugget probs"].apply( lambda x: set([key for key, val in x.items() if val > .9])) nid2time = {} nids = set(matches_df[matches_df["query id"] == event.query_id] ["nugget id"].tolist()) for nid in nids: ts = matches_df[matches_df["nugget id"] == nid]["update id"].apply( lambda x: int(x.split("-")[0])).tolist() ts.sort() nid2time[nid] = ts[0] fltr_nuggets = [] for name, row in df.iterrows(): fltr_nuggets.append( set([ nug for nug in row["nuggets"] if nid2time[nug] <= row["timestamp"] ])) #print df[["nuggets", "timestamp"]].apply(lambda y: print y[0]) # datetime.utcfromtimestamp(int(y["timestamp"]))) #print nids df["nuggets"] = fltr_nuggets df["nuggets"] = df["nuggets"].apply(lambda x: x if len(x) <= max_nuggets else set([])) return df
def get_input_stream(event, gold_probs, extractor="goose", thresh=.8, delay=None, topk=20, use_2015F=False, truncate=False): max_nuggets = 3 corpus = cuttsum.corpora.get_raw_corpus(event) if use_2015F is True and event.query_num > 25: corpus = cuttsum.corpora.FilteredTS2015() print event, corpus res = InputStreamResource() df = pd.concat( res.get_dataframes(event, corpus, extractor, thresh, delay, topk)) selector = (df["n conf"] == 1) & (df["nugget probs"].apply(len) == 0) df.loc[selector, "nugget probs"] = df.loc[selector, "nuggets"].apply( lambda x: {n: 1 for n in x}) df["true probs"] = df["nugget probs"].apply( lambda x: [val for key, val in x.items()] + [0]) df["true probs"] = df["true probs"].apply(lambda x: np.max(x)) df.loc[(df["n conf"] == 1) & (df["nuggets"].apply(len) == 0), "true probs"] = 0 if gold_probs is True: df["probs"] = df["true probs"] else: df["probs"] = NuggetRegressor().predict(event, df) df["nuggets"] = df["nugget probs"].apply( lambda x: set([key for key, val in x.items() if val > .9])) nid2time = {} nids = set(matches_df[matches_df["query id"] == event.query_id] ["nugget id"].tolist()) for nid in nids: ts = matches_df[matches_df["nugget id"] == nid]["update id"].apply( lambda x: int(x.split("-")[0])).tolist() ts.sort() nid2time[nid] = ts[0] fltr_nuggets = [] for name, row in df.iterrows(): fltr_nuggets.append( set([ nug for nug in row["nuggets"] if nid2time[nug] <= row["timestamp"] ])) #print df[["nuggets", "timestamp"]].apply(lambda y: print y[0]) # datetime.utcfromtimestamp(int(y["timestamp"]))) #print nids df["nuggets"] = fltr_nuggets df["nuggets"] = df["nuggets"].apply(lambda x: x if len(x) <= max_nuggets else set([])) from cuttsum.pipeline import DedupedArticlesResource ded = DedupedArticlesResource() stats_df = ded.get_stats_df(event, corpus, extractor, thresh) stats_df["stream ids"] = stats_df["stream ids"].apply( lambda x: set(eval(x))) sid2match = {} for _, row in stats_df.iterrows(): for sid in row["stream ids"]: sid2match[sid] = row["match"] all_ts = [] all_docs = [] new_docs = [] for (sid, ts), doc in df.groupby(["stream id", "timestamp"]): #if truncate is True: doc = doc.iloc[0:20] # print sub_doc if len(all_ts) > 0: assert ts >= all_ts[-1] all_ts.append(ts) if sid2match[sid] is True: new_docs.append(doc) all_docs.append(doc) df = pd.concat(new_docs) print len(all_docs), len(new_docs) return df
def get_input_stream(event, gold_probs, extractor="goose", thresh=.8, delay=None, topk=20, max_nuggets=None, is_filter=False): corpus = cuttsum.corpora.get_raw_corpus(event) res = InputStreamResource() df = pd.concat( res.get_dataframes(event, corpus, extractor, thresh, delay, topk)) selector = (df["n conf"] == 1) & (df["nugget probs"].apply(len) == 0) df.loc[selector, "nugget probs"] = df.loc[selector, "nuggets"].apply( lambda x: {n: 1 for n in x}) df["true probs"] = df["nugget probs"].apply( lambda x: [val for key, val in x.items()] + [0]) df["true probs"] = df["true probs"].apply(lambda x: np.max(x)) df.loc[(df["n conf"] == 1) & (df["nuggets"].apply(len) == 0), "true probs"] = 0 if gold_probs is True: df["probs"] = df["true probs"] else: df["probs"] = NuggetRegressor().predict(event, df) df["nuggets"] = df["nugget probs"].apply( lambda x: set([key for key, val in x.items() if val > .97])) if max_nuggets is not None: def sortme(x): l = [(key, val) for key, val in x.items() if val > .5] sorted(l, key=lambda y: y[1], reverse=True) return [k for k, v in l[:max_nuggets]] df["nuggets"] = df["nuggets"].apply( lambda x: x if len(x) <= max_nuggets else set([])) #df["nuggets"] = df["nugget probs"].apply(sortme) if is_filter: nid2time = {} nids = set(matches_df[matches_df["query id"] == event.query_id] ["nugget id"].tolist()) for nid in nids: ts = matches_df[matches_df["nugget id"] == nid]["update id"].apply( lambda x: int(x.split("-")[0])).tolist() ts.sort() nid2time[nid] = ts[0] #tss = nuggets[nuggets["query id"] == event.query_id]["timestamp"].tolist() #ids = nuggets[nuggets["query id"] == event.query_id]["nugget id"].tolist() #nt = {nid: ts for ts, nid in zip(tss, ids)} fltr_nuggets = [] for name, row in df.iterrows(): fltr_nuggets.append( set([ nug for nug in row["nuggets"] if nid2time[nug] <= row["timestamp"] ])) #print df[["nuggets", "timestamp"]].apply(lambda y: print y[0]) # datetime.utcfromtimestamp(int(y["timestamp"]))) #print nids df["nuggets"] = fltr_nuggets return df