Ejemplo n.º 1
0
def get_input_stream(event,
                     gold_probs,
                     extractor="goose",
                     thresh=.8,
                     delay=None,
                     topk=20):
    max_nuggets = 3
    corpus = cuttsum.corpora.FilteredTS2015()
    res = InputStreamResource()
    df = pd.concat(
        res.get_dataframes(event, corpus, extractor, thresh, delay, topk))

    selector = (df["n conf"] == 1) & (df["nugget probs"].apply(len) == 0)
    df.loc[selector, "nugget probs"] = df.loc[selector, "nuggets"].apply(
        lambda x: {n: 1
                   for n in x})

    df["true probs"] = df["nugget probs"].apply(
        lambda x: [val for key, val in x.items()] + [0])
    df["true probs"] = df["true probs"].apply(lambda x: np.max(x))
    df.loc[(df["n conf"] == 1) & (df["nuggets"].apply(len) == 0),
           "true probs"] = 0

    if gold_probs is True:
        df["probs"] = df["true probs"]
    else:
        df["probs"] = NuggetRegressor().predict(event, df)

    df["nuggets"] = df["nugget probs"].apply(
        lambda x: set([key for key, val in x.items() if val > .9]))

    nid2time = {}
    nids = set(matches_df[matches_df["query id"] == event.query_id]
               ["nugget id"].tolist())
    for nid in nids:
        ts = matches_df[matches_df["nugget id"] == nid]["update id"].apply(
            lambda x: int(x.split("-")[0])).tolist()
        ts.sort()
        nid2time[nid] = ts[0]

    fltr_nuggets = []
    for name, row in df.iterrows():
        fltr_nuggets.append(
            set([
                nug for nug in row["nuggets"]
                if nid2time[nug] <= row["timestamp"]
            ]))
    #print df[["nuggets", "timestamp"]].apply(lambda y: print y[0]) #  datetime.utcfromtimestamp(int(y["timestamp"])))
    #print nids
    df["nuggets"] = fltr_nuggets

    df["nuggets"] = df["nuggets"].apply(lambda x: x
                                        if len(x) <= max_nuggets else set([]))

    return df
Ejemplo n.º 2
0
def get_input_stream(event,
                     gold_probs,
                     extractor="goose",
                     thresh=.8,
                     delay=None,
                     topk=20,
                     use_2015F=False,
                     truncate=False):
    max_nuggets = 3

    corpus = cuttsum.corpora.get_raw_corpus(event)
    if use_2015F is True and event.query_num > 25:
        corpus = cuttsum.corpora.FilteredTS2015()
    print event, corpus

    res = InputStreamResource()
    df = pd.concat(
        res.get_dataframes(event, corpus, extractor, thresh, delay, topk))

    selector = (df["n conf"] == 1) & (df["nugget probs"].apply(len) == 0)
    df.loc[selector, "nugget probs"] = df.loc[selector, "nuggets"].apply(
        lambda x: {n: 1
                   for n in x})

    df["true probs"] = df["nugget probs"].apply(
        lambda x: [val for key, val in x.items()] + [0])
    df["true probs"] = df["true probs"].apply(lambda x: np.max(x))
    df.loc[(df["n conf"] == 1) & (df["nuggets"].apply(len) == 0),
           "true probs"] = 0

    if gold_probs is True:
        df["probs"] = df["true probs"]
    else:
        df["probs"] = NuggetRegressor().predict(event, df)

    df["nuggets"] = df["nugget probs"].apply(
        lambda x: set([key for key, val in x.items() if val > .9]))

    nid2time = {}
    nids = set(matches_df[matches_df["query id"] == event.query_id]
               ["nugget id"].tolist())
    for nid in nids:
        ts = matches_df[matches_df["nugget id"] == nid]["update id"].apply(
            lambda x: int(x.split("-")[0])).tolist()
        ts.sort()
        nid2time[nid] = ts[0]

    fltr_nuggets = []
    for name, row in df.iterrows():
        fltr_nuggets.append(
            set([
                nug for nug in row["nuggets"]
                if nid2time[nug] <= row["timestamp"]
            ]))
    #print df[["nuggets", "timestamp"]].apply(lambda y: print y[0]) #  datetime.utcfromtimestamp(int(y["timestamp"])))
    #print nids
    df["nuggets"] = fltr_nuggets

    df["nuggets"] = df["nuggets"].apply(lambda x: x
                                        if len(x) <= max_nuggets else set([]))

    from cuttsum.pipeline import DedupedArticlesResource
    ded = DedupedArticlesResource()
    stats_df = ded.get_stats_df(event, corpus, extractor, thresh)
    stats_df["stream ids"] = stats_df["stream ids"].apply(
        lambda x: set(eval(x)))
    sid2match = {}
    for _, row in stats_df.iterrows():
        for sid in row["stream ids"]:
            sid2match[sid] = row["match"]

    all_ts = []
    all_docs = []
    new_docs = []
    for (sid, ts), doc in df.groupby(["stream id", "timestamp"]):
        #if truncate is True:
        doc = doc.iloc[0:20]
        #            print sub_doc
        if len(all_ts) > 0:
            assert ts >= all_ts[-1]
        all_ts.append(ts)
        if sid2match[sid] is True:
            new_docs.append(doc)
        all_docs.append(doc)

    df = pd.concat(new_docs)
    print len(all_docs), len(new_docs)
    return df
Ejemplo n.º 3
0
def get_input_stream(event,
                     gold_probs,
                     extractor="goose",
                     thresh=.8,
                     delay=None,
                     topk=20,
                     max_nuggets=None,
                     is_filter=False):
    corpus = cuttsum.corpora.get_raw_corpus(event)
    res = InputStreamResource()
    df = pd.concat(
        res.get_dataframes(event, corpus, extractor, thresh, delay, topk))
    selector = (df["n conf"] == 1) & (df["nugget probs"].apply(len) == 0)
    df.loc[selector, "nugget probs"] = df.loc[selector, "nuggets"].apply(
        lambda x: {n: 1
                   for n in x})

    df["true probs"] = df["nugget probs"].apply(
        lambda x: [val for key, val in x.items()] + [0])
    df["true probs"] = df["true probs"].apply(lambda x: np.max(x))
    df.loc[(df["n conf"] == 1) & (df["nuggets"].apply(len) == 0),
           "true probs"] = 0

    if gold_probs is True:
        df["probs"] = df["true probs"]
    else:
        df["probs"] = NuggetRegressor().predict(event, df)

    df["nuggets"] = df["nugget probs"].apply(
        lambda x: set([key for key, val in x.items() if val > .97]))

    if max_nuggets is not None:

        def sortme(x):
            l = [(key, val) for key, val in x.items() if val > .5]
            sorted(l, key=lambda y: y[1], reverse=True)
            return [k for k, v in l[:max_nuggets]]

        df["nuggets"] = df["nuggets"].apply(
            lambda x: x if len(x) <= max_nuggets else set([]))
        #df["nuggets"] = df["nugget probs"].apply(sortme)

    if is_filter:
        nid2time = {}
        nids = set(matches_df[matches_df["query id"] == event.query_id]
                   ["nugget id"].tolist())
        for nid in nids:
            ts = matches_df[matches_df["nugget id"] == nid]["update id"].apply(
                lambda x: int(x.split("-")[0])).tolist()
            ts.sort()
            nid2time[nid] = ts[0]
        #tss = nuggets[nuggets["query id"] == event.query_id]["timestamp"].tolist()
        #ids = nuggets[nuggets["query id"] == event.query_id]["nugget id"].tolist()
        #nt = {nid: ts for ts, nid in zip(tss, ids)}
        fltr_nuggets = []
        for name, row in df.iterrows():
            fltr_nuggets.append(
                set([
                    nug for nug in row["nuggets"]
                    if nid2time[nug] <= row["timestamp"]
                ]))
        #print df[["nuggets", "timestamp"]].apply(lambda y: print y[0]) #  datetime.utcfromtimestamp(int(y["timestamp"])))
        #print nids
        df["nuggets"] = fltr_nuggets
    return df