def main(): results = [] res = ArticlesResource() for ext in ["gold", "goose"]: for event in cuttsum.events.get_2013_events(): if event.query_id.startswith("TS13"): corpus = cuttsum.corpora.EnglishAndUnknown2013() else: raise Exception() min_hour = datetime.datetime(datetime.MAXYEAR, 1, 1) max_hour = datetime.datetime(datetime.MINYEAR, 1, 1) total = 0 for hour, path, si in res.streamitem_iter(event, corpus, ext): if hour < min_hour: min_hour = hour if hour > max_hour: max_hour = hour total += 1 if total == 0: continue results.append({"event": event.fs_name(), "event start": event.list_event_hours()[0], "event stop": event.list_event_hours()[-1], "article start": min_hour, "article stop": max_hour, "total": total, "annotator": ext}) df = pd.DataFrame(results, columns=["event", "annotator", "event start", "event stop", "article start", "article stop", "total", "annotator"]) print df
def cmd_readstream(args, t, active_events): import textwrap from goose import Goose, Configuration config = Configuration() config.enable_image_fetching = False g = Goose(config) raw_stream = True for arg in args: if arg == "articles": raw_stream = False for event in active_events: print event if event.query_id.startswith("TS13"): corpus = cuttsum.corpora.EnglishAndUnknown2013() elif event.query_id.startswith("TS14"): corpus = cuttsum.corpora.SerifOnly2014() else: raise Exception("Bad query id: {}".format(event.query_id)) if raw_stream is True: from cuttsum.trecdata import SCChunkResource si_iter = SCChunkResource().streamitem_iter(event, corpus) else: from cuttsum.pipeline import ArticlesResource si_iter = ArticlesResource().streamitem_iter(event, corpus) for hour, path, si in si_iter: if si.body.clean_visible is not None: print si.stream_id try: text_height = t.height - 4 #n_chars = t. article = g.extract(raw_html=si.body.clean_html) lines = textwrap.wrap(article.cleaned_text) idx = 0 while 1: print t.clear print "hour:", hour print "title:", article.title print "article:" print "\n".join(lines[idx:idx + text_height]) #print article.cleaned_text with t.cbreak(): char = t.inkey() if char == "i" and idx > 0: idx -= 1 #idx - 1 if idx > 0 else 0 elif char == "k" and idx + text_height < len( lines): idx += 1 elif char == "l": break except Exception, e: print e continue
def main(): results = [] res = ArticlesResource() for ext in ["gold", "goose"]: for event in cuttsum.events.get_2013_events(): if event.query_id.startswith("TS13"): corpus = cuttsum.corpora.EnglishAndUnknown2013() else: raise Exception() min_hour = datetime.datetime(datetime.MAXYEAR, 1, 1) max_hour = datetime.datetime(datetime.MINYEAR, 1, 1) total = 0 for hour, path, si in res.streamitem_iter(event, corpus, ext): if hour < min_hour: min_hour = hour if hour > max_hour: max_hour = hour total += 1 if total == 0: continue results.append({ "event": event.fs_name(), "event start": event.list_event_hours()[0], "event stop": event.list_event_hours()[-1], "article start": min_hour, "article stop": max_hour, "total": total, "annotator": ext }) df = pd.DataFrame(results, columns=[ "event", "annotator", "event start", "event stop", "article start", "article stop", "total", "annotator" ]) print df
def do_job_unit(self, event, corpus, unit, **kwargs): if unit != 0: raise Exception("Job unit {} out of range".format(unit)) res = ArticlesResource() thresh = kwargs.get("dedupe-sim-threshold", .8) extractor = kwargs.get("extractor", "goose") hasher = FeatureHasher(input_type="pair", non_negative=True) si_iter = res.streamitem_iter( event, corpus, extractor) def to_df(all_ids, all_times, all_matches): d = [] for ids, times, match in izip(all_ids, all_times, all_matches): times.sort() d.append({ "stream ids": ids, "hits": len(ids), "match": match, "earliest": times[0], "latest": times[-1], "second": times[1] if len(times) >= 2 else None, "third": times[2] if len(times) >= 3 else None, }) return pd.DataFrame(d, columns=["stream ids", "match", "hits", "earliest", "latest", "second", "third"]) def query_in_top20(event, df): text = u"\n".join(df["sent text"].tolist()[:20]) for query in event.query: if not re.search(query, text, flags=re.I|re.UNICODE): return False return True def make_time(df): return df["timestamp"].tolist()[0] def make_counts(df, slimit=20): counts = defaultdict(int) for words in df["words"].tolist()[:slimit]: for word in words: counts[word.lower()] += 1 return counts def next_chunk_file(chunk_file_num): deduped_path_fmt = self.get_deduped_path_fmt( event, corpus, extractor, threshold=thresh) deduped_path = deduped_path_fmt.format( chunk_file_num) deduped_dir = os.path.dirname(deduped_path) if not os.path.exists(deduped_dir): os.makedirs(deduped_dir) if os.path.exists(deduped_path): os.remove(deduped_path) return sc.Chunk(path=deduped_path, mode="wb", message=corpus.sc_msg()) X = None chunk_file_num = 1 chunk = next_chunk_file(chunk_file_num) for hour, path, si in si_iter: df = si2df(si, extractor=extractor) counts = make_counts(df) x = hasher.transform([counts.items()]) x.shape = (1, hasher.n_features) if X is None: X = x times = [[make_time(df)]] ids = [[si.stream_id]] matches = [query_in_top20(event, df)] chunk.add(si) else: K = cosine_similarity(X, x) k_argmax = K.argmax() if K[k_argmax] < thresh: X = vstack([X, x]) times.append([make_time(df)]) ids.append([si.stream_id]) matches.append(query_in_top20(event, df)) if X.shape[0] % 1000 == 0: chunk.close() chunk_file_num += 1 chunk = next_chunk_file(chunk_file_num) chunk.add(si) else: times[k_argmax].append(make_time(df)) ids[k_argmax].append(si.stream_id) chunk.close() df = to_df(ids, times, matches) print df stats_path = self.get_stats_path( event, corpus, extractor, thresh) with open(stats_path, "w") as f: df.to_csv(f, index=False, sep="\t")
def do_job_unit(self, event, corpus, unit, **kwargs): if unit != 0: raise Exception("unit of work out of bounds!") extractor = kwargs.get("extractor", "gold") soft_match = kwargs.get("soft_match", False) budget = kwargs.get("budget", 25) output_path_prefix = self.get_path_prefix(event, corpus, extractor, budget, soft_match) ## Set up summarizer ### # This is the monotone submodular objective function (basically # nugget coverage). def f_of_A(system, A, V_min_A, e, input_df, ndarray_data): return len( set([ nugget for nuggets in input_df.ix[A, "nuggets"].tolist() for nugget in nuggets ])) system = sumpy.system.MonotoneSubmodularBasic(f_of_A=f_of_A, k=budget) # Get gold matchings for oracle. articles = ArticlesResource() all_matches = cuttsum.judgements.get_merged_dataframe() matches = all_matches[all_matches["query id"] == event.query_id] # Set up soft matching if we are using it. if soft_match is True: from cuttsum.classifiers import NuggetClassifier classify_nuggets = NuggetClassifier().get_classifier(event) if event.query_id.startswith("TS13"): judged = cuttsum.judgements.get_2013_updates() judged = judged[judged["query id"] == event.query_id] judged_uids = set(judged["update id"].tolist()) else: raise Exception("Bad corpus!") # All sentences containing nuggets will go in all_df. all_df = [] # Pull out articles with nuggets. for hour, path, si in articles.streamitem_iter(event, corpus, extractor): # Convert stream item to dataframe and add gold label nuggets. df = si2df(si, extractor=extractor) df["nuggets"] = df["update id"].apply(lambda x: set(matches[ matches["update id"] == x]["nugget id"].tolist())) # Perform soft nugget matching on unjudged sentences. if soft_match is True: ### NOTE BENE: geting an array of indices to index unjudged # sentences so I can force pandas to return a view and not a # copy. I = np.where( df["update id"].apply(lambda x: x not in judged_uids))[0] unjudged = df[df["update id"].apply( lambda x: x not in judged_uids)] unjudged_sents = unjudged["sent text"].tolist() assert len(unjudged_sents) == I.shape[0] df.loc[I, "nuggets"] = classify_nuggets(unjudged_sents) # Add sentences with nuggets to final set for summarzing df = df[df["nuggets"].apply(len) > 0] all_df.append(df) # Collect all dataframes into one and reset index (ALWAYS RESET # THE INDEX because pandas hates me.) all_df = pd.concat(all_df) all_df.reset_index(inplace=True) summary = system.summarize(all_df) F_of_S = len( set(n for ns in summary._df["nuggets"].tolist() for n in ns)) #print "F(S)", F_of_S #print "summary nuggets" sum_nuggets = list( set(n for ns in summary._df["nuggets"].tolist() for n in ns)) sum_nuggets.sort() print sum_nuggets possible_nuggets = list( set(n for ns in all_df["nuggets"].tolist() for n in ns)) possible_nuggets.sort() print possible_nuggets print len(possible_nuggets) event_nuggets = set(matches["nugget id"].tolist()) total_nuggets = len(event_nuggets) timestamp = int(si.stream_id.split("-")[0]) output_df = pd.DataFrame( [ { "Cum. F(S)": F_of_S, "F(S)": F_of_S, "UB no const.": len(possible_nuggets), # total_nuggets, "budget": budget, "Tot. Updates": len(summary._df), "event title": event.fs_name(), "timestamp": timestamp, "query id": event.query_id }, ], columns=[ "timestamp", "query id", "event title", "Cum. F(S)", "F(S)", "UB no const.", "Tot. Updates", "budget", ]) parent = os.path.dirname(output_path_prefix) if not os.path.exists(parent): os.makedirs(parent) stats_path = output_path_prefix + ".stats.tsv" updates_path = output_path_prefix + ".updates.tsv" with open(stats_path, "w") as f: output_df.to_csv(f, sep="\t", index=False) summary._df["sent text"] = summary._df["sent text"].apply( lambda x: x.encode("utf-8")) with open(updates_path, "w") as f: summary._df[["timestamp", "update id", "sent text"]].sort(["update id"]).to_csv(f, sep="\t", index=False)
def do_job_unit(self, event, corpus, unit, **kwargs): if unit != 0: raise Exception("unit of work out of bounds!") extractor = kwargs.get("extractor", "gold") soft_match = kwargs.get("soft_match", False) budget = kwargs.get("budget", 25) output_path_prefix = self.get_path_prefix(event, corpus, extractor, budget, soft_match) ## Set up summarizer ### def f_of_A(system, A, V_min_A, e, input_df, ndarray_data): return len( set([ nugget for nuggets in input_df.ix[A, "nuggets"].tolist() for nugget in nuggets ])) system = sumpy.system.MonotoneSubmodularBasic(f_of_A=f_of_A, k=budget) # Collect all previously collected nuggets here. nugget_cache = set() # Get gold matchings for oracle. articles = ArticlesResource() all_matches = cuttsum.judgements.get_merged_dataframe() matches = all_matches[all_matches["query id"] == event.query_id] # Set up soft matching if we are using it. if soft_match is True: from cuttsum.classifiers import NuggetClassifier classify_nuggets = NuggetClassifier().get_classifier(event) if event.query_id.startswith("TS13"): judged = cuttsum.judgements.get_2013_updates() judged = judged[judged["query id"] == event.query_id] judged_uids = set(judged["update id"].tolist()) else: raise Exception("Bad corpus!") # Collect stats for each document here. stats = [] # Aggregate summaries in summary_df. summary_df = [] cum_F_of_S = 0 all_seen_nuggets = set() # event_nuggets = set(matches["nugget id"].tolist()) # total_nuggets = len(event_nuggets) total_updates = 0 # Pull out articles with nuggets. for hour, path, si in articles.streamitem_iter(event, corpus, extractor): print hour, si.stream_id # Convert stream item to dataframe and add gold label nuggets. df = si2df(si, extractor=extractor) df["nuggets"] = df["update id"].apply(lambda x: set(matches[ matches["update id"] == x]["nugget id"].tolist())) # Perform soft nugget matching on unjudged sentences. if soft_match is True: ### NOTE BENE: geting an array of indices to index unjudged # sentences so I can force pandas to return a view and not a # copy. I = np.where( df["update id"].apply(lambda x: x not in judged_uids))[0] unjudged = df[df["update id"].apply( lambda x: x not in judged_uids)] unjudged_sents = unjudged["sent text"].tolist() assert len(unjudged_sents) == I.shape[0] df.loc[I, "nuggets"] = classify_nuggets(unjudged_sents) # Remove nuggets from dataframe if we have already collected them in # the cache. The scoring function should ignore these. df = df[df["nuggets"].apply(len) > 0] all_seen_nuggets.update( set(n for ns in df["nuggets"].tolist() for n in ns)) df["nuggets"] = df["nuggets"].apply( lambda x: x.difference(nugget_cache)) if len(df) == 0: continue # Run summarizer on current document and update stats about it. summary = system.summarize(df) summary_nuggets = set(n for ns in summary._df["nuggets"].tolist() for n in ns) nugget_cache.update(summary_nuggets) system.k -= len(summary._df) F_of_S = len(summary_nuggets) cum_F_of_S += F_of_S total_updates += len(summary._df) timestamp = int(si.stream_id.split("-")[0]) stats.append({ "Cum. F(S)": cum_F_of_S, "F(S)": F_of_S, "UB no const.": len(all_seen_nuggets), "budget": budget, "Tot. Updates": total_updates, "event title": event.fs_name(), "timestamp": timestamp, "query id": event.query_id, }) summary_df.append(summary._df) if system.k <= 0: print "Budget exceeded!" break output_df = pd.DataFrame(stats, columns=[ "timestamp", "query id", "event title", "Cum. F(S)", "F(S)", "UB no const.", "Tot. Updates", "budget", ]) # Write stats and updates to file. parent = os.path.dirname(output_path_prefix) if not os.path.exists(parent): os.makedirs(parent) stats_path = output_path_prefix + ".stats.tsv" updates_path = output_path_prefix + ".updates.tsv" with open(stats_path, "w") as f: output_df.to_csv(f, sep="\t", index=False) summary_df = pd.concat(summary_df) summary_df["sent text"] = summary_df["sent text"].apply( lambda x: x.encode("utf-8")) with open(updates_path, "w") as f: summary_df[["timestamp", "update id", "sent text"]].sort(["update id"]).to_csv(f, sep="\t", index=False)
import cuttsum.events import cuttsum.corpora from cuttsum.pipeline import ArticlesResource import numpy as np from scipy import linalg event = cuttsum.events.get_events()[0] corpus = cuttsum.corpora.EnglishAndUnknown2013() extractor = "gold" res = ArticlesResource() from sklearn.feature_extraction import DictVectorizer def extract_sentence_features_gold(df, vec): all_nuggets = set( n for ns in df["nuggets"].tolist() for n in ns) X_dict = df["nuggets"].apply( lambda x: {key: 1 if key in x else -1 for key in all_nuggets}).tolist() #vec = DictVectorizer(sparse=False) X = vec.fit_transform(X_dict) return X, vec w = None k = 5 m = 10 cache = {} found_nuggets = set() possible_nuggets = set(['VMTS13.01.064', 'VMTS13.01.058', 'VMTS13.01.056', 'VMTS13.01.055', 'VMTS13.01.054', 'VMTS13.01.052', 'VMTS13.01.051', 'VMTS13.01.050', 'VMTS13.01.062', 'VMTS13.01.088', 'VMTS13.01.077', 'VMTS13.01.065', 'VMTS13.01.078', 'VMTS13.01.086', 'VMTS13.01.069', 'VMTS13.01.099' "VMTS13.01.060", 'VMTS13.01.092', 'VMTS13.01.091', 'VMTS13.01.090', 'VMTS13.01.097', 'VMTS13.01.059', 'VMTS13.01.099', 'VMTS13.01.071', 'VMTS13.01.070', 'VMTS13.01.073', 'VMTS13.01.061', 'VMTS13.01.075', 'VMTS13.01.067', 'VMTS13.01.076', 'VMTS13.01.104', 'VMTS13.01.068', 'VMTS13.01.106', 'VMTS13.01.103', 'VMTS13.01.063', 'VMTS13.01.060', 'VMTS13.01.084', 'VMTS13.01.085', 'VMTS13.01.087', 'VMTS13.01.081', 'VMTS13.01.074'])
def do_job_unit(self, event, corpus, unit, **kwargs): if unit != 0: raise Exception("unit of work out of bounds!") extractor = kwargs.get("extractor", "gold") soft_match = kwargs.get("soft_match", False) budget = kwargs.get("budget", 25) output_path_prefix = self.get_path_prefix( event, corpus, extractor, budget, soft_match) ## Set up summarizer ### # This is the monotone submodular objective function (basically # nugget coverage). def f_of_A(system, A, V_min_A, e, input_df, ndarray_data): return len( set([nugget for nuggets in input_df.ix[A, "nuggets"].tolist() for nugget in nuggets])) system = sumpy.system.MonotoneSubmodularBasic(f_of_A=f_of_A, k=budget) # Get gold matchings for oracle. articles = ArticlesResource() all_matches = cuttsum.judgements.get_merged_dataframe() matches = all_matches[all_matches["query id"] == event.query_id] # Set up soft matching if we are using it. if soft_match is True: from cuttsum.classifiers import NuggetClassifier classify_nuggets = NuggetClassifier().get_classifier(event) if event.query_id.startswith("TS13"): judged = cuttsum.judgements.get_2013_updates() judged = judged[judged["query id"] == event.query_id] judged_uids = set(judged["update id"].tolist()) else: raise Exception("Bad corpus!") # All sentences containing nuggets will go in all_df. all_df = [] # Pull out articles with nuggets. for hour, path, si in articles.streamitem_iter( event, corpus, extractor): # Convert stream item to dataframe and add gold label nuggets. df = si2df(si, extractor=extractor) df["nuggets"] = df["update id"].apply( lambda x: set( matches[matches["update id"] == x]["nugget id"].tolist())) # Perform soft nugget matching on unjudged sentences. if soft_match is True: ### NOTE BENE: geting an array of indices to index unjudged # sentences so I can force pandas to return a view and not a # copy. I = np.where( df["update id"].apply(lambda x: x not in judged_uids))[0] unjudged = df[ df["update id"].apply(lambda x: x not in judged_uids)] unjudged_sents = unjudged["sent text"].tolist() assert len(unjudged_sents) == I.shape[0] df.loc[I, "nuggets"] = classify_nuggets(unjudged_sents) # Add sentences with nuggets to final set for summarzing df = df[df["nuggets"].apply(len) > 0] all_df.append(df) # Collect all dataframes into one and reset index (ALWAYS RESET # THE INDEX because pandas hates me.) all_df = pd.concat(all_df) all_df.reset_index(inplace=True) summary = system.summarize(all_df) F_of_S = len( set(n for ns in summary._df["nuggets"].tolist() for n in ns)) #print "F(S)", F_of_S #print "summary nuggets" sum_nuggets = list(set( n for ns in summary._df["nuggets"].tolist() for n in ns)) sum_nuggets.sort() print sum_nuggets possible_nuggets = list(set( n for ns in all_df["nuggets"].tolist() for n in ns)) possible_nuggets.sort() print possible_nuggets print len(possible_nuggets) event_nuggets = set(matches["nugget id"].tolist()) total_nuggets = len(event_nuggets) timestamp = int(si.stream_id.split("-")[0]) output_df = pd.DataFrame( [{"Cum. F(S)": F_of_S, "F(S)": F_of_S, "UB no const.": len(possible_nuggets), # total_nuggets, "budget": budget, "Tot. Updates": len(summary._df), "event title": event.fs_name(), "timestamp": timestamp, "query id": event.query_id},], columns=["timestamp", "query id", "event title", "Cum. F(S)", "F(S)", "UB no const.", "Tot. Updates", "budget",]) parent = os.path.dirname(output_path_prefix) if not os.path.exists(parent): os.makedirs(parent) stats_path = output_path_prefix + ".stats.tsv" updates_path = output_path_prefix + ".updates.tsv" with open(stats_path, "w") as f: output_df.to_csv(f, sep="\t", index=False) summary._df["sent text"] = summary._df["sent text"].apply( lambda x: x.encode("utf-8")) with open(updates_path, "w") as f: summary._df[["timestamp", "update id", "sent text"]].sort( ["update id"]).to_csv(f, sep="\t", index=False)
def do_job_unit(self, event, corpus, unit, **kwargs): if unit != 0: raise Exception("unit of work out of bounds!") extractor = kwargs.get("extractor", "gold") soft_match = kwargs.get("soft_match", False) budget = kwargs.get("budget", 25) output_path_prefix = self.get_path_prefix( event, corpus, extractor, budget, soft_match) ## Set up summarizer ### def f_of_A(system, A, V_min_A, e, input_df, ndarray_data): return len( set([nugget for nuggets in input_df.ix[A, "nuggets"].tolist() for nugget in nuggets])) system = sumpy.system.MonotoneSubmodularBasic(f_of_A=f_of_A, k=budget) # Collect all previously collected nuggets here. nugget_cache = set() # Get gold matchings for oracle. articles = ArticlesResource() all_matches = cuttsum.judgements.get_merged_dataframe() matches = all_matches[all_matches["query id"] == event.query_id] # Set up soft matching if we are using it. if soft_match is True: from cuttsum.classifiers import NuggetClassifier classify_nuggets = NuggetClassifier().get_classifier(event) if event.query_id.startswith("TS13"): judged = cuttsum.judgements.get_2013_updates() judged = judged[judged["query id"] == event.query_id] judged_uids = set(judged["update id"].tolist()) else: raise Exception("Bad corpus!") # Collect stats for each document here. stats = [] # Aggregate summaries in summary_df. summary_df = [] cum_F_of_S = 0 all_seen_nuggets = set() # event_nuggets = set(matches["nugget id"].tolist()) # total_nuggets = len(event_nuggets) total_updates = 0 # Pull out articles with nuggets. for hour, path, si in articles.streamitem_iter( event, corpus, extractor): print hour, si.stream_id # Convert stream item to dataframe and add gold label nuggets. df = si2df(si, extractor=extractor) df["nuggets"] = df["update id"].apply( lambda x: set( matches[matches["update id"] == x]["nugget id"].tolist())) # Perform soft nugget matching on unjudged sentences. if soft_match is True: ### NOTE BENE: geting an array of indices to index unjudged # sentences so I can force pandas to return a view and not a # copy. I = np.where( df["update id"].apply(lambda x: x not in judged_uids))[0] unjudged = df[ df["update id"].apply(lambda x: x not in judged_uids)] unjudged_sents = unjudged["sent text"].tolist() assert len(unjudged_sents) == I.shape[0] df.loc[I, "nuggets"] = classify_nuggets(unjudged_sents) # Remove nuggets from dataframe if we have already collected them in # the cache. The scoring function should ignore these. df = df[df["nuggets"].apply(len) > 0] all_seen_nuggets.update( set(n for ns in df["nuggets"].tolist() for n in ns)) df["nuggets"] = df["nuggets"].apply( lambda x: x.difference(nugget_cache)) if len(df) == 0: continue # Run summarizer on current document and update stats about it. summary = system.summarize(df) summary_nuggets = set(n for ns in summary._df["nuggets"].tolist() for n in ns) nugget_cache.update(summary_nuggets) system.k -= len(summary._df) F_of_S = len(summary_nuggets) cum_F_of_S += F_of_S total_updates += len(summary._df) timestamp = int(si.stream_id.split("-")[0]) stats.append({ "Cum. F(S)": cum_F_of_S, "F(S)": F_of_S, "UB no const.": len(all_seen_nuggets), "budget": budget, "Tot. Updates": total_updates, "event title": event.fs_name(), "timestamp": timestamp, "query id": event.query_id, }) summary_df.append(summary._df) if system.k <= 0: print "Budget exceeded!" break output_df = pd.DataFrame(stats, columns=["timestamp", "query id", "event title", "Cum. F(S)", "F(S)", "UB no const.", "Tot. Updates", "budget",]) # Write stats and updates to file. parent = os.path.dirname(output_path_prefix) if not os.path.exists(parent): os.makedirs(parent) stats_path = output_path_prefix + ".stats.tsv" updates_path = output_path_prefix + ".updates.tsv" with open(stats_path, "w") as f: output_df.to_csv(f, sep="\t", index=False) summary_df = pd.concat(summary_df) summary_df["sent text"] = summary_df["sent text"].apply( lambda x: x.encode("utf-8")) with open(updates_path, "w") as f: summary_df[["timestamp", "update id", "sent text"]].sort( ["update id"]).to_csv(f, sep="\t", index=False)
import pandas as pd from datetime import datetime, timedelta from collections import defaultdict import matplotlib.pylab as plt plt.style.use('ggplot') pd.set_option('display.width', 200) import locale locale.setlocale(locale.LC_ALL, 'en_US.UTF8') def unix_time(dt): epoch = datetime.utcfromtimestamp(0) delta = dt - epoch return int(delta.total_seconds()) articles_res = ArticlesResource() #ded_articles_res = DedupedArticlesResource() data = [] events = [] for event in cuttsum.events.get_events(): if event.query_num == 7: continue if event.query_num > 25: continue events.append(event) corpus = cuttsum.corpora.get_raw_corpus(event) hours = event.list_event_hours() hour2goose = defaultdict(int) for hour in hours:
from datetime import datetime, timedelta from collections import defaultdict import matplotlib.pylab as plt plt.style.use('ggplot') pd.set_option('display.width', 200) import locale locale.setlocale(locale.LC_ALL, 'en_US.UTF8') def unix_time(dt): epoch = datetime.utcfromtimestamp(0) delta = dt - epoch return int(delta.total_seconds()) articles_res = ArticlesResource() #ded_articles_res = DedupedArticlesResource() data = [] events = [] for event in cuttsum.events.get_events(): if event.query_num == 7: continue if event.query_num > 25: continue events.append(event) corpus = cuttsum.corpora.get_raw_corpus(event) hours = event.list_event_hours() hour2goose = defaultdict(int) for hour in hours: path = articles_res.get_chunk_path(event, "goose", hour) if path is None: