path = articles_res.get_chunk_path(event, "goose", hour, corpus) if path is None: continue # print path fname = os.path.split(path)[1] num_goose = int(fname.split("-")[0]) hour2goose[hour] = num_goose # goose_df = articles_res.get_stats_df(event, "goose") # if goose_df is not None: # for _, row in goose_df.iterrows(): # dt = datetime.utcfromtimestamp(row["hour"]) # hour = datetime(dt.year, dt.month, dt.day, dt.hour) # hour2goose[hour] = row["goose articles"] for hour in hours: raw_chunks = chunk_res.get_chunks_for_hour(hour, corpus, event) num_raw_si = 0 for chunk in raw_chunks: fname = os.path.split(chunk)[1] num_raw_si += int(fname.split("-")[1]) # num_fltr_si = len(articles_res.get_si(event, corpus, "goose", hour)) data.append( { "event": event.query_id, "title": event.title, "hour": hour, "raw articles": num_raw_si, "goose articles": hour2goose[hour], "deduped articles": hour2ded[hour], "deduped match articles": hour2ded_fltr[hour],
def do_job_unit(self, event, corpus, unit, **kwargs): extractor = kwargs.get("extractor", "gold") data_dir = os.path.join(self.dir_, extractor, event.fs_name()) chunks_resource = SCChunkResource() if not os.path.exists(data_dir): try: os.makedirs(data_dir) except OSError as exc: if exc.errno == errno.EEXIST and os.path.isdir(data_dir): pass if extractor == "gold": import cuttsum.judgements all_matches = cuttsum.judgements.get_matches() matches = all_matches[all_matches["query id"] == event.query_id] stream_ids = set( matches["update id"].apply( lambda x: "-".join(x.split("-")[:-1])).tolist()) hours = set([datetime.utcfromtimestamp( int(update_id.split("-")[0])).replace( minute=0, second=0) for update_id in matches["update id"].tolist()]) hours = sorted(list(hours)) hour = hours[unit] output_path = self.get_chunk_path(event, extractor, hour, corpus) gold_si = [] for path in chunks_resource.get_chunks_for_hour(hour, corpus, event): with sc.Chunk(path=path, mode="rb", message=corpus.sc_msg()) as chunk: for si in chunk: if si.stream_id in stream_ids: gold_si.append(si) gold_si.sort(key=lambda x: x.stream_id) for si in gold_si: print si.stream_id if os.path.exists(output_path): os.remove(path) with sc.Chunk(path=output_path, mode="wb", message=corpus.sc_msg()) as chunk: for si in gold_si: chunk.add(si) elif extractor == "goose": import nltk from nltk.tokenize import WordPunctTokenizer sent_tok = nltk.data.load('tokenizers/punkt/english.pickle') word_tok = WordPunctTokenizer() from goose import Goose, Configuration config = Configuration() config.enable_image_fetching = False g = Goose(config) hour = event.list_event_hours()[unit] output_path_tmp = self.get_chunk_template(event, extractor, hour, corpus) good_si = [] for path in chunks_resource.get_chunks_for_hour(hour, corpus, event): try: with sc.Chunk(path=path, mode="rb", message=corpus.sc_msg()) as chunk: for si in chunk: if si.body.clean_visible is None: continue article_text = self._get_goose_text(g, si) if article_text is None: continue if not self._contains_query(event, article_text): continue art_pretty = sent_tok.tokenize(article_text) art_sents = [word_tok.tokenize(sent) for sent in art_pretty] df = si2df(si) I = self._map_goose2streamitem( art_sents, df["words"].tolist()) if "serif" in si.body.sentences: si_sentences = si.body.sentences["serif"] elif "lingpipe" in si.body.sentences: si_sentences = si.body.sentences["lingpipe"] else: raise Exception("Bad sentence annotator.") ann = sc.Annotator() ann.annotator_id = "goose" si.body.sentences["goose"] = [sc.Sentence() for _ in si_sentences] for i_goose, i_si in enumerate(I): #print art_pretty[i_goose] #print df.loc[i_si, "sent text"] #print tokens = [sc.Token(token=token.encode("utf-8")) for token in art_sents[i_goose]] si.body.sentences["goose"][i_si].tokens.extend( tokens) good_si.append(si) except TypeError: continue #if len(good_si) == 0: # print "Nothing in hour:", hour # return output_path = output_path_tmp.format(len(good_si)) odir = os.path.dirname(output_path) if not os.path.exists(odir): os.makedirs(odir) if os.path.exists(output_path): os.remove(path) good_si.sort(key=lambda x: x.stream_id) for si in good_si: print si.stream_id if os.path.exists(output_path): os.remove(output_path) print "Writing to", output_path with sc.Chunk(path=output_path, mode="wb", message=corpus.sc_msg()) as chunk: for si in good_si: chunk.add(si) else: raise Exception("extractor: {} not implemented!".format(extractor))
path = articles_res.get_chunk_path(event, "goose", hour, corpus) if path is None: continue #print path fname = os.path.split(path)[1] num_goose = int(fname.split("-")[0]) hour2goose[hour] = num_goose # goose_df = articles_res.get_stats_df(event, "goose") # if goose_df is not None: # for _, row in goose_df.iterrows(): # dt = datetime.utcfromtimestamp(row["hour"]) # hour = datetime(dt.year, dt.month, dt.day, dt.hour) # hour2goose[hour] = row["goose articles"] for hour in hours: raw_chunks = chunk_res.get_chunks_for_hour(hour, corpus, event) num_raw_si = 0 for chunk in raw_chunks: fname = os.path.split(chunk)[1] num_raw_si += int(fname.split("-")[1]) #num_fltr_si = len(articles_res.get_si(event, corpus, "goose", hour)) data.append({ "event": event.query_id, "title": event.title, "hour": hour, "raw articles": num_raw_si, "goose articles": hour2goose[hour], "deduped articles": hour2ded[hour], "deduped match articles": hour2ded_fltr[hour], })