def test_aligner(tmpdir, request, test_data_dir, third_dir): si = make_hyperlink_labeled_test_stream_item(test_data_dir) assert len(si.body.clean_visible) > 200 #for x in si.body.labels['author']: # print x.offsets[OffsetType.BYTES].first, x.offsets[OffsetType.BYTES].value, x.target.target_id c_path = str(tmpdir.join('chunk.sc')) chunk = streamcorpus.Chunk(c_path, mode='wb') chunk.add(si) chunk.close() lp = lingpipe( config={ 'tmp_dir_path': str(tmpdir), 'exit_code_on_out_of_memory': 1, 'third_dir_path': third_dir, 'path_in_third': 'lingpipe-4.10', 'offset_types': ['BYTES'], 'offset_debugging': True, 'cleanup_tmp_files': False, 'align_labels_by': 'byte_offset_labels', 'aligner_data': { 'annotator_id': 'author', 'tagger_id': 'lingpipe', }, }) lp.process_path(c_path) ## this are only present if cleanup_tmp_files is False assert tmpdir.join('chunk.sc-clean_visible.xml').read() assert tmpdir.join('chunk.sc-ner.xml').read() si = list(streamcorpus.Chunk(c_path))[0] assert len(si.body.clean_visible) > 200 assert len(si.body.sentences['lingpipe']) == 41
def worker(args): event, event_title, query, hours, rel_dir, c_dir, log_file = args msg = sc.StreamItem_v0_2_0 with open(log_file, 'w') as lgf: for hour in hours: total_docs = 0 total_rel = 0 hdir = os.path.join(c_dir, hour) opath = str(os.path.join(rel_dir, '{}.sc.gz'.format(hour))) if not os.path.exists(hdir): continue if os.path.exists(opath): os.remove(opath) print hdir ochunk = sc.Chunk(path=opath, message=msg, mode='wb') for cname in os.listdir(hdir): path = str(os.path.join(hdir, cname)) for si in sc.Chunk(path=path, message=msg): total_docs += 1 if si.body.clean_visible is None: continue elif re.search(query, si.body.clean_visible, re.I): total_rel += 1 ochunk.add(si) ochunk.close() lgf.write('{}\t{}\t{}\n'.format(hour, total_rel, total_docs)) lgf.flush()
def worker(args): rc_dir, out_dir, hours, event, ad_dir, log_file = args vct_pkl = os.path.join(ad_dir, 'article_vectorizer.pkl') clf_pkl = os.path.join(ad_dir, 'article_clf.pkl') artcl_detect = ArticleDetector(vct_pkl, clf_pkl, event) lgf = open(log_file, 'w') n_hours = len(hours) for h, hour in enumerate(hours, 1): n_docs = 0 n_sents = 0 n_rel_docs = 0 n_rel_sents = 0 #print u'({}/{}) hour: {}'.format(h, n_hours, hour) chunk = os.path.join(rc_dir, '{}.sc.gz'.format(hour)) opath = str(os.path.join(out_dir, '{}.sc.gz'.format(hour))) ochunk = sc.Chunk(path=opath, mode='wb') try: for si_idx, si in enumerate(sc.Chunk(path=chunk)): n_docs += 1 if u'serif' in si.body.sentences: annotator = u'serif' elif u'lingpipe' in si.body.sentences: annotator = u'lingpipe' else: continue n_sents += len(si.body.sentences[annotator]) sent_idxs = artcl_detect.find_articles(si, annotator) n_idxs = len(sent_idxs) if n_idxs > 0: n_rel_docs += 1 n_rel_sents += n_idxs rel_sents = [] for sent_idx in sent_idxs: rel_sents.append( si.body.sentences[annotator][sent_idx]) si.body.sentences['article-clf'] = rel_sents ochunk.add(si) ochunk.close() lgf.write('{}\t{}\t{}\t{}\t{}\n'.format(hour, n_docs, n_sents, n_rel_docs, n_rel_sents)) lgf.flush() except IOError, e: print str(e)
def ids_and_clean_visible_from_streamcorpus_chunk_path(corpus_path): '''converts a streamcorpus.Chunk file into the structure that is passed by the search engine to find_soft_selectors ''' ch = clean_html(clean_html.default_config) cv = clean_visible(clean_visible.default_config) ids_and_clean_visible = [] for si in streamcorpus.Chunk(path=corpus_path): if not si.body.clean_visible: ## attempt to make clean_visible if not si.body.raw: logger.critical('no raw content, so skipping: %r', si.abs_url) continue abs_url = si.abs_url si = ch(si, {}) if not si: logger.critical( 'failed to make clean_html, so skipping: %r', abs_url) continue si = cv(si, {}) if not si or not si.body.clean_visible: logger.critical( 'failed to make clean_visible, so skipping: %r', abs_url) continue rec = (si.stream_id, si.body.clean_visible.decode('utf8'), {}) ids_and_clean_visible.append(rec) return ids_and_clean_visible
def __call__(self, i_str): backoff = 0.1 start_time = time.time() tries = 0 max_retries = int(self.config.get('max_retries', 1)) last_exc = None while tries < max_retries: try: message = _message_versions[self.config['streamcorpus_version']] logger.debug('reading from %r' % i_str) chunk = streamcorpus.Chunk(path=i_str, mode='rb', message=message) return chunk except IOError, exc: if exc.errno == errno.ENOENT: logger.critical('File is missing? Assume is slow NFS, try %d more times', max_retries - tries) backoff *= 2 tries += 1 elapsed = time.time() - start_time if elapsed > self.config['max_backoff']: ## give up after five minutes of retries logger.critical('File %r not found after %d retries', i_str, tries) raise time.sleep(backoff) last_exc = exc else: logger.critical('failed loading %r', i_str, exc_info=True) raise
def read_stream_id(fn): ids = set() for si in streamcorpus.Chunk(path=fn, mode='rb', message=streamcorpus.StreamItem_v0_3_0): ids.add(si.stream_id) return ids
def worker(args): msg = sc.StreamItem_v0_2_0 chunk_dir, hours, counts_dir, pid = args nhours = len(hours) for i, hour in enumerate(hours, 1): hdir = os.path.join(chunk_dir, hour) chunks = [os.path.join(hdir, fname) for fname in os.listdir(hdir)] ofile = os.path.join(counts_dir, '{}.txt.gz'.format(hour)) print '{}) {} -- {}/{}'.format(pid, hour, i, nhours) print '--> {}'.format(ofile) counts = defaultdict(int) doc_counts = defaultdict(int) for chunk in chunks: for si in sc.Chunk(path=chunk, message=msg): doc_words = set() for sentence in si.body.sentences['lingpipe']: for token in sentence.tokens: t = token.token.decode('utf-8') counts[t] += 1 doc_words.add(t) for word in doc_words: doc_counts[word] += 1 with gzip.open(ofile, 'wb') as f: for token, count in counts.iteritems(): doc_count = doc_counts[token] f.write(token.encode('utf-8')) f.write('\t') f.write(str(count)) f.write('\t') f.write(str(doc_count)) f.write('\n')
def worker(args): chunk_dir, hours, wcounts_dir, dcounts_dir, log_file = args nhours = len(hours) with open(log_file, 'w') as lf: for i, hour in enumerate(hours, 1): hdir = os.path.join(chunk_dir, hour) chunks = [os.path.join(hdir, fname) for fname in os.listdir(hdir)] wcfile = os.path.join(wcounts_dir, '{}.txt.gz'.format(hour)) dcfile = os.path.join(dcounts_dir, '{}.txt'.format(hour)) lf.write('Counting hour {} ({}/{})\n'.format(hour, i, nhours)) lf.flush() if os.path.exists(wcfile) and os.path.exists(dcfile): continue counts = defaultdict(int) doc_counts = defaultdict(int) num_docs = 0 for chunk in chunks: for si in sc.Chunk(path=chunk): if 'serif' not in si.body.sentences: continue num_docs += 1 doc_words = set() for sentence in si.body.sentences['serif']: for token in sentence.tokens: t = token.token.decode('utf-8') counts[t] += 1 doc_words.add(t) for word in doc_words: doc_counts[word] += 1 if len(counts) == 0: lf.write(u'Warning: {} contained no words.\n'.format(chunk)) lf.flush() continue # Write doc counts for this hour with open(dcfile, 'w') as f: f.write(str(num_docs)) f.flush() # Write word counts for this hour with gzip.open(wcfile, 'wb') as f: for token, count in counts.iteritems(): doc_count = doc_counts[token] f.write(token.encode('utf-8')) f.write('\t') f.write(str(count)) f.write('\t') f.write(str(doc_count)) f.write('\n')
def streamitem_iter(self, event, corpus): for hour in event.list_event_hours(): for chunk_path in self.get_chunks_for_hour(hour, corpus, event): with sc.Chunk(path=chunk_path, mode="rb", message=corpus.sc_msg()) as chunk: for si in chunk: yield hour, chunk_path, si
def test_protection(test_data_dir): with pytest.raises(streamcorpus.VersionMismatchError): # pylint: disable=E1101 for si in streamcorpus.Chunk(os.path.join( test_data_dir, 'test/MAINSTREAM_NEWS-15-9d6218f0aa7c9585cda12a10d642a8b3-41600ffca7703f7914102da5256233ce.sc.xz' ), message=streamcorpus.StreamItem): pass
def worker(args): corpus, path = args locations = set() for si in sc.Chunk(path=path, message=corpus.sc_msg()): for sentence in si.body.sentences[u'article-clf']: for loc_seq in cuttsum.geo.get_loc_sequences(sentence): locations.add(loc_seq) return tuple(locations)
def _maybe_run_post_batch_incremental_transforms(self, t_path): ## Run post batch incremental (pbi) transform stages. ## These exist because certain batch transforms have ## to run before certain incremental stages. if self.pbi_stages: t_path2 = os.path.join( self.tmp_dir_path, 'trec-kba-pipeline-tmp-%s' % str(uuid.uuid1())) # open destination for _run_incremental_transforms to write to self.t_chunk = streamcorpus.Chunk(path=t_path2, mode='wb') input_t_chunk = streamcorpus.Chunk(path=t_path, mode='rb') for si in input_t_chunk: self._run_incremental_transforms(si, self.pbi_stages) self.t_chunk.close() os.rename(t_path2, t_path)
def test_upgrade_streamcorpus_v0_3_0(test_data_dir): up = upgrade_streamcorpus_v0_3_0(config={}) count = 0 for si in streamcorpus.Chunk(get_test_chunk_path(test_data_dir), message=streamcorpus.StreamItem_v0_2_0): count += 1 si3 = up(si) assert si3.version == streamcorpus.Versions._NAMES_TO_VALUES['v0_3_0'] if count > 10: break
def sentencestring_worker_(job_queue, result_queue, **kwargs): signal.signal(signal.SIGINT, signal.SIG_IGN) corpus = kwargs.get(u'corpus') cnlp = corenlp.server.CoreNLPClient() while not job_queue.empty(): try: chunk_path, tsv_path = job_queue.get(block=False) sent_string_data = [] for si in sc.Chunk(path=chunk_path, message=corpus.sc_msg()): sentences = corpus.get_sentences(si) str2idx = {} for idx, sentence in enumerate(sentences): key = stringify_streamcorpus_sentence(sentence) str2idx[key] = idx for sentence in si.body.sentences[u'article-clf']: sc_string = stringify_streamcorpus_sentence(sentence) idx = str2idx[sc_string] #print idx, ")", sc_string doc = cnlp.annotate(sc_string) locs = get_loc_sequences(doc) if len(locs) > 0: locs_string = (u','.join(locs)).encode(u'utf-8') else: locs_string = 'nan' cnlp_string = stringify_corenlp_doc(doc) #print cnlp_string sent_string_data.append({ u'stream id': si.stream_id, u'sentence id': idx, u'streamcorpus': sc_string, u'corenlp': cnlp_string, u'locations': locs_string }) if len(sent_string_data) > 0: df = pd.DataFrame(sent_string_data, columns=[ u'stream id', u'sentence id', u'streamcorpus', u'corenlp', u'locations' ]) with gzip.open(tsv_path, u'w') as f: df.to_csv(f, sep='\t', index=False, index_label=False) result_queue.put(None) except Queue.Empty: pass
def streamitem_iter(self, event, corpus, extractor): for hour in event.list_event_hours(): path = self.get_chunk_path(event, extractor, hour, corpus) if os.path.exists(path): print path try: with sc.Chunk(path=path, mode="rb", message=corpus.sc_msg()) as chunk: for si in chunk: yield hour, path, si except IOError, msg: print msg
def get_chunk(self, key): tries = 0 while 1: fh = StringIO() key.get_contents_to_file(fh) data = fh.getvalue() _errors, data = decrypt_and_uncompress( data, self.config.get('gpg_decryption_key_path'), ## how should this get into the config...? tmp_dir=self.config['tmp_dir_path'], ) logger.info('\n'.join(_errors)) if self.config['input_format'] == 'streamitem' and \ self.config['streamcorpus_version'] == 'v0_1_0': i_content_md5 = key.key.split('.')[-3] else: ## go past {sc,protostream}.xz.gpg parts = key.key.split('.') if parts[-1] == '.gpg': parts.pop() i_content_md5 = parts[-3][-32:] ## verify the data matches expected md5 f_content_md5 = hashlib.md5(data).hexdigest() # pylint: disable=E1101 if i_content_md5 != f_content_md5: msg = 'FAIL(%d): %s --> %s != %s' % ( tries, key.key, i_content_md5, f_content_md5) logger.critical(msg) tries += 1 if tries > self.config['tries']: ## indicate complete failure to pipeline so it ## gets recorded in task_queue raise FailedExtraction(msg) else: continue if self.config['input_format'] == 'spinn3r': ## convert the data from spinn3r's protostream format return _generate_stream_items(data) elif self.config['input_format'] == 'streamitem': message = _message_versions[ self.config['streamcorpus_version']] return streamcorpus.Chunk(data=data, message=message) else: sys.exit('Invalid config: input_format = %r' % self.config['input_format'])
def add(self, si): '''puts `si` into the currently open chunk, which it creates if necessary. If this item causes the chunk to cross chunk_max, then the chunk closed after adding. ''' if self.o_chunk is None: if os.path.exists(self.t_path): os.remove(self.t_path) self.o_chunk = streamcorpus.Chunk(self.t_path, mode='wb') self.o_chunk.add(si) logger.debug('added %d-th item to chunk', len(self.o_chunk)) if len(self.o_chunk) == self.chunk_max: self.close()
def next_chunk_file(chunk_file_num): deduped_path_fmt = self.get_deduped_path_fmt( event, corpus, extractor, threshold=thresh) deduped_path = deduped_path_fmt.format( chunk_file_num) deduped_dir = os.path.dirname(deduped_path) if not os.path.exists(deduped_dir): os.makedirs(deduped_dir) if os.path.exists(deduped_path): os.remove(deduped_path) return sc.Chunk(path=deduped_path, mode="wb", message=corpus.sc_msg())
def _article_resource_worker(job_queue, result_queue, **kwargs): signal.signal(signal.SIGINT, signal.SIG_IGN) event = kwargs.get(u'event') corpus = kwargs.get(u'corpus') while not job_queue.empty(): try: opath, chunk_paths = job_queue.get(block=False) artcl_detect = ArticleDetector(event) patt = event.regex_pattern() with sc.Chunk(path=opath, mode='wb', message=corpus.sc_msg()) as ochunk: for path in chunk_paths: for si in sc.Chunk(path=path, message=corpus.sc_msg()): if si.body.clean_visible is None: continue elif patt.search(si.body.clean_visible, re.I): #if corpus.annotator() not in si.body.sentences: # continue sentences = corpus.get_sentences(si) sent_idxs = artcl_detect.find_articles( sentences) if len(sent_idxs) > 0: rel_sents = [] for sent_idx in sent_idxs: #for token in sentences[sent_idx].tokens: # print token.token, #print rel_sents.append(sentences[sent_idx]) si.body.sentences[u'article-clf'] = rel_sents ochunk.add(si) result_queue.put(None) except Queue.Empty: pass
def streamitem_iter(self, event, corpus, extractor, threshold=.8): df = self.get_stats_df( event, corpus, extractor, threshold) if df is None: return import math num_chunks = int(math.ceil(len(df) / 1000.)) tmp = self.get_deduped_path_fmt( event, corpus, extractor, threshold) for i in xrange(1, num_chunks + 1): path = tmp.format(i) if os.path.exists(path): with sc.Chunk(path=path, mode="rb", message=corpus.sc_msg()) as chunk: for si in chunk: yield si
def _idf_resource_worker(job_queue, result_queue, **kwargs): signal.signal(signal.SIGINT, signal.SIG_IGN) corpus = kwargs.get(u'corpus') while not job_queue.empty(): try: mpath, paths = job_queue.get(block=False) n_docs = 0 counts = defaultdict(int) for path in paths: for si in sc.Chunk(path=path, mode='rb', message=corpus.sc_msg()): sentences = corpus.get_sentences(si) if len(sentences) == 0: continue n_docs += 1 unique_words = set() for sentence in sentences: for token in sentence.tokens: unique_words.add( token.token.decode(u'utf-8').lower()) for word in unique_words: counts[word] += 1 n_docs = float(n_docs) words = counts.keys() idfs = [ tuple([np.log(n_docs / value) + 1, value]) for value in counts.values() ] trie = marisa_trie.RecordTrie("<dd", zip(words, idfs)) with gzip.open(mpath, u'wb') as f: trie.write(f) result_queue.put(None) except Queue.Empty: pass
def test_kvlayer_index_with_source(configurator, test_data_dir): overlay = { 'streamcorpus_pipeline': { 'to_kvlayer': { 'indexes': ['with_source'], }, }, } with chunks(configurator, test_data_dir, overlay) as (path, client): # We should not have written the doc_id_epoch_ticks index at all for k, v in client.scan('stream_items_doc_id_epoch_ticks'): assert False, 'epoch_ticks present! k={!r}'.format(k) # Every item in the ...with_source index should match a real item for k, v in client.scan('stream_items_with_source'): assert v == 'WEBLOG' # by inspection for kk, sixz in client.get('stream_items', k): errs, sibytes = streamcorpus.decrypt_and_uncompress(sixz) assert errs == [] for si in streamcorpus.Chunk(data=sibytes): assert si.source == v
def test_upgrade_streamcorpus_v0_3_0_check_mention_ids(test_data_dir): up = upgrade_streamcorpus_v0_3_0(config={}) all_mention_ids = set() for si in streamcorpus.Chunk(os.path.join( test_data_dir, 'test/MAINSTREAM_NEWS-15-9d6218f0aa7c9585cda12a10d642a8b3-41600ffca7703f7914102da5256233ce.sc.xz' ), message=streamcorpus.StreamItem_v0_2_0): si3 = up(si) assert si3.version == streamcorpus.Versions._NAMES_TO_VALUES['v0_3_0'] mention_ids = set() for sentence in si3.body.sentences['lingpipe']: sentence_mention_ids = set() for token in sentence.tokens: if token.mention_id not in [None, -1]: sentence_mention_ids.add(token.mention_id) assert mention_ids.intersection(sentence_mention_ids) == set() mention_ids.update(sentence_mention_ids) all_mention_ids.update(sentence_mention_ids) assert len(all_mention_ids) > 0
def keys_and_values(): for si in streamcorpus.Chunk(t_path): key1 = uuid.UUID(int=si.stream_time.epoch_ticks) key2 = uuid.UUID(hex=si.doc_id) data = streamcorpus.serialize(si) errors, data = streamcorpus.compress_and_encrypt(data) assert not errors, errors yield (key1, key2), data for ndx in indexes: if ndx == 'doc_id_epoch_ticks': kvp = ((key2, key1), r'') elif ndx == 'with_source': ## si.source can be None but we can't write None blobs to kvlayer if si.source: kvp = ((key1, key2), si.source) else: continue else: assert False, ('invalid index type ' + ndx) indexes[ndx].append(kvp)
def __init__(self, filenames, **kwargs): super(StreamCorpusDataset, self).__init__(kwargs) filenames = getfiles(filenames) for filename in filenames: for si in sc.Chunk(path=filename): if si.body.clean_visible == None: continue did = si.stream_id try: sentences = si.body.sentences["serif"] except KeyError: sentences = si.body.sentences["lingpipe"] for sind, sentence in enumerate(sentences): sid = make_sid(did, sind) self.add_sentence(sid, sentence) self.build_dictionary()
def test_kvlayer_reader_and_writer(configurator, test_data_dir): with chunks(configurator, test_data_dir) as (path, client): ## check that index table was created all_doc_ids = set() all_epoch_ticks = set() for (doc_id, epoch_ticks ), empty_data in client.scan('stream_items_doc_id_epoch_ticks'): all_doc_ids.add(doc_id) all_epoch_ticks.add(epoch_ticks) all_doc_ids = sorted(all_doc_ids) all_epoch_ticks = sorted(all_epoch_ticks) logger.info('%d doc_ids', len(all_doc_ids)) ## make an reader config = yakonfig.get_global_config('streamcorpus_pipeline', 'from_kvlayer') reader = from_kvlayer(config) ## test it with different i_str inputs: for i_str in [ '', '0,,%d,' % 10**10, '%d,%s,%d,%s' % (all_epoch_ticks[0], all_doc_ids[0], all_epoch_ticks[-1], all_doc_ids[-1]) ]: stream_ids = [] for si in reader(i_str): stream_ids.append(si.stream_id) _input_chunk_ids = [ si.stream_id for si in streamcorpus.Chunk(path) ] input_chunk_ids = list(set(_input_chunk_ids)) logger.info('%d inserts, %d unique', len(_input_chunk_ids), len(input_chunk_ids)) input_chunk_ids.sort() stream_ids.sort() assert len(input_chunk_ids) == len(stream_ids) assert input_chunk_ids == stream_ids
def main(): event_file, rc_dir, event_title, ofile = parse_args() event = load_event(event_title, event_file) hours = [dth for dth in gen_dates(event.start, event.end)] num_hours = len(hours) meta_data = [] bow_dicts = [] for h, hour in enumerate(hours, 1): path = os.path.join(rc_dir, '{}.sc.gz'.format(hour)) for si in sc.Chunk(path=path): uni2id = {} for sid, sentence in enumerate(si.body.sentences[u'serif'], 0): uni2id[sentence_uni(sentence)] = sid for sent in si.body.sentences[u'article-clf']: bow_dict = {} for token in sent.tokens: t = token.token.decode(u'utf-8').lower() bow_dict[t] = 1 bow_dicts.append(bow_dict) uni = sentence_uni(sent) sent_id = uni2id[uni] meta_data.append((hour, si.stream_id, sent_id, uni)) vctr = DictVectorizer() X = vctr.fit_transform(bow_dicts) with codecs.open(ofile, 'w', 'utf-8') as f: for i, (hour, stream_id, sent_id, uni) in enumerate(meta_data): uni = uni.replace(u'\n', u' ').replace(u'\t', u' ') f.write(u'{}\t{}\t{}\t{}\t'.format(hour, stream_id, sent_id, uni)) x = u' '.join([unicode(col) for col in X[i, :].indices]) f.write(x) f.write(u'\n') f.flush()
def worker(args): rc_dir, nuggets, hours, event, doc_freqs, word_freqs = args msg = sc.StreamItem_v0_2_0 for hour in hours: active_nuggets = get_active_nuggets(hour, nuggets) if len(active_nuggets) == 0: continue hour_m5 = get_previous_hour(hour, 5) hour_m10 = get_previous_hour(hour, 10) num_docs = doc_count(doc_freqs, hour) num_m5_docs = doc_count(doc_freqs, hour_m5) num_m10_docs = doc_count(doc_freqs, hour_m10) hour_wc = read_tfdf(word_freqs, hour) hour_m5_wc = read_tfdf(word_freqs, hour_m5) hour_m10_wc = read_tfdf(word_freqs, hour_m10) chunk = os.path.join(rc_dir, '{}.sc.gz'.format(hour)) for si in sc.Chunk(path=chunk, message=msg): doc_wc = make_doc_wordcounts(si) for sentence in si.body.sentences['lingpipe']: avg_tfidf = compute_avg_tfidf(sentence, doc_wc, hour_wc, num_docs) avg_m5_tfidf = compute_avg_tfidf(sentence, None, hour_m5_wc, num_m5_docs) avg_m10_tfidf = compute_avg_tfidf(sentence, None, hour_m10_wc, num_m10_docs) delta_m5_tfidf = avg_tfidf - avg_m5_tfidf delta_m10_tfidf = avg_tfidf - avg_m10_tfidf tokens = [token.token for token in sentence.tokens] print avg_tfidf, avg_m5_tfidf, avg_m10_tfidf, ' '.join(tokens) print hour, hour_m5, num_m5_docs, hour_m10 sys.exit()
def _get_streamitem(self): for si in streamcorpus.Chunk(EXPORT_SC_FILENAME): return si return None
def main(): event_file, rc_dir, event_title, ofile, ports, cnts_dirs = parse_args() wc_dir, dc_dir = cnts_dirs event = load_event(event_title, event_file) hours = [dth for dth in gen_dates(event.start, event.end)] print "Connecting lm clients..." dm_lm_score = lm_client_init(ports[0]) bg_lm3_score = lm_client_init(ports[1][0]) bg_lm4_score = lm_client_init(ports[1][1]) bg_lm5_score = lm_client_init(ports[1][2]) print "Query words:", event.query query_matcher = query_term_match_init(event.query) wn_terms = wn_synset_terms(event.type) print "WordNet synset terms:", wn_terms synset_matcher = query_term_match_init(wn_terms) tfidfers = [] preroll = [get_previous_hour(hours[0], i) for i in range(1, 6)] for hour in preroll: tfidfers.append(init_tfidfers(wc_dir, dc_dir, hour, lower=True)) tfidfers.append(None) of = open(ofile, 'w') header = "hour\tstream-id\tsent-id\t" \ + "avg-tfidf\tavg-tfidf-m1\tavg-tfidf-m5\t" \ + "dm-logprob\tdm-avg-logprob\tbg3-logprob\tbg3-avg-logprob\t" \ + "bg4-logprob\tbg4-avg-logprob\tbg5-logprob\tbg5-avg-logprob\t" \ + "query-matches\tsynset-matches\tnum-tokens\tarticle-position\t" \ + "article-position-rel\tcapsrate\n" of.write(header) of.flush() num_hours = len(hours) for h, hour in enumerate(hours, 1): tfidfers = [init_tfidfers(wc_dir, dc_dir, hour, lower=True)] \ + tfidfers[0:-1] print "({}/{}) {}".format(h, num_hours, hour) path = os.path.join(rc_dir, '{}.sc.gz'.format(hour)) for si in sc.Chunk(path=path): ticks = float(si.stream_time.epoch_ticks) si_datetime = datetime.utcfromtimestamp(ticks) tdelta = si_datetime - event.start uni2id = {} doc_word_counts = defaultdict(int) for sid, sentence in enumerate(si.body.sentences[u'serif'], 0): uni2id[sentence_uni(sentence)] = sid for token in sentence.tokens: t = token.token.decode(u'utf-8').lower() doc_word_counts[t] += 1 nsents = len(si.body.sentences[u'article-clf']) for apos, sent in enumerate(si.body.sentences[u'article-clf'], 1): tf_dict = {} for token in sent.tokens: t = token.token.decode(u'utf-8').lower() tf_dict[t] = doc_word_counts[t] tfidfs_now = tfidfers[0](tf_dict) tfidfs_m1 = tfidfers[1](tf_dict) tfidfs_m5 = tfidfers[5](tf_dict) scores = compute_tfidfs(tfidfs_now, tfidfs_m1, tfidfs_m5) avg_tfidf, avg_tfidf_m1, avg_tfidf_m5 = scores uni = sentence_uni(sent) sent_id = uni2id[uni] apos_rel = apos / float(nsents) num_tokens = len(sent.tokens) caps_rate = get_caps_rate(sent) dm_lp, dm_alp = dm_lm_score(uni) bg3_lp, bg3_alp = bg_lm3_score(uni) bg4_lp, bg4_alp = bg_lm4_score(uni) bg5_lp, bg5_alp = bg_lm5_score(uni) query_matches = query_matcher(uni) synset_matches = synset_matcher(uni) # print dm_lp, dm_alp, bg3_lp, bg3_alp, bg4_lp, bg4_alp, bg5_lp, bg5_alp dstr = ('{}\t{}\t{}\t{}\t{}\t{}\t{}\t{}\t{}\t{}' \ +'\t{}\t{}\t{}\t{}\t{}\t{}\t{}\t{}\t{}\t{}\n').format( hour, si.stream_id, sent_id, avg_tfidf, avg_tfidf_m1, avg_tfidf_m5, dm_lp, dm_alp, bg3_lp, bg3_alp, bg4_lp, bg4_alp, bg5_lp, bg5_alp, query_matches, synset_matches, num_tokens, apos, apos_rel, caps_rate) of.write(dstr) of.flush() of.close()