class DataGoogle(Data): name = 'Google' orgn = ['News.txt'] seq_len = 10 topic_num = 152 w_verify_func = du.word_verify(None, None, 0.0, None) wf_flt_func = lambda word, freq: freq >= 0 doc_flt_func = lambda d: len(d.tokens) >= 3 and d.topic is not None topic_flt_func = lambda rank, freq: True def filter_into_temp(self): twarr = iu.load_array(self.orgn_file) print(len(twarr), type(twarr[0])) docarr = du.make_docarr( [[tw[k] for k in ('tweetId', 'clusterNo', 'textCleaned')] for tw in twarr]) du.dump_docarr(self.temp_file, docarr)
class DataEvent(Data): name = 'Event' orgn = ['Terrorist'] seq_len = 14 topic_num = 69 w_verify_func = du.word_verify(2, 16, 0.8, pu.nltk_stop_words) wf_flt_func = lambda word, freq: freq >= 3 doc_flt_func = lambda d: len(d.tokens) >= 3 and d.topic is not None topic_flt_func = lambda rank, freq: True def filter_into_temp(self): file_list = iu.list_children(self.orgn_file, full_path=True) twarr_list = [iu.load_array(file) for file in file_list] doclist = list() for topic_id, twarr in enumerate(twarr_list): for tw in twarr: doclist.append((str(tw['id']), topic_id, tw['text'].replace('#', ''))) docarr = du.make_docarr(doclist) du.dump_docarr(self.temp_file, docarr)
class DataTREC(Data): name = 'TREC' orgn = ['Tweets.txt'] topic_num = 128 w_verify_func = du.word_verify(3, 14, 0.8, pu.my_stop_words) wf_flt_func = lambda word, freq: freq >= 3 doc_flt_func = lambda d: len(d.tokens) >= 5 and d.topic is not None topic_flt_func = lambda rank, freq: 10 <= freq def filter_into_temp(self): twarr = iu.load_array(self.orgn_file) outrows = list() for idx, tw in enumerate(twarr): if tw['relevance'] > 1: continue docid, topic, text = tw['tweetId'], tw['clusterNo'], tw['text'] if not 10 < len(' '.join(pu.tokenize(text, pu.tokenize_pattern))): continue outrows.append([docid, topic, text]) topics = Counter([r[1] for r in outrows]) print('get {} rows'.format(len(outrows))) print('{} topics, {}'.format(len(topics), topics)) du.dump_docarr(self.temp_file, du.make_docarr(outrows))
class DataReuters(Data): name = 'Reuters' orgn = ['segments'] seq_len = 100 topic_num = 31 w_verify_func = du.word_verify(3, 16, 0.8, pu.nltk_stop_words) wf_flt_func = lambda word, freq: freq >= 3 doc_flt_func = lambda d: len(d.tokens) >= 3 and d.topic is not None topic_flt_func = lambda rank, freq: freq >= 20 def filter_into_temp(self): from bs4 import BeautifulSoup files = iu.list_children(self.orgn_file, full_path=True) array = list() for fidx, file in enumerate(files): print(fidx) tree = BeautifulSoup(''.join(iu.read_lines(file)), "html.parser") for article in tree.find_all("reuters"): topics = list(article.topics.children) if not len(topics) == 1: continue topic = str(topics[0].text.encode('ascii', 'ignore')) text = article.find('text') if text is None or text.body is None: continue title = str( text.title.text.encode('utf-8', 'ignore')) if text.title is not None else '' title = ' '.join(pu.tokenize(title, pu.tokenize_pattern)) body = str(text.body.text.encode('utf-8', 'ignore')) body = ' '.join(pu.tokenize(body, pu.tokenize_pattern)) array.append((topic, '{}, {}'.format(title, body))) docarr = du.make_docarr([(idx, topic, body) for idx, (topic, body) in enumerate(array)]) print(len(docarr)) print(Counter([d.topic for d in docarr])) print(len(sorted(set([d.topic for d in docarr])))) du.dump_docarr(self.temp_file, docarr)
class DataR10K(Data): name = 'R10K' orgn = ['data'] topic_num = 4 w_verify_func = du.word_verify(3, 16, 0.6, None) wf_flt_func = lambda word, freq: freq >= 5 doc_flt_func = lambda d: len(d.tokens) >= 10 topic_flt_func = lambda rank, freq: True def filter_into_temp(self): from os.path import join did_to_cat = dict() cat_list = ['CCAT', 'GCAT', 'MCAT', 'ECAT'] data_dir = _data_base.format(self.name, self.special[0]) with open(join(data_dir, 'rcv1-v2.topics.qrels')) as fin: for line in fin.readlines(): line = line.strip().split(' ') cat = line[0] did = int(line[1]) if cat in cat_list: did_to_cat[did] = did_to_cat.get(did, []) + [cat] # did_to_cat = {k: did_to_cat[k] for k in list(did_to_cat.keys()) if len(did_to_cat[k]) > 1} for did in list(did_to_cat.keys()): if len(did_to_cat[did]) > 1: del did_to_cat[did] dat_list = [ 'lyrl2004_tokens_test_pt0.dat', 'lyrl2004_tokens_test_pt1.dat', 'lyrl2004_tokens_test_pt2.dat', 'lyrl2004_tokens_test_pt3.dat', 'lyrl2004_tokens_train.dat' ] data = list() target = list() cat_to_cid = {'CCAT': 0, 'GCAT': 1, 'MCAT': 2, 'ECAT': 3} docarr = list() del did for dat in dat_list: with open(join(data_dir, dat)) as fin: for line in fin.readlines(): if line.startswith('.I'): if 'did' in locals(): assert doc != '' if did in did_to_cat: data.append(doc) target.append(cat_to_cid[did_to_cat[did][0]]) docarr.append( (did, cat_to_cid[did_to_cat[did][0]], doc)) did = int(line.strip().split(' ')[1]) doc = '' elif line.startswith('.W'): assert doc == '' else: doc += line print((len(data), 'and', len(did_to_cat))) print(data[0]) assert len(data) == len(did_to_cat) print(len(docarr)) du.dump_docarr(self.temp_file, du.make_docarr(docarr[:20000]))