def parse(self, response): youbi_cd = date.today().weekday() for idx, bangumi_list in enumerate(response.css('td[valign="top"]')): # 対象外(当日の曜日以外)の番組情報は取得しない if idx != youbi_cd: continue item = NewsItem() text = '' # 取得対象の番組の概要テキストのみ取得 for bangumi in bangumi_list.css('table.new_day'): bangumi_name = bangumi.css( 'span.prog_name a.bangumiDetailOpen::text').extract_first( ) if bangumi_name is None: continue target_name = get_target_bangumi_name(bangumi_name.strip(), TARGET_BANGUMI_DICT) if target_name is None: continue print(bangumi_name) text = bangumi.css( 'span.expo_org a.bangumiDetailOpen::text').extract_first() if text is None: continue text = remove_words( text, TARGET_BANGUMI_DICT[target_name]['rm_word'], KYOKU_SEP_DICT['asahi']) if text != '': item['text'] = text yield item
def parse(self, response): for bangumi in response.css('tbody td'): item = NewsItem() text = '' oa = bangumi.css('p.oa::text').extract_first() if oa is None: continue if re.match(r'\d{2}\:\d{2}', oa.strip()): bangumi_name = bangumi.css('h3::text').extract_first().split() # 取得対象の番組か判定 target_name = get_target_bangumi_name(bangumi_name, TARGET_BANGUMI_DICT) if target_name is None: continue # 番組概要を取得 text = bangumi.css('p::text').extract()[1] if text is None: continue text = remove_words( text.strip(), TARGET_BANGUMI_DICT[target_name]['rm_word'], KYOKU_SEP_DICT['ntv'] ) desc_url = bangumi.css('a::attr(href)').extract_first().strip() yield response.follow( desc_url, callback=self.parse_desc_page, meta={ 'item': item, 'text': text, 'target_name': target_name } )
def parse_desc_page(self, response): item = response.meta['item'] target_name = response.meta['target_name'] text = response.css('div.copy-box').css('p::text').extract_first() if not text is None: text = remove_words(text.strip(), TARGET_BANGUMI_DICT[target_name]['rm_word'], KYOKU_SEP_DICT['tbs']) if text != '': item['text'] = text yield item
def parse_desc_page(self, response): item = response.meta['item'] text = response.meta['text'] target_name = response.meta['target_name'] text_tmp = '' for info in response.css('div.program'): if info.css('h2::text').extract_first() == '詳細': text_tmp = info.css('p::text').extract_first() break if (text_tmp != '') and (not text_tmp is None): text_tmp = remove_words( text_tmp.strip(), TARGET_BANGUMI_DICT[target_name]['rm_word'], KYOKU_SEP_DICT['ntv'] ) text = text_tmp if text != '': item['text'] = text yield item
def parse(self, response): for bangumi in response.css('div#wrap').css('td.info'): text = '' if bangumi.css('span.inform::text').extract_first() == '報道・情報': bangumi_name = bangumi.css('a::text').extract_first() # 取得対象の番組か判定 target_name = get_target_bangumi_name(bangumi_name, TARGET_BANGUMI_DICT) if target_name is None: continue # 番組概要を取得 text = bangumi.css('p.tx_pad::text').extract_first() if text is None: continue text = remove_words( text.strip(), TARGET_BANGUMI_DICT[target_name]['rm_word'], KYOKU_SEP_DICT['fuji']) item = NewsItem() if text != '': item['text'] = text yield item
def exract_target_data(f, contents_dict): '''必要な情報を抜き出してファイルに出力する''' for content in contents_dict['list']['g1']: # 取得対象の番組か判定 target_name = get_target_bangumi_name(content['title'], TARGET_BANGUMI_DICT) if target_name is None: continue # 番組概要を取得 print('target_name ', target_name) text = '' for target_content in TARGET_BANGUMI_DICT[target_name]['target']: text += content[target_content] + KYOKU_SEP_DICT['nhk'][0] print('text ', text) if text is None: continue text = remove_words( text.strip(), TARGET_BANGUMI_DICT[target_name]['rm_word'], KYOKU_SEP_DICT['nhk'] ) if text != '': f.write(text + '\n')
def main(args): print("Open data") df = pd.read_csv(args.datapath) # tokenize print("Tokenize text") tokenizer = NLTKToknizerWrapper(False) df["token"] = df["text"].apply(lambda t: tokenizer.tokenize(t.lower())) X = df["token"].values.tolist() y = df["label"].values.tolist() # optional preprocessing print("Preprocessing") X = [[remove_characters(token) for token in tokens] for tokens in X] X = [list(filter(lambda x: x != '', tokens)) for tokens in X] id_stopwords = stopwords.words('indonesian') X = [remove_words(tokens, id_stopwords) for tokens in X] # split print("Split Data") X_train, X_test, y_train, y_test = train_test_split(X, y, train_size=0.8, test_size=0.2, random_state=4371) X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, train_size=0.9, test_size=0.1, random_state=4371) data = X_train, y_train, X_test, y_test, X_val, y_val if args.architecture == "rnn": rnn_classify_demo.main(args, data) elif args.architecture == "cnn": cnn_classify_demo.main(args, data)