def extract_content(): DB = 'mysql+pymysql://homestead:[email protected]/public_opinion?charset=utf8' session = db_session(DB) M = db_model(DB, 'corpus') query = session.query(M) while True: corpuses = query.filter(M.status == 'ready').order_by(M.id).limit(30).all() if not corpuses: break for corpus in corpuses: try: summary_html = Document(corpus.html).summary(html_partial=True) content = BS(summary_html).text.strip() corpus.content = content session.commit() except: corpus.content = '[extract_error]' session.commit() print('===> extract_content error, id: ', corpus.id) corpus.status = 'extracted' session.commit()
def export_csv(): DB = 'mysql+pymysql://homestead:[email protected]/public_opinion?charset=utf8' session = db_session(DB) M = db_model(DB, 'corpus') query = session.query(M) csv_file = open(csvfile_path, 'wb') writer = csv.writer(csv_file) table_head = ['url', 'website', 'published_at', 'word_freq', 'topic'] writer.writerow(table_head) offset = 0 limit = 3000 while True: corpuses = query.filter(M.status == 'marked').order_by(M.id).offset(offset).limit(limit).all() if not corpuses: break for corpus in corpuses: table_row = [corpus.url, corpus.website, corpus.published_at, corpus.word_freq, corpus.topic] writer.writerow(table_row) print('===> write id: ', corpus.id) offset += limit csv_file.close()
def mark_topic(): DB = 'mysql+pymysql://homestead:[email protected]/public_opinion?charset=utf8' session = db_session(DB) M = db_model(DB, 'corpus') query = session.query(M) while True: corpuses = query.filter(M.status == 'segmented').order_by(M.id) if not corpuses: break for corpus in corpuses: try: word_freq = json.loads(corpus.word_freq) topic = 0 if check_topic1(word_freq): topic = topic | TOPIC1 if check_topic2(word_freq): topic = topic | TOPIC2 if check_topic3(word_freq): topic = topic | TOPIC3 if check_topic4(word_freq): topic = topic | TOPIC4 if check_topic5(word_freq): topic = topic | TOPIC5 if check_topic6(word_freq): topic = topic | TOPIC6 if check_topic7(word_freq): topic = topic | TOPIC7 if check_topic8(word_freq): topic = topic | TOPIC8 if check_topic9(word_freq): topic = topic | TOPIC9 if check_topic10(word_freq): topic = topic | TOPIC10 corpus.topic = topic session.commit() print('===> mark topic, id: ', corpus.id) except: print('===> mark topic error, id: ', corpus.id) corpus.status = 'marked' session.commit()
def segment(): DB = 'mysql+pymysql://homestead:[email protected]/public_opinion?charset=utf8' session = db_session(DB) M = db_model(DB, 'corpus') query = session.query(M) #while True: corpuses = query.filter(M.status == 'extracted', M.content != '[extract_error]').order_by(M.id) # if not corpuses: # break for corpus in corpuses: try: corpus.word_freq = segment_text(corpus.content) session.commit() print('===> segment id: ', corpus.id) except: print('===> segment error, id: ', corpus.id) corpus.status = 'segmented' session.commit()
def open_spider(self, spider): self.session = session = db_session(DB) self.model = db_model(DB, 'corpus') self.query = self.session.query(self.model)