def main(argv) : press_name = argv[1] start_page_index = int(argv[2]) end_page_index = int(argv[3]) print press_name press = press_dict[press_name] con = db.connect_raw() for i in range(start_page_index, end_page_index+1) : # get 10-20 url and insert url_list = press.get_article_urls_with_pagenum(i) print "page: " + str(i) for url in url_list : print url try : article = press.parse_article_with_url(url) except : print 'retry parsing!' article = press.parse_article_with_url(url) query = db.make_insert_query("article", article) result = db.do_insert(con, query) time.sleep(1.5) time.sleep(5) con.close()
def __extract_author(con, email) : """ id, name, email, press_id, added_date """ author = {} con_r = db.connect_raw() query = "SELECT URL, author_info FROM article WHERE author_info like \'%" + email + "%\'" result = db.do_select(con_r, query) url = result[0][0].split('/')[2] # SUM ALL POSSIBLE NAME possible_words = {} for row in result : author_info = row[1] possible_words_in_article = __extract_name(author_info) for k, v in possible_words_in_article.items() : possible_words[k] = possible_words.get(k, 0) + v max_value = 0 name = "" for key, value in possible_words.items() : if (value > max_value) : if key == u'' : continue name = key max_value = value if max_value < 3 : return None author['name'] = name.encode('utf-8') author['email'] = email.encode('utf-8') author['press_id'] = __get_press_id_from(url) author['added_date'] = str(__get_today()).encode('utf-8') return author
def _get_raw_data() : con_r = db.connect_raw() query = 'SELECT * FROM article' result = db.do_select(con_r, query) con_r.close() return result