def main(argv) : press_name = argv[1] start_page_index = int(argv[2]) end_page_index = int(argv[3]) print press_name press = press_dict[press_name] con = db.connect_raw() for i in range(start_page_index, end_page_index+1) : # get 10-20 url and insert url_list = press.get_article_urls_with_pagenum(i) print "page: " + str(i) for url in url_list : print url try : article = press.parse_article_with_url(url) except : print 'retry parsing!' article = press.parse_article_with_url(url) query = db.make_insert_query("article", article) result = db.do_insert(con, query) time.sleep(1.5) time.sleep(5) con.close()
def _is_author_exits(email) : con_d = db.connect_dev() query = 'SELECT id FROM author WHERE email=\'' + email + '\'' result = db.do_select(con_d, query) con_d.close() if len(result) == 0 : return False return True
def __get_hook_word_dict() : con = db.connect_dev() keyword_dict = {} query = "SELECT * FROM hooking_keyword" result = db.do_select(con, query) con.close() for (id, word) in result : keyword_dict[word.encode('utf-8')] = id return keyword_dict
def __get_section_id(con, section_name) : section_keywords = re.findall(u"[가-힣]+", section_name) kwd_num = len(section_keywords) # 일치하는 키워드가 전혀 없을 때 if kwd_num == 0 : return 0 kwd_idx = kwd_num - 1 while True : last_keyword = section_keywords[kwd_idx] # 일반: 분류할 수 없음 if last_keyword == u'일반' : kwd_idx -= 1 continue # 키워드 탐색 query = "SELECT id FROM section WHERE name LIKE \'%" + last_keyword + "%\'" result = db.do_select(con, query) # 일치 키워드를 찾았을 때, 반환 if len(result) > 0 : return result[0][0] # 찾지 못했을 때, 상위 키워드로 이동 kwd_idx -= 1 # 일치하는 키워드가 전혀 없을 때 if kwd_idx < 0 : return 0
def __get_if_email_exits(con, email) : query = 'SELECT * from author WHERE email = \'' + email + '\'' result = db.do_select(con, query) if len(result) == 0 : return None else : author = {} for row in result : author['id'] = row[0] author['name'] = row[1] author['email'] = row[2] author['press_id'] = row[3] author['added_date'] = row[4] return author
def __extract_author(con, email) : """ id, name, email, press_id, added_date """ author = {} con_r = db.connect_raw() query = "SELECT URL, author_info FROM article WHERE author_info like \'%" + email + "%\'" result = db.do_select(con_r, query) url = result[0][0].split('/')[2] # SUM ALL POSSIBLE NAME possible_words = {} for row in result : author_info = row[1] possible_words_in_article = __extract_name(author_info) for k, v in possible_words_in_article.items() : possible_words[k] = possible_words.get(k, 0) + v max_value = 0 name = "" for key, value in possible_words.items() : if (value > max_value) : if key == u'' : continue name = key max_value = value if max_value < 3 : return None author['name'] = name.encode('utf-8') author['email'] = email.encode('utf-8') author['press_id'] = __get_press_id_from(url) author['added_date'] = str(__get_today()).encode('utf-8') return author
from tkinter import * from tkinter import ttk import datetime import time import DB_connector from tkinter import messagebox db = DB_connector.db_connect() ID = -1 root = Tk() root.title("Pro_Timer") root.geometry("930x240") root.resizable(0, 0) style = ttk.Style() style.configure("Treeview.Heading", font=(None, 15)) style.configure("mystyle.Treeview", font=(None, 12)) # fram1 frm1 = ttk.Frame(root) frm1.pack(padx=15, pady=15, side=RIGHT) frm1.config(width=200, height=40, relief=RIDGE) # fram2 frm2 = ttk.Frame(root) frm2.pack(pady=15, side=BOTTOM) frm2.config(width=200, height=50, relief=RIDGE) But_in = ttk.Button(frm1, text="IN") But_in.grid(row=1, column=0, pady=15, padx=15, sticky='snew') But_out = ttk.Button(frm1, text="OUT")
def _get_raw_data() : con_r = db.connect_raw() query = 'SELECT * FROM article' result = db.do_select(con_r, query) con_r.close() return result
def __get_press_id_from(url) : con_d = db.connect_dev() query = 'SELECT id FROM press WHERE domain=\'' + url + '\'' result = db.do_select(con_d, query) con_d.close() return result[0][0]
def main() : result = _get_raw_data() con = db.connect_dev() for row in result : # article table article = _make_article_info(con, row) query = db.make_insert_query('article', article) db.do_insert(con, query) # hooking keyword table raw_content = row[2] hook_words = _extract_hook_word(raw_content.encode('utf-8')) for word in hook_words.keys() : words_in_article = {'article_URL': article['URL'], 'hooking_keyword_id': word, 'count': hook_words[word]} query = db.make_insert_query('article_hooking_keyword', words_in_article) db.do_insert(con, query) # author table expected_author_string = row[5] author_list = _make_author_list(con, expected_author_string) for author in author_list : if not _is_author_exits(author['email']) : query = db.make_insert_query('author', author) db.do_insert(con, query) # Get author_id query = "SELECT id FROM author WHERE email=\'" + author['email'] + "\'" author_id = db.do_select(con, query) author_id = author_id[0][0] print author_id, author['name'], author['email'], author['press_id'] # article_author table article_author = {'article_URL': article['URL'], 'author_id': author_id} query = db.make_insert_query('article_author', article_author) db.do_insert(con, query)