def run(self): global conn print "mlbpark_start" db = model.model(conn) for i in self.data: for j in i.findAll('div', {'class', 'page'}): max_page = 0 if (j.text): max_page = len(j.findAll("a")) + 1 try: if (max_page > 3): max_page = 3 now_page = 1 else_page = 1 while (now_page <= max_page): if (now_page == 1): mlbpark_paging_find(db, self.url, now_page).start() else: else_page += 30 mlbpark_paging_find(db, self.url, else_page).start() now_page += 1 except: pass else: break
def bing_find(): global query url = "https://www.bing.com/search?&q=" + urllib.quote("\"" + query + "\"") req_queue = Queue.Queue() header = { "user-agent": "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/54.0.2840.99 Safari/537.36" } YPIMcrawler_nonlogin(req_queue, url, header, "span", "class", "sb_count").start() # class data = req_queue.get() db = model.model(conn) max_page = 0 try: for i in data: max_page = math.ceil( int(i.text.encode("utf-8").replace(',', "").replace('결과', "")) / 10.0) if (max_page > 7): max_page = 7 now_page = 1 first = 1 while (now_page < max_page): bing_paging_find(db, url, first).start() first += 10 now_page += 1 except Exception as e: print e
def run(self): print "chosun_find" # req_class, url, header, search_tag, search_attr, search_key return_list = [] for i in self.data: try: count_article = i.h3.em.text.strip('(').encode('utf-8') spl = count_article.index( '건') # total page url - wikidocs.net/13 n_cnt_article = int(count_article[:spl]) total_page_no = int(math.ceil(n_cnt_article / 10.0)) # casting now_page = 1 end_page_no = 1 db = model.model(conn) while (True): if (now_page > total_page_no): break else: #print "chosun_find_threading..." chosun_paging_find(self.url, now_page, db).start() now_page += 1 end_page_no += 1 if (end_page_no == 4): end_page_no = total_page_no break except: print "chosun Error" pass
def run(self): db = model.model(conn) total_page_no = -1 for i in self.data: for j in i.findAll('td'): try: if (j['align'] == "center"): if (j['height'] == str(40)): try: if (len(j.findAll('font')) > 0): total_page_no += len(j.findAll('font')) except Exception as e: print e if (total_page_no >= 3): total_page_no = 3 now_page = 1 while (True): if (now_page > total_page_no): break else: todayhumor_paging_find( db, self.url[:-1], now_page).start() now_page += 1 except: pass
def run(self): global conn print "ddanzi_start" db = model.model(conn) for i in self.data: try: for j in i.findAll('h3', {'class': 'subTitle'}): count_article = j.text.encode('utf-8') if (count_article.find('문서') >= 0): n_cnt_article = int( j.span.text.encode('utf-8').strip('(|)')) total_page_no = int(math.ceil(n_cnt_article / 30.0)) if (total_page_no >= 3): total_page_no = 3 now_page = 1 while (True): if (now_page > total_page_no): break else: ddanzi_paging_find(db, self.url, now_page).start() now_page += 1 except Exception as e: print e pass
def run(self): global conn print "ppomppu_start" db = model.model(conn) for i in self.data: try: total_page_no = len(i.findAll('a')) if (total_page_no >= 11): total_page_no = 3 else: total_page_no = total_page_no + 1 if (total_page_no > 3): total_page_no = 3 now_page = 1 while (True): if (now_page > total_page_no): break else: ppomppu_paging_find(db, self.url, now_page).start() now_page += 1 except Exception as e: print e pass
def yahoo_find(): global query url = "https://search.yahoo.com/search?fr=yfp-t&fp=1&toggle=1&cop=mss&ei=UTF-8&p=" + "\"" + query + "\"" req_queue = Queue.Queue() header = { "user-agent": "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/54.0.2840.99 Safari/537.36" } YPIMcrawler_nonlogin(req_queue, url, header, "div", "class", "compPagination").start() # class data = req_queue.get() db = model.model(conn) max_page = 0 try: for i in data: max_page = math.ceil( int(i.span.text.replace(" results", "").replace(",", "")) / 10.0) if (max_page > 7): max_page = 7 now_page = 1 first = 1 while (now_page < max_page): yahoo_paging_find(db, url, first).start() first += 10 now_page += 1 except Exception as e: print e
def __init__(self, query_, page): self.page = page self.query = query_.encode("utf-8") self.conn = db_conn.db_conn() self.db = model.model(self.conn) self.cnt = self.db.tb_detail_count(self.query)
def __init__(self, board): threading.Thread.__init__(self) global query, conn #추가 self.url = "http://cafe984.daum.net/_c21_/cafesearch?grpid=mEr9&listnum=50&head=&viewtype=all&searchPeriod=all&item=" + board #html = urllib2.urlopen(self.url + board[1][1] + unicode(query, 'utf-8').encode('euc-kr')).read() req_queue = Queue.Queue() self.db = model.model(conn) # 추가 YPIMcrawler_nonlogin( req_queue, self.url + unicode(query, 'utf-8').encode('euc-kr'), None, "tr", "class", "list_row_info").start() # class self.data = req_queue.get()
def ypim_crawler_inven3(): conn = db_conn.db_conn() model_ = model.model(conn) return_list = model_.tb_url_detail_select("inven", 3) url_list = [] for i in return_list: url_list.append(i['href']) name_list = ["subjcont", "nicname"] for i in name_list: for j in url_list: inven_find(model_, j, i).start()
def run(self): global conn print "dcinside_start" db = model.model(conn) for i in self.data: max_page = None for j in i.findAll('div', {'id': 'dgn_btn_paging'}): max_page = len(j.findAll('a')) if (max_page > 3): max_page = 3 now_page = 1 while (now_page <= max_page): dcinside_paging_find(db, self.url, now_page).start() now_page += 1
def ypim_crawler_inven2(): conn = db_conn.db_conn() model_ = model.model(conn) return_list = model_.tb_url_detail_select("inven", 2) #url_list = ["http://www.inven.co.kr/board/powerbbs.php?come_idx=2730"] url_list = [] for i in return_list: url_list.append(i['href']) name_list = ["subjcont", "nicname"] for i in name_list: for j in url_list: inven_find(model_, j, i).start()
def run(self): global conn db = model.model(conn) max_page = 0 try: for i in self.data: max_page = len(i.findAll('a')) print max_page if (max_page > 3): max_page = 3 now_page = 1 while (now_page <= max_page): jobkorea_paging_find(db, self.url + "&page=", now_page).start() now_page += 1 except Exception as e: print e
def run(self): global conn print "clien_start" db = model.model(conn) for i in self.data: print try: total_page_no = len(i.findAll("a")) #print total_page_no print total_page_no if (total_page_no >= 3): total_page_no = 3 now_page = 1 while (now_page <= total_page_no): clien_paging_find(db, self.url, now_page).start() now_page += 1 except Exception as e: print e
def run(self): global query, conn, lock url = "https://www.facebook.com/search/str/" + query + "/keywords_users" resp = self.broswer.open( url) # data = set_cookie_broswer (cookie 값 저장됨) html = str(resp.read()).replace("<!--", "").replace("-->", "") Soup = BeautifulSoup(html, "html.parser") db = model.model(conn) # print Soup.prettify() data_list = Soup.find_all('div', {"class", "_glj"}) return_facebook_list = [] for i in data_list: each_data = i.find('div', {"class", "_gll"}) each_name = each_data.text each_url = each_data.a['href'] each_data = i.find('div', {"class", "_glm"}) job = each_data.text each_data = i.find('div', {"class", "_glo"}) school = each_data.text # title = each_name + ", " + + ", " + job + ", " + school title = each_name + "/ " + job + "/ " + school img = "facebook.png" lock.acquire() face_data = { "web_site": "facebook", "href": each_url, "title": title, "img": img } db.tb_detail_insert(query, face_data) lock.release() print "facebook_end"
def inven_get_come_idx(): list = [] non_overlap_list = [] """ url = ["http://lovelive.inven.co.kr", "http://durango.inven.co.kr/"] for i in url: queue = Queue.Queue() t = YPIMcrawler_nonlogin(queue, i, None, "li", "class", "firstMenuItem") t.start() list.append(queue.get()) """ try: for i in inven(): for j in i.findAll('a'): queue = Queue.Queue() print j['href'] t = YPIMcrawler_nonlogin(queue, j['href'], None, "li", "class", "firstMenuItem") t.start() list.append(queue.get()) for i in list: for j in i: for x in j.parent: try: if (x.a['href'].find("come_idx") >= 0): if (x.a['href'].find("category") >= 0): non_overlap_list.append(x.a['href']) else: index = x.a['href'].find("come_idx") try: href = int(x.a['href'][index + 9:index + 13]) non_overlap_list.append( "http://www.inven.co.kr/board/powerbbs.php?come_idx=" + str(href)) except Exception as e: print e pass except: pass db = db_conn.db_conn() conn = db.db_conn() model_ = model.model(conn) model_.tb_url_insert("inven") url_seqno = model_.tb_url_select("inven")[0]['url_seqno'] part_num = 0 lock = threading.Lock() for n, i in enumerate(set(non_overlap_list)): if (n % 100 == 0): part_num += 1 lock.acquire() model_.tb_url_detail_insert(url_seqno, part_num, i) lock.release() except Exception as e: print e
def groups(query): conn = db_conn.db_conn() model_ = model.model(conn) return jsonify(model_.tb_detail_groups_cnt(query))
def web_site(query, web_site): conn = db_conn.db_conn() model_ = model.model(conn) print query, web_site return jsonify(model_.tb_detail_group(query, web_site))
def isfirst(query): conn = db_conn.db_conn() model_ = model.model(conn) return jsonify( str(model_.tb_query_select(query.encode("utf-8"))[0]['que_seqno']))