def _parse_one_list(self, article_list): soup = load_page(article_list) threads = soup.find('tbody', attrs={'class', 'list_tbody' }).find_all('tr') # 각 tr 항목마다 갤 제목 등등 저장 valid_threads = [] for t in threads: if t.find('td') is None: continue header = t.find('td', class_='t_notice').string if bool(re.match('\d+', header)): # 공지사항 등 제외 valid_threads.append(t) if len(self.keyword_filter) == 0: # 제목 필터 없는 경우 ret = [ 'http://gall.dcinside.com' + v.find('a').get('href') for v in valid_threads ] else: # 제목 필터 적용 ret = [] for v in valid_threads: thread_title = ''.join( v.find('td', class_='t_subject').strings) for k in self.keyword_filter: if k in thread_title: ret.append('http://gall.dcinside.com' + v.find('a').get('href')) break return ret
def collect(_rid): # Get html base_url = "https://race.netkeiba.com/?pid=race_old&id=c{rid}" if re.match(r"^\d{12}$", _rid): url = base_url.replace("{rid}", _rid) page = load_page(url, ".race_table_old") else: return {"status": "ERROR", "message": "Invalid URL parameter: " + _rid} # Parse race info if page is not None: race = parse_nk_race(page) else: return {"status": "ERROR", "message": "There is no page: " + url} if "_id" in race: db = vault() db.races.update({"_id": race["_id"]}, race, upsert=True) else: return { "status": "ERROR", "message": "There is no id in page: " + race } return { "status": "SUCCESS", "message": "Start race collection process for " + _rid }
def format_code_dcinside(code, recommend=False): ret = 'http://gall.dcinside.com/board/lists/?id=' + code soup = load_page(ret) mat = re.match("window.location.replace\('(?P<target>\S+)'\);", soup.text) if bool(mat): ret = 'http://gall.dcinside.com' + mat.group('target') ret += '&page=1' if recommend: ret += '&exception_mode=recommend' return ret
def get_last_list(self): soup = load_page(self.url) self.title = soup.title.string.strip() if self.recommend: self.title += ' (개념글만)' page_range_tag = soup.find('div', id='dgn_btn_paging') end_page_url = page_range_tag.find_all('a')[-1].get('href') last_page_mat = re.search('page=(?P<pagenum>\d+)', end_page_url) if bool(last_page_mat): # 마지막 페이지가 있는 경우 self.last_list = int(last_page_mat.group('pagenum')) else: # 마지막 페이지가 없는 경우 self.last_list = 1
def get_last_list(self): title_soup = load_page(self.url) mat = re.search( '\S+.src="(?P<prefix>\S+)"\+Date.now\(\)\+"(?P<suffix>\S+)"', title_soup.text) if bool(mat): new_referer = mat.group('prefix') + str(round( time.time())) + mat.group('suffix') title_soup = load_page(self.url, extra_headers={'Referer': new_referer}) self.title = title_soup.title.string.strip() category_soup = load_page(self.url + 'category') url_list = [] a_list = category_soup.find_all('a') for a in a_list: link = a.get('href') if link is not None: mat = re.search('^/(?P<page_id>\d+)', a.get('href')) if bool(mat): url_list.append(int(mat.group('page_id'))) self.last_list = max(url_list)
def find_gall(keyword): search_url = 'http://m.dcinside.com/search/index.php?search_gall={}&search_type=gall_name'.format( keyword) soup = load_page(search_url, mobile=True, extra_headers={'Host': 'm.dcinside.com'}) ret = [] a_list = soup.find('div', class_="searh-result-box").find_all('a') for a in a_list: if 'http://m.dcinside.com/list.php' in a.get('href'): title = ''.join(a.strings).strip() code = re.split('=', a.get('href'))[1] ret.append([title, code]) return ret
def bulk_collect(_year, _month): url = "https://keiba.yahoo.co.jp/schedule/list/" + _year + "/?month=" + _month page = load_page(url, ".layoutCol2M") # Parse race info if page is not None: race_id = parse_spn_rid(page) else: return {"status": "ERROR", "message": "There is no page: " + url} if len(race_id) != 0: for rid in race_id: collect(rid) time.sleep(5) else: return {"status": "ERROR", "message": "There is no page: " + url} return {"status": "SUCCESS", "message": "Start bulk collection process"}
def job_manager(_urls): # Put log task_id = start_log(len(_urls)) for url in _urls: # Collect target page html page = load_page(url, ".layoutCol2M") if page is None: warning_log(task_id, "There is no page: " + url) else: # Parse & Insert page elements db = vault() for hold in parse_sportsnavi(page): if "_id" in hold: db.holds.update({"_id": hold["_id"]}, hold, upsert=True) else: warning_log(task_id, "There is no _id in page: " + hold) update_log(task_id) time.sleep(5) end_log(task_id)
def get_last_list(self): soup = load_page(self.url) self.title = soup.find('h2', {'class': 'tit_series'}).string a_list = soup.find_all('a', {'class': 'spot_post_area'}) self.last_list = len(a_list)
def _get_soup(self): self.soup = load_page(self.url) self._get_raw_title()