def rss2_get(user_info, post_poor, config=config.yml): # print('\n') # print('-------执行rss2规则----------') # print('执行链接:', user_info[1]) link = user_info[1] error_atom = False try: html = request.get_data(link + "/rss.xml") soup = BeautifulSoup(html, 'html.parser') items = soup.find_all("item") if len(items) == 0: html = request.get_data(link + "/rss2.xml") soup = BeautifulSoup(html, 'html.parser') items = soup.find_all("item") l = 5 new_loc = [] new_loc_time = [] if len(items) < 5: l = len(items) if l == 0: error_atom = True # print('该网站可能没有rss') else: for i in range(l): post_info = {} item = items[i] title = item.find("title").text url = item.find("link").text timedata = item.find("pubDate").text.split(" ") y, m, d = int(timedata[3]), list(calendar.month_abbr).index( timedata[2]), int(timedata[1]) time = "{:02d}-{:02d}-{:02d}".format(y, m, d) post_info['title'] = title post_info['time'] = time post_info['updated'] = time post_info['link'] = url post_info['name'] = user_info[0] post_info['img'] = user_info[2] post_info['rule'] = "rss2" new_loc.append(url) new_loc_time.append(time) post_poor.append(post_info) # print('该网站最新的{}条rss为:'.format(l), new_loc[0:5]) # print('该网站最新的{}个时间为:'.format(l), new_loc_time[0:5]) except Exception as e: # print('无法请求rss/rss2') # print(e) # print(e.__traceback__.tb_frame.f_globals["__file__"]) # print(e.__traceback__.tb_lineno) error_atom = True # print('-----------结束rss2规则----------') # print('\n') return error_atom, post_poor
def atom_get(user_info, post_poor, config=config.yml): # print('\n') # print('-------执行atom规则----------') # print('执行链接:', user_info[1]) link = user_info[1] error_atom = False try: html = request.get_data(link + "/atom.xml") # # print(html) soup = BeautifulSoup(html, 'html.parser') items = soup.find_all("entry") if len(items) == 0: html = request.get_data(link + "/feed/atom") soup = BeautifulSoup(html, 'html.parser') items = soup.find_all("entry") l = 5 new_loc = [] new_loc_time = [] if len(items) < 5: l = len(items) if l == 0: error_atom = True # print('该网站可能没有atom') else: for i in range(l): post_info = {} item = items[i] title = item.find("title").text url = item.find("link")['href'] time = item.find("published").text[:10] updated = item.find("updated").text[:10] post_info['title'] = title post_info['time'] = time post_info['updated'] = updated post_info['link'] = url post_info['name'] = user_info[0] post_info['img'] = user_info[2] post_info['rule'] = "atom" new_loc.append(url) new_loc_time.append(time) post_poor.append(post_info) # print('该网站最新的{}条atom为:'.format(l), new_loc[0:5]) # print('该网站最新的{}个时间为:'.format(l), new_loc_time[0:5]) except Exception as e: # print('无法请求atom') # print(e) # print(e.__traceback__.tb_frame.f_globals["__file__"]) # print(e.__traceback__.tb_lineno) error_atom = True # print('-----------结束atom规则----------') # print('\n') return error_atom, post_poor
def get_last_post(user_info, post_poor): error_sitmap = False link = user_info[1] # print('\n') # print('-------执行sakura主页规则----------') # print('执行链接:', link) result = request.get_data(link) soup = BeautifulSoup(result, 'html.parser') main_content = soup.find_all(id='main') time_excit = soup.find_all('div', {"class": "post-date"}) if main_content and time_excit: error_sitmap = True link_list = main_content[0].find_all('div', {"class": "post-date"}) lasttime = datetime.datetime.strptime('1970-01-01', "%Y-%m-%d") for index, item in enumerate(link_list): time = item.text time = re.search(r"(\d{4}-\d{1,2}-\d{1,2})", time).group(0) if lasttime < datetime.datetime.strptime(time, "%Y-%m-%d"): lasttime = datetime.datetime.strptime(time, "%Y-%m-%d") lasttime = lasttime.strftime('%Y-%m-%d') # print('最新时间是', lasttime) last_post_list = main_content[0].find_all('article', {"class": "post"}) for item in last_post_list: time_created = item.find('div', { "class": "post-date" }).text.strip() time_created = re.search(r"(\d{4}-\d{1,2}-\d{1,2})", time_created).group(0) time_created = datetime.datetime.strptime( time_created, "%Y-%m-%d").strftime("%Y-%m-%d") if time_created == lasttime: error_sitmap = False # print(lasttime) a = item.find('a') # # print(item.find('a')) alink = a['href'] alinksplit = alink.split("/", 1) stralink = alinksplit[1].strip() if link[-1] != '/': link = link + '/' # print(item.find('h3').text.strip().encode("gbk", 'ignore').decode('gbk', 'ignore')) link = link.split('/')[0] # print(link + '/' + stralink) # print("-----------获取到匹配结果----------") post_info = { 'title': item.find('h3').text.strip(), 'time': lasttime, 'updated': lasttime, 'link': link + '/' + stralink, 'name': user_info[0], 'img': user_info[2], 'rule': "sakura" } post_poor.append(post_info) else: error_sitmap = True # print('貌似不是类似sakura主题!') # print("-----------结束sakura主页规则----------") # print('\n') return error_sitmap
def get_last_post(user_info, post_poor): error_sitmap = False link = user_info[1] # print('\n') # print('-------执行volantis主页规则----------') # print('执行链接:', link) result = request.get_data(link) soup = BeautifulSoup(result, 'html.parser') main_content = soup.find_all('section', {"class": "post-list"}) time_excit = soup.find_all('time') if main_content and time_excit: error_sitmap = True link_list = main_content[0].find_all('time') lasttime = datetime.datetime.strptime('1970-01-01', "%Y-%m-%d") for index, item in enumerate(link_list): time = item.text time = time.replace("|", "") time = time.replace(" ", "") time = time.replace("\n", "") if lasttime < datetime.datetime.strptime(time, "%Y-%m-%d"): lasttime = datetime.datetime.strptime(time, "%Y-%m-%d") lasttime = lasttime.strftime('%Y-%m-%d') # print('最新时间是', lasttime) last_post_list = main_content[0].find_all('div', {"class": "post-wrapper"}) for item in last_post_list: if item.find('time'): time_created = item.find('time').text.strip() else: time_created = '' if time_created == lasttime: error_sitmap = False # print(lasttime) a = item.find('a') alink = a['href'] alinksplit = alink.split("/", 1) stralink = alinksplit[1].strip() if link[-1] != '/': link = link + '/' # print(item.find('h2', {"class": "article-title"}).text.strip().encode("gbk", 'ignore').decode('gbk', 'ignore')) # print(link + stralink) # print("-----------获取到匹配结果----------") post_info = { 'title': item.find('h2', {"class": "article-title"}).text.strip(), 'time': lasttime, 'updated': lasttime, 'link': link + stralink, 'name': user_info[0], 'img': user_info[2], 'rule': "volantis" } post_poor.append(post_info) else: error_sitmap = True # print('貌似不是类似volantis主题!') # print("-----------结束主页规则----------") # print('\n') return error_sitmap
def github_issuse(friend_poor, config=None): # print('\n') # print('-------获取github友链----------') baselink = 'https://github.com/' errortimes = 0 # config = config.yml # print('owner:', config['setting']['github_friends_links']['owner']) # print('repo:', config['setting']['github_friends_links']['repo']) # print('state:', config['setting']['github_friends_links']['state']) try: for number in range(1, 100): # print(number) github = request.get_data( 'https://github.com/' + config['setting']['github_friends_links']['owner'] + '/' + config['setting']['github_friends_links']['repo'] + '/issues?q=is%3A' + config['setting']['github_friends_links']['state'] + '&page=' + str(number)) soup = BeautifulSoup(github, 'html.parser') main_content = soup.find_all('div', {'aria-label': 'Issues'}) linklist = main_content[0].find_all('a', {'class': 'Link--primary'}) if len(linklist) == 0: # print('爬取完毕') # print('失败了%r次' % errortimes) break for item in linklist: issueslink = baselink + item['href'] issues_page = request.get_data(issueslink) issues_soup = BeautifulSoup(issues_page, 'html.parser') try: issues_linklist = issues_soup.find_all('pre') source = issues_linklist[0].text user_info = [] info_list = ['name', 'link', 'avatar'] reg(info_list, user_info, source) if user_info[1] != '你的链接': friend_poor.append(user_info) except: errortimes += 1 continue except Exception as e: pass
def gitee_issuse(friend_poor): # print('\n') # print('-------获取volantis-gitee友链----------') baselink = 'https://gitee.com' errortimes = 0 config = load_config() # print('owner:', config['setting']['gitee_friends_links']['owner']) # print('repo:', config['setting']['gitee_friends_links']['repo']) # print('state:', config['setting']['gitee_friends_links']['state']) try: for number in range(1, 100): # print(number) gitee = request.get_data( 'https://gitee.com/' + config['setting']['gitee_friends_links']['owner'] + '/' + config['setting']['gitee_friends_links']['repo'] + '/issues?state=' + config['setting']['gitee_friends_links']['state'] + '&page=' + str(number)) soup = BeautifulSoup(gitee, 'html.parser') main_content = soup.find_all(id='git-issues') linklist = main_content[0].find_all('a', {'class': 'title'}) if len(linklist) == 0: # print('爬取完毕') # print('失败了%r次' % errortimes) break for item in linklist: issueslink = baselink + item['href'] issues_page = request.get_data(issueslink) issues_soup = BeautifulSoup(issues_page, 'html.parser') try: issues_linklist = issues_soup.find_all('code') source = issues_linklist[0].text user_info = [] info_list = ['title', 'url', 'avatar'] reg_volantis(info_list, user_info, source) if user_info[1] != '你的链接': friend_poor.append(user_info) except: errortimes += 1 continue except Exception as e: pass
def sitmap_get(user_info, post_poor, config=config.yml): # print('\n') # print('-------执行sitemap规则----------') # print('执行链接:', user_info[1]) link = user_info[1] error_sitmap = False try: result = request.get_data(link + '/sitemap.xml') soup = BeautifulSoup(result, 'html.parser') items = soup.find_all('url') if len(items) == 0: result = request.get_data(link + '/baidusitemap.xml') soup = BeautifulSoup(result, 'html.parser') items = soup.find_all('url') l = 5 new_loc = [] new_loc_time = [] if len(items) < 5: l = len(items) if l == 0: error_sitmap = True # print('该网站可能没有rss') else: for i in range(l): post_info = {} item = items[i] # new_loc.append(url) # new_loc_time.append(time) # post_poor.append(post_info) # print('该网站最新的{}条rss为:'.format(l), new_loc[0:5]) # print('该网站最新的{}个时间为:'.format(l), new_loc_time[0:5]) except Exception as e: # print('无法请求sitemap') # print(e) # print(e.__traceback__.tb_frame.f_globals["__file__"]) # print(e.__traceback__.tb_lineno) error_sitmap = True return error_sitmap, post_poor
def get_friendlink(friendpage_link, friend_poor): main_content = [] result = request.get_data(friendpage_link) soup = BeautifulSoup(result, 'html.parser') # Volantis sites if len(soup.find_all('a', {"class": "site-card"})) > 0: main_content = soup.find_all('a', {"class": "site-card"}) # print('使用Volantis simple') # Volantis simple elif len(soup.find_all('a', {"class": "simpleuser"})) > 0: main_content = soup.find_all('a', {"class": "simpleuser"}) # print('使用Volantis traditional') # Volantis traditional elif len(soup.find_all('a', {"class": "friend-card"})) > 0: main_content = soup.find_all('a', {"class": "friend-card"}) # print('使用Volantis sites') # else: # print('不包含标准volantis友链!') for item in main_content: if len(item.find_all('img')) > 1: img = item.find_all('img')[1].get('src') else: img = item.find('img').get('src') link = item.get('href') if item.find('span'): name = item.find('span').text elif item.find('p'): name = item.find('p').text if "#" in link: pass else: user_info = [] user_info.append(name) user_info.append(link) user_info.append(img) print('----------------------') try: print('好友名%r' % name) except: print('非法用户名') print('头像链接%r' % img) print('主页链接%r' % link) friend_poor.append(user_info) config = load_config() if config['setting']['gitee_friends_links']['enable'] and config[ 'setting']['gitee_friends_links']['type'] == 'volantis': gitee_issuse(friend_poor) if config['setting']['github_friends_links']['enable'] and config[ 'setting']['github_friends_links']['type'] == 'volantis': github_issuse(friend_poor)
def get_friendlink(friendpage_link, friend_poor): result = request.get_data(friendpage_link) soup = BeautifulSoup(result, 'html.parser') main_content = soup.find_all(id='article-container') link_list = main_content[0].find_all('a') for index, item in enumerate(link_list): link = item.get('href') if link.count('/') > 3: continue if item.get('title'): name = item.get('title') else: try: name = item.find('span').text except: continue try: if len(item.find_all('img')) > 1: imglist = item.find_all('img') if imglist[1].get('data-lazy-src'): img = imglist[1].get('data-lazy-src') else: img = imglist[1].get('src') else: imglist = item.find_all('img') if imglist[0].get('data-lazy-src'): img = imglist[0].get('data-lazy-src') else: img = imglist[0].get('src') except: continue if "#" in link: pass else: user_info = [] user_info.append(name) user_info.append(link) user_info.append(img) print('----------------------') try: print('好友名%r' % name) except: print('非法用户名') print('头像链接%r' % img) print('主页链接%r' % link) friend_poor.append(user_info)
def get_friendlink(friendpage_link, friend_poor): result = request.get_data(friendpage_link) soup = BeautifulSoup(result, 'html.parser') main_content = soup.find_all('li', {"class": "link-item"}) for item in main_content: img = item.find('img').get('data-src') link = item.find('a').get('href') name = item.find('span').text if "#" in link: pass else: user_info = [] user_info.append(name) user_info.append(link) user_info.append(img) print('----------------------') try: print('好友名%r' % name) except: print('非法用户名') print('头像链接%r' % img) print('主页链接%r' % link) friend_poor.append(user_info)
def sitmap_get(user_info, post_poor, config=config.yml): from handlers.coreSettings import configs # print('\n') # print('-------执行sitemap规则----------') # print('执行链接:', user_info[1]) link = user_info[1] error_sitmap = False try: result = request.get_data(link + '/sitemap.xml') soup = BeautifulSoup(result, 'html.parser') url = soup.find_all('url') if len(url) == 0: result = request.get_data(link + '/baidusitemap.xml') soup = BeautifulSoup(result, 'html.parser') url = soup.find_all('url') new_link_list = [] for item in url: box = [] url_link = item.find('loc') url_date = item.find('lastmod') box.append(url_link) box.append(url_date) new_link_list.append(box) def takeSecond(elem): return str(elem[1])[9:19] new_link_list.sort(key=takeSecond, reverse=True) if len(url) == 0: error_sitmap = True # print('该网站可能没有sitemap') # block_word = config['setting']['block_word'] block_word = configs.BLOCK_WORD new_loc = [] new_loc_time = [] for item in new_link_list: loc_item = item[0] time = item[1] if loc_item.text[-1] == '/': limit_number = 5 else: limit_number = 4 block = False for item in block_word: if item in loc_item.text: block = True if block: pass elif loc_item.text.count('/') < limit_number: pass else: new_loc.append(loc_item) new_loc_time.append(time) if len(new_loc) < 1: for item in new_link_list: loc_item = item[0] time = item[1] if loc_item.text[-1] == '/': limit_number = 3 else: limit_number = 2 block = False for item in block_word: if item in loc_item.text: block = True if block: pass elif loc_item.text.count('/') == limit_number: pass else: new_loc.append(loc_item) new_loc_time.append(time) # print('该网站最新的五条sitemap为:', new_loc[0:5]) # print('该网站最新的五个时间戳为:', new_loc_time[0:5]) # print('-------开始详情页面爬取----------') if len(new_loc) != 0: for i, new_loc_item in enumerate(new_loc[0:5]): post_link = new_loc_item.text result = request.get_data(post_link) if result == 'error': continue try: time = find_time(str(result)) if time == '': time = str(new_loc_time[i])[9:19] # print('采用sitemap时间', time) soup = BeautifulSoup(result, 'html.parser') title = soup.find('title') strtitle = title.text # block_chars = config['setting']['block_chars'] block_chars = configs.BLOCK_CHARS for item in block_chars: titlesplit = strtitle.split(item, 1) strtitle = titlesplit[0].strip() post_info = { 'title': strtitle, 'time': time, 'link': post_link, 'name': user_info[0], 'img': user_info[2], 'rule': "sitemap" } # print(strtitle.encode("gbk", 'ignore').decode('gbk', 'ignore')) # print(time) # print(post_link) post_poor.append(post_info) # print("-----------获取到匹配结果----------") except Exception as e: # print(e) # print(e.__traceback__.tb_frame.f_globals["__file__"]) # print(e.__traceback__.tb_lineno) # print('网站不包含规范的时间格式!') error_sitmap = True except Exception as e: # print('无法请求sitemap') # print(e) # print(e.__traceback__.tb_frame.f_globals["__file__"]) # print(e.__traceback__.tb_lineno) error_sitmap = True # print('-----------结束sitemap规则----------') # print('\n') return error_sitmap, post_poor
def get_last_post(user_info, post_poor): error_sitmap = False link = user_info[1] # print('\n') # print('-------执行butterfly主页规则----------') # print('执行链接:', link) result = request.get_data(link) soup = BeautifulSoup(result, 'html.parser') main_content = soup.find_all(id='recent-posts') time_excit = soup.find_all('time') if main_content and time_excit: error_sitmap = True link_list = main_content[0].find_all( 'time', {"class": "post-meta-date-created"}) if link_list == []: # print('该页面无文章生成日期') link_list = main_content[0].find_all('time') # else: # print('该页面有文章生成日期') lasttime = datetime.datetime.strptime('1970-01-01', "%Y-%m-%d") for index, item in enumerate(link_list): time = item.text time = time.replace("|", "") time = time.replace(" ", "") if lasttime < datetime.datetime.strptime(time, "%Y-%m-%d"): lasttime = datetime.datetime.strptime(time, "%Y-%m-%d") lasttime = lasttime.strftime('%Y-%m-%d') # print('最新时间是', lasttime) last_post_list = main_content[0].find_all( 'div', {"class": "recent-post-info"}) for item in last_post_list: time_created = item.find('time', {"class": "post-meta-date-created"}) if time_created: pass else: time_created = item if time_created.find(text=lasttime): error_sitmap = False # print(lasttime) a = item.find('a') # # print(item.find('a')) alink = a['href'] alinksplit = alink.split("/", 1) stralink = alinksplit[1].strip() if link[-1] != '/': link = link + '/' # print(a.text.encode("gbk", 'ignore').decode('gbk', 'ignore')) # print(link + stralink) # print("-----------获取到匹配结果----------") post_info = { 'title': a.text, 'time': lasttime, 'updated': lasttime, 'link': link + stralink, 'name': user_info[0], 'img': user_info[2], 'rule': "butterfly" } post_poor.append(post_info) else: error_sitmap = True # print('貌似不是类似butterfly主题!') # print("-----------结束butterfly主页规则----------") # print('\n') return error_sitmap
def get_last_post(user_info, post_poor): error_sitmap = False link = user_info[1] # print('\n') # print('-------执行fluid主页规则----------') # print('执行链接:', link) result = request.get_data(link) soup = BeautifulSoup(result, 'html.parser') main_content = soup.find_all(id='board') time_excit = soup.find_all('div', {"class": "post-meta mr-3"}) if main_content and time_excit: error_sitmap = True link_list = main_content[0].find_all('div', {"class": "post-meta mr-3"}) lasttime = datetime.datetime.strptime('1970-01-01', "%Y-%m-%d") for index, item in enumerate(link_list): time = item.text time = time.replace("|", "") time = time.replace(" ", "") time = time.replace("\n", "") try: datetime.datetime.strptime(time, "%Y-%m-%d") except: continue if lasttime < datetime.datetime.strptime(time, "%Y-%m-%d"): lasttime = datetime.datetime.strptime(time, "%Y-%m-%d") lasttime = lasttime.strftime('%Y-%m-%d') # print('最新时间是', lasttime) last_post_list = main_content[0].find_all( 'div', {"class": "row mx-auto index-card"}) for item in last_post_list: time_created = item.find('div', { "class": "post-meta mr-3" }).text.strip() if time_created == lasttime: error_sitmap = False a = item.find('a') # # print(item.find('a')) stralink = a['href'] if link[-1] != '/': link = link + '/' # print(item.find('h1', {"class": "index-header"}).text.strip().encode("gbk", 'ignore').decode('gbk', 'ignore')) # print(link + stralink) # print("-----------获取到匹配结果----------") post_info = { 'title': item.find('h1', { "class": "index-header" }).text.strip(), 'time': lasttime, 'link': link + stralink, 'name': user_info[0], 'img': user_info[2], 'rule': "fluid" } post_poor.append(post_info) else: error_sitmap = True # print('貌似不是类似fluid主题!') # print("-----------结束fluid主页规则----------") # print('\n') return error_sitmap