예제 #1
0
def rss2_get(user_info, post_poor, config=config.yml):
    # print('\n')
    # print('-------执行rss2规则----------')
    # print('执行链接:', user_info[1])
    link = user_info[1]
    error_atom = False
    try:
        html = request.get_data(link + "/rss.xml")
        soup = BeautifulSoup(html, 'html.parser')
        items = soup.find_all("item")
        if len(items) == 0:
            html = request.get_data(link + "/rss2.xml")
            soup = BeautifulSoup(html, 'html.parser')
            items = soup.find_all("item")
        l = 5
        new_loc = []
        new_loc_time = []
        if len(items) < 5: l = len(items)
        if l == 0:
            error_atom = True
            # print('该网站可能没有rss')
        else:
            for i in range(l):
                post_info = {}
                item = items[i]
                title = item.find("title").text
                url = item.find("link").text
                timedata = item.find("pubDate").text.split(" ")
                y, m, d = int(timedata[3]), list(calendar.month_abbr).index(
                    timedata[2]), int(timedata[1])
                time = "{:02d}-{:02d}-{:02d}".format(y, m, d)
                post_info['title'] = title
                post_info['time'] = time
                post_info['updated'] = time
                post_info['link'] = url
                post_info['name'] = user_info[0]
                post_info['img'] = user_info[2]
                post_info['rule'] = "rss2"
                new_loc.append(url)
                new_loc_time.append(time)
                post_poor.append(post_info)
            # print('该网站最新的{}条rss为:'.format(l), new_loc[0:5])
            # print('该网站最新的{}个时间为:'.format(l), new_loc_time[0:5])
    except Exception as e:
        # print('无法请求rss/rss2')
        # print(e)
        # print(e.__traceback__.tb_frame.f_globals["__file__"])
        # print(e.__traceback__.tb_lineno)
        error_atom = True
    # print('-----------结束rss2规则----------')
    # print('\n')
    return error_atom, post_poor
예제 #2
0
def atom_get(user_info, post_poor, config=config.yml):
    # print('\n')
    # print('-------执行atom规则----------')
    # print('执行链接:', user_info[1])
    link = user_info[1]
    error_atom = False
    try:
        html = request.get_data(link + "/atom.xml")
        # # print(html)
        soup = BeautifulSoup(html, 'html.parser')
        items = soup.find_all("entry")
        if len(items) == 0:
            html = request.get_data(link + "/feed/atom")
            soup = BeautifulSoup(html, 'html.parser')
            items = soup.find_all("entry")
        l = 5
        new_loc = []
        new_loc_time = []
        if len(items) < 5: l = len(items)

        if l == 0:
            error_atom = True
            # print('该网站可能没有atom')
        else:
            for i in range(l):
                post_info = {}
                item = items[i]
                title = item.find("title").text
                url = item.find("link")['href']
                time = item.find("published").text[:10]
                updated = item.find("updated").text[:10]
                post_info['title'] = title
                post_info['time'] = time
                post_info['updated'] = updated
                post_info['link'] = url
                post_info['name'] = user_info[0]
                post_info['img'] = user_info[2]
                post_info['rule'] = "atom"
                new_loc.append(url)
                new_loc_time.append(time)
                post_poor.append(post_info)
            # print('该网站最新的{}条atom为:'.format(l), new_loc[0:5])
            # print('该网站最新的{}个时间为:'.format(l), new_loc_time[0:5])
    except Exception as e:
        # print('无法请求atom')
        # print(e)
        # print(e.__traceback__.tb_frame.f_globals["__file__"])
        # print(e.__traceback__.tb_lineno)
        error_atom = True
    # print('-----------结束atom规则----------')
    # print('\n')
    return error_atom, post_poor
예제 #3
0
def get_last_post(user_info, post_poor):
    error_sitmap = False
    link = user_info[1]
    # print('\n')
    # print('-------执行sakura主页规则----------')
    # print('执行链接:', link)
    result = request.get_data(link)
    soup = BeautifulSoup(result, 'html.parser')
    main_content = soup.find_all(id='main')
    time_excit = soup.find_all('div', {"class": "post-date"})
    if main_content and time_excit:
        error_sitmap = True
        link_list = main_content[0].find_all('div', {"class": "post-date"})
        lasttime = datetime.datetime.strptime('1970-01-01', "%Y-%m-%d")
        for index, item in enumerate(link_list):
            time = item.text
            time = re.search(r"(\d{4}-\d{1,2}-\d{1,2})", time).group(0)
            if lasttime < datetime.datetime.strptime(time, "%Y-%m-%d"):
                lasttime = datetime.datetime.strptime(time, "%Y-%m-%d")
        lasttime = lasttime.strftime('%Y-%m-%d')
        # print('最新时间是', lasttime)
        last_post_list = main_content[0].find_all('article', {"class": "post"})
        for item in last_post_list:
            time_created = item.find('div', {
                "class": "post-date"
            }).text.strip()
            time_created = re.search(r"(\d{4}-\d{1,2}-\d{1,2})",
                                     time_created).group(0)
            time_created = datetime.datetime.strptime(
                time_created, "%Y-%m-%d").strftime("%Y-%m-%d")
            if time_created == lasttime:
                error_sitmap = False
                # print(lasttime)
                a = item.find('a')
                # # print(item.find('a'))
                alink = a['href']
                alinksplit = alink.split("/", 1)
                stralink = alinksplit[1].strip()
                if link[-1] != '/':
                    link = link + '/'
                # print(item.find('h3').text.strip().encode("gbk", 'ignore').decode('gbk', 'ignore'))
                link = link.split('/')[0]
                # print(link + '/' + stralink)
                # print("-----------获取到匹配结果----------")
                post_info = {
                    'title': item.find('h3').text.strip(),
                    'time': lasttime,
                    'updated': lasttime,
                    'link': link + '/' + stralink,
                    'name': user_info[0],
                    'img': user_info[2],
                    'rule': "sakura"
                }
                post_poor.append(post_info)
    else:
        error_sitmap = True
        # print('貌似不是类似sakura主题!')
    # print("-----------结束sakura主页规则----------")
    # print('\n')
    return error_sitmap
예제 #4
0
def get_last_post(user_info, post_poor):
    error_sitmap = False
    link = user_info[1]
    # print('\n')
    # print('-------执行volantis主页规则----------')
    # print('执行链接:', link)
    result = request.get_data(link)
    soup = BeautifulSoup(result, 'html.parser')
    main_content = soup.find_all('section', {"class": "post-list"})
    time_excit = soup.find_all('time')
    if main_content and time_excit:
        error_sitmap = True
        link_list = main_content[0].find_all('time')
        lasttime = datetime.datetime.strptime('1970-01-01', "%Y-%m-%d")
        for index, item in enumerate(link_list):
            time = item.text
            time = time.replace("|", "")
            time = time.replace(" ", "")
            time = time.replace("\n", "")
            if lasttime < datetime.datetime.strptime(time, "%Y-%m-%d"):
                lasttime = datetime.datetime.strptime(time, "%Y-%m-%d")
        lasttime = lasttime.strftime('%Y-%m-%d')
        # print('最新时间是', lasttime)
        last_post_list = main_content[0].find_all('div', {"class": "post-wrapper"})
        for item in last_post_list:
            if item.find('time'):
                time_created = item.find('time').text.strip()
            else:
                time_created = ''
            if time_created == lasttime:
                error_sitmap = False
                # print(lasttime)
                a = item.find('a')
                alink = a['href']
                alinksplit = alink.split("/", 1)
                stralink = alinksplit[1].strip()
                if link[-1] != '/':
                    link = link + '/'
                # print(item.find('h2', {"class": "article-title"}).text.strip().encode("gbk", 'ignore').decode('gbk', 'ignore'))
                # print(link + stralink)
                # print("-----------获取到匹配结果----------")
                post_info = {
                    'title': item.find('h2', {"class": "article-title"}).text.strip(),
                    'time': lasttime,
                    'updated': lasttime,
                    'link': link + stralink,
                    'name': user_info[0],
                    'img': user_info[2],
                    'rule': "volantis"
                }
                post_poor.append(post_info)
    else:
        error_sitmap = True
        # print('貌似不是类似volantis主题!')
    # print("-----------结束主页规则----------")
    # print('\n')
    return error_sitmap
예제 #5
0
def github_issuse(friend_poor, config=None):
    # print('\n')
    # print('-------获取github友链----------')
    baselink = 'https://github.com/'
    errortimes = 0
    # config = config.yml
    # print('owner:', config['setting']['github_friends_links']['owner'])
    # print('repo:', config['setting']['github_friends_links']['repo'])
    # print('state:', config['setting']['github_friends_links']['state'])
    try:
        for number in range(1, 100):
            # print(number)
            github = request.get_data(
                'https://github.com/' +
                config['setting']['github_friends_links']['owner'] + '/' +
                config['setting']['github_friends_links']['repo'] +
                '/issues?q=is%3A' +
                config['setting']['github_friends_links']['state'] + '&page=' +
                str(number))
            soup = BeautifulSoup(github, 'html.parser')
            main_content = soup.find_all('div', {'aria-label': 'Issues'})
            linklist = main_content[0].find_all('a',
                                                {'class': 'Link--primary'})
            if len(linklist) == 0:
                # print('爬取完毕')
                # print('失败了%r次' % errortimes)
                break
            for item in linklist:
                issueslink = baselink + item['href']
                issues_page = request.get_data(issueslink)
                issues_soup = BeautifulSoup(issues_page, 'html.parser')
                try:
                    issues_linklist = issues_soup.find_all('pre')
                    source = issues_linklist[0].text
                    user_info = []
                    info_list = ['name', 'link', 'avatar']
                    reg(info_list, user_info, source)
                    if user_info[1] != '你的链接':
                        friend_poor.append(user_info)
                except:
                    errortimes += 1
                    continue
    except Exception as e:
        pass
예제 #6
0
def gitee_issuse(friend_poor):
    # print('\n')
    # print('-------获取volantis-gitee友链----------')
    baselink = 'https://gitee.com'
    errortimes = 0
    config = load_config()
    # print('owner:', config['setting']['gitee_friends_links']['owner'])
    # print('repo:', config['setting']['gitee_friends_links']['repo'])
    # print('state:', config['setting']['gitee_friends_links']['state'])
    try:
        for number in range(1, 100):
            # print(number)
            gitee = request.get_data(
                'https://gitee.com/' +
                config['setting']['gitee_friends_links']['owner'] + '/' +
                config['setting']['gitee_friends_links']['repo'] +
                '/issues?state=' +
                config['setting']['gitee_friends_links']['state'] + '&page=' +
                str(number))
            soup = BeautifulSoup(gitee, 'html.parser')
            main_content = soup.find_all(id='git-issues')
            linklist = main_content[0].find_all('a', {'class': 'title'})
            if len(linklist) == 0:
                # print('爬取完毕')
                # print('失败了%r次' % errortimes)
                break
            for item in linklist:
                issueslink = baselink + item['href']
                issues_page = request.get_data(issueslink)
                issues_soup = BeautifulSoup(issues_page, 'html.parser')
                try:
                    issues_linklist = issues_soup.find_all('code')
                    source = issues_linklist[0].text
                    user_info = []
                    info_list = ['title', 'url', 'avatar']
                    reg_volantis(info_list, user_info, source)
                    if user_info[1] != '你的链接':
                        friend_poor.append(user_info)
                except:
                    errortimes += 1
                    continue
    except Exception as e:
        pass
예제 #7
0
def sitmap_get(user_info, post_poor, config=config.yml):
    # print('\n')
    # print('-------执行sitemap规则----------')
    # print('执行链接:', user_info[1])
    link = user_info[1]
    error_sitmap = False
    try:
        result = request.get_data(link + '/sitemap.xml')
        soup = BeautifulSoup(result, 'html.parser')
        items = soup.find_all('url')
        if len(items) == 0:
            result = request.get_data(link + '/baidusitemap.xml')
            soup = BeautifulSoup(result, 'html.parser')
            items = soup.find_all('url')
        l = 5
        new_loc = []
        new_loc_time = []
        if len(items) < 5: l = len(items)
        if l == 0:
            error_sitmap = True
            # print('该网站可能没有rss')
        else:
            for i in range(l):
                post_info = {}
                item = items[i]

                # new_loc.append(url)
                # new_loc_time.append(time)
                # post_poor.append(post_info)
            # print('该网站最新的{}条rss为:'.format(l), new_loc[0:5])
            # print('该网站最新的{}个时间为:'.format(l), new_loc_time[0:5])

    except Exception as e:
        # print('无法请求sitemap')
        # print(e)
        # print(e.__traceback__.tb_frame.f_globals["__file__"])
        # print(e.__traceback__.tb_lineno)
        error_sitmap = True
    return error_sitmap, post_poor
예제 #8
0
def get_friendlink(friendpage_link, friend_poor):
    main_content = []
    result = request.get_data(friendpage_link)
    soup = BeautifulSoup(result, 'html.parser')
    # Volantis sites
    if len(soup.find_all('a', {"class": "site-card"})) > 0:
        main_content = soup.find_all('a', {"class": "site-card"})
        # print('使用Volantis simple')
    # Volantis simple
    elif len(soup.find_all('a', {"class": "simpleuser"})) > 0:
        main_content = soup.find_all('a', {"class": "simpleuser"})
        # print('使用Volantis traditional')
    # Volantis traditional
    elif len(soup.find_all('a', {"class": "friend-card"})) > 0:
        main_content = soup.find_all('a', {"class": "friend-card"})
        # print('使用Volantis sites')
    # else:
    # print('不包含标准volantis友链!')
    for item in main_content:
        if len(item.find_all('img')) > 1:
            img = item.find_all('img')[1].get('src')
        else:
            img = item.find('img').get('src')
        link = item.get('href')
        if item.find('span'):
            name = item.find('span').text
        elif item.find('p'):
            name = item.find('p').text
        if "#" in link:
            pass
        else:
            user_info = []
            user_info.append(name)
            user_info.append(link)
            user_info.append(img)
            print('----------------------')
            try:
                print('好友名%r' % name)
            except:
                print('非法用户名')
            print('头像链接%r' % img)
            print('主页链接%r' % link)
            friend_poor.append(user_info)
    config = load_config()
    if config['setting']['gitee_friends_links']['enable'] and config[
            'setting']['gitee_friends_links']['type'] == 'volantis':
        gitee_issuse(friend_poor)
    if config['setting']['github_friends_links']['enable'] and config[
            'setting']['github_friends_links']['type'] == 'volantis':
        github_issuse(friend_poor)
예제 #9
0
def get_friendlink(friendpage_link, friend_poor):
    result = request.get_data(friendpage_link)
    soup = BeautifulSoup(result, 'html.parser')
    main_content = soup.find_all(id='article-container')
    link_list = main_content[0].find_all('a')
    for index, item in enumerate(link_list):
        link = item.get('href')
        if link.count('/') > 3:
            continue
        if item.get('title'):
            name = item.get('title')
        else:
            try:
                name = item.find('span').text
            except:
                continue
        try:
            if len(item.find_all('img')) > 1:
                imglist = item.find_all('img')
                if imglist[1].get('data-lazy-src'):
                    img = imglist[1].get('data-lazy-src')
                else:
                    img = imglist[1].get('src')
            else:
                imglist = item.find_all('img')
                if imglist[0].get('data-lazy-src'):
                    img = imglist[0].get('data-lazy-src')
                else:
                    img = imglist[0].get('src')
        except:
            continue
        if "#" in link:
            pass
        else:
            user_info = []
            user_info.append(name)
            user_info.append(link)
            user_info.append(img)
            print('----------------------')
            try:
                print('好友名%r' % name)
            except:
                print('非法用户名')
            print('头像链接%r' % img)
            print('主页链接%r' % link)
            friend_poor.append(user_info)
예제 #10
0
def get_friendlink(friendpage_link, friend_poor):
    result = request.get_data(friendpage_link)
    soup = BeautifulSoup(result, 'html.parser')
    main_content = soup.find_all('li', {"class": "link-item"})
    for item in main_content:
        img = item.find('img').get('data-src')
        link = item.find('a').get('href')
        name = item.find('span').text
        if "#" in link:
            pass
        else:
            user_info = []
            user_info.append(name)
            user_info.append(link)
            user_info.append(img)
            print('----------------------')
            try:
                print('好友名%r' % name)
            except:
                print('非法用户名')
            print('头像链接%r' % img)
            print('主页链接%r' % link)
            friend_poor.append(user_info)
예제 #11
0
def sitmap_get(user_info, post_poor, config=config.yml):
    from handlers.coreSettings import configs
    # print('\n')
    # print('-------执行sitemap规则----------')
    # print('执行链接:', user_info[1])
    link = user_info[1]
    error_sitmap = False
    try:
        result = request.get_data(link + '/sitemap.xml')
        soup = BeautifulSoup(result, 'html.parser')
        url = soup.find_all('url')
        if len(url) == 0:
            result = request.get_data(link + '/baidusitemap.xml')
            soup = BeautifulSoup(result, 'html.parser')
            url = soup.find_all('url')
        new_link_list = []
        for item in url:
            box = []
            url_link = item.find('loc')
            url_date = item.find('lastmod')
            box.append(url_link)
            box.append(url_date)
            new_link_list.append(box)

        def takeSecond(elem):
            return str(elem[1])[9:19]

        new_link_list.sort(key=takeSecond, reverse=True)
        if len(url) == 0:
            error_sitmap = True
            # print('该网站可能没有sitemap')
        # block_word = config['setting']['block_word']
        block_word = configs.BLOCK_WORD
        new_loc = []
        new_loc_time = []
        for item in new_link_list:
            loc_item = item[0]
            time = item[1]
            if loc_item.text[-1] == '/':
                limit_number = 5
            else:
                limit_number = 4
            block = False
            for item in block_word:
                if item in loc_item.text:
                    block = True
            if block:
                pass
            elif loc_item.text.count('/') < limit_number:
                pass
            else:
                new_loc.append(loc_item)
                new_loc_time.append(time)
        if len(new_loc) < 1:
            for item in new_link_list:
                loc_item = item[0]
                time = item[1]
                if loc_item.text[-1] == '/':
                    limit_number = 3
                else:
                    limit_number = 2
                block = False
                for item in block_word:
                    if item in loc_item.text:
                        block = True
                if block:
                    pass
                elif loc_item.text.count('/') == limit_number:
                    pass
                else:
                    new_loc.append(loc_item)
                    new_loc_time.append(time)
        # print('该网站最新的五条sitemap为:', new_loc[0:5])
        # print('该网站最新的五个时间戳为:', new_loc_time[0:5])
        # print('-------开始详情页面爬取----------')
        if len(new_loc) != 0:
            for i, new_loc_item in enumerate(new_loc[0:5]):
                post_link = new_loc_item.text
                result = request.get_data(post_link)
                if result == 'error':
                    continue
                try:
                    time = find_time(str(result))
                    if time == '':
                        time = str(new_loc_time[i])[9:19]
                        # print('采用sitemap时间', time)
                    soup = BeautifulSoup(result, 'html.parser')
                    title = soup.find('title')
                    strtitle = title.text
                    # block_chars = config['setting']['block_chars']
                    block_chars = configs.BLOCK_CHARS
                    for item in block_chars:
                        titlesplit = strtitle.split(item, 1)
                        strtitle = titlesplit[0].strip()
                    post_info = {
                        'title': strtitle,
                        'time': time,
                        'link': post_link,
                        'name': user_info[0],
                        'img': user_info[2],
                        'rule': "sitemap"
                    }
                    # print(strtitle.encode("gbk", 'ignore').decode('gbk', 'ignore'))
                    # print(time)
                    # print(post_link)
                    post_poor.append(post_info)
                    # print("-----------获取到匹配结果----------")
                except Exception as e:
                    # print(e)
                    # print(e.__traceback__.tb_frame.f_globals["__file__"])
                    # print(e.__traceback__.tb_lineno)
                    # print('网站不包含规范的时间格式!')
                    error_sitmap = True
    except Exception as e:
        # print('无法请求sitemap')
        # print(e)
        # print(e.__traceback__.tb_frame.f_globals["__file__"])
        # print(e.__traceback__.tb_lineno)
        error_sitmap = True
    # print('-----------结束sitemap规则----------')
    # print('\n')
    return error_sitmap, post_poor
예제 #12
0
def get_last_post(user_info, post_poor):
    error_sitmap = False
    link = user_info[1]
    # print('\n')
    # print('-------执行butterfly主页规则----------')
    # print('执行链接:', link)
    result = request.get_data(link)
    soup = BeautifulSoup(result, 'html.parser')
    main_content = soup.find_all(id='recent-posts')
    time_excit = soup.find_all('time')
    if main_content and time_excit:
        error_sitmap = True
        link_list = main_content[0].find_all(
            'time', {"class": "post-meta-date-created"})
        if link_list == []:
            # print('该页面无文章生成日期')
            link_list = main_content[0].find_all('time')
        # else:
        # print('该页面有文章生成日期')
        lasttime = datetime.datetime.strptime('1970-01-01', "%Y-%m-%d")
        for index, item in enumerate(link_list):
            time = item.text
            time = time.replace("|", "")
            time = time.replace(" ", "")
            if lasttime < datetime.datetime.strptime(time, "%Y-%m-%d"):
                lasttime = datetime.datetime.strptime(time, "%Y-%m-%d")
        lasttime = lasttime.strftime('%Y-%m-%d')
        # print('最新时间是', lasttime)
        last_post_list = main_content[0].find_all(
            'div', {"class": "recent-post-info"})
        for item in last_post_list:
            time_created = item.find('time',
                                     {"class": "post-meta-date-created"})
            if time_created:
                pass
            else:
                time_created = item
            if time_created.find(text=lasttime):
                error_sitmap = False
                # print(lasttime)
                a = item.find('a')
                # # print(item.find('a'))
                alink = a['href']
                alinksplit = alink.split("/", 1)
                stralink = alinksplit[1].strip()
                if link[-1] != '/':
                    link = link + '/'
                # print(a.text.encode("gbk", 'ignore').decode('gbk', 'ignore'))
                # print(link + stralink)
                # print("-----------获取到匹配结果----------")
                post_info = {
                    'title': a.text,
                    'time': lasttime,
                    'updated': lasttime,
                    'link': link + stralink,
                    'name': user_info[0],
                    'img': user_info[2],
                    'rule': "butterfly"
                }
                post_poor.append(post_info)
    else:
        error_sitmap = True
        # print('貌似不是类似butterfly主题!')
    # print("-----------结束butterfly主页规则----------")
    # print('\n')
    return error_sitmap
예제 #13
0
def get_last_post(user_info, post_poor):
    error_sitmap = False
    link = user_info[1]
    # print('\n')
    # print('-------执行fluid主页规则----------')
    # print('执行链接:', link)
    result = request.get_data(link)
    soup = BeautifulSoup(result, 'html.parser')
    main_content = soup.find_all(id='board')
    time_excit = soup.find_all('div', {"class": "post-meta mr-3"})
    if main_content and time_excit:
        error_sitmap = True
        link_list = main_content[0].find_all('div',
                                             {"class": "post-meta mr-3"})
        lasttime = datetime.datetime.strptime('1970-01-01', "%Y-%m-%d")
        for index, item in enumerate(link_list):
            time = item.text
            time = time.replace("|", "")
            time = time.replace(" ", "")
            time = time.replace("\n", "")
            try:
                datetime.datetime.strptime(time, "%Y-%m-%d")
            except:
                continue
            if lasttime < datetime.datetime.strptime(time, "%Y-%m-%d"):
                lasttime = datetime.datetime.strptime(time, "%Y-%m-%d")
        lasttime = lasttime.strftime('%Y-%m-%d')
        # print('最新时间是', lasttime)
        last_post_list = main_content[0].find_all(
            'div', {"class": "row mx-auto index-card"})

        for item in last_post_list:
            time_created = item.find('div', {
                "class": "post-meta mr-3"
            }).text.strip()

            if time_created == lasttime:
                error_sitmap = False
                a = item.find('a')
                # # print(item.find('a'))
                stralink = a['href']
                if link[-1] != '/':
                    link = link + '/'
                # print(item.find('h1', {"class": "index-header"}).text.strip().encode("gbk", 'ignore').decode('gbk', 'ignore'))
                # print(link + stralink)
                # print("-----------获取到匹配结果----------")
                post_info = {
                    'title':
                    item.find('h1', {
                        "class": "index-header"
                    }).text.strip(),
                    'time':
                    lasttime,
                    'link':
                    link + stralink,
                    'name':
                    user_info[0],
                    'img':
                    user_info[2],
                    'rule':
                    "fluid"
                }
                post_poor.append(post_info)
    else:
        error_sitmap = True
        # print('貌似不是类似fluid主题!')
    # print("-----------结束fluid主页规则----------")
    # print('\n')
    return error_sitmap