コード例 #1
0
ファイル: posts.py プロジェクト: zhusijia/junior_spider
import re
from lxml import etree
import requests
import time
from threading import Thread

from crawler import PostsCrawler
from mysql_manager import MysqlManager

max_threads = 10
interval = 20
mysql_mgr = MysqlManager(max_threads)


def post_crawl_task(topic):
    # Get 1st page of this topic
    post_crawler = PostsCrawler()
    post_crawler.get_content(topic['url'], 1)
    posts = post_crawler.get_posts()

    # Get number of pages of this topic
    page_count = post_crawler.get_max_page()

    print(topic['url'])
    print('page count', page_count)

    # Get the rest posts of this topic
    if page_count > 1:
        for i in range(2, page_count + 1):
            post_crawler.get_content(topic['url'], i)
            posts += post_crawler.get_posts()
コード例 #2
0
import json
from mysql_manager import MysqlManager

mysql = MysqlManager(4)

with open('videos.json', 'r') as f:
    i = 1
    while True:
        print("Parse json: ", i)
        i+= 1
        line = f.readline()

        if not line:
            break

        if len(line) < 10:
            continue

        # urls = re.findall('http://v3-dy.ixigua.com[^\"]+', json_str)
        obj = json.loads(line)

        # aweme_list->[n]->video->play_addr->url_list
        i_url = 0
        for v in obj['aweme_list']:
            # print("-----", i_url)
            try:
                url = v['video']['play_addr']['url_list'][0]
            except Exception as err:
                print("parse error ", i, " index: ", i_url)
            i_url += 1
            # print(url)
コード例 #3
0
import re
from lxml import etree
import requests
import time
import global_var

from mysql_manager import MysqlManager

mysql_mgr = MysqlManager(4)


class BoardsCrawler:
    domain = 'http://www.newsmth.net/'

    base_url = domain + '/nForum/section/{}?ajax'

    def __init__(self, interval=1):
        self.interval = interval

    def get_board_of_section(self, section_idx):
        url = self.base_url.format(section_idx)
        response = requests.get(url, headers=global_var.newsmth_headers)
        time.sleep(self.interval)
        self.content = response.text
        self.tree = etree.HTML(self.content)

    def get_board_list(self, etr_obj=None):
        if etr_obj is None:
            etr_obj = self.tree
        elements = etr_obj.xpath(
            '//table[@class="board-list corner"]/tbody/tr')
コード例 #4
0
ファイル: weibo.py プロジェクト: renji01/learning_python
 def __init__(self, limit=500):
     self.reply_limit = limit
     self.mm = MysqlManager(4)
コード例 #5
0
import re
from lxml import etree
import requests
from threading import Thread
import time
import html
from mysql_manager import MysqlManager
from crawler import PostsCrawler

max_threads = 10
wait_duration = 20

mysql_mgr = MysqlManager(10)

def post_crawl_task(topic):
        # Get 1st page of this topic
        post_crawler = PostsCrawler()
        post_crawler.get_content(topic['url'], 1)
        posts = post_crawler.get_posts()

        # Get number of pages of this topic
        page_count = post_crawler.get_max_page()

        # Get the rest posts of this topic
        if page_count > 1:
            for i in range(2, page_count + 1):
                post_crawler.get_content(topic['url'], i)
                posts += post_crawler.get_posts()
                break
        
        # Insert post of a topic
コード例 #6
0
ファイル: weibo.py プロジェクト: zky959/spider-course-5
 def __init__(self, limit=200):
     self.reply_limit = limit
     self.mm = MysqlManager(4)
     self.post = {}