Python Rule Exemples, base_processor.Rule Python Exemples

Exemple #1

0

Afficher le fichier

Fichier : mzitu_proccessor_regex.py Projet : tuian/Sasila

class MezituProcessor(BaseProcessor):
    spider_id = 'mzitu'
    spider_name = 'mzitu'
    allowed_domains = ['mzitu.com', 'meizitu.net']
    start_requests = [Request(url='http://www.mzitu.com/xinggan/')]

    rules = (
        Rule(LinkExtractor(
            regex_str=r"http://i.meizitu.net/\d{4}/\d{2}/[0-9a-z]+.jpg"),
             callback="save",
             priority=3),
        Rule(LinkExtractor(regex_str=r"http://www.mzitu.com/\d+"), priority=1),
        Rule(LinkExtractor(regex_str=r"http://www.mzitu.com/\d+/\d+"),
             priority=2),
        Rule(LinkExtractor(regex_str=r"http://www.mzitu.com/xinggan/page/\d+"),
             priority=0),
    )

    def save(self, response):
        if response.m_response:
            if not os.path.exists("img"):
                os.mkdir("img")
            with open("img/" + str(uuid.uuid1()) + ".jpg", 'wb') as fs:
                fs.write(response.m_response.content)
                print("download success!")

Exemple #2

0

Afficher le fichier

class FeProcessor(BaseProcessor):
    spider_id = 'fe'
    spider_name = 'fe'
    allowed_domains = ['58.com']
    start_requests = [Request(url='http://www.58.com/daikuan/changecity/')]

    rules = (
        Rule(LinkExtractor(regex_str=r"http://[a-z]*?.58.com/daikuan/"), priority=0),
        Rule(LinkExtractor(regex_str=r"/daikuan/pn\d+/"), priority=1),
        Rule(LinkExtractor(css_str="table.small-tbimg a.t"), priority=3, callback='save'),
    )

    def save(self, response):
        if response.m_response:
            print bs(response.m_response.content, 'lxml').title.string

Exemple #3

0

Afficher le fichier

Fichier : city_location_processor.py Projet : DawnYe/Sasila

class CityLocationProcessor(BaseProcessor):
    spider_id = 'city'
    spider_name = 'city'
    allowed_domains = ['supfree.net']
    start_requests = [Request(url='http://jingwei.supfree.net/')]

    rules = (
        Rule(LinkExtractor(regex_str=r"kongzi\.asp\?id=\d+"), priority=0),
        Rule(LinkExtractor(regex_str=r"mengzi\.asp\?id=\d+"),
             priority=1,
             only_first=True,
             callback='save'),
    )

    def save(self, response):
        if response.m_response:
            soup = bs(response.m_response.content, 'lxml')
            name = soup.select("div.cdiv p")[0].string.strip().split(' ')
            if len(name) > 2:
                province = name[0]
                city = name[1]
                area = name[2]
            elif len(name) > 1:
                province = name[0]
                city = name[0]
                area = name[1]
            else:
                province = name[0]
                city = name[0]
                area = name[0]
            lo = soup.select("div.cdiv p")[1].select("span")[0].string.strip()
            la = soup.select("div.cdiv p")[1].select("span")[1].string.strip()
            data = province + ',' + city + ',' + area + ',' + lo + ',' + la
            print data
            with open('city.txt', 'a+') as fs:
                data = province + ',' + city + ',' + area + ',' + lo + ',' + la
                fs.write(data + '\n')
                print data