Python Logの例、ScrapyStudy.public.Log.Log Pythonの例

コード例 #1

0

ファイルを表示

 def __init__(self):
     self.log = Log().get_logger()
     """输出json,使用内置JsonItemExporter实现，或自己使用内置json实现"""
     filename = Config.get_results_path() + 'sina7x24.json'
     self.file = open(filename, 'wb')
     self.exporter = JsonItemExporter(self.file, indent=0, encoding='utf-8')
     # self.exporter = MyJsonLinesItemExporter(self.file, indent=0, encoding='utf-8')
     self.exporter.start_exporting()
     """存储数据库"""
     client = pymongo.MongoClient(
         'mongodb://*****:*****@localhost:27017/')
     db = client['sinadb']  # 指定数据库
     self.collection = db['7x24']  # 指定集合

コード例 #2

0

ファイルを表示

 def __init__(self):
     self.log = Log().get_logger()
     """输出json,使用内置JsonItemExporter实现，或自己使用内置json实现"""
     filename = Config.get_results_path() + 'jd.json'
     self.file = open(filename, 'wb')
     self.exporter = JsonItemExporter(self.file, indent=0, encoding='utf-8')
     self.exporter.start_exporting()
     """输出csv,使用自定义MyCsvItemExporter可指定顺序和分隔符"""
     csv_filename = Config.get_results_path() + 'jd.csv'
     self.csv_file = open(csv_filename, 'wb')
     fields = ['title', 'price', 'comment', 'product_id']
     self.csv_exporter = MyCsvItemExporter(fields=fields,
                                           file=self.csv_file,
                                           encoding='gbk')

コード例 #3

0

ファイルを表示

class ItcastPipeline(object):
    def __init__(self):
        self.log = Log().get_logger()
        """输出json,使用内置JsonItemExporter实现，或自己使用内置json实现"""
        filename = Config.get_results_path() + 'teachers.json'
        # self.file = open(filename, 'w' , encoding='utf-8')
        self.file = open(filename, 'wb')
        self.exporter = JsonItemExporter(self.file, indent=0, encoding='utf-8')
        self.exporter.start_exporting()
        """输出csv,使用自定义MyCsvItemExporter可指定顺序和分隔符"""
        csv_filename = Config.get_results_path() + 'teachers.csv'
        self.csv_file = open(csv_filename, 'wb')
        fields = ['name', 'grade', 'info']
        self.csv_exporter = MyCsvItemExporter(fields=fields,
                                              file=self.csv_file,
                                              encoding='utf-8')
        """存储数据库"""
        client = pymongo.MongoClient(
            'mongodb://*****:*****@localhost:27017/')
        db = client['scrapydb']  # 指定数据库
        self.collection = db['teachers']  # 指定集合

    def process_item(self, item, spider):
        # 当爬虫的数据返回时，这个方法被调用。
        if isinstance(item, ItcastItem):
            # line = json.dumps(dict(item), ensure_ascii=False) + ",\n"
            # self.file.writelines(line)
            self.exporter.export_item(item)
            self.csv_exporter.export_item(item)
            # result = self.collection.insert_one(dict(item))
            # self.log.info(result.inserted_id)
        return item

    def open_spider(self, spider):
        # 可选实现，当spider被开启时，这个方法被调用。
        self.log.info('open_itcast_spider')

    def close_spider(self, spider):
        # 可选实现，当spider被关闭时，这个方法被调用,这里没有被调用，原因不详
        self.exporter.finish_exporting()
        self.file.close()
        self.csv_file.close()
        self.log.info('close_itcast_spider')

コード例 #4

0

ファイルを表示

 def __init__(self):
     self.log = Log().get_logger()
     """输出json,使用内置JsonItemExporter实现，或自己使用内置json实现"""
     filename = Config.get_results_path() + 'teachers.json'
     # self.file = open(filename, 'w' , encoding='utf-8')
     self.file = open(filename, 'wb')
     self.exporter = JsonItemExporter(self.file, indent=0, encoding='utf-8')
     self.exporter.start_exporting()
     """输出csv,使用自定义MyCsvItemExporter可指定顺序和分隔符"""
     csv_filename = Config.get_results_path() + 'teachers.csv'
     self.csv_file = open(csv_filename, 'wb')
     fields = ['name', 'grade', 'info']
     self.csv_exporter = MyCsvItemExporter(fields=fields,
                                           file=self.csv_file,
                                           encoding='utf-8')
     """存储数据库"""
     client = pymongo.MongoClient(
         'mongodb://*****:*****@localhost:27017/')
     db = client['scrapydb']  # 指定数据库
     self.collection = db['teachers']  # 指定集合

コード例 #5

0

ファイルを表示

ファイル: MongoDBStudy.py プロジェクト: pgsheng/ScrapyStudy

    def __init__(self):
        self.log = Log().get_logger()
        """
        1、连接MongoDB
            连接MongoDB我们需要使用PyMongo库里面的MongoClient，一般来说传入MongoDB的IP及端口即可，第一个参数为地址host，
            第二个参数为端口port，端口如果不传默认是27017。
        """
        self.client = pymongo.MongoClient(
            'mongodb://*****:*****@localhost:27017/')  # 可以达到同样的连接效果。
        """
        2、指定数据库
            MongoDB分为一个个数据库，需要指定要操作哪个数据库，在这里我以test数据库为例进行说明
            注意: 在 MongoDB 中，如果数据库不存在，数据库只有在内容插入后才会创建! 就是说，数据库创建后要创建集合(数据表)
                并插入一个文档(记录)，数据库才会真正创建。集合创建同理。
        """
        self.db = self.client.sinadb
        # self.db = self.client.scrapydb
        # self.db = self.client['testdb']  #　两种方式是等价的。
        """
        2.2、读取 MongoDB 中的所有数据库，并判断指定的数据库是否存在
        """
        dblist = self.client.list_database_names()
        if "sinadb" in dblist:
            self.log.info("数据库已存在！")
        else:
            self.log.info("数据库不存在！")
        """
        3、指定集合
            每个数据库又包含了许多集合Collection，也就类似与关系型数据库中的表，下一步需要指定要操作的集合，
        在这里我们指定一个集合名称为students，学生集合。还是和指定数据库类似，指定集合也有两种方式。
        """
        # self.collection = self.db.teachers
        self.collection = self.db['7x24']
        # self.collection = self.db['students']

        collist = self.db.list_collection_names()
        if "7x24" in collist:  # 判断 sites 集合是否存在
            self.log.info("集合已存在！")
        else:
            self.log.info("集合不存在！")

コード例 #6

0

ファイルを表示

class JDPipeline(object):
    def __init__(self):
        self.log = Log().get_logger()
        """输出json,使用内置JsonItemExporter实现，或自己使用内置json实现"""
        filename = Config.get_results_path() + 'jd.json'
        self.file = open(filename, 'wb')
        self.exporter = JsonItemExporter(self.file, indent=0, encoding='utf-8')
        self.exporter.start_exporting()
        """输出csv,使用自定义MyCsvItemExporter可指定顺序和分隔符"""
        csv_filename = Config.get_results_path() + 'jd.csv'
        self.csv_file = open(csv_filename, 'wb')
        fields = ['title', 'price', 'comment', 'product_id']
        self.csv_exporter = MyCsvItemExporter(fields=fields,
                                              file=self.csv_file,
                                              encoding='gbk')

    def process_item(self, item, spider):
        # 当爬虫的数据返回时，这个方法被调用。
        self.log.info('这是京东网站数据')

        self.exporter.export_item(item)
        self.csv_exporter.export_item(item)
        return item

    def open_spider(self, spider):
        # 可选实现，当spider被开启时，这个方法被调用。
        self.log.info('open_jd_spider')

    def close_spider(self, spider):
        # 可选实现，当spider被关闭时，这个方法被调用,这里没有被调用，原因不详
        self.exporter.finish_exporting()
        self.file.close()
        self.log.info('close_jd_spider')

コード例 #7

0

ファイルを表示

class BeautyImagePipeline(ImagesPipeline):
    log = Log().get_logger()

    def get_media_requests(self, item, info):
        for image_url in item['image_url']:
            self.log.info(image_url)
            yield Request(image_url, meta={'title': item['title']})

    # 默认下载图片名为哈希
    def file_path(self, request, response=None, info=None):
        title = request.meta['title']  # 通过上面的meta传递过来title
        name = request.url.split('/')[-1]  # 提取url前面名称作为图片名。
        filename = '%s/%s' % (title, name)
        return filename

コード例 #8

0

ファイルを表示

ファイル: BeautyImageSpider.py プロジェクト: pgsheng/ScrapyStudy

class BeautyImageSpider(scrapy.Spider):
    name = "BeautyImageSpider"
    allowed_domains = ["lab.scrapyd.cn"]
    start_urls = ['http://lab.scrapyd.cn/archives/55.html']
    custom_settings = {
        'ITEM_PIPELINES': {
            'ScrapyStudy.pipelines.BeautyImagePipeline': 300,
        },
        # 'DOWNLOADER_MIDDLEWARES': {"ScrapyStudy.middlewares.SeleniumMiddleware": 401, },
        'IMAGES_STORE': 'D:\ImageSpider',  # 图片存储位置
    }

    def __init__(self):
        self.log = Log().get_logger()
        super(BeautyImageSpider, self).__init__()

    def start_requests(self):
        for urls in self.start_urls:
            yield scrapy.Request(url=urls, callback=self.parse)

    def parse(self, response):
        # filename = Config.get_results_path() + "beauty_image.html"  # 1、保存网页数据
        # with open(filename, 'wb+') as file:  # 只能以二进制方式打开
        #     file.write(response.body)

        item = BeautyImageItem()
        # image_urls = response.css(".post-content img::attr(src)").extract()  # 图片url集合
        image_urls = response.xpath(
            '//div[@class="post-content"]//img/@src').extract()
        title = response.css(".post-title a::text").extract_first()
        item['image_url'] = image_urls
        item['title'] = title

        yield item  # 直接返回最后数据

    def closed(self, spider):
        self.log.info("BeautyImageSpider_closed")

コード例 #9

0

ファイルを表示

class Sina7x24Pipeline(object):
    def __init__(self):
        self.log = Log().get_logger()
        """输出json,使用内置JsonItemExporter实现，或自己使用内置json实现"""
        filename = Config.get_results_path() + 'sina7x24.json'
        self.file = open(filename, 'wb')
        self.exporter = JsonItemExporter(self.file, indent=0, encoding='utf-8')
        # self.exporter = MyJsonLinesItemExporter(self.file, indent=0, encoding='utf-8')
        self.exporter.start_exporting()
        """存储数据库"""
        client = pymongo.MongoClient(
            'mongodb://*****:*****@localhost:27017/')
        db = client['sinadb']  # 指定数据库
        self.collection = db['7x24']  # 指定集合

    def process_item(self, item, spider):
        # 当爬虫的数据返回时，这个方法被调用。
        if isinstance(item, Sina7x24Item):
            self.exporter.export_item(item)

            myquery = {"date": item['date']}
            newvalues = {"$set": dict(item)}
            result = self.collection.update_many(myquery,
                                                 newvalues,
                                                 upsert=True)
            # self.log.info('插入数据id：%s' % result.upserted_id)
        return item

    def open_spider(self, spider):
        # 可选实现，当spider被开启时，这个方法被调用。
        self.log.info('open_sina7x24_spider')

    def close_spider(self, spider):
        # 可选实现，当spider被关闭时，这个方法被调用,这里没有被调用，原因不详
        self.exporter.finish_exporting()
        self.file.close()
        self.log.info('close_sina7x24_spider')

コード例 #10

0

ファイルを表示

class Command(ScrapyCommand):
    requires_project = True
    log = Log().get_logger()

    def syntax(self):
        return '[options]'

    def short_desc(self):
        return 'Runs all of the spiders'

    def add_options(self, parser):
        ScrapyCommand.add_options(self, parser)
        parser.add_option("-a",
                          dest="spargs",
                          action="append",
                          default=[],
                          metavar="NAME=VALUE",
                          help="set spider argument (may be repeated)")
        parser.add_option(
            "-o",
            "--output",
            metavar="FILE",
            help="dump scraped items into FILE (use - for stdout)")
        parser.add_option("-t",
                          "--output-format",
                          metavar="FORMAT",
                          help="format to use for dumping items with -o")

    def process_options(self, args, opts):
        ScrapyCommand.process_options(self, args, opts)
        try:
            opts.spargs = arglist_to_dict(opts.spargs)
        except ValueError:
            raise UsageError("Invalid -a value, use -a NAME=VALUE",
                             print_help=False)

    def run(self, args, opts):

        spider_loader = self.crawler_process.spider_loader
        for spider_name in args or spider_loader.list():
            self.log.info('准备爬取 %s' % spider_name)
            self.crawler_process.crawl(spider_name, **opts.spargs)

        self.crawler_process.start()

コード例 #11

0

ファイルを表示

ファイル: FileUtils.py プロジェクト: pgsheng/ScrapyStudy

class FileUtils(object):
    """
    文件操作类
    """
    def __init__(self):
        self.log = Log().get_logger()

    def read_yaml(self, path):
        try:
            with open(path, 'r', encoding='utf-8') as file:
                data_dict = yaml.load(file)
                self.log.info('读取yaml文件成功')
                if isinstance(data_dict, dict):
                    return data_dict
                else:
                    return dict()
        except Exception as e:
            return dict()

    def write_yaml(self, path, data):
        try:
            # a追加写入，w覆盖写入
            with open(path, 'w', encoding='utf-8') as file:
                yaml.dump(data, file)
                self.log.info('数据写入yaml文件成功')
                return True
        except Exception as e:
            self.log.error('数据写入yaml文件出现异常：%s' % e)
            return None

    def read_txt(self, path):
        try:
            with open(path, 'r', encoding='utf-8-sig') as file:
                message_list = file.readlines()
                if len(message_list) > 0:
                    msg_list = []
                    for line in message_list:
                        line = line.strip().replace(" ", "")  # 去掉换行符和空格
                        if len(line) > 0:  # 去掉空行
                            msg_list.append(line)
                    if len(msg_list) > 0:
                        return msg_list
                return None
        except Exception as e:
            self.log.error('读取txt文件出现异常：%s' % e)
            return None

    def write_txt(self, path, text):
        try:
            # a追加写入，w覆盖写入
            with open(path, 'w', encoding='utf-8') as file:
                file.write(str(text))
                self.log.info('数据写入文件成功')
                return True
        except Exception as e:
            self.log.error('数据写入txt文件出现异常：%s' % e)
            return None

    def del_file(self, path):
        ls = os.listdir(path)
        for i in ls:
            c_path = os.path.join(path, i)
            if os.path.isdir(c_path):
                self.del_file(c_path)
            else:
                os.remove(c_path)

    def copy_file_random(self, file_dir, tar_dir, num):  # 随机复制num张图片
        sample = random.sample(os.listdir(file_dir), num)
        for name in sample:
            shutil.copyfile(file_dir + name, tar_dir + name)

    def clip_file_random(self, file_dir, tar_dir, num):  # 随机剪贴num张图片,并重复复制多张
        sample = random.sample(os.listdir(file_dir), num)
        for name in sample:
            shutil.copyfile(file_dir + name, tar_dir + name)
            os.remove(file_dir + name)
            self.copy_redo(tar_dir, name, 32)

    def copy_redo(self, file_dir, name, num):  # 同目录下重复复制多张图片
        for index in range(0, num):
            shutil.copyfile(file_dir + name, file_dir + str(index) + name)

    def copy_word(self, path):  # 复制word的所有内容到了剪切板，包括图片、文字、格式
        try:
            pythoncom.CoInitialize()
            w = win32com.client.DispatchEx('Word.Application')
            try:
                # # 后台运行，不显示，不警告
                w.Visible = 0
                w.DisplayAlerts = 0
                doc = w.Documents.Open(path)  # 打开word，经测试要是绝对路径
                doc.Content.Copy()  # 复制word的所有内容
                doc.Close()  # 关闭word
            except Exception as e:
                self.log.error('复制word文档出现异常：%s' % e)
            finally:
                if w:  # 对com操作，一定要确保退出word应用
                    self.log.info('退出word应用')
                    w.Quit()
                    del (w)
                pythoncom.CoUninitialize()
        except Exception as e:
            pass

コード例 #12

0

ファイルを表示

ファイル: FileUtils.py プロジェクト: pgsheng/ScrapyStudy

 def __init__(self):
     self.log = Log().get_logger()

コード例 #13

0

ファイルを表示

ファイル: Sina7x24SeleniumSpider.py プロジェクト: pgsheng/ScrapyStudy

class Sina7x24SeleniumSpider(scrapy.Spider):
    name = "Sina7x24SeleniumSpider"
    allowed_domains = ["finance.sina.com.cn"]
    # start_urls = [
    #     'http://finance.sina.com.cn/7x24/'
    # ]
    custom_settings = {
        'ITEM_PIPELINES': {
            'ScrapyStudy.pipelines.Sina7x24Pipeline': 300,
        },
        'DOWNLOADER_MIDDLEWARES': {
            "ScrapyStudy.middlewares.SeleniumMiddleware": 401,
        },
        'CONCURRENT_REQUESTS': 1,
    }

    def __init__(self):
        self.log = Log().get_logger()
        self.driver_firefox()
        self.date_list = []
        self.is_first = True
        super(Sina7x24SeleniumSpider, self).__init__()

    def driver_firefox(self):
        options = Options()  # 不同浏览器导入Options包路径不一样
        options.add_argument('-headless')  # 无界面配置
        self.driver = webdriver.Firefox(firefox_options=options)  # 这里初始化浏览很耗时
        # self.driver = webdriver.Firefox()
        # self.driver.maximize_window()
        self.driver.set_page_load_timeout(25)

    def start_requests(self):
        urls = 'http://finance.sina.com.cn/7x24/'
        while True:
            # for i in range(50):
            try:
                title = self.driver.title
            except Exception as e:
                self.log.info('浏览器进程被干掉，重新驱动浏览器')
                if 'without establishing a connection' in str(e):
                    self.driver_firefox()
            time.sleep(25)
            yield scrapy.Request(url=urls,
                                 callback=self.parse,
                                 dont_filter=True)  # dont_filte为True,不去重

    def parse(self, response):
        # filename = Config.get_results_path() + "sina7x24.html"  # 1、保存网页数据
        # with open(filename, 'wb+') as file:  # 只能以二进制方式打开
        #     file.write(response.body)

        items = []
        news = response.xpath("//div[@class='bd_i bd_i_og  clearfix']")
        day = time.strftime("%Y-%m-%d ", time.localtime(time.time()))  # 获取当前时间
        for each in news[::-1]:
            item = Sina7x24Item()
            # extract()方法返回的都是unicode字符串,normalize-space()可以去掉数据中空格、换行符等特殊符号
            times = each.xpath("normalize-space(div/p/text())").extract()
            info = each.xpath("normalize-space(div[2]/div/p/text())").extract()
            # time = each.css(".bd_i_time_c::text").extract()
            # info = each.css(".bd_i_txt_c::text").extract()

            date = day + times[0]
            if self.is_first:
                item['date'] = date
                item['info'] = info[0]
                self.date_list.append(date)
                print(date + info[0])
            else:
                if date in self.date_list:
                    continue
                else:
                    item['date'] = date
                    item['info'] = info[0]
                    self.date_list.append(date)
                    print('-' * 60)
                    print(date + info[0])

            items.append(item)
        self.is_first = False
        # print('长度：%s' % len(items))
        return items  # 直接返回最后数据

    def closed(self, spider):
        self.log.info("Sina7x24SeleniumSpider_closed")
        self.driver.close()

コード例 #14

0

ファイルを表示

"""中间件，可以添加下载中过程一些配置，需要在设置文件setting中配置还起作用"""
import random
import time

from scrapy.http import HtmlResponse
from selenium.common.exceptions import TimeoutException

from ScrapyStudy.public.Log import Log
from ScrapyStudy.user_agents import agents

log = Log().get_logger()


class UserAgentMiddleware(object):
    """ 换User-Agent """
    def process_request(self, request, spider):
        log.info(spider.name)
        # agent = random.choice(agents)  # 随机获取一个请求头
        # request.headers["User-Agent"] = agent


# class CookiesMiddleware(object):
#     """ 换Cookie """
#
#     def process_request(self, request, spider):
#         cookie = random.choice(cookies)
#         request.cookies = cookie


class SeleniumMiddleware(object):
    """ 抓取js动态生成代码中间件 Selenium"""

コード例 #15

0

ファイルを表示

ファイル: Sina7x24SeleniumSpider.py プロジェクト: pgsheng/ScrapyStudy

 def __init__(self):
     self.log = Log().get_logger()
     self.driver_firefox()
     self.date_list = []
     self.is_first = True
     super(Sina7x24SeleniumSpider, self).__init__()

コード例 #16

0

ファイルを表示

ファイル: StartUp.py プロジェクト: pgsheng/ScrapyStudy

from scrapy import cmdline

from ScrapyStudy.public.Log import Log

log = Log().get_logger()

log.info(20 * '-' + '开始' + 20 * '-')
"""单个执行"""
# cmdline.execute("scrapy crawl BeautyImageSpider".split())
cmdline.execute("scrapy crawl Sina7x24SeleniumSpider".split())
# cmdline.execute("scrapy crawl Sina7x24SplashSpider".split())
# cmdline.execute("scrapy crawl itcast".split())  # 执行爬虫命令 不能去掉split()方法
# cmdline.execute("scrapy crawl jd".split()) #执行爬虫命令 不能去掉split()方法
# cmdline.execute("scrapy crawl itcast -o results//teachers0.json".split()) # -o 输出指定格式的文件
# cmdline.execute("scrapy crawl itcast -o results//teachers0.csv".split())
# cmdline.execute("scrapy crawl itcast -o results//teachers0.xml".split())
"""多个执行"""
# cmdline.execute("scrapy crawlall".split())

コード例 #17

0

ファイルを表示

class Sina7x24SplashSpider(scrapy.Spider):
    name = "Sina7x24SplashSpider"
    allowed_domains = ["finance.sina.com.cn"]
    # start_urls = [
    #     'http://finance.sina.com.cn/7x24/'
    # ]
    custom_settings = {
        'ITEM_PIPELINES': {
            'ScrapyStudy.pipelines.Sina7x24Pipeline': 300,
        },
        'CONCURRENT_REQUESTS': 1,
        'SPLASH_URL': 'http://192.168.99.100:8050/',  # Splash的服务地址，本地或远程服务地址
        "DOWNLOADER_MIDDLEWARES": {
            'scrapy_splash.SplashCookiesMiddleware':
            723,
            'scrapy_splash.SplashMiddleware':
            725,
            'scrapy.downloadermiddlewares.httpcompression.HttpCompressionMiddleware':
            810,
        },
        'SPIDER_MIDDLEWARES': {
            'scrapy_splash.SplashDeduplicateArgsMiddleware': 100,
        },
        'DUPEFILTER_CLASS': 'scrapy_splash.SplashAwareDupeFilter',  # 去重的类
        'HTTPCACHE_STORAGE':
        'scrapy_splash.SplashAwareFSCacheStorage',  # Cache存储
    }

    def __init__(self):
        self.log = Log().get_logger()
        self.date_list = []
        self.is_first = True
        super(Sina7x24SplashSpider, self).__init__()

    def start_requests(self):
        urls = 'http://finance.sina.com.cn/7x24/'
        yield SplashRequest(url=urls, callback=self.parse)

    def parse(self, response):
        filename = Config.get_results_path() + "sina7x24.html"  # 1、保存网页数据
        with open(filename, 'wb+') as file:  # 只能以二进制方式打开
            file.write(response.body)

        items = []
        news = response.xpath("//div[@class='bd_i bd_i_og  clearfix']")
        day = time.strftime("%Y-%m-%d ", time.localtime(time.time()))  # 获取当前时间
        for each in news[::-1]:
            item = Sina7x24Item()
            # extract()方法返回的都是unicode字符串,normalize-space()可以去掉数据中空格、换行符等特殊符号
            times = each.xpath("normalize-space(div/p/text())").extract()
            info = each.xpath("normalize-space(div[2]/div/p/text())").extract()
            # time = each.css(".bd_i_time_c::text").extract()
            # info = each.css(".bd_i_txt_c::text").extract()

            date = day + times[0]
            if self.is_first:
                item['date'] = date
                item['info'] = info[0]
                self.date_list.append(date)
                print(date + info[0])
            else:
                if date in self.date_list:
                    continue
                else:
                    item['date'] = date
                    item['info'] = info[0]
                    self.date_list.append(date)
                    print('-' * 60)
                    print(date + info[0])

            items.append(item)
        self.is_first = False
        # print('长度：%s' % len(items))
        return items  # 直接返回最后数据

    def closed(self, spider):
        self.log.info("Sina7x24SplashSpider_closed")

コード例 #18

0

ファイルを表示

 def __init__(self):
     self.log = Log().get_logger()
     self.date_list = []
     self.is_first = True
     super(Sina7x24SplashSpider, self).__init__()

コード例 #19

0

ファイルを表示

ファイル: itcastSpider.py プロジェクト: pgsheng/ScrapyStudy

 def __init__(self):
     self.log = Log().get_logger()
     super(ItcastSpider, self).__init__()

コード例 #20

0

ファイルを表示

ファイル: BeautyImageSpider.py プロジェクト: pgsheng/ScrapyStudy

 def __init__(self):
     self.log = Log().get_logger()
     super(BeautyImageSpider, self).__init__()

コード例 #21

0

ファイルを表示

ファイル: MongoDBStudy.py プロジェクト: pgsheng/ScrapyStudy

class MongoDBStudy(object):
    def __init__(self):
        self.log = Log().get_logger()
        """
        1、连接MongoDB
            连接MongoDB我们需要使用PyMongo库里面的MongoClient，一般来说传入MongoDB的IP及端口即可，第一个参数为地址host，
            第二个参数为端口port，端口如果不传默认是27017。
        """
        self.client = pymongo.MongoClient(
            'mongodb://*****:*****@localhost:27017/')  # 可以达到同样的连接效果。
        """
        2、指定数据库
            MongoDB分为一个个数据库，需要指定要操作哪个数据库，在这里我以test数据库为例进行说明
            注意: 在 MongoDB 中，如果数据库不存在，数据库只有在内容插入后才会创建! 就是说，数据库创建后要创建集合(数据表)
                并插入一个文档(记录)，数据库才会真正创建。集合创建同理。
        """
        self.db = self.client.sinadb
        # self.db = self.client.scrapydb
        # self.db = self.client['testdb']  #　两种方式是等价的。
        """
        2.2、读取 MongoDB 中的所有数据库，并判断指定的数据库是否存在
        """
        dblist = self.client.list_database_names()
        if "sinadb" in dblist:
            self.log.info("数据库已存在！")
        else:
            self.log.info("数据库不存在！")
        """
        3、指定集合
            每个数据库又包含了许多集合Collection，也就类似与关系型数据库中的表，下一步需要指定要操作的集合，
        在这里我们指定一个集合名称为students，学生集合。还是和指定数据库类似，指定集合也有两种方式。
        """
        # self.collection = self.db.teachers
        self.collection = self.db['7x24']
        # self.collection = self.db['students']

        collist = self.db.list_collection_names()
        if "7x24" in collist:  # 判断 sites 集合是否存在
            self.log.info("集合已存在！")
        else:
            self.log.info("集合不存在！")

    def db_insert(self):
        """
        4、插入数据,
            对于students这个Collection，我们新建一条学生数据，以字典的形式表示，
            直接调用collection的insert()方法即可插入数据
        """
        student = {
            "name": "Google",
            "alexa": "1",
            "url": "https://www.google.com"
        }
        result = self.collection.insert_one(student)
        # 在MongoDB中，每条数据都有一个_id属性来唯一标识，如果没有显式指明_id，MongoDB会自动产生一个ObjectId类型的_id属性。
        self.log.info(result.inserted_id)

        students = [
            {
                "name": "QQ",
                "alexa": "101",
                "url": "https://www.qq.com"
            },
            {
                "name": "Facebook",
                "alexa": "10",
                "url": "https://www.facebook.com"
            },
        ]
        results = self.collection.insert_many(students)  # 集合中插入多条数据
        # 输出插入的所有文档对应的 _id 值
        self.log.info(results.inserted_ids)

        # students = [
        #     {"_id": 1, "name": "知乎", "alexa": "103", "url": "https://www.zhihu.com"},
        #     {"_id": 2, "name": "Github", "alexa": "109", "url": "https://www.github.com"}
        # ]
        # results = self.collection.insert_many(students)  # 自己指定 id，插入
        # # 输出插入的所有文档对应的 _id 值
        # self.log.info(results.inserted_ids)

    def db_find(self):
        # results = self.collection.find_one()  # 查询集合中的第一条数据。
        # self.log.info(results)
        #
        # lists = self.collection.find()
        lists = list(self.collection.find({}, {
            "_id": 0,
            "date": 1,
            "info": 1
        }))
        self.log.info("数据总量：%s" % len(lists))
        for x in lists:  # 查询集合中的所有数据
            self.log.info(x)

        # find() 方法来查询指定字段的数据，将要返回的字段对应值设置为 1。
        # 除了 _id 你不能在一个对象中同时指定 0 和 1，如果你设置了一个字段为 0，则其他都为 1，反之亦然。
        # # self.collection.find({}, {"name": 0, "alexa": 1}) 会抛异常
        # value_list = self.collection.find({}, {"_id": 0, "name": 1, "alexa": 1})
        # for x in value_list:
        #     self.log.info(x)
        #
        # self.log.info('-' * 20)
        #
        # # 指定条件查询
        # myquery = {"name": "Facebook"}
        # value_list = self.collection.find(myquery)
        # for x in value_list:
        #     self.log.info(x)
        #
        # self.log.info('-' * 20)
        #
        # # 高级查询,第一个字母 ASCII 值大于 "H" 的数据
        # myquery = {"name": {"$gt": "H"}}
        # value_list = self.collection.find(myquery)
        # for x in value_list:
        #     self.log.info(x)
        #
        # self.log.info('-' * 20)
        #
        # # 使用正则表达式查询,第一个字母为 "F" 的数据
        # myquery = {"name": {"$regex": "^F"}}
        # value_list = self.collection.find(myquery)
        # for x in value_list:
        #     self.log.info(x)
        #
        # self.log.info('-' * 20)
        #
        # # 返回指定条数记录，设置指定条数的记录可以使用 limit() 方法，该方法只接受一个数字参数
        # value_list = self.collection.find().limit(2)
        # for x in value_list:
        #     self.log.info(x)

    def db_update(self):
        value_list = self.collection.find()  # 查询集合中的所有数据
        for x in value_list:
            self.log.info(x)

        # # 将 alexa 字段的值为XXX 的改为 XXX
        # myquery = {"alexa": "103"}
        # newvalues = {"$set": {"alexa": "111"}}
        # result = self.collection.update_one(myquery, newvalues) # 只能修匹配到的第一条记录
        # self.log.info('修改数据结果：%s' % result.modified_count)

        # 将 alexa 字段的值为10000 的改为 111
        myquery = {"alexa": "111"}
        newvalues = {"$set": {"alexa": "112"}}
        result = self.collection.update_many(myquery, newvalues)  # 修改所有匹配到的记录
        self.log.info('修改数据结果：%s' % result.modified_count)

        self.log.info('-' * 20)

        value_list = self.collection.find()  # 查询集合中的所有数据
        for x in value_list:
            self.log.info(x)

    def db_sort(self):
        # value_list = self.collection.find().sort("alexa")  # 对字段 alexa 按升序排序
        value_list = self.collection.find().sort("alexa",
                                                 -1)  # 对字段 alexa 按降序排序
        for x in value_list:
            self.log.info(x)

    def db_delete(self):
        # myquery = {"name": "Taobao"}
        # self.collection.delete_one(myquery)# 删除 name 字段值为 "Taobao" 的第一个匹配文档

        myquery = {"name": {"$regex": "^G"}}
        result = self.collection.delete_many(myquery)  # 删除所有 name 字段中以 G 开头的文档
        self.log.info('删除结果：%s' % result.deleted_count)

        self.db.drop_collection("7x24")  # 删除整个collection

        value_list = self.collection.find()
        for x in value_list:
            self.log.info(x)