def init_rule(): engine = db_connect() create_news_table(engine) Session = sessionmaker(bind=engine) with session_scope(Session) as session: artile_rule1 = ArticleRule( name='huxiu', allow_domains='huxiu.com', start_urls='http://www.huxiu.com/', next_page='', allow_url='/article/\d+/\d+\.html', extract_from='//div[@class="mod-info-flow"]', title_xpath='//div[@class="article-wrap"]/h1/text()', body_xpath='//div[@id="article_content"]/p//text()', publish_time_xpath='//span[@class="article-time"]/text()', source_site='虎嗅网', enable=1 ) artile_rule2 = ArticleRule( name='osc', allow_domains='oschina.net', start_urls='http://www.oschina.net/', next_page='', allow_url='/news/\d+/', extract_from='//div[@id="IndustryNews"]', title_xpath='//h1[@class="OSCTitle"]/text()', publish_time_xpath='//div[@class="PubDate"]/text()', body_xpath='//div[starts-with(@class, "Body")]/p[position()>1]//text()', source_site='开源中国', enable=1 ) session.add(artile_rule1) session.add(artile_rule2)
def init_rule(): engine = db_connect() create_news_table(engine) Session = sessionmaker(bind=engine) with session_scope(Session) as session: artile_rule1 = ArticleRule( name='huxiu', allow_domains='huxiu.com', start_urls='http://www.huxiu.com/', next_page='', allow_url='/article/\d+/\d+\.html', extract_from='//div[@class="mod-info-flow"]', title_xpath='//div[@class="article-wrap"]/h1/text()', body_xpath='//div[@id="article_content"]/p//text()', publish_time_xpath='//span[@class="article-time"]/text()', source_site='虎嗅网', enable=1) artile_rule2 = ArticleRule( name='osc', allow_domains='oschina.net', start_urls='http://www.oschina.net/', next_page='', allow_url='/news/\d+/', extract_from='//div[@id="IndustryNews"]', title_xpath='//h1[@class="OSCTitle"]/text()', publish_time_xpath='//div[@class="PubDate"]/text()', body_xpath= '//div[starts-with(@class, "Body")]/p[position()>1]//text()', source_site='开源中国', enable=1) session.add(artile_rule1) session.add(artile_rule2)
def __init__(self): """ Initializes database connection and sessionmaker. Creates deals table. """ engine = db_connect() create_news_table(engine) # 初始化对象属性Session为可调用对象 self.Session = sessionmaker(bind=engine) self.recent_links = None self.nowtime = datetime.datetime.now()
def __init__(self): engine = db_connect() self.Session = sessionmaker(bind=engine)
def __init__(self): engine = db_connect() create_news_table(engine) self.Session = sessionmaker(bind=engine)
""" import logging from twisted.internet import reactor from scrapy.crawler import CrawlerRunner from scrapy.utils.project import get_project_settings from scrapy.utils.log import configure_logging from coolscrapy.models import db_connect, create_news_table from coolscrapy.models import ArticleRule from sqlalchemy.orm import sessionmaker from coolscrapy.spiders.article_spider import ArticleSpider if __name__ == '__main__': settings = get_project_settings() configure_logging(settings) db = db_connect() Session = sessionmaker(bind=db) session = Session() rules = session.query(ArticleRule).filter(ArticleRule.enable == 1).all() session.close() runner = CrawlerRunner(settings) for rule in rules: # spider = ArticleSpider(rule) # instantiate every spider using rule # stop reactor when spider closes # runner.signals.connect(spider_closing, signal=signals.spider_closed) runner.crawl(ArticleSpider, rule=rule) d = runner.join() d.addBoth(lambda _: reactor.stop())
""" import logging from spiders.article_spider import ArticleSpider from twisted.internet import reactor from scrapy.crawler import CrawlerRunner from scrapy.utils.project import get_project_settings from scrapy.utils.log import configure_logging from coolscrapy.models import db_connect from coolscrapy.models import ArticleRule from sqlalchemy.orm import sessionmaker if __name__ == '__main__': settings = get_project_settings() configure_logging(settings) db = db_connect() Session = sessionmaker(bind=db) session = Session() rules = session.query(ArticleRule).filter(ArticleRule.enable == 1).all() session.close() runner = CrawlerRunner(settings) for rule in rules: # spider = ArticleSpider(rule) # instantiate every spider using rule # stop reactor when spider closes # runner.signals.connect(spider_closing, signal=signals.spider_closed) runner.crawl(ArticleSpider, rule=rule) d = runner.join() d.addBoth(lambda _: reactor.stop())
def __init__(self): logg.info("Init ProxyDatabasePipeline") engine = db_connect() self.Session = sessionmaker(bind=engine) logg.info(self.Session)