def __init__(self):
     model=DataBaseModel()
     engine = model.db_connect()
     model.create_drawing_table(engine)
     self._session = sessionmaker(bind=engine)
     self._handlers=[DrawingItemDataBaseHanlder(self._session),
                     DrawingTagItemDataBaseHanlder(self._session),
                     DrawingIntroItemDataBaseHanlder(self._session)]
def mark_crawling_task_to_done(crawl_task):
    model = DataBaseModel()
    engine = model.db_connect()
    session = sessionmaker(bind=engine)()
    for row in session.query(CrawlListTable).filter(CrawlListTable.id == crawl_task.tag_id):
        logger.info("crawl done %d %s %s %s" % (row.id, row.tag_type, row.tag_value, row.start_url))
        row.done = True
    session.commit()
    session.close()
def get_jd_drawings_to_crawl(by_search=False):
    model = DataBaseModel()
    engine = model.db_connect()
    session = sessionmaker(bind=engine)()
    to_crawl_list = []
    for row in session.query(CrawlListTable).filter(CrawlListTable.done == False).filter(
                    CrawlListTable.by_search == by_search).all():
        to_crawl_list.append(DrawingsToCrawl(row.id, row.tag_type, row.tag_value, row.start_url))
    session.close()
    return to_crawl_list
class JdBaseSpider(scrapy.Spider):
    allowed_domains = ["http://www.jd.com/", "item.jd.com", "list.jd.com", "d.3.cn", "search.jd.com"]

    def __init__(self):
        self._model = DataBaseModel()
        self._engine = self._model.db_connect()
        self._session = sessionmaker(bind=self._engine)()

    def _get_drawing_by_unique_key(self, key):
        drawings = self._session.query(DrawingTable).filter(DrawingTable.isbn == key)
        if drawings.count() == 0:
            return None
        else:
            return drawings.first()

    def _is_empty(self, intro):
        return intro == '' or intro == 'n/a'

    def _is_content_or_author_intro_empty(self, drawing):
        if not drawing:
            return True
        return self._is_empty(drawing.content_intro) or self._is_empty(drawing.author_intro)

    def _handle_drawing_page(self, response):
        item = DrawingItemBuilder(response).build()

        # DRAWING_UNIQUE_KEY absent
        if not item[DRAWING_UNIQUE_KEY]:
            logger.warn("drop the item found in %s since no %s found" % (response.url, DRAWING_UNIQUE_KEY))
            return

        drawing = self._get_drawing_by_unique_key(item[DRAWING_UNIQUE_KEY])
        # drawing has not been in DataBase
        if self._create_new_drawing_if_not_exist and not drawing:
            yield item
            # unique tags for a single DRAWING_UNIQUE_KEY
            if item['press']:
                yield DrawingTagItemBuilder(item[DRAWING_UNIQUE_KEY], u"出版社", item['press']).build()
            if item['author']:
                yield DrawingTagItemBuilder(item[DRAWING_UNIQUE_KEY], u"作者/绘者/译者", item['author']).build()
            if item['drawer']:
                yield DrawingTagItemBuilder(item[DRAWING_UNIQUE_KEY], u"作者/绘者/译者", item['drawer']).build()
            if item['translator']:
                yield DrawingTagItemBuilder(item[DRAWING_UNIQUE_KEY], u"作者/绘者/译者", item['translator']).build()

        # add one more tag
        if self._tag_type and self._tag_value:
            yield DrawingTagItemBuilder(item[DRAWING_UNIQUE_KEY], self._tag_type, self._tag_value).build()

        # crawl for content introduction and/or author introduction
        if self._is_content_or_author_intro_empty(drawing) and item['product_code']:
            drawing_detail_page = u"http://d.3.cn/desc/" + item['product_code'] + u"?cdn=1&callback=showdesc"
            yield scrapy.Request(drawing_detail_page,
                                 callback=lambda rsp, key=item[DRAWING_UNIQUE_KEY]: self._handle_drawing_detail_page(
                                     rsp, key))

    def _handle_drawing_detail_page(self, response, key):
        yield DrawingIntroItemBuilder(response, key).build()
class DuplicateInDataBasePipeline(object):
    def __init__(self):
        self._model = DataBaseModel()
        self._engine = self._model.db_connect()
        self._session = sessionmaker(bind=self._engine)

    def _exist_in_db(self,item):
        return self._session().query(DrawingTable).filter(
            DrawingTable.isbn == item[DRAWING_UNIQUE_KEY]).count() > 0

    def process_item(self, item, spider):
        if type(item) == DrawingItem:
           if self._exist_in_db(item):
                raise DropItem("Duplicate item found in DB: %s" % item)
        return item
 def __init__(self):
     self._model = DataBaseModel()
     self._engine = self._model.db_connect()
     self._session = sessionmaker(bind=self._engine)
# -*- coding: utf-8 -*-

import logging
import time
import requests
import string
from drawing.util.log import setup_logger
from drawing.models.tables.drawing import DrawingTable
from drawing.models.model import DataBaseModel
from sqlalchemy.orm import sessionmaker

logger = setup_logger(loggername=__name__, console=logging.DEBUG)

DATABASE_MODEL = DataBaseModel()
DATABASE_ENGINE = DATABASE_MODEL.db_connect()


class CrawlListGenerator(object):
    def _valid_isbn(self, isbn):
        return len(isbn) == 13 or len(isbn) == 10

    def _book_to_crawl(self, row):
        return self._valid_isbn(row.isbn) and (not row.tried_in_douban) and row.single_book

    def generate(self):
        db_session = sessionmaker(bind=DATABASE_ENGINE)
        return [row.isbn for row
                in db_session().query(DrawingTable.isbn, DrawingTable.tried_in_douban, DrawingTable.single_book).all()
                if self._book_to_crawl(row)]