class SQLPipeline(object):
    def open_spider(self, spider):
        """Create a SQLAlchemy session.
        Note: this gets called implicitly by scrapy.
        """
        self.session = Session()

    def close_spider(self, spider):
        """Close the SQLAlchemy session.
        Note: this gets called implicitly by scrapy.
        """
        self.session.close()

    def process_item(self, item, spider):
        """Add the PersonItem component and the list of ToolItem components to
        the database. Return the input item.
        If a database constraint is violated, rollback the transaction.

        Arguments:
            - item: dictionary {'person': PersonItem component,
                                'tools': list of ToolItem components}
            - spider: a spider instance (see scrapy docs)

        Returns:
            - item: dictionary {'person': PersonItem component,
                                'tools': list of ToolItem components}

        Note: the "rollback" behavior is currently untestable by SQLAlchemy.
        """
        person = Person(**item["person"])

        for tool_item in item["tools"]:
            tool = Tool(**tool_item)
            person.tools.append(tool)

        self.session.add(person)
        try:
            self.session.commit()
            sys.stderr.write(".")
        except IntegrityError:
            logger.warn('"%s" is already in database.', person.name)
            self.session.rollback()

        return item
 def open_spider(self, spider):
     """Create a SQLAlchemy session.
     Note: this gets called implicitly by scrapy.
     """
     self.session = Session()
 def setup_example(self):
     engine = create_engine('sqlite:///:memory:') #, echo=True)
     self.connection = engine.connect()
     self.transaction = self.connection.begin()
     self.session = Session(bind=self.connection)
     Base.metadata.create_all(engine)
Exemple #4
0
def init_models(db_path, enable_test_mode=False):
    engine = create_engine('sqlite:///'+db_path, echo=enable_test_mode)
    Base.metadata.create_all(engine)
    Session.configure(bind=engine)
class SQLPipelineTestCase(unittest.TestCase):
    def setup_example(self):
        engine = create_engine('sqlite:///:memory:') #, echo=True)
        self.connection = engine.connect()
        self.transaction = self.connection.begin()
        self.session = Session(bind=self.connection)
        Base.metadata.create_all(engine)

    def teardown_example(self, _):
        self.session.close()
        self.transaction.rollback()
        self.connection.close()

    @given(
        st.fixed_dictionaries(
            dict(
                person=st.builds(
                    PersonItem,
                    st.fixed_dictionaries(
                        dict(
                            name=st.text(min_size=1),
                            article_url=st.just('https://usesthis.com/interviews/joe.schmoe/'),
                            pub_date=st.just('2014-04-08'),
                            title=st.text(),
                            img_src=st.just('https://usesthis.com/images/portraits/joe.schmoe.jpg'),
                            bio=st.text(),
                            hardware=st.text(),
                            software=st.text(),
                            dream=st.text(),
                        )
                    )
                ),
                tools=st.lists(
                    st.builds(
                        ToolItem,
                        st.fixed_dictionaries(
                            dict(
                                tool_name=st.text(min_size=1),
                                tool_url=st.just('http://plumbertools.org/pipebuster5000'),
                            )
                        )
                    )
                )
            )
        )
    )
    def test_item_added_to_database(self, item):
        """Verify that when the SQLPipeline receives a validated item - and the database doesn't already contain the item - it gets added to the database.
        """
        spider = UsesthisSpider('usesthis')
        pipeline = SQLPipeline()
        pipeline.session = self.session

        # Assert that nothing is in the relevant database tables beforehand
        self.assertFalse(self.session.query(Person).all())
        self.assertFalse(self.session.query(Tool).all())
        self.assertFalse(self.session.query(people_to_tools_tbl).all())

        # The `process_item()` method should return the same item
        same_item = pipeline.process_item(item, spider)
        self.assertEquals(same_item, item)

        # Create some Person and Tool model objects for comparison
        person = Person(**item['person'])
        person.id = 1
        tools = []
        for tool_id, tool_item in enumerate(item['tools'], start=1):
            tool = Tool(**tool_item)
            tool.id = tool_id
            tools.append(tool)

        # Assert that the correct Person model was inserted
        same_person = self.session.query(Person).one()
        self.assertEquals(same_person.id, person.id)
        self.assertEquals(same_person.name, person.name)
        self.assertEquals(same_person.pub_date, person.pub_date)
        self.assertEquals(same_person.title, person.title)
        self.assertEquals(same_person.img_src, person.img_src)
        self.assertEquals(same_person.article_url, person.article_url)
        self.assertEquals(same_person.bio, person.bio)
        self.assertEquals(same_person.hardware, person.hardware)
        self.assertEquals(same_person.software, person.software)
        self.assertEquals(same_person.dream, person.dream)

        # Assert that the correct Tool models were inserted
        same_tools = self.session.query(Tool).all()
        for tool_idx, same_tool in enumerate(same_tools):
            tool = tools[tool_idx]
            self.assertEquals(same_tool.id, tool.id)
            self.assertEquals(same_tool.tool_name, tool.tool_name)
            self.assertEquals(same_tool.tool_url, tool.tool_url)

        # Assert that the correct Person-Tool relations were inserted
        relations = self.session.query(people_to_tools_tbl).all()
        for idx, relation in enumerate(sorted(relations)):
            tool = tools[idx]
            self.assertEquals(relation.person_id, person.id)
            self.assertEquals(relation.tool_id, tool.id)

    @unittest.skip("SQLAlchemy SAVEPOINTs don't work with SQLite, so the rollback invalidates the transaction.")
    @given(
        st.fixed_dictionaries(
            dict(
                person=st.builds(
                    PersonItem,
                    st.fixed_dictionaries(
                        dict(
                            name=st.text(min_size=1),
                            article_url=st.just('https://usesthis.com/interviews/joe.schmoe/'),
                            pub_date=st.just('2014-04-08'),
                            title=st.text(),
                            img_src=st.just('https://usesthis.com/images/portraits/joe.schmoe.jpg'),
                            bio=st.text(),
                            hardware=st.text(),
                            software=st.text(),
                            dream=st.text(),
                        )
                    )
                ),
                tools=st.lists(
                    st.builds(
                        ToolItem,
                        st.fixed_dictionaries(
                            dict(
                                tool_name=st.text(min_size=1),
                                tool_url=st.just('http://plumbertools.org/pipebuster5000'),
                            )
                        )
                    )
                )
            )
        )
    )
    def test_dont_add_dup_person_to_database(self, item):
        """Verify that when the SQLPipeline receives a validated item - and the database already contains the item - it doesn't get added to the database.
        """
        spider = UsesthisSpider('usesthis')
        pipeline = SQLPipeline()
        pipeline.session = self.session

        # Create some Person and Tool model objects for comparison
        person = Person(**item['person'])
        person.id = 1
        tools = []
        for tool_id, tool_item in enumerate(item['tools'], start=1):
            tool = Tool(**tool_item)
            tool.id = tool_id
            tools.append(tool)
        self.session.add(person)

        # Assert that nothing is in the relevant database tables beforehand
        self.assertEquals(self.session.query(Person).all(), [person])
        self.assertFalse(self.session.query(Tool).all())
        self.assertFalse(self.session.query(people_to_tools_tbl).all())

        # The `process_item()` method should return the same item
        same_item = pipeline.process_item(item, spider)
        self.assertEquals(same_item, item)

        # Assert that the correct Person model was inserted
        same_person = self.session.query(Person).one()
        self.assertEquals(same_person.id, person.id)
        self.assertEquals(same_person.name, person.name)
        self.assertEquals(same_person.pub_date, person.pub_date)
        self.assertEquals(same_person.title, person.title)
        self.assertEquals(same_person.img_src, person.img_src)
        self.assertEquals(same_person.article_url, person.article_url)
        self.assertEquals(same_person.bio, person.bio)
        self.assertEquals(same_person.hardware, person.hardware)
        self.assertEquals(same_person.software, person.software)
        self.assertEquals(same_person.dream, person.dream)

        # Assert that the correct Tool models were inserted
        tools_in_db = self.session.query(Tool).all()
        self.assertFalse(tools_in_db)

        # Assert that the correct Person-Tool relations were inserted
        relations_in_db = self.session.query(people_to_tools_tbl).all()
        self.assertFalse(relations_in_db)