class SQLPipeline(object): def open_spider(self, spider): """Create a SQLAlchemy session. Note: this gets called implicitly by scrapy. """ self.session = Session() def close_spider(self, spider): """Close the SQLAlchemy session. Note: this gets called implicitly by scrapy. """ self.session.close() def process_item(self, item, spider): """Add the PersonItem component and the list of ToolItem components to the database. Return the input item. If a database constraint is violated, rollback the transaction. Arguments: - item: dictionary {'person': PersonItem component, 'tools': list of ToolItem components} - spider: a spider instance (see scrapy docs) Returns: - item: dictionary {'person': PersonItem component, 'tools': list of ToolItem components} Note: the "rollback" behavior is currently untestable by SQLAlchemy. """ person = Person(**item["person"]) for tool_item in item["tools"]: tool = Tool(**tool_item) person.tools.append(tool) self.session.add(person) try: self.session.commit() sys.stderr.write(".") except IntegrityError: logger.warn('"%s" is already in database.', person.name) self.session.rollback() return item
def open_spider(self, spider): """Create a SQLAlchemy session. Note: this gets called implicitly by scrapy. """ self.session = Session()
def setup_example(self): engine = create_engine('sqlite:///:memory:') #, echo=True) self.connection = engine.connect() self.transaction = self.connection.begin() self.session = Session(bind=self.connection) Base.metadata.create_all(engine)
def init_models(db_path, enable_test_mode=False): engine = create_engine('sqlite:///'+db_path, echo=enable_test_mode) Base.metadata.create_all(engine) Session.configure(bind=engine)
class SQLPipelineTestCase(unittest.TestCase): def setup_example(self): engine = create_engine('sqlite:///:memory:') #, echo=True) self.connection = engine.connect() self.transaction = self.connection.begin() self.session = Session(bind=self.connection) Base.metadata.create_all(engine) def teardown_example(self, _): self.session.close() self.transaction.rollback() self.connection.close() @given( st.fixed_dictionaries( dict( person=st.builds( PersonItem, st.fixed_dictionaries( dict( name=st.text(min_size=1), article_url=st.just('https://usesthis.com/interviews/joe.schmoe/'), pub_date=st.just('2014-04-08'), title=st.text(), img_src=st.just('https://usesthis.com/images/portraits/joe.schmoe.jpg'), bio=st.text(), hardware=st.text(), software=st.text(), dream=st.text(), ) ) ), tools=st.lists( st.builds( ToolItem, st.fixed_dictionaries( dict( tool_name=st.text(min_size=1), tool_url=st.just('http://plumbertools.org/pipebuster5000'), ) ) ) ) ) ) ) def test_item_added_to_database(self, item): """Verify that when the SQLPipeline receives a validated item - and the database doesn't already contain the item - it gets added to the database. """ spider = UsesthisSpider('usesthis') pipeline = SQLPipeline() pipeline.session = self.session # Assert that nothing is in the relevant database tables beforehand self.assertFalse(self.session.query(Person).all()) self.assertFalse(self.session.query(Tool).all()) self.assertFalse(self.session.query(people_to_tools_tbl).all()) # The `process_item()` method should return the same item same_item = pipeline.process_item(item, spider) self.assertEquals(same_item, item) # Create some Person and Tool model objects for comparison person = Person(**item['person']) person.id = 1 tools = [] for tool_id, tool_item in enumerate(item['tools'], start=1): tool = Tool(**tool_item) tool.id = tool_id tools.append(tool) # Assert that the correct Person model was inserted same_person = self.session.query(Person).one() self.assertEquals(same_person.id, person.id) self.assertEquals(same_person.name, person.name) self.assertEquals(same_person.pub_date, person.pub_date) self.assertEquals(same_person.title, person.title) self.assertEquals(same_person.img_src, person.img_src) self.assertEquals(same_person.article_url, person.article_url) self.assertEquals(same_person.bio, person.bio) self.assertEquals(same_person.hardware, person.hardware) self.assertEquals(same_person.software, person.software) self.assertEquals(same_person.dream, person.dream) # Assert that the correct Tool models were inserted same_tools = self.session.query(Tool).all() for tool_idx, same_tool in enumerate(same_tools): tool = tools[tool_idx] self.assertEquals(same_tool.id, tool.id) self.assertEquals(same_tool.tool_name, tool.tool_name) self.assertEquals(same_tool.tool_url, tool.tool_url) # Assert that the correct Person-Tool relations were inserted relations = self.session.query(people_to_tools_tbl).all() for idx, relation in enumerate(sorted(relations)): tool = tools[idx] self.assertEquals(relation.person_id, person.id) self.assertEquals(relation.tool_id, tool.id) @unittest.skip("SQLAlchemy SAVEPOINTs don't work with SQLite, so the rollback invalidates the transaction.") @given( st.fixed_dictionaries( dict( person=st.builds( PersonItem, st.fixed_dictionaries( dict( name=st.text(min_size=1), article_url=st.just('https://usesthis.com/interviews/joe.schmoe/'), pub_date=st.just('2014-04-08'), title=st.text(), img_src=st.just('https://usesthis.com/images/portraits/joe.schmoe.jpg'), bio=st.text(), hardware=st.text(), software=st.text(), dream=st.text(), ) ) ), tools=st.lists( st.builds( ToolItem, st.fixed_dictionaries( dict( tool_name=st.text(min_size=1), tool_url=st.just('http://plumbertools.org/pipebuster5000'), ) ) ) ) ) ) ) def test_dont_add_dup_person_to_database(self, item): """Verify that when the SQLPipeline receives a validated item - and the database already contains the item - it doesn't get added to the database. """ spider = UsesthisSpider('usesthis') pipeline = SQLPipeline() pipeline.session = self.session # Create some Person and Tool model objects for comparison person = Person(**item['person']) person.id = 1 tools = [] for tool_id, tool_item in enumerate(item['tools'], start=1): tool = Tool(**tool_item) tool.id = tool_id tools.append(tool) self.session.add(person) # Assert that nothing is in the relevant database tables beforehand self.assertEquals(self.session.query(Person).all(), [person]) self.assertFalse(self.session.query(Tool).all()) self.assertFalse(self.session.query(people_to_tools_tbl).all()) # The `process_item()` method should return the same item same_item = pipeline.process_item(item, spider) self.assertEquals(same_item, item) # Assert that the correct Person model was inserted same_person = self.session.query(Person).one() self.assertEquals(same_person.id, person.id) self.assertEquals(same_person.name, person.name) self.assertEquals(same_person.pub_date, person.pub_date) self.assertEquals(same_person.title, person.title) self.assertEquals(same_person.img_src, person.img_src) self.assertEquals(same_person.article_url, person.article_url) self.assertEquals(same_person.bio, person.bio) self.assertEquals(same_person.hardware, person.hardware) self.assertEquals(same_person.software, person.software) self.assertEquals(same_person.dream, person.dream) # Assert that the correct Tool models were inserted tools_in_db = self.session.query(Tool).all() self.assertFalse(tools_in_db) # Assert that the correct Person-Tool relations were inserted relations_in_db = self.session.query(people_to_tools_tbl).all() self.assertFalse(relations_in_db)