class SQLPipeline(object): def open_spider(self, spider): """Create a SQLAlchemy session. Note: this gets called implicitly by scrapy. """ self.session = Session() def close_spider(self, spider): """Close the SQLAlchemy session. Note: this gets called implicitly by scrapy. """ self.session.close() def process_item(self, item, spider): """Add the PersonItem component and the list of ToolItem components to the database. Return the input item. If a database constraint is violated, rollback the transaction. Arguments: - item: dictionary {'person': PersonItem component, 'tools': list of ToolItem components} - spider: a spider instance (see scrapy docs) Returns: - item: dictionary {'person': PersonItem component, 'tools': list of ToolItem components} Note: the "rollback" behavior is currently untestable by SQLAlchemy. """ person = Person(**item["person"]) for tool_item in item["tools"]: tool = Tool(**tool_item) person.tools.append(tool) self.session.add(person) try: self.session.commit() sys.stderr.write(".") except IntegrityError: logger.warn('"%s" is already in database.', person.name) self.session.rollback() return item