def process_item(self, item, spider): """Add the PersonItem component and the list of ToolItem components to the database. Return the input item. If a database constraint is violated, rollback the transaction. Arguments: - item: dictionary {'person': PersonItem component, 'tools': list of ToolItem components} - spider: a spider instance (see scrapy docs) Returns: - item: dictionary {'person': PersonItem component, 'tools': list of ToolItem components} Note: the "rollback" behavior is currently untestable by SQLAlchemy. """ person = Person(**item["person"]) for tool_item in item["tools"]: tool = Tool(**tool_item) person.tools.append(tool) self.session.add(person) try: self.session.commit() sys.stderr.write(".") except IntegrityError: logger.warn('"%s" is already in database.', person.name) self.session.rollback() return item
def validate_tool_items(items, person_item, verbose=False): name = person_item["name"] article_url = person_item["article_url"] if not items: logger.warn("%s (%s) doesn't use any tools.", name, article_url) return # Replace the contents of `items` list with only the items that are valid items[:] = [item for item in items if is_valid_tool(item, name, article_url, verbose)] if not items: logger.warn("%s doesn't use any tools that have valid URLs.", name)
def validate_person_item(item, verbose=False): missing_fields = missing_item_fields(item) if missing_fields: raise ItemValidationError("PersonItem missing fields: {missing_fields}".format(**locals())) name = item["name"] article_url = item["article_url"] if not name: err_msg = "Interview at {article_url} doesn't have a person's name".format(**locals()) raise ItemValidationError(err_msg) if not is_valid_url(article_url): err_msg = "{name} ({article_url}) doesn't have a valid interview URL".format(**locals()) raise ItemValidationError(err_msg) pub_date = item["pub_date"] if not is_valid_date(pub_date): err_msg = "{name} ({article_url}) doesn't have a publication date.".format(**locals()) raise ItemValidationError(err_msg) img_src = item["img_src"] if not is_valid_src(img_src): err_msg = "{name} ({article_url}) doesn't have a valid image source URL ({img_src}).".format(**locals()) raise ItemValidationError(err_msg) if not item["bio"]: logger.warn("%s (%s) doesn't have a bio.", name, article_url) if not item["hardware"]: logger.warn("%s (%s) doesn't have a hardware section.", name, article_url) if not item["software"]: logger.warn("%s (%s) doesn't have a software section.", name, article_url) if not item["dream"]: logger.warn("%s (%s) doesn't have a dream-setup section.", name, article_url)