def _scrape_unit(self, document): article = Article() metadata = list(META) # We select all 'div' elements directly under '.article' divs = document.cssselect("* > div") # Check for author field. If present: remove from metadata # fields list try: author_field = document.cssselect(".author")[0] except IndexError: pass else: article.author = author_field.text_content().lstrip("Von").strip() divs.remove(author_field) # Strip everything before headline headline_field = document.cssselect("b.deHeadline")[0].getparent() divs = divs[divs.index(headline_field):] # Parse metadata. Loop through each 'div' within an article, along with # its field name according to META (thus based on its position) for field_name, element in zip(metadata, divs): if field_name is None: continue processor = PROCESSORS.get(field_name, lambda x: x) text_content = element.text_content().strip() setattr(article, field_name, processor(text_content)) # Fetch text, which is paragraphs = [p.text_content() for p in document.cssselect("p")] article.text = ("\n\n".join(paragraphs)).strip() # We must return a iterable, so we return a one-tuple return (article, )
def _scrape_unit(self, document): article = Article() metadata = list(META) # We select all 'div' elements directly under '.article' divs = document.cssselect("* > div") # Check for author field. If present: remove from metadata # fields list try: author_field = document.cssselect(".author")[0] except IndexError: pass else: article.author = author_field.text_content().lstrip("Von").strip() divs.remove(author_field) # Strip everything before headline headline_field = document.cssselect("b.deHeadline")[0].getparent() divs = divs[divs.index(headline_field):] # Parse metadata. Loop through each 'div' within an article, along with # its field name according to META (thus based on its position) for field_name, element in zip(metadata, divs): if field_name is None: continue processor = PROCESSORS.get(field_name, lambda x: x) text_content = element.text_content().strip() setattr(article, field_name, processor(text_content)) # Fetch text, which is paragraphs = [p.text_content() for p in document.cssselect("p")] article.text = ("\n\n".join(paragraphs)).strip() # We must return a iterable, so we return a one-tuple return (article,)