def getarticle(self, headline, lines): article = Document(headline = headline) text = "" for line in lines[2:]: if len(line) > 2: text += "\n" + line text = text.replace("-\n","") text = text.replace(" "," ") text = text.replace("\n"," ") article.props.text = text date_pattern = re.compile("([0-9]{2,2})\-([0-9]{2,2})\-([0-9]{4,4})") result = date_pattern.search(lines[1]) article.props.date = date( int(result.group(3)), int(result.group(2)), int(result.group(1))) pagenum_pattern = re.compile("\(p.([0-9]+)([0-9\-]+)?\)") result = pagenum_pattern.search(lines[1]) if result: article.props.pagenr = int(result.group(1)) for h, medium in self.index: if article.props.headline.lower().strip() in h.lower().strip(): article.props.medium = self.create_medium(medium) return article
def _scrape_unit(self, status): tweet = Document() tweet.props.author = status.author tweet.props.text = status.text tweet.props.date = status.created_at tweet.props.meta = status yield tweet
def _scrape_unit(self, result): # (i)page properties: # bytes, page, category, any of the article props # article properties: # date, section, pagenr, headline, byline, length (autogenerated), # url (already present), text, parent, medium (auto), author obj = Document() obj.doc = result.data obj.props.type = result.type obj.props.fb_id = obj.doc['id'] _type = result.type if _type == "post": for post in self.scrape_post(obj): yield post elif _type == "page": for unit in self.scrape_page(obj): yield unit
def _get_paper(self, paper_id): date = self.options['date'] rmsg = self.create_message( messaging.RemotingMessage, operation="getPaper", body=[self.paper_id, paper_id, self.context_id], destination="onlineFacade") env = self.create_envelope(self.create_request(rmsg)) resp = self.apiget(env).bodies[0][1] for spread in resp.body.body['spreads']: for page in [spread.get(p) for p in ('leftPage', 'rightPage')]: if page is None: continue index = Document() index.props.date = date index.props.section = page.get('section') index.props.pagenr = index.page = page.get('nr') index.doc = page yield index
def _get_paper(self, paper_id): date = self.options['date'] rmsg = self.create_message( messaging.RemotingMessage, operation="getPaper", body=[self.paper_id, paper_id, self.context_id], destination="onlineFacade" ) env = self.create_envelope(self.create_request(rmsg)) resp = self.apiget(env).bodies[0][1] for spread in resp.body.body['spreads']: for page in [spread.get(p) for p in ('leftPage', 'rightPage')]: if page is None: continue index = Document() index.props.date = date index.props.section = page.get('section') index.props.pagenr = index.page = page.get('nr') index.doc = page yield index