Exemple #1
0
    def getarticle(self, headline, lines):
        article = Document(headline = headline)
        text = ""
        for line in lines[2:]:
            if len(line) > 2:
                text += "\n" + line

        text = text.replace("-\n","")
        text = text.replace("  "," ")
        text = text.replace("\n"," ")

        article.props.text = text
        date_pattern = re.compile("([0-9]{2,2})\-([0-9]{2,2})\-([0-9]{4,4})")
        result = date_pattern.search(lines[1])
        article.props.date = date(
            int(result.group(3)),
            int(result.group(2)),
            int(result.group(1)))
        pagenum_pattern = re.compile("\(p.([0-9]+)([0-9\-]+)?\)")
        result = pagenum_pattern.search(lines[1])
        if result:
            
            article.props.pagenr = int(result.group(1))

        for h, medium in self.index:
            if article.props.headline.lower().strip() in h.lower().strip():
                article.props.medium = self.create_medium(medium)

        return article
Exemple #2
0
 def _scrape_unit(self, status):
     tweet = Document()
     tweet.props.author = status.author
     tweet.props.text = status.text
     tweet.props.date = status.created_at
     tweet.props.meta = status
     yield tweet
    def _scrape_unit(self, result):

        # (i)page properties:
        # bytes, page, category, any of the article props

        # article properties:
        # date, section, pagenr, headline, byline, length (autogenerated),
        # url (already present), text, parent, medium (auto), author

        obj = Document()
        obj.doc = result.data
        obj.props.type = result.type
        obj.props.fb_id = obj.doc['id']

        _type = result.type
        if _type == "post":
            for post in self.scrape_post(obj):
                yield post
        elif _type == "page":
            for unit in self.scrape_page(obj):
                yield unit
Exemple #4
0
    def _get_paper(self, paper_id):
        date = self.options['date']

        rmsg = self.create_message(
            messaging.RemotingMessage,
            operation="getPaper",
            body=[self.paper_id, paper_id, self.context_id],
            destination="onlineFacade")

        env = self.create_envelope(self.create_request(rmsg))
        resp = self.apiget(env).bodies[0][1]

        for spread in resp.body.body['spreads']:
            for page in [spread.get(p) for p in ('leftPage', 'rightPage')]:
                if page is None: continue
                index = Document()
                index.props.date = date
                index.props.section = page.get('section')
                index.props.pagenr = index.page = page.get('nr')
                index.doc = page

                yield index
Exemple #5
0
    def _get_paper(self, paper_id):
        date = self.options['date']

        rmsg = self.create_message(
            messaging.RemotingMessage,
            operation="getPaper",
            body=[self.paper_id, paper_id, self.context_id],
            destination="onlineFacade"
        )

        env = self.create_envelope(self.create_request(rmsg))
        resp = self.apiget(env).bodies[0][1]

        for spread in resp.body.body['spreads']:
            for page in [spread.get(p) for p in ('leftPage', 'rightPage')]:
                if page is None: continue
                index = Document()
                index.props.date = date
                index.props.section = page.get('section')
                index.props.pagenr = index.page = page.get('nr')
                index.doc = page

                yield index