Exemple #1
0
 def parse_feed(self, entry):
     'Extract list of articles from the feed.'
     articles = []
     (url, publisher, publisher_location) = entry
     try:
         c = urlopen(url)
     except URLError:
         print 'Failed to fetch ' + url
     feed = feedparser.parse(c)
     # for e in feed.entries[:1]: # read just the first entry while debugging
     for e in feed.entries:
         image_link = None
         image_type = None
         for link in e.links:
             if link['rel'] == 'enclosure':
                 image_link = link['href']
                 image_type = link['type']
         article = Article(
             publisher=publisher,
             publisher_location=publisher_location,
             published_date=e.updated_parsed,
             title=e.title,
             link=e.link,
             image_link=image_link,
             image_type=image_type)
         content = self.htmlparser.parse(e.link)
         m = re.search(r'-\s*([a-zA-Z]+(,?\s+[a-zA-Z]+){0,6})$', content)
         if m:
             article.source = m.group(1)
         article.content = re.sub(r'(\\n)?\s*-\s*([a-zA-Z]+(,?\s+[a-zA-Z]+){0,6})$', '', content)
         article.store(self.db) # put article and word frequencies into couchdb
         articles.append(article)
     return articles
Exemple #2
0
 def parse_feed(self, feed):
     'Extract list of articles from the feed.'
     articles = []
     htmlparser = HtmlParser()
     for e in feed.entries[:1]: # read just the first entry while debugging
         article = Article(source=e.author, title=e.title, link=e.link)
         content = htmlparser.parse(e.link)
         article.content = re.sub(r' -.*$', '', content)
         article.save() # and associated word frequencies
         articles.append(article)
     return articles
Exemple #3
0
    def process_item(self, item, spider):
        article = Article()
        article.title = item["title"]
        article.create_date = item["create_date"]
        article.content = remove_tags(item["content"]).strip().replace("\r\n","").replace("\t","")
        article.front_image_url = item["front_image_url"]
        # article.front_image_path = item["front_image_path"]
        article.praise_nums = item["praise_nums"]
        article.comment_nums = item["comment_nums"]
        article.fav_nums = item["fav_nums"]
        article.url = item["url"]
        article.tags = item["tags"]
        article.id = item["url_object_id"]

        title_suggest = self.gen_suggests(article.title, article.tags)
        article.title_suggest = title_suggest

        article.save()

        return item
Exemple #4
0
    def save_to_es(self):
        article = Article()
        article.title = self['title']
        article.create_date = self["create_date"]
        article.content = remove_tags(self["content"])
        article.front_image_url = self["front_image_url"]
        if "front_image_path" in self:
            article.front_image_path = self["front_image_path"]
        article.praise_nums = self["praise_nums"]
        article.fav_nums = self["fav_nums"]
        article.comment_nums = self["comment_nums"]
        article.url = self["url"]
        article.tags = self["tags"]
        article.meta.id = self["url_object_id"]

        article.title_suggest = gen_suggests(Article._doc_type.index, ((article.title, 7), (article.tags, 8)))

        article.save()

        redis_cli.incr("jobbole_count")

        return