예제 #1
0
    def fetch_all_sources(self, opts):
        for source in self.get_sources(opts):
            print "CRAWL: Crawling \"%s\"..." % source['title']
            try:
                f = feedparser.parse(source['link'])
            except Exception, e:
                print "Exception while parsing feed: %s" % (source['link'],)
                print e
                continue

            for entry in f.entries:
                d = None
                try:
                    if hasattr(entry, 'published_parsed'):
                        d = date(entry.published_parsed[0],
                                 entry.published_parsed[1],
                                 entry.published_parsed[2])
                    else:
                        d = date(entry.updated_parsed[0],
                                 entry.updated_parsed[1],
                                 entry.updated_parsed[2])
                except Exception, e:
                    print e
                    print entry
                    print "Setting date as today; could not parse date for feed", \
                        source['link']
                    d = self.today
                if d > self.today or d < self.earliest_date: continue
                if entry.title[-6:] == '(blog)' \
                        or entry.title[-15:] == '(press release)':
                    print "Blog or press release in title. (%s) (%s)" % \
                        (entry.link, entry.title)
                    continue
                try:
                    url = urllib2.urlopen(entry.link).geturl()
                except KeyboardInterrupt:
                    print "Quitting early due to keyboard interrupt."
                    sys.exit()
                except: continue

                # attempt to skip blogs
                if re.match('^.*blog.*$', url):
                    print "'blog' in url (%s) (%s)" % \
                        (entry.link, entry.title)
                    continue
                # attempt to skip job postings
                if re.match('^.*job.*$', url):
                    print "'job' in url (%s) (%s)" % \
                        (entry.link, entry.title)
                    continue
                # skip urls we have already crawled
                if self.db.crawled(url):
                    print "Seen this url before (%s) (%s)" % \
                        (entry.link, entry.title)
                    continue
                
                title = cgi.escape(convert_to_printable(entry.title)).strip()

                # if source is Google News, extract true source from title
                if re.match(r'^.*Google News.*$', source['title']):
                    true_source = re.match(r'^.* - (.+)$', title).group(1)
                    true_source = "%s via Google News" % true_source
                    title = re.match(r'^(.*) - .+$', title).group(1)
                else: true_source = source['title']
                
                self.articles.append({'url': url, 'title': title, 'pubdate': d,
                                      'source': true_source, 'source_id': source['source_id'],
                                      'source_relevance': source['relevance']})
예제 #2
0
    def fetch_all_sources(self, opts):
        for source in self.get_sources(opts):
            print "CRAWL: Crawling \"%s\"..." % source['title']
            try:
                f = feedparser.parse(source['link'])
            except Exception, e:
                print "Exception while parsing feed: %s" % (source['link'], )
                print e
                continue

            for entry in f.entries:
                d = None
                try:
                    if hasattr(entry, 'published_parsed'):
                        d = date(entry.published_parsed[0],
                                 entry.published_parsed[1],
                                 entry.published_parsed[2])
                    else:
                        d = date(entry.updated_parsed[0],
                                 entry.updated_parsed[1],
                                 entry.updated_parsed[2])
                except Exception, e:
                    print e
                    print entry
                    print "Setting date as today; could not parse date for feed", \
                        source['link']
                    d = self.today
                if d > self.today or d < self.earliest_date: continue
                if entry.title[-6:] == '(blog)' \
                        or entry.title[-15:] == '(press release)':
                    print "Blog or press release in title. (%s) (%s)" % \
                        (entry.link, entry.title)
                    continue
                try:
                    url = urllib2.urlopen(entry.link).geturl()
                except KeyboardInterrupt:
                    print "Quitting early due to keyboard interrupt."
                    sys.exit()
                except:
                    continue

                # attempt to skip blogs
                if re.match('^.*blog.*$', url):
                    print "'blog' in url (%s) (%s)" % \
                        (entry.link, entry.title)
                    continue
                # attempt to skip job postings
                if re.match('^.*job.*$', url):
                    print "'job' in url (%s) (%s)" % \
                        (entry.link, entry.title)
                    continue
                # skip urls we have already crawled
                if self.db.crawled(url):
                    print "Seen this url before (%s) (%s)" % \
                        (entry.link, entry.title)
                    continue

                title = cgi.escape(convert_to_printable(entry.title)).strip()

                # if source is Google News, extract true source from title
                if re.match(r'^.*Google News.*$', source['title']):
                    true_source = re.match(r'^.* - (.+)$', title).group(1)
                    true_source = "%s via Google News" % true_source
                    title = re.match(r'^(.*) - .+$', title).group(1)
                else:
                    true_source = source['title']

                self.articles.append({
                    'url': url,
                    'title': title,
                    'pubdate': d,
                    'source': true_source,
                    'source_id': source['source_id'],
                    'source_relevance': source['relevance']
                })
예제 #3
0
    def fetch_all_articles(self):
        try:
            os.makedirs(paths['ainews.content_tmp'])
        except: pass
        f = open("%surllist.txt" % paths['ainews.content_tmp'], 'w')
        for article in self.articles:
            f.write("%s\n" % article['url'])
        f.close()

        goose_cmd = "cd %s/goose; MAVEN_OPTS=\"-Xms256m -Xmx800m\" %s exec:java -Dexec.mainClass=com.gravity.goose.FetchMany -Dexec.args=\"%s\" -q" % \
            (paths['libraries.tools'], paths['libraries.maven'], paths['ainews.content_tmp'])
        Popen(goose_cmd, shell = True).communicate()

        i = 0
        for article in self.articles:
            print "READ:  Opening", ("%s%d" % (paths['ainews.content_tmp'], i))
            f = codecs.open("%s%d" % (paths['ainews.content_tmp'], i), encoding='utf-8')
            rows = f.read().split("\n")
            f.close()
            #os.remove("%s%d" % (paths['ainews.content_tmp'], i))
            # don't move this; have to ensure the increment occurs!
            i += 1

            if self.db.crawled(article['url']):
                print "READ:  .. Ignoring; already in crawled database."
                continue

            if len(rows) < 3:
                print "FETCH: .. Ignoring; not enough lines in Goose output: URL=%s, ROWS=%s" % \
                    (article['url'], rows)
                continue

            self.db.set_crawled(article['url'])
            content = ' '.join(rows[:-2])
            content = convert_to_printable(cgi.escape(re.sub(r'\s+', ' ', content))).strip()
            content = re.sub("%s\\s*-?\\s*" % re.escape(article['title']), '', content)
            content = re.sub(r'\s*Share this\s*', '', content)
            content = re.sub(r'\s+,\s+', ', ', content)
            content = re.sub(r'\s+\.', '.', content)
            # shorten content to (presumably) ignore article comments
            content = trunc(content, max_pos=5000)
            article['content'] = content

            article['summary'] = self.summarizer.summarize_first_two_sentences(article['content'])
            print "SUMRY: ..", article['summary']
            article['image_url'] = convert_to_printable(rows[-2]).strip()

            if len(article['title']) < 5 or len(article['content']) < 1000:
                print "CRAWL: -- Ignoring. Content or title too short. Title = {%s}; Content = {%s}" % \
                    (article['title'], article['content'])
                continue

            # remove content with blacklisted words
            found_blacklist_word = False
            for word in blacklist_words:
                if re.search("\W%s\W" % word, article['content'], re.IGNORECASE) != None:
                    print "CRAWL: -- Ignoring. Found blacklisted word \"%s\", ignoring article." % word
                    found_blacklist_word = True
                    break
            if found_blacklist_word: 
                continue

            urlid = self.put_in_db(article)
            if urlid == None: continue
            print "CRAWL: ++ {ID:%d/%d} %s (%s, %s)" % \
                (urlid, i, article['title'], str(article['pubdate']), article['source'])            
예제 #4
0
    def fetch_all_articles(self):
        try:
            os.makedirs(paths['ainews.content_tmp'])
        except:
            pass
        f = open("%surllist.txt" % paths['ainews.content_tmp'], 'w')
        for article in self.articles:
            f.write("%s\n" % article['url'])
        f.close()

        goose_cmd = "cd %s/goose; MAVEN_OPTS=\"-Xms256m -Xmx800m\" %s exec:java -Dexec.mainClass=com.gravity.goose.FetchMany -Dexec.args=\"%s\" -q" % \
            (paths['libraries.tools'], paths['libraries.maven'], paths['ainews.content_tmp'])
        Popen(goose_cmd, shell=True).communicate()

        i = 0
        for article in self.articles:
            print "READ:  Opening", ("%s%d" % (paths['ainews.content_tmp'], i))
            f = codecs.open("%s%d" % (paths['ainews.content_tmp'], i),
                            encoding='utf-8')
            rows = f.read().split("\n")
            f.close()
            #os.remove("%s%d" % (paths['ainews.content_tmp'], i))
            # don't move this; have to ensure the increment occurs!
            i += 1

            if self.db.crawled(article['url']):
                print "READ:  .. Ignoring; already in crawled database."
                continue

            if len(rows) < 3:
                print "FETCH: .. Ignoring; not enough lines in Goose output: URL=%s, ROWS=%s" % \
                    (article['url'], rows)
                continue

            self.db.set_crawled(article['url'])
            content = ' '.join(rows[:-2])
            content = convert_to_printable(
                cgi.escape(re.sub(r'\s+', ' ', content))).strip()
            content = re.sub("%s\\s*-?\\s*" % re.escape(article['title']), '',
                             content)
            content = re.sub(r'\s*Share this\s*', '', content)
            content = re.sub(r'\s+,\s+', ', ', content)
            content = re.sub(r'\s+\.', '.', content)
            # shorten content to (presumably) ignore article comments
            content = trunc(content, max_pos=5000)
            article['content'] = content

            article['summary'] = self.summarizer.summarize_first_two_sentences(
                article['content'])
            print "SUMRY: ..", article['summary']
            article['image_url'] = convert_to_printable(rows[-2]).strip()

            if len(article['title']) < 5 or len(article['content']) < 1000:
                print "CRAWL: -- Ignoring. Content or title too short. Title = {%s}; Content = {%s}" % \
                    (article['title'], article['content'])
                continue

            # remove content with blacklisted words
            found_blacklist_word = False
            for word in blacklist_words:
                if re.search("\W%s\W" % word, article['content'],
                             re.IGNORECASE) != None:
                    print "CRAWL: -- Ignoring. Found blacklisted word \"%s\", ignoring article." % word
                    found_blacklist_word = True
                    break
            if found_blacklist_word:
                continue

            urlid = self.put_in_db(article)
            if urlid == None: continue
            print "CRAWL: ++ {ID:%d/%d} %s (%s, %s)" % \
                (urlid, i, article['title'], str(article['pubdate']), article['source'])