Ejemplo n.º 1
0
def evaluate():
    corpus = AINewsCorpus()
    print "urlid,length truewords,length justext,length goose,ld justtext,ld goose"
    for filename in sorted(glob.glob("../../experiments/justext/*.true")):
        truetext = ents.convert(file(filename).read())
        truetext = re.sub(r'[^\w\s]', ' ', trunc(truetext, max_pos=3000, ellipsis=False))
        truewords = re.split(r'\s+', truetext)
        urlid = filename[26:30]
        article = corpus.get_article(urlid)
        if article == None: continue
        articletext = re.sub(r'[^\w\s]', ' ', trunc((article['content_all']).encode('ascii'), max_pos=3000, ellipsis=False))
        articlewords = re.split(r'\s+', articletext)
        goosecmd = "cd /home/josh/aitopics/AINews/tools/goose; /opt/maven/bin/mvn exec:java -Dexec.mainClass=com.jimplush.goose.TalkToMeGoose -Dexec.args='%s' -q 2>>/home/josh/log.txt" % article['url']
        (stdout, _) = Popen(goosecmd, shell = True, stdout = PIPE).communicate()
        goosetext = ents.convert(stdout.encode('ascii'))
        goosetext = re.sub(r'[^\w\s]', ' ', trunc(goosetext, max_pos=3000, ellipsis=False))
        goosewords = re.split(r'\s+', goosetext)
        ld_1 = (levenshtein_distance(truewords, articlewords))/float(len(truewords))
        ld_2 = (levenshtein_distance(truewords, goosewords))/float(len(truewords))
        print "%s,%d,%d,%d,%.4f,%.4f" % \
            (urlid, len(truewords), len(articlewords), len(goosewords), ld_1, ld_2)
Ejemplo n.º 2
0
    def get_article(self, urlid, corpus = False):
        row = None
        if corpus:
            table = 'cat_corpus'
            cat_table = 'cat_corpus_cats'
            row = self.db.selectone("""select u.url, u.title, u.content
                from %s as u where u.urlid = %s""" % (table, urlid))

        else:
            table = 'urllist'
            cat_table = 'categories'
            row = self.db.selectone("""select u.url, u.title, u.content, u.summary, 
                u.pubdate, u.crawldate, u.processed, u.published, u.publisher
                from %s as u where u.urlid = %s""" % \
                                        (table, urlid))
        if row != None and row[2] is not None:
            wordfreq = self.txtpro.simpletextprocess(urlid, row[2])
            summary = ""
            if not corpus: summary = row[3]
            processed = False
            if not corpus and row[6] == 1: processed = True
            published = False
            if not corpus and row[7] == 1: published = True
            pubdate = ""
            if not corpus: pubdate = row[4]
            crawldate = ""
            if not corpus: crawldate = row[5]
            publisher = ""
            if not corpus: publisher = row[8]
            categories = []
            cat_rows = self.db.selectall("""select category from %s
                where urlid = %s""" % (cat_table, urlid))
            for cat_row in cat_rows:
                categories.append(cat_row[0])
            return {'urlid': urlid, 'url': row[0], 'title': row[1],
                    'content': trunc(row[2], max_pos=3000),
                    'content_all': row[2],
                    'summary': summary,
                    'pubdate': pubdate, 'crawldate': crawldate,
                    'processed': processed, 'published': published,
                    'publisher': publisher,
                    'categories': categories, 'duplicates': [],
                    'wordfreq': wordfreq, 'tfidf': self.get_tfidf(urlid, wordfreq)}
        else:
            return None
Ejemplo n.º 3
0
    def fetch_all_articles(self):
        try:
            os.makedirs(paths['ainews.content_tmp'])
        except: pass
        f = open("%surllist.txt" % paths['ainews.content_tmp'], 'w')
        for article in self.articles:
            f.write("%s\n" % article['url'])
        f.close()

        goose_cmd = "cd %s/goose; MAVEN_OPTS=\"-Xms256m -Xmx800m\" %s exec:java -Dexec.mainClass=com.gravity.goose.FetchMany -Dexec.args=\"%s\" -q" % \
            (paths['libraries.tools'], paths['libraries.maven'], paths['ainews.content_tmp'])
        Popen(goose_cmd, shell = True).communicate()

        i = 0
        for article in self.articles:
            print "READ:  Opening", ("%s%d" % (paths['ainews.content_tmp'], i))
            f = codecs.open("%s%d" % (paths['ainews.content_tmp'], i), encoding='utf-8')
            rows = f.read().split("\n")
            f.close()
            #os.remove("%s%d" % (paths['ainews.content_tmp'], i))
            # don't move this; have to ensure the increment occurs!
            i += 1

            if self.db.crawled(article['url']):
                print "READ:  .. Ignoring; already in crawled database."
                continue

            if len(rows) < 3:
                print "FETCH: .. Ignoring; not enough lines in Goose output: URL=%s, ROWS=%s" % \
                    (article['url'], rows)
                continue

            self.db.set_crawled(article['url'])
            content = ' '.join(rows[:-2])
            content = convert_to_printable(cgi.escape(re.sub(r'\s+', ' ', content))).strip()
            content = re.sub("%s\\s*-?\\s*" % re.escape(article['title']), '', content)
            content = re.sub(r'\s*Share this\s*', '', content)
            content = re.sub(r'\s+,\s+', ', ', content)
            content = re.sub(r'\s+\.', '.', content)
            # shorten content to (presumably) ignore article comments
            content = trunc(content, max_pos=5000)
            article['content'] = content

            article['summary'] = self.summarizer.summarize_first_two_sentences(article['content'])
            print "SUMRY: ..", article['summary']
            article['image_url'] = convert_to_printable(rows[-2]).strip()

            if len(article['title']) < 5 or len(article['content']) < 1000:
                print "CRAWL: -- Ignoring. Content or title too short. Title = {%s}; Content = {%s}" % \
                    (article['title'], article['content'])
                continue

            # remove content with blacklisted words
            found_blacklist_word = False
            for word in blacklist_words:
                if re.search("\W%s\W" % word, article['content'], re.IGNORECASE) != None:
                    print "CRAWL: -- Ignoring. Found blacklisted word \"%s\", ignoring article." % word
                    found_blacklist_word = True
                    break
            if found_blacklist_word: 
                continue

            urlid = self.put_in_db(article)
            if urlid == None: continue
            print "CRAWL: ++ {ID:%d/%d} %s (%s, %s)" % \
                (urlid, i, article['title'], str(article['pubdate']), article['source'])            
Ejemplo n.º 4
0
    def crawl(self, opts):
        """
        Crawl the news by source lists (Search page or RSS).
        """
        rows = self.get_newssources(opts)
        for row in rows:
            sourcepage_url = row[0]
            publisher = row[1]
            sourcetype = row[2]
            tag = row[3]
            parser = ParserFactory(publisher, sourcetype)
            if parser == None: continue
            if self.debug: print "Crawling %s (%s):" % (publisher, tag)
            try:
                parser.parse_sourcepage(sourcepage_url)
                parser.parse_storypage()
                for candidate in parser.candidates:
                    if len(candidate) != 4: continue
                    url = candidate[0].encode('utf-8')
                    print "Fetching", url
                    title = convert_to_printable(ents.convert((re.sub(r'\s+', ' ', candidate[1])))).strip()
                    # if publisher is GoogleNews, extract true publisher from title
                    if publisher == "GoogleNews":
                        print title
                        true_publisher = re.match(r'^.* - (.+)$', title).group(1)
                        true_publisher = "%s via Google News" % true_publisher
                    elif publisher == "UserSubmitted":
                        true_publisher = re.match(r'^[^\/]+:\/\/([^\/]+)(?::\d+)?\/?.*$', url).group(1)
                        true_publisher = "%s (User submitted)" % true_publisher
                    else: true_publisher = publisher

                    # removing site title like " - NPR"
                    title = re.sub(r'\s+[:-]\s+.*$', '', title)
                    pubdate = candidate[2]
                    content = convert_to_printable(ents.convert((re.sub(r'\s+', ' ', candidate[3])))).strip()
                    if isinstance(title, types.StringType):
                        title = unicode(title, errors = 'ignore')
                    if isinstance(content, types.StringType):
                        content = unicode(content, errors = 'ignore')
                    content = re.sub("\\s*%s\\s*" % re.escape(title), '', content)
                    content = re.sub(r'\s*Share this\s*', '', content)
                    content = re.sub(r'\s+,\s+', ', ', content)
                    content = re.sub(r'\s+\.', '.', content)

                    if len(title) < 5 or len(content) < 2000:
                        print "Content or title too short"
                        continue

                    # shorten content to (presumably) ignore article comments
                    content = trunc(content, max_pos=3000)

                    # remove content with blacklisted words
                    found_blacklist_word = False
                    for word in blacklist_words:
                        if re.search("\W%s\W" % word, content, re.IGNORECASE) != None:
                            print "Found blacklisted word \"%s\", ignoring article." % word
                            found_blacklist_word = True
                            break
                    if found_blacklist_word: 
                        continue

                    urlid = self.put_in_db(url, pubdate, self.today, true_publisher, \
                            tag, title, content)
                    if urlid == None: continue
                    try:
                        print "{ID:%d} %s (%s, %s)" % (urlid, title, str(pubdate), true_publisher)
                    except:
                        pass

            except (KeyboardInterrupt):
                if self.debug: print "Quitting early due to keyboard interrupt."
                sys.exit()
            except:
                if self.debug:
                    print "Parser for %s failed." % (publisher)
                    print traceback.print_exc()
                continue;
Ejemplo n.º 5
0
    def fetch_all_articles(self):
        try:
            os.makedirs(paths['ainews.content_tmp'])
        except:
            pass
        f = open("%surllist.txt" % paths['ainews.content_tmp'], 'w')
        for article in self.articles:
            f.write("%s\n" % article['url'])
        f.close()

        goose_cmd = "cd %s/goose; MAVEN_OPTS=\"-Xms256m -Xmx800m\" %s exec:java -Dexec.mainClass=com.gravity.goose.FetchMany -Dexec.args=\"%s\" -q" % \
            (paths['libraries.tools'], paths['libraries.maven'], paths['ainews.content_tmp'])
        Popen(goose_cmd, shell=True).communicate()

        i = 0
        for article in self.articles:
            print "READ:  Opening", ("%s%d" % (paths['ainews.content_tmp'], i))
            f = codecs.open("%s%d" % (paths['ainews.content_tmp'], i),
                            encoding='utf-8')
            rows = f.read().split("\n")
            f.close()
            #os.remove("%s%d" % (paths['ainews.content_tmp'], i))
            # don't move this; have to ensure the increment occurs!
            i += 1

            if self.db.crawled(article['url']):
                print "READ:  .. Ignoring; already in crawled database."
                continue

            if len(rows) < 3:
                print "FETCH: .. Ignoring; not enough lines in Goose output: URL=%s, ROWS=%s" % \
                    (article['url'], rows)
                continue

            self.db.set_crawled(article['url'])
            content = ' '.join(rows[:-2])
            content = convert_to_printable(
                cgi.escape(re.sub(r'\s+', ' ', content))).strip()
            content = re.sub("%s\\s*-?\\s*" % re.escape(article['title']), '',
                             content)
            content = re.sub(r'\s*Share this\s*', '', content)
            content = re.sub(r'\s+,\s+', ', ', content)
            content = re.sub(r'\s+\.', '.', content)
            # shorten content to (presumably) ignore article comments
            content = trunc(content, max_pos=5000)
            article['content'] = content

            article['summary'] = self.summarizer.summarize_first_two_sentences(
                article['content'])
            print "SUMRY: ..", article['summary']
            article['image_url'] = convert_to_printable(rows[-2]).strip()

            if len(article['title']) < 5 or len(article['content']) < 1000:
                print "CRAWL: -- Ignoring. Content or title too short. Title = {%s}; Content = {%s}" % \
                    (article['title'], article['content'])
                continue

            # remove content with blacklisted words
            found_blacklist_word = False
            for word in blacklist_words:
                if re.search("\W%s\W" % word, article['content'],
                             re.IGNORECASE) != None:
                    print "CRAWL: -- Ignoring. Found blacklisted word \"%s\", ignoring article." % word
                    found_blacklist_word = True
                    break
            if found_blacklist_word:
                continue

            urlid = self.put_in_db(article)
            if urlid == None: continue
            print "CRAWL: ++ {ID:%d/%d} %s (%s, %s)" % \
                (urlid, i, article['title'], str(article['pubdate']), article['source'])