Ejemplo n.º 1
0
 def __init__(self):
     self.today = date.today()
     self.earliest_date = self.today - timedelta(
         days=int(config['ainews.period']))
     self.db = AINewsDB()
     self.summarizer = AINewsSummarizer()
     self.articles = []
Ejemplo n.º 2
0
    def __init__(self):
        self.debug = config['ainews.debug']
        self.today = date.today()
        self.earliest_date = self.today - timedelta(
            days=int(config['ainews.period']))
        self.db = AINewsDB()
        self.corpus = AINewsCorpus()
        self.duplicates = AINewsDuplicates()
        self.txtpro = AINewsTextProcessor()
        self.weka = AINewsWekaClassifier()

        self.articles = {}
        self.semiauto_email_output = ""
Ejemplo n.º 3
0
    def __init__(self):
        self.txtpro = AINewsTextProcessor()
        self.cache_urls = {}

        #: A dictionary of word=>word freq in corpus
        self.dftext = {}

        #: A dictionary of word=>wordid
        self.idwords = {}

        #: A dictionary of wordid=>word
        self.wordids = {}

        self.db = AINewsDB()

        self.categories = ["AIOverview","Agents", "Applications", \
                 "CognitiveScience", "Education", "Ethics", "Games", "History", \
                 "Interfaces", "MachineLearning", "NaturalLanguage", "Philosophy", \
                 "Reasoning", "Representation", "Robots", "ScienceFiction", \
                 "Speech", "Systems", "Vision"]

        self.sources = {}
        rows = self.db.selectall("select parser, relevance from sources")
        for row in rows:
            self.sources[row[0].split('::')[0]] = int(row[1])

        self.retained_db_docs = None
        
        self.restore_corpus()
Ejemplo n.º 4
0
    def __init__(self):
        self.debug = config["ainews.debug"]
        self.today = date.today()
        self.earliest_date = self.today - timedelta(days=int(config["ainews.period"]))
        self.db = AINewsDB()
        self.corpus = AINewsCorpus()
        self.duplicates = AINewsDuplicates()
        self.txtpro = AINewsTextProcessor()
        self.weka = AINewsWekaClassifier()

        self.articles = {}
        self.semiauto_email_output = ""
Ejemplo n.º 5
0
    def __init__(self):
        self.txtpro = AINewsTextProcessor()
        self.cache_urls = {}

        #: A dictionary of word=>word freq in corpus
        self.dftext = {}

        #: A dictionary of word=>wordid
        self.idwords = {}

        #: A dictionary of wordid=>word
        self.wordids = {}

        self.db = AINewsDB()

        self.categories = ["AIOverview","Agents", "Applications", \
                 "CognitiveScience", "Education", "Ethics", "Games", "History", \
                 "Interfaces", "MachineLearning", "NaturalLanguage", "Philosophy", \
                 "Reasoning", "Representation", "Robots", "ScienceFiction", \
                 "Speech", "Systems", "Vision"]

        self.retained_db_docs = None

        self.restore_corpus()
Ejemplo n.º 6
0
    def __init__(self):
        self.debug = config['ainews.debug']
        self.today = date.today()
        self.earliest_date = self.today - timedelta(days = int(config['ainews.period']))
        self.db = AINewsDB()
        self.corpus = AINewsCorpus()
        self.duplicates = AINewsDuplicates()
        self.svm_classifier = AINewsSVMClassifier()
        self.txtpro = AINewsTextProcessor()
        self.summarizer = AINewsSummarizer()

        self.articles = {}
        self.publishable_articles = []
        self.semiauto_email_output = ""

        self.topicids = {"AIOverview":0, "Agents":1, "Applications":2,
           "CognitiveScience":3, "Education":4,"Ethics":5, 
           "Games":6, "History":7, "Interfaces":8, "MachineLearning":9,
           "NaturalLanguage":10, "Philosophy":11, "Reasoning":12,
           "Representation":13, "Robots":14, "ScienceFiction":15,"Speech":16,
           "Systems":17,  "Vision":18}
Ejemplo n.º 7
0
    def __init__(self):
        self.txtpro = AINewsTextProcessor()
        self.cache_urls = {}

        #: A dictionary of word=>word freq in corpus
        self.dftext = {}

        #: A dictionary of word=>wordid
        self.idwords = {}

        #: A dictionary of wordid=>word
        self.wordids = {}

        self.db = AINewsDB()

        self.categories = [
            "AIOverview",
            "Agents",
            "Applications",
            "CognitiveScience",
            "Education",
            "Ethics",
            "Games",
            "History",
            "Interfaces",
            "MachineLearning",
            "NaturalLanguage",
            "Philosophy",
            "Reasoning",
            "Representation",
            "Robots",
            "ScienceFiction",
            "Speech",
            "Systems",
            "Vision",
        ]

        self.retained_db_docs = None

        self.restore_corpus()
Ejemplo n.º 8
0
class AINewsCorpus:
    """
    A corpus is a set of news articles (each with a title, content,
    and categories) that are used for training and comparison
    purposes. For training, the corpus provides the training
    examples. For comparison, the corpus provides the data for various
    measures like word frequency. This is important in the prediction
    process: we only want to predict a new article's categories based
    on word frequencies, and other measures, from the corpus; we don't
    want articles that have not been "vetted" (articles not part of
    the corpus) to contribute to these measures.

    A corpus can be "loaded" via C{load_corpus()} or "restored" via
    C{restore_corpus()}. The difference is the following: when loading a
    corpus, word frequencies are measured and stored in the database
    table C{wordlist_eval}; when restoring a corpus, word frequencies
    are simply retrieved from the database table C{wordlist}. In other
    words, we load a corpus when we are training or evaluating our
    training procedures, and we restore a corpus when we are
    predicting.
    """
    def __init__(self):
        self.txtpro = AINewsTextProcessor()
        self.cache_urls = {}

        #: A dictionary of word=>word freq in corpus
        self.dftext = {}

        #: A dictionary of word=>wordid
        self.idwords = {}

        #: A dictionary of wordid=>word
        self.wordids = {}

        self.db = AINewsDB()

        self.categories = ["AIOverview","Agents", "Applications", \
                 "CognitiveScience", "Education", "Ethics", "Games", "History", \
                 "Interfaces", "MachineLearning", "NaturalLanguage", "Philosophy", \
                 "Reasoning", "Representation", "Robots", "ScienceFiction", \
                 "Speech", "Systems", "Vision"]

        self.sources = {}
        rows = self.db.selectall("select parser, relevance from sources")
        for row in rows:
            self.sources[row[0].split('::')[0]] = int(row[1])

        self.retained_db_docs = None
        
        self.restore_corpus()

    def get_relevance(self, publisher):
        if re.search(r'via Google News', publisher):
            publisher = 'GoogleNews'
        return self.sources[publisher]

    def compare_articles(self, article1, article2):
        dupcount1 = len(article1['duplicates'])
        dupcount2 = len(article2['duplicates'])
        if article1['publisher'].find('User submitted') != -1:
            relevance1 = 200
        else:
            relevance1 = self.get_relevance(article1['publisher'])
        if article2['publisher'].find('User submitted') != -1:
            relevance2 = 200
        else:
            relevance2 = self.get_relevance(article2['publisher'])
        cat_count1 = len(article1['categories'])
        cat_count2 = len(article2['categories'])
        if cmp(dupcount1, dupcount2) == 0:
            if cmp(relevance1, relevance2) == 0:
                return cmp(cat_count1, cat_count2)
            else:
                return cmp(relevance1, relevance2)
        else:
            return cmp(dupcount1, dupcount2)

    def get_tfidf(self, urlid, wordfreq):
        """
        Helper function to retrieve the tfidf of each word based on the urlid.
        @param  urlid: target news story's urlid.
        @type  urlid: C{int}
        """
        if urlid in self.cache_urls:
            return self.cache_urls[urlid]
        wordid_freq_pairs = {}
        for word in wordfreq:
            if word in self.dftext:
                wordid_freq_pairs[self.idwords[word]] = (wordfreq[word], self.dftext[word])

        data = {}
        distsq = 0.0
        for wordid in wordid_freq_pairs:
            tfidf = math.log(wordid_freq_pairs[wordid][0] + 1, 2) * \
                    (math.log(self.corpus_count + 1, 2) - \
                    math.log(wordid_freq_pairs[wordid][1] + 1, 2))
            data[wordid] = tfidf
            distsq += tfidf * tfidf
        dist = math.sqrt(distsq)
        if dist > 1.0e-9:
            for key in data:
                data[key] /= dist
        self.cache_urls[urlid] = data
        return data

    def cos_sim(self, tfidf1, tfidf2):
        """
        A helper function to compute the cos simliarity between
        news story and centroid.
        @param  tfidf1: target news story tfidf vector.
        @type  tfidf1: C{dict}
        @param tfidf2: centroid tfidf vector.
        @type  tfidf2: C{dict}
        """
        sim = 0.0
        for key in tfidf1:
            if key in tfidf2:
                word = self.wordids[key]
                a = tfidf1[key]
                b = tfidf2[key]
                sim += a*b
        return sim

    def get_article(self, urlid, corpus = False):
        row = None
        if corpus:
            table = 'cat_corpus'
            cat_table = 'cat_corpus_cats'
            row = self.db.selectone("""select u.url, u.title, u.content
                from %s as u where u.urlid = %s""" % (table, urlid))

        else:
            table = 'urllist'
            cat_table = 'categories'
            row = self.db.selectone("""select u.url, u.title, u.content, u.summary, 
                u.pubdate, u.crawldate, u.processed, u.published, u.publisher
                from %s as u where u.urlid = %s""" % \
                                        (table, urlid))
        if row != None and row[2] is not None:
            wordfreq = self.txtpro.simpletextprocess(urlid, row[2])
            summary = ""
            if not corpus: summary = row[3]
            processed = False
            if not corpus and row[6] == 1: processed = True
            published = False
            if not corpus and row[7] == 1: published = True
            pubdate = ""
            if not corpus: pubdate = row[4]
            crawldate = ""
            if not corpus: crawldate = row[5]
            publisher = ""
            if not corpus: publisher = row[8]
            categories = []
            cat_rows = self.db.selectall("""select category from %s
                where urlid = %s""" % (cat_table, urlid))
            for cat_row in cat_rows:
                categories.append(cat_row[0])
            return {'urlid': urlid, 'url': row[0], 'title': row[1],
                    'content': trunc(row[2], max_pos=3000),
                    'content_all': row[2],
                    'summary': summary,
                    'pubdate': pubdate, 'crawldate': crawldate,
                    'processed': processed, 'published': published,
                    'publisher': publisher,
                    'categories': categories, 'duplicates': [],
                    'wordfreq': wordfreq, 'tfidf': self.get_tfidf(urlid, wordfreq)}
        else:
            return None

    def get_articles_daterange(self, date_start, date_end):
        articles = {}
        rows = self.db.selectall("""select urlid from urllist
            where pubdate >= %s and pubdate <= %s""", (date_start, date_end))
        for row in rows:
            articles[row[0]] = self.get_article(row[0])
        return articles

    def get_articles_idrange(self, urlid_start, urlid_end, corpus = False):
        articles = {}
        rows = self.db.selectall("""select urlid from urllist
            where urlid >= %s and urlid <= %s""", (urlid_start, urlid_end))
        for row in rows:
            art = self.get_article(row[0], corpus)
            if art is not None:
                articles[row[0]] = art
        return articles

    def get_unprocessed(self):
        articles = {}
        rows = self.db.selectall("select urlid from urllist where processed = 0")
        for row in rows:
            articles[row[0]] = self.get_article(row[0])
        return articles

    def get_publishable(self):
        articles = []
        rows = self.db.selectall("select urlid from urllist where "
                        "publishable = 1 and published = 0 and pubdate != '0000-00-00'")
        for row in rows:
            articles.append(self.get_article(row[0]))
        return articles

    def get_published(self):
        articles = []
        rows = self.db.selectall("select urlid from urllist where published = 1")
        for row in rows:
            articles.append(self.get_article(row[0]))
        return articles

    def mark_processed(self, articles):
        for article in articles:
            self.db.execute("update urllist set processed = 1 where urlid = %s",
                    article['urlid'])

    def mark_publishable(self, articles):
        for article in articles:
            self.db.execute("update urllist set publishable = 1 where urlid = %s",
                            article['urlid'])

    def mark_published(self, articles):
        for article in articles:
            self.db.execute("update urllist set published = 1 where urlid = %s",
                            article['urlid'])

    def restore_corpus(self):
        self.wordids = {}
        self.dftext = {}
        rows = self.db.selectall("select rowid, word, dftext from wordlist")
        for row in rows:
            self.wordids[row[0]] = row[1]
            self.idwords[row[1]] = row[0]
            self.dftext[row[1]] = row[2]
        self.corpus_count = self.db.selectone("select count(*) from cat_corpus")[0]

    def add_freq_index(self, urlid, wordfreq, categories = []):
        for word in wordfreq:
            self.wordcounts.setdefault(word, 0)
            self.wordcounts[word] += 1

    def commit_freq_index(self, table):
        self.dftext = {}
        self.wordids = {}
        for word in self.wordcounts:
            rowid = self.db.execute("insert into "+table+" (word, dftext) " + \
                "values(%s, %s)", (word, self.wordcounts[word]))
            self.wordids[rowid] = word
            self.idwords[word] = rowid
            self.dftext[word] = self.wordcounts[word]
        self.wordcounts = {}

    def load_corpus(self, ident, pct, debug = False, retain = False):
        if debug:
            print "Loading corpus..."
        source = ident.split(':')[0]
        name = ident.split(':')[1:]
        if source == "file":
            docs = self.load_file_corpus(name, debug)
        elif source == "db":
            docs = self.load_db_corpus(name, debug, retain)
        if debug: print

        random.shuffle(docs)
        offset = int(len(docs)*pct)
        if debug:
            print "Selecting random %d%% of corpus (%d docs)." % \
                    (pct * 100, offset)

        # sort train_corpus by urlid
        train_corpus = sorted(docs[0:offset], key=operator.itemgetter(0))
        self.corpus_count = len(train_corpus)

        # sort predict_corpus by urlid
        predict_corpus = sorted(docs[offset:offset+int(len(docs)*0.1)], \
                key=operator.itemgetter(0))

        self.db.execute("delete from wordlist_eval")
        self.db.execute("alter table wordlist_eval auto_increment = 0")
        self.wordids = {}
        self.wordcounts = {}
        self.cache_urls = {}
        for c in train_corpus:
            self.add_freq_index(c[0], c[1], c[2].split())
            if debug:
                sys.stdout.write('.')
                sys.stdout.flush()
        self.commit_freq_index('wordlist_eval')

        return (train_corpus, predict_corpus)

    def load_file_corpus(self, name, debug = False):
        wordsfile = paths['corpus.corpus_other'] + name[0] + ".mat.clabel"
        f = open(wordsfile, 'r')
        self.wordids = {}
        wordid = 1
        for line in f:
            self.wordids[int(wordid)] = line.strip()
            wordid += 1

        catsfile = paths['corpus.corpus_other'] + name[0] + ".mat.rlabel"
        f = open(catsfile, 'r')
        cats = {}
        uniqcats = set()
        docid = 0
        for line in f:
            cats[docid] = line.strip()
            uniqcats.add(line.strip())
            docid += 1
        self.categories = list(uniqcats)

        matfile = paths['corpus.corpus_other'] + name[0] + ".mat"
        f = open(matfile, 'r')
        f.readline() # ignore first line
        docs = []
        docid = 0
        for line in f:
            wordfreq = {}
            for (wordid, freq) in izip(*[iter(line.split())]*2):
                wordfreq[self.wordids[int(wordid)]] = int(float(freq))
            docs.append((docid, wordfreq, cats[docid]))
            docid += 1
            if debug:
                sys.stdout.write('.')
                sys.stdout.flush()
        return docs

    def load_db_corpus(self, name, debug = False, retain = False):
        rows = self.db.selectall("""select c.urlid, c.content,
            group_concat(cc.category separator ' ')
            from %s as c, %s as cc
            where c.urlid = cc.urlid
            group by c.urlid order by c.urlid desc""" % (name[0], name[1]))
        if debug: print "Processing %d articles..." % len(rows)
        if retain and self.retained_db_docs != None:
            return self.retained_db_docs
        docs = []
        for row in rows:
            wordfreq = self.txtpro.simpletextprocess(row[0], row[1])
            if wordfreq.N() > 0 and 'NotRelated' not in row[2].split(' '):
                docs.append((row[0], wordfreq, row[2]))
            if debug:
                sys.stdout.write('.')
                sys.stdout.flush()
        if retain:
            self.retained_db_docs = docs
        return docs
Ejemplo n.º 9
0
# Artificial Intelligence. This program and parts of it may be used and
# distributed without charge for non-commercial purposes as long as this
# notice is included.

import sys
from AINewsDB import AINewsDB

if __name__ == "__main__":

    categories =["AIOverview","Agents", "Applications", \
                     "CognitiveScience","Education","Ethics", "Games", "History",\
                     "Interfaces","MachineLearning","NaturalLanguage","Philosophy",\
                     "Reasoning","Representation", "Robots","ScienceFiction",\
                     "Speech", "Systems","Vision"]
    
    db = AINewsDB()
    
    url_counts = {}
    
    cat_counts = {}
    for cat in categories:
        cat_counts[cat] = 0
        
        rows = db.selectall( \
            "select c.urlid, c.content, group_concat(cc.category separator ' ') " +
            "from cat_corpus as c, cat_corpus_cats as cc where c.urlid = cc.urlid " +
            "group by c.urlid")
        for row in rows:
            url_counts[row[0]] = len(row[2].split(' '))
            for cat in row[2].split(' '):
                cat_counts[cat] += 1
Ejemplo n.º 10
0
 def __init__(self):
     self.today = date.today()
     self.earliest_date = self.today - timedelta(days = int(config['ainews.period']))
     self.db = AINewsDB()
     self.summarizer = AINewsSummarizer()
     self.articles = []
Ejemplo n.º 11
0
class AINewsCrawler:
    def __init__(self):
        self.today = date.today()
        self.earliest_date = self.today - timedelta(days = int(config['ainews.period']))
        self.db = AINewsDB()
        self.summarizer = AINewsSummarizer()
        self.articles = []

    def get_sources(self, opts):
        """
        Get the news source list.
        """
        sources = []
        csv_file = csv.reader(urllib2.urlopen(paths['ainews.sources_csv']))
        header = True
        for row in csv_file:
            if header:
                header = False
                continue
            if len(opts) == 0 or (opts[0][0] == '--source' and opts[0][1] == row[1]):
                sources.append({'source_id': row[0],
                                'title': row[1],
                                'link': row[2],
                                'parser': row[3],
                                'relevance': int(row[4])})
        return sources

    def fetch_all_sources(self, opts):
        for source in self.get_sources(opts):
            print "CRAWL: Crawling \"%s\"..." % source['title']
            try:
                f = feedparser.parse(source['link'])
            except Exception, e:
                print "Exception while parsing feed: %s" % (source['link'],)
                print e
                continue

            for entry in f.entries:
                d = None
                try:
                    if hasattr(entry, 'published_parsed'):
                        d = date(entry.published_parsed[0],
                                 entry.published_parsed[1],
                                 entry.published_parsed[2])
                    else:
                        d = date(entry.updated_parsed[0],
                                 entry.updated_parsed[1],
                                 entry.updated_parsed[2])
                except Exception, e:
                    print e
                    print entry
                    print "Setting date as today; could not parse date for feed", \
                        source['link']
                    d = self.today
                if d > self.today or d < self.earliest_date: continue
                if entry.title[-6:] == '(blog)' \
                        or entry.title[-15:] == '(press release)':
                    print "Blog or press release in title. (%s) (%s)" % \
                        (entry.link, entry.title)
                    continue
                try:
                    url = urllib2.urlopen(entry.link).geturl()
                except KeyboardInterrupt:
                    print "Quitting early due to keyboard interrupt."
                    sys.exit()
                except: continue

                # attempt to skip blogs
                if re.match('^.*blog.*$', url):
                    print "'blog' in url (%s) (%s)" % \
                        (entry.link, entry.title)
                    continue
                # attempt to skip job postings
                if re.match('^.*job.*$', url):
                    print "'job' in url (%s) (%s)" % \
                        (entry.link, entry.title)
                    continue
                # skip urls we have already crawled
                if self.db.crawled(url):
                    print "Seen this url before (%s) (%s)" % \
                        (entry.link, entry.title)
                    continue
                
                title = cgi.escape(convert_to_printable(entry.title)).strip()

                # if source is Google News, extract true source from title
                if re.match(r'^.*Google News.*$', source['title']):
                    true_source = re.match(r'^.* - (.+)$', title).group(1)
                    true_source = "%s via Google News" % true_source
                    title = re.match(r'^(.*) - .+$', title).group(1)
                else: true_source = source['title']
                
                self.articles.append({'url': url, 'title': title, 'pubdate': d,
                                      'source': true_source, 'source_id': source['source_id'],
                                      'source_relevance': source['relevance']})
Ejemplo n.º 12
0
 def __init__(self):
     self.today = date.today()
     self.debug = config['ainews.debug']
     self.db = AINewsDB()
     self.parser = AINewsParser()
Ejemplo n.º 13
0
class AINewsCrawler:
    def __init__(self):
        self.today = date.today()
        self.debug = config['ainews.debug']
        self.db = AINewsDB()
        self.parser = AINewsParser()

    def get_newssources(self, opts):
        """
        Get the news source list.
        """
        sources = []
        where = "1=1"
        for opt in opts:
            if opt[0] == "-s" or opt[0] == "--source":
                where = "id = %s" % opt[1]
        
        sql = "select url,parser,description from sources where status = 1 and %s order by id asc" % where
        rows = self.db.selectall(sql)
        for row in rows:
            items = row[1].split('::')
            sources.append((row[0], items[0], items[1], row[2]))
        return sources

    def crawl(self, opts):
        """
        Crawl the news by source lists (Search page or RSS).
        """
        rows = self.get_newssources(opts)
        for row in rows:
            sourcepage_url = row[0]
            publisher = row[1]
            sourcetype = row[2]
            tag = row[3]
            parser = ParserFactory(publisher, sourcetype)
            if parser == None: continue
            if self.debug: print "Crawling %s (%s):" % (publisher, tag)
            try:
                parser.parse_sourcepage(sourcepage_url)
                parser.parse_storypage()
                for candidate in parser.candidates:
                    if len(candidate) != 4: continue
                    url = candidate[0].encode('utf-8')
                    print "Fetching", url
                    title = convert_to_printable(ents.convert((re.sub(r'\s+', ' ', candidate[1])))).strip()
                    # if publisher is GoogleNews, extract true publisher from title
                    if publisher == "GoogleNews":
                        print title
                        true_publisher = re.match(r'^.* - (.+)$', title).group(1)
                        true_publisher = "%s via Google News" % true_publisher
                    elif publisher == "UserSubmitted":
                        true_publisher = re.match(r'^[^\/]+:\/\/([^\/]+)(?::\d+)?\/?.*$', url).group(1)
                        true_publisher = "%s (User submitted)" % true_publisher
                    else: true_publisher = publisher

                    # removing site title like " - NPR"
                    title = re.sub(r'\s+[:-]\s+.*$', '', title)
                    pubdate = candidate[2]
                    content = convert_to_printable(ents.convert((re.sub(r'\s+', ' ', candidate[3])))).strip()
                    if isinstance(title, types.StringType):
                        title = unicode(title, errors = 'ignore')
                    if isinstance(content, types.StringType):
                        content = unicode(content, errors = 'ignore')
                    content = re.sub("\\s*%s\\s*" % re.escape(title), '', content)
                    content = re.sub(r'\s*Share this\s*', '', content)
                    content = re.sub(r'\s+,\s+', ', ', content)
                    content = re.sub(r'\s+\.', '.', content)

                    if len(title) < 5 or len(content) < 2000:
                        print "Content or title too short"
                        continue

                    # shorten content to (presumably) ignore article comments
                    content = trunc(content, max_pos=3000)

                    # remove content with blacklisted words
                    found_blacklist_word = False
                    for word in blacklist_words:
                        if re.search("\W%s\W" % word, content, re.IGNORECASE) != None:
                            print "Found blacklisted word \"%s\", ignoring article." % word
                            found_blacklist_word = True
                            break
                    if found_blacklist_word: 
                        continue

                    urlid = self.put_in_db(url, pubdate, self.today, true_publisher, \
                            tag, title, content)
                    if urlid == None: continue
                    try:
                        print "{ID:%d} %s (%s, %s)" % (urlid, title, str(pubdate), true_publisher)
                    except:
                        pass

            except (KeyboardInterrupt):
                if self.debug: print "Quitting early due to keyboard interrupt."
                sys.exit()
            except:
                if self.debug:
                    print "Parser for %s failed." % (publisher)
                    print traceback.print_exc()
                continue;

    def put_in_db(self, url, pubdate, crawldate, publisher, tag, title, content):
        """
        Save the news story into database.
        """
        try:
            urlid = self.db.execute("""insert into urllist (url, pubdate, crawldate,
                publisher, tag, title, content)
                values (%s, %s, %s, %s, %s, %s, %s)""",
                (url, str(pubdate), str(crawldate), publisher, tag, title, content))
            return urlid
        except Exception, e :
            #if self.debug:
            #   print >> sys.stderr, "ERROR: can't add url metadata.", e
            return None
Ejemplo n.º 14
0
class AINewsPublisher():
    def __init__(self):
        self.debug = config['ainews.debug']
        self.today = date.today()
        self.earliest_date = self.today - timedelta(
            days=int(config['ainews.period']))
        self.db = AINewsDB()
        self.corpus = AINewsCorpus()
        self.duplicates = AINewsDuplicates()
        self.txtpro = AINewsTextProcessor()
        self.weka = AINewsWekaClassifier()

        self.articles = {}
        self.semiauto_email_output = ""

    def filter_and_process(self):
        self.articles = self.corpus.get_unprocessed()

        if len(self.articles) == 0: return

        # assume every article will be published; may be set to False from one
        # of the filtering processes below
        for urlid in self.articles:
            self.articles[urlid]['publish'] = True
            self.articles[urlid]['transcript'] = []

        # filter by date
        print "Filtering by date..."
        for urlid in self.articles:
            if self.articles[urlid]['pubdate'] == None:
                # give a meaningful pubdate so that other code doesn't crash
                self.articles[urlid]['pubdate'] = self.today
                self.articles[urlid]['publish'] = False
                self.articles[urlid]['transcript'].append(
                    "Rejected due to bogus publication date.")
            elif self.articles[urlid]['pubdate'] < self.earliest_date:
                self.articles[urlid]['publish'] = False
                self.articles[urlid]['transcript'].append(
                    ("Rejected because article is too old " +
                     "(earliest valid date is %s while article was " +
                     "published on %s") %
                    (self.earliest_date.strftime('%F'),
                     self.articles[urlid]['pubdate'].strftime('%F')))

        # filter by blacklist (for urls)
        print "Filtering by blacklist..."
        for urlid in self.articles:
            for black in blacklist_urls:
                if re.search(black, self.articles[urlid]['url']):
                    self.articles[urlid]['publish'] = False
                    self.articles[urlid]['transcript'].append(
                        ("Rejected because url matched blacklisted url %s" %
                         black))
                    break

        # filter by whitelist
        print "Filtering by whitelist..."
        for urlid in self.articles:
            white_wordfreq = self.txtpro.whiteprocess(
                urlid, self.articles[urlid]['content'])
            self.articles[urlid]['white_wordfreq'] = white_wordfreq

            # require at least two different whitelisted terms
            # unless the article is user-submitted
            if len(white_wordfreq) < 2 \
                    and self.articles[urlid]['source'] != 'User Submitted':
                self.articles[urlid]['publish'] = False
                self.articles[urlid]['transcript'].append(
                    'Rejected due to only one or no whitelisted terms')

        # update categories based on classifier predictions
        print "Classifying..."
        self.weka.predict(self.articles)

        # drop articles with no categories
        print "Dropping articles with no categories..."
        for urlid in self.articles:
            if len(self.articles[urlid]['categories']) == 0:
                self.articles[urlid]['publish'] = False
                self.articles[urlid]['transcript'].append(
                    'Rejected due to no selected categories')

        # filter out duplicates; some articles may have 'publish' set to False
        # by this function
        print "Filtering duplicates..."
        self.duplicates.filter_duplicates(self.articles)

        for urlid in self.articles:
            print urlid, self.articles[urlid]['publish'], \
                self.articles[urlid]['title'], \
                self.articles[urlid]['categories'], \
                self.articles[urlid]['summary']
            print

        print "Grabbing images..."
        for urlid in self.articles:
            # grab and convert article image (if it exists)
            self.grab_convert_image(self.articles[urlid])

            # update article in database
            self.update_db(self.articles[urlid])

        # mark each as processed
        print "Marking as processed."
        self.corpus.mark_processed(self.articles.itervalues())

    def grab_convert_image(self, article):
        if len(article['image_url']) == 0:
            article['image_path'] = ''
            return
        try:
            f = urllib2.urlopen(article['image_url'])
            img = open(
                "%s%s" % (paths['ainews.image_dir'], str(article['urlid'])),
                'w')
            img.write(f.read())
            img.close()
            # produces [urlid].jpg
            Popen("%s -format jpg -gravity Center -thumbnail 200x200 %s%s" % \
                      (paths['imagemagick.mogrify'], paths['ainews.image_dir'],
                       str(article['urlid'])),
                  shell = True).communicate()
            # remove [urlid] file (with no extension)
            remove("%s%s" % (paths['ainews.image_dir'], str(article['urlid'])))
            article[
                'image_path'] = "public://newsfinder_images/%s.jpg" % article[
                    'urlid']
        except Exception as e:
            print "Failed converting image for %d: %s" % (article['urlid'], e)
            article['image_path'] = ''

    def update_db(self, article):
        self.db.execute("delete from categories where urlid = %s",
                        article['urlid'])
        for cat in article['categories']:
            self.db.execute("insert into categories values (%s,%s)",
                            (article['urlid'], cat))

    def generate_feed_import(self):
        """
        Generate XML file for feed import on the Drupal site.
        """
        xml = FeedImport()
        for article in self.articles.values():
            article['source'] = re.sub(r'&', '&amp;', article['source'])
        xml.news = self.articles.values()
        savefile(paths['ainews.output_xml'] + "news.xml", str(xml))

    def generate_email_output(self):
        articles = []
        try:
            f = urllib2.urlopen(paths['ainews.top_weekly_news_xml'])
            xml = etree.parse(f)
            for node in xml.iter("node"):
                print "Found", node.findtext("Title")
                published = node.findtext("Publication_date")
                articles.append({
                    'title':
                    node.findtext("Title"),
                    'source':
                    node.findtext("Source"),
                    'topics':
                    re.sub(r'/topic/', 'http://aitopics.org/topic/',
                           node.findtext("Topics")),
                    'pubdate':
                    date(int(published[0:4]), int(published[5:7]),
                         int(published[8:10])),
                    'summary':
                    re.sub(
                        r'</p>(</blockquote>)?$', '',
                        re.sub(r'^(<blockquote>)?<p>', '',
                               node.findtext("Body"))),
                    'url':
                    node.findtext("Original_link"),
                    'link':
                    re.sub(r'/news/', 'http://aitopics.org/news/',
                           node.findtext("Link")),
                    'image':
                    re.sub(
                        r'<img',
                        '<img align="left" style="margin: 8px 8px 8px 0; border: 1px solid #ccc; padding: 5px; background: white;" ',
                        node.findtext("Representative_image"))
                })
        except Exception, e:
            print e

        email = LatestNewsEmail()
        email.date = self.today.strftime("%B %d, %Y")
        email.year = self.today.strftime("%Y")
        email.articles = articles
        email_output = str(email)

        return email_output
Ejemplo n.º 15
0
class AINewsPublisher:
    def __init__(self):
        self.debug = config["ainews.debug"]
        self.today = date.today()
        self.earliest_date = self.today - timedelta(days=int(config["ainews.period"]))
        self.db = AINewsDB()
        self.corpus = AINewsCorpus()
        self.duplicates = AINewsDuplicates()
        self.txtpro = AINewsTextProcessor()
        self.weka = AINewsWekaClassifier()

        self.articles = {}
        self.semiauto_email_output = ""

    def filter_and_process(self):
        self.articles = self.corpus.get_unprocessed()

        if len(self.articles) == 0:
            return

        # assume every article will be published; may be set to False from one
        # of the filtering processes below
        for urlid in self.articles:
            self.articles[urlid]["publish"] = True
            self.articles[urlid]["transcript"] = []

        # filter by date
        print "Filtering by date..."
        for urlid in self.articles:
            if self.articles[urlid]["pubdate"] == None:
                # give a meaningful pubdate so that other code doesn't crash
                self.articles[urlid]["pubdate"] = self.today
                self.articles[urlid]["publish"] = False
                self.articles[urlid]["transcript"].append("Rejected due to bogus publication date.")
            elif self.articles[urlid]["pubdate"] < self.earliest_date:
                self.articles[urlid]["publish"] = False
                self.articles[urlid]["transcript"].append(
                    (
                        "Rejected because article is too old "
                        + "(earliest valid date is %s while article was "
                        + "published on %s"
                    )
                    % (self.earliest_date.strftime("%F"), self.articles[urlid]["pubdate"].strftime("%F"))
                )

        # filter by blacklist (for urls)
        print "Filtering by blacklist..."
        for urlid in self.articles:
            for black in blacklist_urls:
                if re.search(black, self.articles[urlid]["url"]):
                    self.articles[urlid]["publish"] = False
                    self.articles[urlid]["transcript"].append(
                        ("Rejected because url matched blacklisted url %s" % black)
                    )
                    break

        # filter by whitelist
        print "Filtering by whitelist..."
        for urlid in self.articles:
            white_wordfreq = self.txtpro.whiteprocess(urlid, self.articles[urlid]["content"])
            self.articles[urlid]["white_wordfreq"] = white_wordfreq

            # require at least two different whitelisted terms
            # unless the article is user-submitted
            if len(white_wordfreq) < 2 and self.articles[urlid]["source"] != "User Submitted":
                self.articles[urlid]["publish"] = False
                self.articles[urlid]["transcript"].append("Rejected due to only one or no whitelisted terms")

        # update categories based on classifier predictions
        print "Classifying..."
        self.weka.predict(self.articles)

        # drop articles with no categories
        print "Dropping articles with no categories..."
        for urlid in self.articles:
            if len(self.articles[urlid]["categories"]) == 0:
                self.articles[urlid]["publish"] = False
                self.articles[urlid]["transcript"].append("Rejected due to no selected categories")

        # filter out duplicates; some articles may have 'publish' set to False
        # by this function
        print "Filtering duplicates..."
        self.duplicates.filter_duplicates(self.articles)

        for urlid in self.articles:
            print urlid, self.articles[urlid]["publish"], self.articles[urlid]["title"], self.articles[urlid][
                "categories"
            ], self.articles[urlid]["summary"]
            print

        print "Grabbing images..."
        for urlid in self.articles:
            # grab and convert article image (if it exists)
            self.grab_convert_image(self.articles[urlid])

            # update article in database
            self.update_db(self.articles[urlid])

        # mark each as processed
        print "Marking as processed."
        self.corpus.mark_processed(self.articles.itervalues())

    def grab_convert_image(self, article):
        if len(article["image_url"]) == 0:
            article["image_path"] = ""
            return
        try:
            f = urllib2.urlopen(article["image_url"])
            img = open("%s%s" % (paths["ainews.image_dir"], str(article["urlid"])), "w")
            img.write(f.read())
            img.close()
            # produces [urlid].jpg
            Popen(
                "%s -format jpg -gravity Center -thumbnail 200x200 %s%s"
                % (paths["imagemagick.mogrify"], paths["ainews.image_dir"], str(article["urlid"])),
                shell=True,
            ).communicate()
            # remove [urlid] file (with no extension)
            remove("%s%s" % (paths["ainews.image_dir"], str(article["urlid"])))
            article["image_path"] = "public://newsfinder_images/%s.jpg" % article["urlid"]
        except Exception as e:
            print "Failed converting image for %d: %s" % (article["urlid"], e)
            article["image_path"] = ""

    def update_db(self, article):
        self.db.execute("delete from categories where urlid = %s", article["urlid"])
        for cat in article["categories"]:
            self.db.execute("insert into categories values (%s,%s)", (article["urlid"], cat))

    def generate_feed_import(self):
        """
        Generate XML file for feed import on the Drupal site.
        """
        xml = FeedImport()
        for article in self.articles.values():
            article["source"] = re.sub(r"&", "&amp;", article["source"])
        xml.news = self.articles.values()
        savefile(paths["ainews.output_xml"] + "news.xml", str(xml))

    def generate_email_output(self):
        articles = []
        try:
            f = urllib2.urlopen(paths["ainews.top_weekly_news_xml"])
            xml = etree.parse(f)
            for node in xml.iter("node"):
                print "Found", node.findtext("Title")
                published = node.findtext("Publication_date")
                articles.append(
                    {
                        "title": node.findtext("Title"),
                        "source": node.findtext("Source"),
                        "topics": re.sub(r"/topic/", "http://aitopics.org/topic/", node.findtext("Topics")),
                        "pubdate": date(int(published[0:4]), int(published[5:7]), int(published[8:10])),
                        "summary": re.sub(
                            r"</p>(</blockquote>)?$", "", re.sub(r"^(<blockquote>)?<p>", "", node.findtext("Body"))
                        ),
                        "url": node.findtext("Original_link"),
                        "link": re.sub(r"/news/", "http://aitopics.org/news/", node.findtext("Link")),
                        "image": re.sub(
                            r"<img",
                            '<img align="left" style="margin: 8px 8px 8px 0; border: 1px solid #ccc; padding: 5px; background: white;" ',
                            node.findtext("Representative_image"),
                        ),
                    }
                )
        except Exception, e:
            print e

        email = LatestNewsEmail()
        email.date = self.today.strftime("%B %d, %Y")
        email.year = self.today.strftime("%Y")
        email.articles = articles
        email_output = str(email)

        return email_output
Ejemplo n.º 16
0
class AINewsCorpus:
    """
    A corpus is a set of news articles (each with a title, content,
    and categories) that are used for training and comparison
    purposes. For training, the corpus provides the training
    examples. For comparison, the corpus provides the data for various
    measures like word frequency. This is important in the prediction
    process: we only want to predict a new article's categories based
    on word frequencies, and other measures, from the corpus; we don't
    want articles that have not been "vetted" (articles not part of
    the corpus) to contribute to these measures.

    A corpus can be "loaded" via C{load_corpus()} or "restored" via
    C{restore_corpus()}. The difference is the following: when loading a
    corpus, word frequencies are measured and stored in the database
    table C{wordlist_eval}; when restoring a corpus, word frequencies
    are simply retrieved from the database table C{wordlist}. In other
    words, we load a corpus when we are training or evaluating our
    training procedures, and we restore a corpus when we are
    predicting.
    """
    def __init__(self):
        self.txtpro = AINewsTextProcessor()
        self.cache_urls = {}

        #: A dictionary of word=>word freq in corpus
        self.dftext = {}

        #: A dictionary of word=>wordid
        self.idwords = {}

        #: A dictionary of wordid=>word
        self.wordids = {}

        self.db = AINewsDB()

        self.categories = ["AIOverview","Agents", "Applications", \
                 "CognitiveScience", "Education", "Ethics", "Games", "History", \
                 "Interfaces", "MachineLearning", "NaturalLanguage", "Philosophy", \
                 "Reasoning", "Representation", "Robots", "ScienceFiction", \
                 "Speech", "Systems", "Vision"]

        self.retained_db_docs = None

        self.restore_corpus()

    def compare_articles(self, article1, article2):
        dupcount1 = len(article1['duplicates'])
        dupcount2 = len(article2['duplicates'])
        relevance1 = article1['source_relevance']
        relevance2 = article2['source_relevance']
        cat_count1 = len(article1['categories'])
        cat_count2 = len(article2['categories'])
        if cmp(dupcount1, dupcount2) == 0:
            if cmp(relevance1, relevance2) == 0:
                return cmp(cat_count1, cat_count2)
            else:
                return cmp(relevance1, relevance2)
        else:
            return cmp(dupcount1, dupcount2)

    def get_tfidf(self, urlid, wordfreq):
        """
        Helper function to retrieve the tfidf of each word based on the urlid.
        @param  urlid: target news story's urlid.
        @type  urlid: C{int}
        """
        if urlid in self.cache_urls:
            return self.cache_urls[urlid]
        wordid_freq_pairs = {}
        for word in wordfreq:
            if word in self.dftext:
                wordid_freq_pairs[self.idwords[word]] = (wordfreq[word],
                                                         self.dftext[word])

        data = {}
        distsq = 0.0
        for wordid in wordid_freq_pairs:
            tfidf = math.log(wordid_freq_pairs[wordid][0] + 1, 2) * \
                    (math.log(self.corpus_count + 1, 2) - \
                    math.log(wordid_freq_pairs[wordid][1] + 1, 2))
            data[wordid] = tfidf
            distsq += tfidf * tfidf
        dist = math.sqrt(distsq)
        if dist > 1.0e-9:
            for key in data:
                data[key] /= dist
        self.cache_urls[urlid] = data
        return data

    def cos_sim(self, tfidf1, tfidf2):
        """
        A helper function to compute the cos simliarity between
        news story and centroid.
        @param  tfidf1: target news story tfidf vector.
        @type  tfidf1: C{dict}
        @param tfidf2: centroid tfidf vector.
        @type  tfidf2: C{dict}
        """
        sim = 0.0
        for key in tfidf1:
            if key in tfidf2:
                word = self.wordids[key]
                a = tfidf1[key]
                b = tfidf2[key]
                sim += a * b
        return sim

    def get_article(self, urlid, corpus=False):
        row = None
        if corpus:
            table = 'cat_corpus'
            cat_table = 'cat_corpus_cats'
            row = self.db.selectone("""select u.url, u.title, u.content
                from %s as u where u.urlid = %s""" % (table, urlid))

        else:
            table = 'urllist'
            cat_table = 'categories'
            row = self.db.selectone("""select u.url, u.title, u.content, u.summary, 
                u.pubdate, u.crawldate, u.processed, u.published, u.source,
                u.source_relevance, u.source_id, u.tfpn, u.image_url
                from %s as u where u.urlid = %s""" % \
                                        (table, urlid))
        if row != None and row[2] is not None:
            content = row[2]
            wordfreq = self.txtpro.simpletextprocess(urlid, content)
            summary = ""
            if not corpus: summary = row[3]
            processed = False
            if not corpus and row[6] == 1: processed = True
            published = False
            if not corpus and row[7] == 1: published = True
            pubdate = ""
            if not corpus: pubdate = row[4]
            crawldate = ""
            if not corpus: crawldate = row[5]
            source = ""
            if not corpus: source = row[8]
            tfpn = "xx"
            if not corpus: tfpn = row[11]
            source_relevance = 0
            if row[9]: source_relevance = int(row[9])
            categories = []
            cat_rows = self.db.selectall("""select category from %s
                where urlid = %s""" % (cat_table, urlid))
            for cat_row in cat_rows:
                categories.append(cat_row[0])
            return {
                'urlid': urlid,
                'url': row[0],
                'title': row[1],
                'content': content,
                'summary': summary,
                'pubdate': pubdate,
                'crawldate': crawldate,
                'processed': processed,
                'published': published,
                'source': source,
                'source_relevance': source_relevance,
                'source_id': row[10],
                'categories': categories,
                'duplicates': [],
                'tfpn': tfpn,
                'wordfreq': wordfreq,
                'image_url': row[12],
                'tfidf': self.get_tfidf(urlid, wordfreq)
            }
        else:
            return None

    def get_articles_daterange(self, date_start, date_end):
        articles = {}
        rows = self.db.selectall(
            """select urlid from urllist
            where pubdate >= %s and pubdate <= %s""", (date_start, date_end))
        for row in rows:
            articles[row[0]] = self.get_article(row[0])
        return articles

    def get_articles_idrange(self, urlid_start, urlid_end, corpus=False):
        articles = {}
        rows = self.db.selectall(
            """select urlid from urllist
            where urlid >= %s and urlid <= %s""", (urlid_start, urlid_end))
        for row in rows:
            art = self.get_article(row[0], corpus)
            if art is not None:
                articles[row[0]] = art
        return articles

    def get_unprocessed(self):
        articles = {}
        rows = self.db.selectall(
            "select urlid from urllist where processed = 0")
        for row in rows:
            articles[row[0]] = self.get_article(row[0])
        return articles

    def get_publishable(self):
        articles = []
        rows = self.db.selectall(
            "select urlid from urllist where "
            "publishable = 1 and published = 0 and pubdate != '0000-00-00'")
        for row in rows:
            articles.append(self.get_article(row[0]))
        return articles

    def get_published(self):
        articles = []
        rows = self.db.selectall(
            "select urlid from urllist where published = 1")
        for row in rows:
            articles.append(self.get_article(row[0]))
        return articles

    def mark_processed(self, articles):
        for article in articles:
            self.db.execute(
                "update urllist set processed = 1 where urlid = %s",
                article['urlid'])

    def mark_publishable(self, articles):
        for article in articles:
            self.db.execute(
                "update urllist set publishable = 1 where urlid = %s",
                article['urlid'])

    def mark_published(self, articles):
        for article in articles:
            self.db.execute(
                "update urllist set published = 1 where urlid = %s",
                article['urlid'])

    def restore_corpus(self):
        self.wordids = {}
        self.dftext = {}
        rows = self.db.selectall("select rowid, word, dftext from wordlist")
        for row in rows:
            self.wordids[row[0]] = row[1]
            self.idwords[row[1]] = row[0]
            self.dftext[row[1]] = row[2]
        self.corpus_count = self.db.selectone(
            "select count(*) from cat_corpus")[0]

    def add_freq_index(self, urlid, wordfreq, categories=[]):
        for word in wordfreq:
            self.wordcounts.setdefault(word, 0)
            self.wordcounts[word] += 1

    def commit_freq_index(self, table):
        self.dftext = {}
        self.wordids = {}
        for word in self.wordcounts:
            rowid = self.db.execute("insert into "+table+" (word, dftext) " + \
                "values(%s, %s)", (word, self.wordcounts[word]))
            self.wordids[rowid] = word
            self.idwords[word] = rowid
            self.dftext[word] = self.wordcounts[word]
        self.wordcounts = {}

    def load_corpus(self, ident, pct, debug=False, retain=False):
        if debug:
            print "Loading corpus..."
        source = ident.split(':')[0]
        name = ident.split(':')[1:]
        if source == "file":
            docs = self.load_file_corpus(name, debug)
        elif source == "db":
            docs = self.load_db_corpus(name, debug, retain)
        if debug: print

        random.shuffle(docs)
        offset = int(len(docs) * pct)
        if debug:
            print "Selecting random %d%% of corpus (%d docs)." % \
                    (pct * 100, offset)

        # sort train_corpus by urlid
        train_corpus = sorted(docs[0:offset], key=operator.itemgetter(0))
        self.corpus_count = len(train_corpus)

        # sort predict_corpus by urlid
        predict_corpus = sorted(docs[offset:offset+int(len(docs)*0.1)], \
                key=operator.itemgetter(0))

        self.db.execute("delete from wordlist_eval")
        self.db.execute("alter table wordlist_eval auto_increment = 0")
        self.wordids = {}
        self.wordcounts = {}
        self.cache_urls = {}
        for c in train_corpus:
            self.add_freq_index(c[0], c[1], c[2].split())
            if debug:
                sys.stdout.write('.')
                sys.stdout.flush()
        self.commit_freq_index('wordlist_eval')

        return (train_corpus, predict_corpus)

    def load_file_corpus(self, name, debug=False):
        wordsfile = paths['corpus.corpus_other'] + name[0] + ".mat.clabel"
        f = open(wordsfile, 'r')
        self.wordids = {}
        wordid = 1
        for line in f:
            self.wordids[int(wordid)] = line.strip()
            wordid += 1

        catsfile = paths['corpus.corpus_other'] + name[0] + ".mat.rlabel"
        f = open(catsfile, 'r')
        cats = {}
        uniqcats = set()
        docid = 0
        for line in f:
            cats[docid] = line.strip()
            uniqcats.add(line.strip())
            docid += 1
        self.categories = list(uniqcats)

        matfile = paths['corpus.corpus_other'] + name[0] + ".mat"
        f = open(matfile, 'r')
        f.readline()  # ignore first line
        docs = []
        docid = 0
        for line in f:
            wordfreq = {}
            for (wordid, freq) in izip(*[iter(line.split())] * 2):
                wordfreq[self.wordids[int(wordid)]] = int(float(freq))
            docs.append((docid, wordfreq, cats[docid]))
            docid += 1
            if debug:
                sys.stdout.write('.')
                sys.stdout.flush()
        return docs

    def load_db_corpus(self, name, debug=False, retain=False):
        rows = self.db.selectall("""select c.urlid, c.content,
            group_concat(cc.category separator ' ')
            from %s as c, %s as cc
            where c.urlid = cc.urlid
            group by c.urlid order by c.urlid desc""" % (name[0], name[1]))
        if debug: print "Processing %d articles..." % len(rows)
        if retain and self.retained_db_docs != None:
            return self.retained_db_docs
        docs = []
        for row in rows:
            wordfreq = self.txtpro.simpletextprocess(row[0], row[1])
            if wordfreq.N() > 0 and 'NotRelated' not in row[2].split(' '):
                docs.append((row[0], wordfreq, row[2]))
            if debug:
                sys.stdout.write('.')
                sys.stdout.flush()
        if retain:
            self.retained_db_docs = docs
        return docs
Ejemplo n.º 17
0
class AINewsPublisher():
    def __init__(self):
        self.debug = config['ainews.debug']
        self.today = date.today()
        self.earliest_date = self.today - timedelta(days = int(config['ainews.period']))
        self.db = AINewsDB()
        self.corpus = AINewsCorpus()
        self.duplicates = AINewsDuplicates()
        self.svm_classifier = AINewsSVMClassifier()
        self.txtpro = AINewsTextProcessor()
        self.summarizer = AINewsSummarizer()

        self.articles = {}
        self.publishable_articles = []
        self.semiauto_email_output = ""

        self.topicids = {"AIOverview":0, "Agents":1, "Applications":2,
           "CognitiveScience":3, "Education":4,"Ethics":5, 
           "Games":6, "History":7, "Interfaces":8, "MachineLearning":9,
           "NaturalLanguage":10, "Philosophy":11, "Reasoning":12,
           "Representation":13, "Robots":14, "ScienceFiction":15,"Speech":16,
           "Systems":17,  "Vision":18}

    def filter_and_process(self):
        self.articles = self.corpus.get_unprocessed()

        if len(self.articles) == 0: return

        # assume every article will be published; may be set to False from one
        # of the filtering processes below
        for urlid in self.articles:
            self.articles[urlid]['publish'] = True
            self.articles[urlid]['transcript'] = []

        # filter by date
        for urlid in self.articles:
            if self.articles[urlid]['pubdate'] == None:
                # give a meaningful pubdate so that other code doesn't crash
                self.articles[urlid]['pubdate'] = self.today
                self.articles[urlid]['publish'] = False
                self.articles[urlid]['transcript'].append("Rejected due to bogus publication date.")
            elif self.articles[urlid]['pubdate'] < self.earliest_date:
                self.articles[urlid]['publish'] = False
                self.articles[urlid]['transcript'].append(
                        ("Rejected because article is too old " +
                        "(earliest valid date is %s while article was " +
                        "published on %s") % (self.earliest_date.strftime('%F'),
                            self.articles[urlid]['pubdate'].strftime('%F')))

        # filter by blacklist (for urls)
        for urlid in self.articles:
            for black in blacklist_urls:
                if re.search(black, self.articles[urlid]['url']):
                    self.articles[urlid]['publish'] = False
                    self.articles[urlid]['transcript'].append(
                        ("Rejected because url matched blacklisted url %s" % black))
                    break

        # filter by whitelist
        for urlid in self.articles:
            white_wordfreq = self.txtpro.whiteprocess(urlid,
                    self.articles[urlid]['content'])
            self.articles[urlid]['white_wordfreq'] = white_wordfreq

            # require at least two different whitelisted terms
            # unless the article is user-submitted
            if len(white_wordfreq) < 2 \
                    and self.articles[urlid]['publisher'] != 'UserSubmitted':
                self.articles[urlid]['publish'] = False
                self.articles[urlid]['transcript'].append(
                        'Rejected due to only one or no whitelisted terms')

        # update categories based on SVM classifier predictions
        self.svm_classifier.predict(self.articles)

        # drop articles classified as 'NotRelated' unless the article
        # is user-submitted
        for urlid in self.articles:
            if 'NotRelated' in self.articles[urlid]['categories'] \
                    and self.articles[urlid]['publisher'] != 'UserSubmitted':
                self.articles[urlid]['publish'] = False
                self.articles[urlid]['transcript'].append(
                        'Rejected due to NotRelated classification')

        # drop articles with no categories (even if user-submitted)
        for urlid in self.articles:
            if len(self.articles[urlid]['categories']) == 0:
                self.articles[urlid]['publish'] = False
                self.articles[urlid]['transcript'].append(
                        'Rejected due to no selected categories')

        # filter out duplicates; some articles may have 'publish' set to False
        # by this function
        self.duplicates.filter_duplicates(self.articles)

        # add article summaries
        self.summarizer.summarize(self.corpus, self.articles)

        for urlid in self.articles:
            try:
                print urlid, self.articles[urlid]['publish'], \
                    self.articles[urlid]['title'], \
                    self.articles[urlid]['categories'], \
                    self.articles[urlid]['summary']
                print
            except:
                pass

        for urlid in self.articles:
            # update article in database
            self.update_db(self.articles[urlid])

        # mark each as processed
        self.corpus.mark_processed(self.articles.itervalues())

        # save sorted list of articles to be read by AINewsPublisher; sort by
        # duplicate count (more = better), then relevance of source,
        # then by number of categories (more = better)
        unpublished_articles = sorted(
                filter(lambda x: x['publish'], self.articles.values()),
                cmp=lambda x,y: self.corpus.compare_articles(x, y),
                reverse = True)

        max_cat_count = int(config['publisher.max_cat_count'])
        max_count = int(config['publisher.max_count'])
        cat_counts = {}
        for cat in self.corpus.categories:
            cat_counts[cat] = 0
        # choose stories such that no category has more than max_cat_count
        # members and no more than max_count stories have been selected
        # (independent of category); only one of the article's categories needs
        # to have "free space"
        self.publishable_articles = []
        for article in unpublished_articles:
            if len(self.publishable_articles) == max_count:
                break
            free_cat = False
            for cat in article['categories']:
                if cat_counts[cat] < max_cat_count:
                    free_cat = True
                    break
            # if there is a free category or this article has only the
            # Applications category, then it can be published
            if free_cat or (article['categories'] == ['Applications']):
                self.publishable_articles.append(article)
                self.articles[article['urlid']]['transcript'].append('Published')
                self.articles[article['urlid']]['published'] = True
                for cat in article['categories']:
                    cat_counts[cat] += 1

        # record that these articles are publishable
        self.corpus.mark_publishable(self.publishable_articles)

    def update_db(self, article):
        self.db.execute("delete from categories where urlid = %s", article['urlid'])
        for cat in article['categories']:
            self.db.execute("insert into categories values (%s,%s)",
                (article['urlid'], cat))
        self.db.execute("update urllist set summary = %s where urlid = %s",
                        (article['summary'], article['urlid']))

    def get_publishable_articles(self):
        publishable = self.corpus.get_publishable()

        self.publishable_articles = []

        # drop "Applications" category if article has more categories
        for article in publishable:
            if len(article['categories']) > 1:
                article['categories'] = filter(lambda c: c != "Applications",
                                               article['categories'])
            self.publishable_articles.append(article)


    def mark_published(self):
        self.corpus.mark_published(self.publishable_articles)

    def generate_standard_output(self): 
        """
        Generate the stanard output for debuging on screen.
        """
        txt = LatestNewsTxt()
        txt.news = self.publishable_articles
        savefile(paths['ainews.output'] + "std_output.txt", str(txt))

    def generate_email_output(self):
        """
        Generate the output for email format.
        """
        email = LatestNewsEmail()
        email.date = self.today.strftime("%B %d, %Y")
        email.year = self.today.strftime("%Y")
        email.news = self.publishable_articles
        email.aitopic_urls = aitopic_urls
        email.topicids = self.topicids
        email_output = str(email)

        savefile(paths['ainews.output'] + "email_output.txt", email_output)
        self.semiauto_email_output = email_output

    def generate_pmwiki_all_output(self):
        pmwiki_all = AllNewsPmWiki()
        pmwiki_all.date = self.today.strftime("%B %d, %Y")
        pmwiki_all.year = self.today.strftime("%Y")
        pmwiki_all.news = self.articles.values()
        savefile(paths['ainews.output'] + "pmwiki_all.txt", str(pmwiki_all))

        # Generate wiki metadata page for each article
        urlids_output = ""
        for urlid in self.articles:
            urlids_output += str(urlid) + '\n'
            article_wiki = ArticlePmWiki()
            article_wiki.year = self.today.strftime("%Y")
            article_wiki.dupthreshold = float(config['duplicates.threshold'])
            article_wiki.n = self.articles[urlid]
            savefile(paths['ainews.output'] + "aiarticles/%d" % urlid,
                    str(article_wiki))
        savefile(paths['ainews.output'] + "urlids_output.txt", urlids_output)
        
    def generate_pmwiki_published_output(self):
        """
        Genereate the output with PmWiki page format. It needs to be further
        processed by AINewsPmwiki.php.
        """
        pmwiki = LatestNewsPmWiki()
        pmwiki.date = self.today.strftime("%B %d, %Y")
        pmwiki.year = self.today.strftime("%Y")
        pmwiki.news = self.publishable_articles
        pmwiki.rater = True
        savefile(paths['ainews.output'] + "pmwiki_output.txt", str(pmwiki))
        pmwiki.rater = False
        savefile(paths['ainews.output'] + "pmwiki_output_norater.txt", str(pmwiki))

    def publish_email(self):
        """
        Call AINewsEmail.php to send email through PHP Mail Server
        """
        #cmd = 'php AINewsEmail.php'
        #Popen(cmd, shell = True, stdout = PIPE, stderr = STDOUT).communicate()
        self.publish_email_semiauto()
        
    def publish_email_semiauto(self):
        """
        Create an AINewsSemiAutoEmail.html file for admin to click and semi-auto
        send it to the subscriber list.
        """
        semiauto = """
        <html>
        <body>
        <h1>AI Alert - SemiAuto Sender</h1>
        <form action="http://aaai.org/cgi-dada/mail.cgi?flavor=send_email" method='post'>
        <!-- <form action="welcome.php" method="post"> -->
        <input type='hidden' name='f' value='send_email' />
        <input type='hidden' name='process' value='true' />
        <input type='hidden' name='admin_list' value='alert' />
        <input type='hidden' name='message_subject' value="%s" />
        <input type='hidden' name='email_format' value='HTML' />
        <textarea type='hidden' name="text_message_body">%s</textarea>
        <input type='submit' value='Submit Mailing List Message' />
        </form>
        <h2>Please review the email below. If there are concerns, contact Bruce or Reid:</h2>
        <p>
        %s
        </p>
        </body>
        </html>
        """ % ("AI Alert - "+str(self.today.strftime("%B %d, %Y")),
               self.semiauto_email_output, self.semiauto_email_output)
        savefile(paths['ainews.html'] + "semiauto_email.html", semiauto)

    def publish_pmwiki(self):
        """
        Call AINewsPmwiki.php to publish latest news to AAAI Pmwiki website.
        """
        cmd = 'php AINewsPmwiki.php'
        Popen(cmd, shell = True).wait()
        
    def update_rss(self):
        rssitems = []
        # insert latest news into rssitems
        for article in self.publishable_articles:
            rssitems.append(PyRSS2Gen.RSSItem(
                title = article['title'],
                link = article['url'],
                description = article['summary'],
                guid = PyRSS2Gen.Guid(article['url']),
                pubDate = datetime(article['pubdate'].year, \
                    article['pubdate'].month, article['pubdate'].day)))
            
        rssfile = paths['ainews.rss'] + "news.xml"
        publish_rss(rssfile, rssitems)
        
        
        topicrsses = ['overview', 'agent', 'apps', 'cogsci', 'edu', 'ethsoc', 
            'game', 'hist', 'interf', 'ml', 'nlp', 'phil', 'reason',
             'rep', 'robot', 'scifi', 'speech', 'systems',  'vision']
        topicitems = []
        for i in range(len(topicrsses)):
            topicitems.append([])
        urlset = set()
        for article in self.publishable_articles:
            if article['url'] in urlset: continue
            urlset.add(article['url'])
            for cat in article['categories']:
                topicid = self.topicids[cat]
                topicitems[topicid].append(PyRSS2Gen.RSSItem(
                        title = article['title'],
                        link = article['url'],
                        description = article['summary'],
                        guid = PyRSS2Gen.Guid(article['url']),
                        pubDate = datetime(article['pubdate'].year, \
                            article['pubdate'].month, article['pubdate'].day)))
            
        for i in range(len(topicrsses)):
            rssfile = paths['ainews.rss'] + topicrsses[i]+'.xml'
            if len(topicitems[i]) != 0:
                publish_rss(rssfile, topicitems[i])
Ejemplo n.º 18
0
class AINewsCrawler:
    def __init__(self):
        self.today = date.today()
        self.earliest_date = self.today - timedelta(
            days=int(config['ainews.period']))
        self.db = AINewsDB()
        self.summarizer = AINewsSummarizer()
        self.articles = []

    def get_sources(self, opts):
        """
        Get the news source list.
        """
        sources = []
        csv_file = csv.reader(urllib2.urlopen(paths['ainews.sources_csv']))
        header = True
        for row in csv_file:
            if header:
                header = False
                continue
            if len(opts) == 0 or (opts[0][0] == '--source'
                                  and opts[0][1] == row[1]):
                sources.append({
                    'source_id': row[0],
                    'title': row[1],
                    'link': row[2],
                    'parser': row[3],
                    'relevance': int(row[4])
                })
        return sources

    def fetch_all_sources(self, opts):
        for source in self.get_sources(opts):
            print "CRAWL: Crawling \"%s\"..." % source['title']
            try:
                f = feedparser.parse(source['link'])
            except Exception, e:
                print "Exception while parsing feed: %s" % (source['link'], )
                print e
                continue

            for entry in f.entries:
                d = None
                try:
                    if hasattr(entry, 'published_parsed'):
                        d = date(entry.published_parsed[0],
                                 entry.published_parsed[1],
                                 entry.published_parsed[2])
                    else:
                        d = date(entry.updated_parsed[0],
                                 entry.updated_parsed[1],
                                 entry.updated_parsed[2])
                except Exception, e:
                    print e
                    print entry
                    print "Setting date as today; could not parse date for feed", \
                        source['link']
                    d = self.today
                if d > self.today or d < self.earliest_date: continue
                if entry.title[-6:] == '(blog)' \
                        or entry.title[-15:] == '(press release)':
                    print "Blog or press release in title. (%s) (%s)" % \
                        (entry.link, entry.title)
                    continue
                try:
                    url = urllib2.urlopen(entry.link).geturl()
                except KeyboardInterrupt:
                    print "Quitting early due to keyboard interrupt."
                    sys.exit()
                except:
                    continue

                # attempt to skip blogs
                if re.match('^.*blog.*$', url):
                    print "'blog' in url (%s) (%s)" % \
                        (entry.link, entry.title)
                    continue
                # attempt to skip job postings
                if re.match('^.*job.*$', url):
                    print "'job' in url (%s) (%s)" % \
                        (entry.link, entry.title)
                    continue
                # skip urls we have already crawled
                if self.db.crawled(url):
                    print "Seen this url before (%s) (%s)" % \
                        (entry.link, entry.title)
                    continue

                title = cgi.escape(convert_to_printable(entry.title)).strip()

                # if source is Google News, extract true source from title
                if re.match(r'^.*Google News.*$', source['title']):
                    true_source = re.match(r'^.* - (.+)$', title).group(1)
                    true_source = "%s via Google News" % true_source
                    title = re.match(r'^(.*) - .+$', title).group(1)
                else:
                    true_source = source['title']

                self.articles.append({
                    'url': url,
                    'title': title,
                    'pubdate': d,
                    'source': true_source,
                    'source_id': source['source_id'],
                    'source_relevance': source['relevance']
                })