Esempio n. 1
0
def evaluate():
    corpus = AINewsCorpus()
    print "urlid,length truewords,length justext,length goose,ld justtext,ld goose"
    for filename in sorted(glob.glob("../../experiments/justext/*.true")):
        truetext = ents.convert(file(filename).read())
        truetext = re.sub(r'[^\w\s]', ' ', trunc(truetext, max_pos=3000, ellipsis=False))
        truewords = re.split(r'\s+', truetext)
        urlid = filename[26:30]
        article = corpus.get_article(urlid)
        if article == None: continue
        articletext = re.sub(r'[^\w\s]', ' ', trunc((article['content_all']).encode('ascii'), max_pos=3000, ellipsis=False))
        articlewords = re.split(r'\s+', articletext)
        goosecmd = "cd /home/josh/aitopics/AINews/tools/goose; /opt/maven/bin/mvn exec:java -Dexec.mainClass=com.jimplush.goose.TalkToMeGoose -Dexec.args='%s' -q 2>>/home/josh/log.txt" % article['url']
        (stdout, _) = Popen(goosecmd, shell = True, stdout = PIPE).communicate()
        goosetext = ents.convert(stdout.encode('ascii'))
        goosetext = re.sub(r'[^\w\s]', ' ', trunc(goosetext, max_pos=3000, ellipsis=False))
        goosewords = re.split(r'\s+', goosetext)
        ld_1 = (levenshtein_distance(truewords, articlewords))/float(len(truewords))
        ld_2 = (levenshtein_distance(truewords, goosewords))/float(len(truewords))
        print "%s,%d,%d,%d,%.4f,%.4f" % \
            (urlid, len(truewords), len(articlewords), len(goosewords), ld_1, ld_2)
Esempio n. 2
0
File: DAF.py Progetto: ARTFL/dodgr
                                 access=mmap.ACCESS_READ)
            file_encoding = chardet.detect(file_map[10000:50000])['encoding']

            log.debug('file_path: %s file_size: %d file_encoding: %s',
                       file_path, file_size, file_encoding)

        entry = {}
        headword = row['headword']

        try:
            headword = headword.decode(file_encoding)
        except UnicodeDecodeError, e:
            log.warning('Bad encoding in headword, dico %s, file_path %s,'
                        'error text: %s', dico_id, file_path, str(e))
            headword = try_decoding(headword, tried_encoding=file_encoding)
        headword = ents.convert(headword)
        if type(headword) != unicode:
            raise Exception('somehow headword isn''t unicode: %s', headword)
        entry['headwords'] = [(headword, None)]

        start_byte = int(row['startbyte'])
        end_byte = start_byte + int(row['byteoff'])
        content = file_map[start_byte:end_byte]
        try:
            content = content.decode(file_encoding)
        except UnicodeDecodeError, e:
            log.warning('Bad encoding in content, dico %s, file_path %s,'
                        'error text: %s', dico_id, file_path, str(e))
            content = try_decoding(content, tried_encoding=file_encoding)

        content = ents.convert(content)
Esempio n. 3
0
    def crawl(self, opts):
        """
        Crawl the news by source lists (Search page or RSS).
        """
        rows = self.get_newssources(opts)
        for row in rows:
            sourcepage_url = row[0]
            publisher = row[1]
            sourcetype = row[2]
            tag = row[3]
            parser = ParserFactory(publisher, sourcetype)
            if parser == None: continue
            if self.debug: print "Crawling %s (%s):" % (publisher, tag)
            try:
                parser.parse_sourcepage(sourcepage_url)
                parser.parse_storypage()
                for candidate in parser.candidates:
                    if len(candidate) != 4: continue
                    url = candidate[0].encode('utf-8')
                    print "Fetching", url
                    title = convert_to_printable(ents.convert((re.sub(r'\s+', ' ', candidate[1])))).strip()
                    # if publisher is GoogleNews, extract true publisher from title
                    if publisher == "GoogleNews":
                        print title
                        true_publisher = re.match(r'^.* - (.+)$', title).group(1)
                        true_publisher = "%s via Google News" % true_publisher
                    elif publisher == "UserSubmitted":
                        true_publisher = re.match(r'^[^\/]+:\/\/([^\/]+)(?::\d+)?\/?.*$', url).group(1)
                        true_publisher = "%s (User submitted)" % true_publisher
                    else: true_publisher = publisher

                    # removing site title like " - NPR"
                    title = re.sub(r'\s+[:-]\s+.*$', '', title)
                    pubdate = candidate[2]
                    content = convert_to_printable(ents.convert((re.sub(r'\s+', ' ', candidate[3])))).strip()
                    if isinstance(title, types.StringType):
                        title = unicode(title, errors = 'ignore')
                    if isinstance(content, types.StringType):
                        content = unicode(content, errors = 'ignore')
                    content = re.sub("\\s*%s\\s*" % re.escape(title), '', content)
                    content = re.sub(r'\s*Share this\s*', '', content)
                    content = re.sub(r'\s+,\s+', ', ', content)
                    content = re.sub(r'\s+\.', '.', content)

                    if len(title) < 5 or len(content) < 2000:
                        print "Content or title too short"
                        continue

                    # shorten content to (presumably) ignore article comments
                    content = trunc(content, max_pos=3000)

                    # remove content with blacklisted words
                    found_blacklist_word = False
                    for word in blacklist_words:
                        if re.search("\W%s\W" % word, content, re.IGNORECASE) != None:
                            print "Found blacklisted word \"%s\", ignoring article." % word
                            found_blacklist_word = True
                            break
                    if found_blacklist_word: 
                        continue

                    urlid = self.put_in_db(url, pubdate, self.today, true_publisher, \
                            tag, title, content)
                    if urlid == None: continue
                    try:
                        print "{ID:%d} %s (%s, %s)" % (urlid, title, str(pubdate), true_publisher)
                    except:
                        pass

            except (KeyboardInterrupt):
                if self.debug: print "Quitting early due to keyboard interrupt."
                sys.exit()
            except:
                if self.debug:
                    print "Parser for %s failed." % (publisher)
                    print traceback.print_exc()
                continue;