def evaluate(): corpus = AINewsCorpus() print "urlid,length truewords,length justext,length goose,ld justtext,ld goose" for filename in sorted(glob.glob("../../experiments/justext/*.true")): truetext = ents.convert(file(filename).read()) truetext = re.sub(r'[^\w\s]', ' ', trunc(truetext, max_pos=3000, ellipsis=False)) truewords = re.split(r'\s+', truetext) urlid = filename[26:30] article = corpus.get_article(urlid) if article == None: continue articletext = re.sub(r'[^\w\s]', ' ', trunc((article['content_all']).encode('ascii'), max_pos=3000, ellipsis=False)) articlewords = re.split(r'\s+', articletext) goosecmd = "cd /home/josh/aitopics/AINews/tools/goose; /opt/maven/bin/mvn exec:java -Dexec.mainClass=com.jimplush.goose.TalkToMeGoose -Dexec.args='%s' -q 2>>/home/josh/log.txt" % article['url'] (stdout, _) = Popen(goosecmd, shell = True, stdout = PIPE).communicate() goosetext = ents.convert(stdout.encode('ascii')) goosetext = re.sub(r'[^\w\s]', ' ', trunc(goosetext, max_pos=3000, ellipsis=False)) goosewords = re.split(r'\s+', goosetext) ld_1 = (levenshtein_distance(truewords, articlewords))/float(len(truewords)) ld_2 = (levenshtein_distance(truewords, goosewords))/float(len(truewords)) print "%s,%d,%d,%d,%.4f,%.4f" % \ (urlid, len(truewords), len(articlewords), len(goosewords), ld_1, ld_2)
access=mmap.ACCESS_READ) file_encoding = chardet.detect(file_map[10000:50000])['encoding'] log.debug('file_path: %s file_size: %d file_encoding: %s', file_path, file_size, file_encoding) entry = {} headword = row['headword'] try: headword = headword.decode(file_encoding) except UnicodeDecodeError, e: log.warning('Bad encoding in headword, dico %s, file_path %s,' 'error text: %s', dico_id, file_path, str(e)) headword = try_decoding(headword, tried_encoding=file_encoding) headword = ents.convert(headword) if type(headword) != unicode: raise Exception('somehow headword isn''t unicode: %s', headword) entry['headwords'] = [(headword, None)] start_byte = int(row['startbyte']) end_byte = start_byte + int(row['byteoff']) content = file_map[start_byte:end_byte] try: content = content.decode(file_encoding) except UnicodeDecodeError, e: log.warning('Bad encoding in content, dico %s, file_path %s,' 'error text: %s', dico_id, file_path, str(e)) content = try_decoding(content, tried_encoding=file_encoding) content = ents.convert(content)
def crawl(self, opts): """ Crawl the news by source lists (Search page or RSS). """ rows = self.get_newssources(opts) for row in rows: sourcepage_url = row[0] publisher = row[1] sourcetype = row[2] tag = row[3] parser = ParserFactory(publisher, sourcetype) if parser == None: continue if self.debug: print "Crawling %s (%s):" % (publisher, tag) try: parser.parse_sourcepage(sourcepage_url) parser.parse_storypage() for candidate in parser.candidates: if len(candidate) != 4: continue url = candidate[0].encode('utf-8') print "Fetching", url title = convert_to_printable(ents.convert((re.sub(r'\s+', ' ', candidate[1])))).strip() # if publisher is GoogleNews, extract true publisher from title if publisher == "GoogleNews": print title true_publisher = re.match(r'^.* - (.+)$', title).group(1) true_publisher = "%s via Google News" % true_publisher elif publisher == "UserSubmitted": true_publisher = re.match(r'^[^\/]+:\/\/([^\/]+)(?::\d+)?\/?.*$', url).group(1) true_publisher = "%s (User submitted)" % true_publisher else: true_publisher = publisher # removing site title like " - NPR" title = re.sub(r'\s+[:-]\s+.*$', '', title) pubdate = candidate[2] content = convert_to_printable(ents.convert((re.sub(r'\s+', ' ', candidate[3])))).strip() if isinstance(title, types.StringType): title = unicode(title, errors = 'ignore') if isinstance(content, types.StringType): content = unicode(content, errors = 'ignore') content = re.sub("\\s*%s\\s*" % re.escape(title), '', content) content = re.sub(r'\s*Share this\s*', '', content) content = re.sub(r'\s+,\s+', ', ', content) content = re.sub(r'\s+\.', '.', content) if len(title) < 5 or len(content) < 2000: print "Content or title too short" continue # shorten content to (presumably) ignore article comments content = trunc(content, max_pos=3000) # remove content with blacklisted words found_blacklist_word = False for word in blacklist_words: if re.search("\W%s\W" % word, content, re.IGNORECASE) != None: print "Found blacklisted word \"%s\", ignoring article." % word found_blacklist_word = True break if found_blacklist_word: continue urlid = self.put_in_db(url, pubdate, self.today, true_publisher, \ tag, title, content) if urlid == None: continue try: print "{ID:%d} %s (%s, %s)" % (urlid, title, str(pubdate), true_publisher) except: pass except (KeyboardInterrupt): if self.debug: print "Quitting early due to keyboard interrupt." sys.exit() except: if self.debug: print "Parser for %s failed." % (publisher) print traceback.print_exc() continue;