def main(options, args): total_comments = 0 total_pages = 0 total_users_known_comments = 0 last_page_id = 0 fh = open(options.filename, 'r') users = json.load(fh) logging.info("Users database is loaded") try: db = WikipediaDB() logging.info("Connected to database") for page in db.talk_pages('talk_pages_1'): comments = parse_page(page.old_text) total_pages += 1 total_comments += len(comments) last_page_id = page.page_id found = 0 for comment in comments: username = comment[1]["username"].upper() if username in users: row = OrderedDict() row["user_name"] = username row["page_id"] = page.page_id row["page_title"] = page.page_title row["comment"] = comment[0] row["time_stamp"] = comment[1].get("time", "0") db.insert(row, 'comments') found += 1 total_users_known_comments += found logging.debug( 'Found %d known users comments out of %d comments in page %s', found, len(comments), page.page_id) if total_pages % 10000 == 0: logging.info( '\nParsed %d pages. Latest page is %d. Known comments = %d ' 'found = %d. Comments/page = %.2f. Known comments = %.2f.', total_pages, last_page_id, total_users_known_comments, total_comments, total_comments / float(total_pages), total_users_known_comments / float(total_comments)) logging.info( 'Parsed %d pages, latest page is %d. Known users comments are %d out of %d', total_pages, last_page_id, total_users_known_comments, total_comments) db.close() except mdb.Error, e: print "Error %d: %s" % (e.args[0], e.args[1]) sys.exit(1)
def main(options, args): total_comments = 0 total_pages = 0 total_users_known_comments = 0 last_page_id = 0 fh = open(options.filename, 'r') users = json.load(fh) logging.info("Users database is loaded") try: db = WikipediaDB() logging.info("Connected to database") for page in db.talk_pages('talk_pages_1'): comments = parse_page(page.old_text) total_pages += 1 total_comments += len(comments) last_page_id = page.page_id found = 0 for comment in comments: username = comment[1]["username"].upper() if username in users: row = OrderedDict() row["user_name"] = username row["page_id"] = page.page_id row["page_title"] = page.page_title row["comment"] = comment[0] row["time_stamp"] = comment[1].get("time", "0") db.insert(row, 'comments') found += 1 total_users_known_comments += found logging.debug('Found %d known users comments out of %d comments in page %s', found, len(comments), page.page_id) if total_pages % 10000 == 0: logging.info('\nParsed %d pages. Latest page is %d. Known comments = %d ' 'found = %d. Comments/page = %.2f. Known comments = %.2f.', total_pages, last_page_id, total_users_known_comments, total_comments, total_comments/float(total_pages), total_users_known_comments/float(total_comments)) logging.info('Parsed %d pages, latest page is %d. Known users comments are %d out of %d', total_pages, last_page_id, total_users_known_comments, total_comments) db.close() except mdb.Error, e: print "Error %d: %s" % (e.args[0], e.args[1]) sys.exit(1)
def testBolshoi(self): answers = json.loads(open('bolshoi.ans.json', 'r').read()) text = open('bolshoi.txt', 'r').read() text = text.decode('utf-8') self.assertEqual(answers, parse_talk.parse_page(text))