コード例 #1
0
def main(options, args):
    total_comments = 0
    total_pages = 0
    total_users_known_comments = 0
    last_page_id = 0
    fh = open(options.filename, 'r')
    users = json.load(fh)
    logging.info("Users database is loaded")
    try:
        db = WikipediaDB()
        logging.info("Connected to database")
        for page in db.talk_pages('talk_pages_1'):
            comments = parse_page(page.old_text)
            total_pages += 1
            total_comments += len(comments)
            last_page_id = page.page_id
            found = 0
            for comment in comments:
                username = comment[1]["username"].upper()
                if username in users:
                    row = OrderedDict()
                    row["user_name"] = username
                    row["page_id"] = page.page_id
                    row["page_title"] = page.page_title
                    row["comment"] = comment[0]
                    row["time_stamp"] = comment[1].get("time", "0")
                    db.insert(row, 'comments')
                    found += 1
            total_users_known_comments += found
            logging.debug(
                'Found %d known users comments out of %d comments in page %s',
                found, len(comments), page.page_id)
            if total_pages % 10000 == 0:
                logging.info(
                    '\nParsed %d pages. Latest page is %d. Known comments = %d '
                    'found = %d. Comments/page = %.2f. Known comments = %.2f.',
                    total_pages, last_page_id, total_users_known_comments,
                    total_comments, total_comments / float(total_pages),
                    total_users_known_comments / float(total_comments))

        logging.info(
            'Parsed %d pages, latest page is %d. Known users comments are %d out of %d',
            total_pages, last_page_id, total_users_known_comments,
            total_comments)
        db.close()
    except mdb.Error, e:
        print "Error %d: %s" % (e.args[0], e.args[1])
        sys.exit(1)
コード例 #2
0
ファイル: wikipedia.py プロジェクト: aboSamoor/NLP
def main(options, args):
  total_comments = 0
  total_pages = 0
  total_users_known_comments = 0
  last_page_id = 0
  fh = open(options.filename, 'r')
  users = json.load(fh)
  logging.info("Users database is loaded")
  try:
    db = WikipediaDB()
    logging.info("Connected to database")
    for page in db.talk_pages('talk_pages_1'):
      comments = parse_page(page.old_text)
      total_pages += 1
      total_comments += len(comments)
      last_page_id = page.page_id
      found = 0
      for comment in comments:
        username = comment[1]["username"].upper()
        if username in users:
          row = OrderedDict()
          row["user_name"] = username
          row["page_id"] = page.page_id
          row["page_title"] = page.page_title
          row["comment"] = comment[0]
          row["time_stamp"] = comment[1].get("time", "0")
          db.insert(row, 'comments')
          found += 1
      total_users_known_comments += found
      logging.debug('Found %d known users comments out of %d comments in page %s',
                    found, len(comments), page.page_id)
      if total_pages % 10000 == 0:
        logging.info('\nParsed %d pages. Latest page is %d. Known comments = %d '
                     'found = %d. Comments/page = %.2f. Known comments = %.2f.',
                     total_pages,
                     last_page_id,
                     total_users_known_comments, total_comments,
                     total_comments/float(total_pages),
                     total_users_known_comments/float(total_comments))
        
    logging.info('Parsed %d pages, latest page is %d. Known users comments are %d out of %d', total_pages, last_page_id, total_users_known_comments, total_comments)
    db.close()
  except mdb.Error, e:
    print "Error %d: %s" % (e.args[0], e.args[1])
    sys.exit(1)
コード例 #3
0
ファイル: parse_talk_test.py プロジェクト: aboSamoor/NLP
 def testBolshoi(self):
   answers = json.loads(open('bolshoi.ans.json', 'r').read())
   text = open('bolshoi.txt', 'r').read()
   text = text.decode('utf-8')
   self.assertEqual(answers, parse_talk.parse_page(text))
コード例 #4
0
ファイル: parse_talk_test.py プロジェクト: javipus/NLP
 def testBolshoi(self):
     answers = json.loads(open('bolshoi.ans.json', 'r').read())
     text = open('bolshoi.txt', 'r').read()
     text = text.decode('utf-8')
     self.assertEqual(answers, parse_talk.parse_page(text))