def process_article_text(id, count, title, text, newf): global verbose if verbose: PrintLog.message(u'[PA {0:d}] {1:s}'.format(count, title)) text = TidyUp.article(text) if newf: newf.write('{0:d}:'.format(id)) newf.write(title[1:].encode('utf-8')) # We pad the title to force the database to import strings newf.write('\n__NOTOC__\n') newf.write(text.encode('utf-8') + '\n') newf.write('***EOF***\n')
def body(self, category, key, title, text, seek): global verbose global error_flag title = self.convert(title).strip(u'\u200e\u200f') if self.KEY_TEMPLATE == key: if title not in self.ignored_templates: title = unicode(category, 'utf-8').capitalize() + ':' + upper_case_first_char(title) t_body = TidyUp.template(text) self.template_cursor.execute(u'insert or replace into templates (title, body) values(?, ?)', [u'~{0:d}~{1:s}'.format(self.file_id(), title), u'~' + t_body]) self.template_count += 1 return restricted = FilterWords.is_restricted(title) or FilterWords.is_restricted(text) self.article_count += 1 # do closer inspection to see if really restricted if restricted: (restricted, bad_words) = FilterWords.find_restricted(text) if restricted: self.restricted_count += 1 if self.article_count % 10000 == 0: start_time = time.time() PrintLog.message(u'Index: {0:7.2f}s {1:10d}'.format(start_time - self.time, self.article_count)) self.time = start_time for t in self.language_processor.translate(title): generate_bigram(t) if verbose: if restricted: PrintLog.message(u'Restricted Title: {0:s}'.format(title)) PrintLog.message(u' --> {0:s}'.format(bad_words)) else: PrintLog.message(u'Title: {0:s}'.format(title)) pass character_count = len(text) self.total_character_count += character_count self.offsets[self.article_count] = (self.file_id(), title, seek, character_count, self.total_character_count) if self.set_index(title, (self.article_count, -1, restricted, False)): # -1 == place holder PrintLog.message(u'ERROR: Duplicate Title: {0:s}'.format(title)) error_flag = True
def body(self, category, key, title, text, seek): global verbose global error_flag title = self.convert(title).strip(u'\u200e\u200f') if self.KEY_TEMPLATE == key: if title not in self.ignored_templates: title = unicode(category, 'utf-8').capitalize() + ':' + upper_case_first_char(title) t_body = TidyUp.template(text) self.template_cursor.execute(u'insert or replace into templates (title, body) values(?, ?)', [u'~{0:d}~{1:s}'.format(self.file_id(), title), u'~' + t_body]) self.template_count += 1 return restricted = FilterWords.is_restricted(title) or FilterWords.is_restricted(text) self.article_count += 1 # do closer inspection to see if really restricted if restricted: (restricted, bad_words) = FilterWords.find_restricted(text) if restricted: self.restricted_count += 1 if not verbose and self.article_count % 10000 == 0: start_time = time.time() PrintLog.message(u'Index: {0:7.2f}s {1:10d}'.format(start_time - self.time, self.article_count)) self.time = start_time for t in self.language_processor.translate(title): generate_bigram(t) if verbose: if restricted: PrintLog.message(u'Restricted Title: {0:s}'.format(title)) PrintLog.message(u' --> {0:s}'.format(bad_words)) else: PrintLog.message(u'Title: {0:s}'.format(title)) character_count = len(text) self.total_character_count += character_count self.offsets[self.article_count] = (self.file_id(), title, seek, character_count, self.total_character_count) if self.set_index(title, (self.article_count, -1, restricted, False)): # -1 == place holder PrintLog.message(u'ERROR: Duplicate Title: {0:s}'.format(title)) error_flag = True