예제 #1
0
def process_article_text(id, count, title, text, newf):
    global verbose

    if verbose:
        PrintLog.message(u'[PA {0:d}] {1:s}'.format(count, title))

    text = TidyUp.article(text)

    if newf:
        newf.write('{0:d}:'.format(id))
        newf.write(title[1:].encode('utf-8'))  # We pad the title to force the database to import strings
        newf.write('\n__NOTOC__\n')
        newf.write(text.encode('utf-8') + '\n')
        newf.write('***EOF***\n')
예제 #2
0
def process_article_text(id, count, title, text, newf):
    global verbose

    if verbose:
        PrintLog.message(u'[PA {0:d}] {1:s}'.format(count, title))

    text = TidyUp.article(text)

    if newf:
        newf.write('{0:d}:'.format(id))
        newf.write(title[1:].encode('utf-8'))  # We pad the title to force the database to import strings
        newf.write('\n__NOTOC__\n')
        newf.write(text.encode('utf-8') + '\n')
        newf.write('***EOF***\n')
예제 #3
0
    def body(self, category, key, title, text, seek):
        global verbose
        global error_flag

        title = self.convert(title).strip(u'\u200e\u200f')

        if self.KEY_TEMPLATE == key:
            if title not in self.ignored_templates:
                title = unicode(category, 'utf-8').capitalize() + ':' + upper_case_first_char(title)
                t_body = TidyUp.template(text)
                self.template_cursor.execute(u'insert or replace into templates (title, body) values(?, ?)',
                                             [u'~{0:d}~{1:s}'.format(self.file_id(), title), u'~' + t_body])
                self.template_count += 1
            return

        restricted = FilterWords.is_restricted(title) or FilterWords.is_restricted(text)

        self.article_count += 1

        # do closer inspection to see if really restricted
        if restricted:
            (restricted, bad_words) = FilterWords.find_restricted(text)

        if restricted:
            self.restricted_count += 1

        if self.article_count % 10000 == 0:
            start_time = time.time()
            PrintLog.message(u'Index: {0:7.2f}s {1:10d}'.format(start_time - self.time, self.article_count))
            self.time = start_time

        for t in self.language_processor.translate(title):
            generate_bigram(t)

        if verbose:
            if restricted:
                PrintLog.message(u'Restricted Title: {0:s}'.format(title))
                PrintLog.message(u'  --> {0:s}'.format(bad_words))
            else:
                PrintLog.message(u'Title: {0:s}'.format(title))
                pass

        character_count = len(text)
        self.total_character_count += character_count
        self.offsets[self.article_count] = (self.file_id(), title, seek, character_count, self.total_character_count)

        if self.set_index(title, (self.article_count, -1, restricted, False)): # -1 == place holder
            PrintLog.message(u'ERROR: Duplicate Title: {0:s}'.format(title))
            error_flag = True
예제 #4
0
    def body(self, category, key, title, text, seek):
        global verbose
        global error_flag

        title = self.convert(title).strip(u'\u200e\u200f')

        if self.KEY_TEMPLATE == key:
            if title not in self.ignored_templates:
                title = unicode(category, 'utf-8').capitalize() + ':' + upper_case_first_char(title)
                t_body = TidyUp.template(text)
                self.template_cursor.execute(u'insert or replace into templates (title, body) values(?, ?)',
                                             [u'~{0:d}~{1:s}'.format(self.file_id(), title), u'~' + t_body])
                self.template_count += 1
            return

        restricted = FilterWords.is_restricted(title) or FilterWords.is_restricted(text)

        self.article_count += 1

        # do closer inspection to see if really restricted
        if restricted:
            (restricted, bad_words) = FilterWords.find_restricted(text)

        if restricted:
            self.restricted_count += 1

        if not verbose and self.article_count % 10000 == 0:
            start_time = time.time()
            PrintLog.message(u'Index: {0:7.2f}s {1:10d}'.format(start_time - self.time, self.article_count))
            self.time = start_time

        for t in self.language_processor.translate(title):
            generate_bigram(t)

        if verbose:
            if restricted:
                PrintLog.message(u'Restricted Title: {0:s}'.format(title))
                PrintLog.message(u'  --> {0:s}'.format(bad_words))
            else:
                PrintLog.message(u'Title: {0:s}'.format(title))

        character_count = len(text)
        self.total_character_count += character_count
        self.offsets[self.article_count] = (self.file_id(), title, seek, character_count, self.total_character_count)

        if self.set_index(title, (self.article_count, -1, restricted, False)): # -1 == place holder
            PrintLog.message(u'ERROR: Duplicate Title: {0:s}'.format(title))
            error_flag = True