Exemple #1
0
#!/usr/bin/env python3
from xml.dom import minidom
from html import unescape
import re
from utils import sanitize, line_rules, download

# TODO:     ['−', '-']  # replace unicode minus sign with hypen-minus (the minus commonly used on computer keyboard)
# TODO: sentences with different alphabets

DOWNLOAD_PATH = 'https://dumps.wikimedia.org/itwikiquote/latest/itwikiquote-latest-pages-articles.xml.bz2'
OUTFILE = "output/wikiquote.txt"
DISCARD_FILE = 'output/discarded/wikiquote.json'

download_me = download.Download()
validate_line = line_rules.LineRules(discard_file=DISCARD_FILE)
clean_me = sanitize.Sanitization()

sub_regex = re.compile(
    r"""\[\[(File|Category):[\s\S]+\]\]|
                        \[\[[^|^\]]+\||
                        \[\[|\]\]|
                        \'{2,5}|
                        (<s>|<!--)[\s\S]+(</s>|-->)|
                        (<s>|<!)[\s\S]+(</s>|>)|
                        {{[\s\S\n]+?}}|
                        <.*?>|
                        ={1,6}""", re.VERBOSE)

normalize_rules = [['*', u"\n"], ['<br />', u"\n"], ['<br>', u"\n"],
                   ["\(\d\d\d\d\)", ""], ["[\(\[].*?[\)\]]", ""],
                   ['AvvertenzaContattiDonazioni', ''],
#!/usr/bin/env python3
from xml.dom import minidom
import re
from utils import sanitize, line_rules, download

download_me = download.Download()
validate_line = line_rules.LineRules()
clean_me = sanitize.Sanitization()

xml_path = download_me.if_not_exist('https://dumps.wikimedia.org/itwikiquote/latest/itwikiquote-latest-pages-articles.xml.bz2').bz2_decompress()

print('  Reading XML file')
mydoc = minidom.parse(xml_path)
items = mydoc.getElementsByTagName('page')

result = open( './output/wikiquote.txt', 'w' )

print('  Parsing in progress')
text = ''
for elem in items:
    title = elem.getElementsByTagName("title")[0].firstChild.data
    if 'wiki' not in title and title != 'Pagina principale':
        textdom = elem.getElementsByTagName("revision")[0].getElementsByTagName("text")[0]
        if textdom.firstChild is not None:
            text = ''
            raw_text = clean_me.escapehtml(textdom.firstChild.data)
            raw_text = re.compile(r"""\[\[(File|Category):[\s\S]+\]\]|
                        \[\[[^|^\]]+\||
                        \[\[|\]\]|
                        \'{2,5}|
                        (<s>|<!--)[\s\S]+(</s>|-->)|
#!/usr/bin/env python3
from utils import sanitize, line_rules, download
from urllib import parse
import time
import os
import re

OUTFILE = "output/wikisource.txt"
PARSING = './parsing/wikisource/'
if not os.path.isdir(PARSING):
    os.mkdir(PARSING)
DISCARD_FILE = 'output/discarded/wikisource.json'
DOWNLOAD_LINK = 'https://wsexport.wmflabs.org/tool/book.php?lang=it&format=txt&page='

validate_line = line_rules.LineRules(DISCARD_FILE)
clean_me = sanitize.Sanitization()
download_me = download.Download()


def process_line(line, out_file):
    """if line is invalid returns early, if is correct writes the line to the file"""
    line = re.sub("[eE]'", "è", line)
    line = clean_me.clean_single_line(line)
    if (validate_line.is_not_valid(line) or len(line) <= 12
            or line == 'creativecommons'
            or validate_line.contain(line, [
                '§', '=', '--', '~', 'wiki', 'licenses', '//', ' pp', ' Ibid',
                '■', '^'
            ]) or
            # line.find('/') >= 1 or  or commented out because with the current regex digits and brackets are always discarded
            validate_line.isbrokenparenthesis(line)