#!/usr/bin/env python3
from xml.dom import minidom
import re
from utils import sanitize, line_rules, download

download_me = download.Download()
validate_line = line_rules.LineRules()
clean_me = sanitize.Sanitization()

xml_path = download_me.if_not_exist('https://dumps.wikimedia.org/itwikiquote/latest/itwikiquote-latest-pages-articles.xml.bz2').bz2_decompress()

print('  Reading XML file')
mydoc = minidom.parse(xml_path)
items = mydoc.getElementsByTagName('page')

result = open( './output/wikiquote.txt', 'w' )

print('  Parsing in progress')
text = ''
for elem in items:
    title = elem.getElementsByTagName("title")[0].firstChild.data
    if 'wiki' not in title and title != 'Pagina principale':
        textdom = elem.getElementsByTagName("revision")[0].getElementsByTagName("text")[0]
        if textdom.firstChild is not None:
            text = ''
            raw_text = clean_me.escapehtml(textdom.firstChild.data)
            raw_text = re.compile(r"""\[\[(File|Category):[\s\S]+\]\]|
                        \[\[[^|^\]]+\||
                        \[\[|\]\]|
                        \'{2,5}|
                        (<s>|<!--)[\s\S]+(</s>|-->)|
#!/usr/bin/env python3
import re
import os
import zipfile
from utils import sanitize, download

# start downloading ITALIANO.ZIP
downloader = download.Download()
downloader = downloader.if_not_exist(
    'http://www.parlaritaliano.it/attachments/article/716/ITALIANO.zip')
with zipfile.ZipFile(downloader.file) as italiano:
    with italiano.open('ITALIANO/ITALIANO_TRASCRIZIONI.zip') as trascrizioni:
        with zipfile.ZipFile(trascrizioni) as trascrizioni_ita:
            trascrizioni_ita.extractall(path=downloader.folder)

downloader = downloader.if_not_exist(
    'http://www.parlaritaliano.it/attachments/article/644/PALERMO.zip')
with zipfile.ZipFile(downloader.file) as palermo:
    with open(
            os.path.join(downloader.folder, "ITALIANO_TRASCRIZIONI",
                         "palermo.txt"), 'wb') as f:
        f.write(palermo.read('PALERMO/corpusPa/DGmtB03P.txt'))

downloader = downloader.if_not_exist(
    'http://www.parlaritaliano.it/attachments/article/644/ROMA.zip')
with zipfile.ZipFile(downloader.file) as palermo:
    with open(
            os.path.join(downloader.folder, "ITALIANO_TRASCRIZIONI",
                         "roma.txt"), 'wb') as f:
        f.write(palermo.read('ROMA/corpusRm/DGtdB04R.txt'))
    [re.compile('che\`'), u'ché'],
    [re.compile('e\`'), u'è'],
]

# managing parse directory name
parsedir = "parsing/qall/"

# managing output pathname + output filename
output = "output/qallme.txt"

output_file = open(output, "w", encoding='utf-8')

print("Qallme Importer")

downloader = download.Download().if_not_exist(
    'http://qallme.fbk.eu/archive/QB_IT_V1.0_TranscriptionsReferences.zip'
).zip_decompress(parsedir)

###  XML  ###
qallmef = ET.parse(
    parsedir +
    "QB_IT_V1.0_Translations/QallmebenchmarkIT_v1.0_final-translation.xml")
sentences = qallmef.findall("question/text")

# We are looking for sentences, not xml elements!
# turning xml elements into real sentences
for s in sentences:
    line = s.text
    if line is not None:
        line = sanitizer.maybe_normalize(line, mapping_normalization)
        output_file.write(line)