コード例 #1
0
ファイル: prepare.py プロジェクト: cidles/poio-corpus
def main(argv):
    config_file = os.path.join('..', 'config.ini')
    config = configparser.ConfigParser()
    config.read(config_file)

    for iso_639_3 in config.options("LanguagesISOMap"):
        iso_639_1 = config.get("LanguagesISOMap", iso_639_3)
        wiki_prefix = "{0}wiki".format(iso_639_1)
        new_wiki_prefix = "{0}wiki".format(iso_639_3)

        # finds out if the language currently dealt with in the for loop is part of the list of languages to prepare
        # this also means: iso_639_3 code of language to prepare has to be in config file as well as it has to be the folder name
        if not new_wiki_prefix in languages:
            print("Skipping {0}, since it does not belong to the languages to prepare".format(new_wiki_prefix))
            continue
        
        print("Processing wikipedia {0} -> {1}...".format(iso_639_1, iso_639_3))


        url = "http://dumps.wikimedia.org/backup-index.html"
        html_page = urllib2.urlopen(url)
        soup = BeautifulSoup(html_page)

        page = None
        for link in soup('a'):
            if link.string == wiki_prefix:
                page = urlparse.urljoin(url, link['href'])

        # get the link for the dump file
        wiki_date, dump_link = helpers.dump_link(wiki_prefix, page)

        if not dump_link:
            sys.stderr.write("Could not find dump link. Abort.")
            sys.exit(1)

        # check if there is already build for this Wikipedia dump
        #output_file = os.path.join(
            #'..', 'build', 'corpus', "{0}.zip".format(new_wiki_prefix,
                #wiki_date))
        #if os.path.exists(output_file):
        #    print("Output file already exists. Skipping.")
        #    continue

        # download dump
        print("Downloading {0}...".format(dump_link))
        file_path = helpers.download_dump(dump_link, wiki_prefix,
            new_wiki_prefix)
       
        print("Running WikiExtractor...")
        helpers.wikipedia_extractor(file_path, new_wiki_prefix)

        # Concatenate output files
        helpers.concatenate(new_wiki_prefix)

        # Calling clean scripts
        print("Cleaning...")
        helpers.clean_1(new_wiki_prefix)          
コード例 #2
0
ファイル: update.py プロジェクト: ricafett/poio-corpus
def main(argv):
    config_file = os.path.join('..', 'config.ini')
    config = configparser.ConfigParser()
    config.read(config_file)

    for iso_639_3 in config.options("LanguagesISOMap"):
        iso_639_1 = config.get("LanguagesISOMap", iso_639_3)
        wiki_prefix = "{0}wiki".format(iso_639_1)
        new_wiki_prefix = "{0}wiki".format(iso_639_3)

        print("Processing wikipedia {0} -> {1}...".format(
            iso_639_1, iso_639_3))

        # check if we already have a clean script for this language
        if not os.path.exists(os.path.join(new_wiki_prefix, "clean2.py")):
            print("No clean script found. Skipping.")
            continue

        url = "http://dumps.wikimedia.org/backup-index.html"
        html_page = urllib2.urlopen(url)
        soup = BeautifulSoup(html_page)

        page = None
        for link in soup('a'):
            if link.string == wiki_prefix:
                page = urlparse.urljoin(url, link['href'])

        # get the link for the dump file
        wiki_date, dump_link = helpers.dump_link(wiki_prefix, page)

        if not dump_link:
            sys.stderr.write("Could not find dump link. Abort.")
            sys.exit(1)

        # check if there is already build for this Wikipedia dump
        output_file = os.path.join(
            '..', 'build', 'corpus', "{0}.zip".format(new_wiki_prefix,
                                                      wiki_date))
        #if os.path.exists(output_file):
        #    print("Output file already exists. Skipping.")
        #    continue

        # download dump
        print("Downloading {0}...".format(dump_link))
        file_path = helpers.download_dump(dump_link, wiki_prefix,
                                          new_wiki_prefix)

        print("Running WikiExtractor...")
        helpers.wikipedia_extractor(file_path, new_wiki_prefix)

        # Concatenate output files
        helpers.concatenate(new_wiki_prefix)

        # Calling clean scripts
        print("Cleaning...")
        helpers.clean_1(new_wiki_prefix)

        os.system("{0} {1}/clean2.py {2} {3}".format(
            sys.executable, new_wiki_prefix,
            os.path.join(new_wiki_prefix,
                         "{0}_cleaned1.xml".format(new_wiki_prefix)),
            os.path.join(new_wiki_prefix,
                         "{0}_cleaned2.xml".format(new_wiki_prefix))))

        os.system("{0} clean3.py {1} {2}".format(
            sys.executable,
            os.path.join(new_wiki_prefix,
                         "{0}_cleaned2.xml".format(new_wiki_prefix)),
            os.path.join(new_wiki_prefix,
                         "{0}_cleaned3.xml".format(new_wiki_prefix))))

        print("Converting to GrAF...")
        os.system("{0} to_graf.py {1} {2}".format(
            sys.executable,
            os.path.join(new_wiki_prefix,
                         "{0}_cleaned3.xml".format(new_wiki_prefix)),
            os.path.join(new_wiki_prefix,
                         "{0}-{1}.hdr".format(new_wiki_prefix, wiki_date))))

        # Zipping
        print("Zipping...")
        files = [
            os.path.join(new_wiki_prefix,
                         "{0}-{1}.hdr".format(new_wiki_prefix, wiki_date)),
            os.path.join(new_wiki_prefix,
                         "{0}-{1}.txt".format(new_wiki_prefix, wiki_date)),
            os.path.join(new_wiki_prefix,
                         "{0}-{1}-doc.xml".format(new_wiki_prefix, wiki_date))
        ]
        myzip = zipfile.ZipFile(output_file, 'w', zipfile.ZIP_DEFLATED)
        for f in files:
            myzip.write(f, os.path.basename(f))
        myzip.write("LICENSE.wikipedia", "LICENSE")
        myzip.close()

        print
コード例 #3
0
ファイル: prepare.py プロジェクト: ricafett/poio-corpus
url = "http://dumps.wikimedia.org/backup-index.html"
html_page = urllib2.urlopen(url)
soup = BeautifulSoup(html_page)

lang_pages = [(link.string, urlparse.urljoin(url, link['href']))
                for l in languages
                    for link in soup('a')
                        if link.string == l]

for wiki_name, page in lang_pages:
    wiki_date, dump_link = helpers.dump_link(wiki_name, page)
           

    if not dump_link:
        print("Could not find dump link for {0}.".format(wiki_name))
        sys.exit(1)

    print("Downloading {0}...".format(dump_link))
    file_path = helpers.download_dump(dump_link, wiki_name)
   
    
    helpers.wikipedia_extractor(file_path, wiki_name)

    # Concatenate output files
    helpers.concatenate(wiki_name)

    # Calling first clean script
    print("Cleaning...")
    helpers.clean_1(wiki_name)
コード例 #4
0
    print("There are no languages to prepare that match your request...")
    sys.exit(1)
else:
    print("Preparing the following languages: {0} ...".format(languages))

url = "http://dumps.wikimedia.org/backup-index.html"
html_page = urllib2.urlopen(url)
soup = BeautifulSoup(html_page)

lang_pages = [(link.string, urlparse.urljoin(url, link['href']))
              for l in languages for link in soup('a') if link.string == l]

for wiki_name, page in lang_pages:
    wiki_date, dump_link = helpers.dump_link(wiki_name, page)

    if not dump_link:
        print("Could not find dump link for {0}.".format(wiki_name))
        sys.exit(1)

    print("Downloading {0}...".format(dump_link))
    file_path = helpers.download_dump(dump_link, wiki_name)

    helpers.wikipedia_extractor(file_path, wiki_name)

    # Concatenate output files
    helpers.concatenate(wiki_name)

    # Calling first clean script
    print("Cleaning...")
    helpers.clean_1(wiki_name)
コード例 #5
0
ファイル: update.py プロジェクト: ricafett/poio-corpus
def main(argv):
    config_file = os.path.join('..', 'config.ini')
    config = configparser.ConfigParser()
    config.read(config_file)

    for iso_639_3 in config.options("LanguagesISOMap"):
        iso_639_1 = config.get("LanguagesISOMap", iso_639_3)
        wiki_prefix = "{0}wiki".format(iso_639_1)
        new_wiki_prefix = "{0}wiki".format(iso_639_3)

        print("Processing wikipedia {0} -> {1}...".format(iso_639_1, iso_639_3))

        # check if we already have a clean script for this language
        if not os.path.exists(os.path.join(new_wiki_prefix, "clean2.py")):
            print("No clean script found. Skipping.")
            continue

        url = "http://dumps.wikimedia.org/backup-index.html"
        html_page = urllib2.urlopen(url)
        soup = BeautifulSoup(html_page)

        page = None
        for link in soup('a'):
            if link.string == wiki_prefix:
                page = urlparse.urljoin(url, link['href'])

        # get the link for the dump file
        wiki_date, dump_link = helpers.dump_link(wiki_prefix, page)

        if not dump_link:
            sys.stderr.write("Could not find dump link. Abort.")
            sys.exit(1)

        # check if there is already build for this Wikipedia dump
        output_file = os.path.join(
            '..', 'build', 'corpus', "{0}.zip".format(new_wiki_prefix,
                wiki_date))
        #if os.path.exists(output_file):
        #    print("Output file already exists. Skipping.")
        #    continue

        # download dump
        print("Downloading {0}...".format(dump_link))
        file_path = helpers.download_dump(dump_link, wiki_prefix,
            new_wiki_prefix)
       
        print("Running WikiExtractor...")
        helpers.wikipedia_extractor(file_path, new_wiki_prefix)

        # Concatenate output files
        helpers.concatenate(new_wiki_prefix)

        # Calling clean scripts
        print("Cleaning...")
        helpers.clean_1(new_wiki_prefix)   
       
        os.system("{0} {1}/clean2.py {2} {3}".format(
            sys.executable,
            new_wiki_prefix,
            os.path.join(
                new_wiki_prefix, "{0}_cleaned1.xml".format(new_wiki_prefix)),
            os.path.join(
                new_wiki_prefix, "{0}_cleaned2.xml".format(new_wiki_prefix))))

        os.system("{0} clean3.py {1} {2}".format(
            sys.executable,
            os.path.join(
                new_wiki_prefix, "{0}_cleaned2.xml".format(new_wiki_prefix)),
            os.path.join(
                new_wiki_prefix, "{0}_cleaned3.xml".format(new_wiki_prefix))))

        print("Converting to GrAF...")
        os.system("{0} to_graf.py {1} {2}".format(
            sys.executable,
            os.path.join(
                new_wiki_prefix, "{0}_cleaned3.xml".format(new_wiki_prefix)),
            os.path.join(
                new_wiki_prefix, "{0}-{1}.hdr".format(
                    new_wiki_prefix, wiki_date))))

        # Zipping
        print("Zipping...")
        files = [
            os.path.join(
                new_wiki_prefix, "{0}-{1}.hdr".format(
                    new_wiki_prefix, wiki_date)),
            os.path.join(
                new_wiki_prefix, "{0}-{1}.txt".format(
                    new_wiki_prefix, wiki_date)),
            os.path.join(
                new_wiki_prefix, "{0}-{1}-doc.xml".format(
                    new_wiki_prefix, wiki_date))
        ]
        myzip = zipfile.ZipFile(output_file, 'w', zipfile.ZIP_DEFLATED)
        for f in files:
            myzip.write(f, os.path.basename(f))
        myzip.write("LICENSE.wikipedia", "LICENSE")
        myzip.close()

        print
コード例 #6
0
ファイル: update.py プロジェクト: cidles/poio-corpus
def main(argv):
    arg_iso = None
    if len(argv) > 1:
        arg_iso = argv[1]

    script_path = os.path.dirname(os.path.realpath(__file__))
    os.chdir(script_path)

    config_file = os.path.join('..', 'config.ini')
    config = configparser.ConfigParser()
    config.read(config_file)

    processed = dict()
    processed_file = os.path.join('..', 'build', 'processed.pickle')
    if os.path.exists(processed_file):
        with open(processed_file, 'rb') as f:
            processed = pickle.load(f)

    if 'wikipedia' not in processed:
        processed['wikipedia'] = dict()


    for iso_639_3 in config.options("LanguagesISOMap"):
        if arg_iso and iso_639_3 != arg_iso:
            continue

        iso_639_1 = config.get("LanguagesISOMap", iso_639_3)
        wiki_prefix = "{0}wiki".format(iso_639_1)
        new_wiki_prefix = "{0}wiki".format(iso_639_3)

        print("Processing wikipedia {0} -> {1}...".format(iso_639_1, iso_639_3))

        # check if we already have a clean script for this language
        if not os.path.exists(os.path.join(new_wiki_prefix, "clean2.py")):
            print("No clean script found. Skipping.")
            continue

        url = "http://dumps.wikimedia.org/backup-index.html"
        html_page = requests.get(url)
        soup = BeautifulSoup(html_page.content)

        page = None
        for link in soup('a'):
            if link.string == wiki_prefix:
                page = urllib.parse.urljoin(url, link['href'])

        # get the link for the dump file
        wiki_date, dump_link = helpers.dump_link(wiki_prefix, page)

        if not dump_link:
            sys.stderr.write("Could not find dump link. Abort.")
            sys.exit(1)

        # check if there is already build for this Wikipedia dump
        output_file = os.path.join(
            '..', 'build', 'corpus', "{0}.zip".format(new_wiki_prefix))

        if iso_639_3 in processed['wikipedia'] and \
                int(processed['wikipedia'][iso_639_3]) >= int(wiki_date) and \
                os.path.exists(output_file):
            print("  Wikipedia already processed, skipping.")
            continue

        # download dump
        print("Downloading {0}...".format(dump_link))
        file_path = helpers.download_dump(dump_link, wiki_prefix,
            new_wiki_prefix)
       
        print("Running WikiExtractor...")
        helpers.wikipedia_extractor(file_path, new_wiki_prefix)

        # Concatenate output files
        helpers.concatenate(new_wiki_prefix)

        # Calling clean scripts
        print("Cleaning...")
        helpers.clean_1(new_wiki_prefix)   
       
        os.system("{0} {1}/clean2.py {2} {3}".format(
            sys.executable,
            new_wiki_prefix,
            os.path.join(
                new_wiki_prefix, "{0}_cleaned1.xml".format(new_wiki_prefix)),
            os.path.join(
                new_wiki_prefix, "{0}_cleaned2.xml".format(new_wiki_prefix))))

        os.system("{0} clean3.py {1} {2}".format(
            sys.executable,
            os.path.join(
                new_wiki_prefix, "{0}_cleaned2.xml".format(new_wiki_prefix)),
            os.path.join(
                new_wiki_prefix, "{0}_cleaned3.xml".format(new_wiki_prefix))))

        print("Converting to GrAF...")
        os.system("{0} to_graf.py {1} {2}".format(
            sys.executable,
            os.path.join(
                new_wiki_prefix, "{0}_cleaned3.xml".format(new_wiki_prefix)),
            os.path.join(
                new_wiki_prefix, "{0}-{1}.hdr".format(
                    new_wiki_prefix, wiki_date))))

        # Zipping
        print("Zipping...")
        files = [
            os.path.join(
                new_wiki_prefix, "{0}-{1}.hdr".format(
                    new_wiki_prefix, wiki_date)),
            os.path.join(
                new_wiki_prefix, "{0}-{1}.txt".format(
                    new_wiki_prefix, wiki_date)),
            os.path.join(
                new_wiki_prefix, "{0}-{1}-doc.xml".format(
                    new_wiki_prefix, wiki_date))
        ]
        myzip = zipfile.ZipFile(output_file, 'w', zipfile.ZIP_DEFLATED)
        for f in files:
            myzip.write(f, os.path.basename(f))
        myzip.write("LICENSE.wikipedia", "LICENSE")
        myzip.close()

        # Delete all files
        # print("Cleaning up...")
        # files.append(file_path)
        # files.append(os.path.splitext(file_path)[0])
        # for i in range(3):
        #     files.append(os.path.join(
        #         new_wiki_prefix,
        #         "{0}_cleaned{1}.xml".format(new_wiki_prefix, i+1)))
        # files.append(os.path.join(
        #         new_wiki_prefix,"{0}.xml".format(new_wiki_prefix)))
        # for f in files:
        #     os.remove(f)
        # shutil.rmtree(os.path.join(new_wiki_prefix, "extracted"))

        processed['wikipedia'][iso_639_3] = wiki_date
        with open(processed_file, 'wb') as f:
            pickle.dump(processed, f)