Esempio n. 1
0
def run(path_to_dump, wiki_files_dir='wiki', path_to_res='res_wiki.csv', workers_num=3):
    # step #1 - process files via wikiextractor
    argv = shlex.split(f'-o {wiki_files_dir} --json --processes {workers_num} {path_to_dump}')
    sys.argv = [sys.argv[0]] + argv
    print(argv)
    WikiExtractor.main()

    # step #2 - postporcessing
    postprocessing.run(wiki_files_dir, path_to_res, workers_num)
Esempio n. 2
0
def format_wikicorpus(input, output, bytes, num_process, num_out_files):
    if input is None:
        raise ValueError('input file is empty.')
    if not input.endswith('xml.bz2'):
        raise ValueError('input file not *.xml.bz2.')
    if not os.path.exists(output):
        os.makedirs(output)

    # Use WikiExtractor to extract the content
    WikiExtractor = try_import_wikiextractor()
    wiki_path = os.path.join(output, 'extracted')
    sys.argv = ['prog', '-b', bytes, '-o', wiki_path, input]
    WikiExtractor.main()

    # Merge extracted content into txt files
    prepared_path = os.path.join(output, 'prepared_wikipedia')
    if not os.path.exists(prepared_path):
        os.makedirs(prepared_path)
    filenames = get_formatting_list(wiki_path, recursive=True)
    num_files = len(filenames)
    num_out_files = min(num_out_files, num_files)
    file_volume = math.ceil(num_files / num_out_files)
    splited_files = [
        filenames[i:i + file_volume] for i in range(0, num_files, file_volume)
    ]
    num_out_files = len(splited_files)
    output_files = [
        os.path.join(prepared_path,
                     "wikipedia-prepared-{}.txt".format(str(i).zfill(4)))
        for i in range(num_out_files)
    ]
    print("All prepared raw text will be saved in {} txt files".format(
        num_out_files))
    num_process = min(num_process, num_out_files)
    print('Start preprocessing {} text files with {} cores'.format(
        num_files, num_process))
    process_args = [(splited_files[i], output_files[i])
                    for i in range(num_out_files)]

    start_time = time.time()
    with multiprocessing.Pool(num_process) as pool:
        f_read = 0
        for i, _ in enumerate(pool.imap(merge, process_args)):
            elapsed = time.time() - start_time
            f_read += len(splited_files[i])
            print(
                "prepared {:} files, Elapsed: {:.2f}s, ETA: {:.2f}s, ".format(
                    f_read, elapsed,
                    (num_files - f_read) / (num_files / elapsed)))
    print("Done preparation within {:.2f} seconds".format(elapsed))
Esempio n. 3
0
def get_keywords(title, primary):

    url_non_en = "https://" + primary + ".wikipedia.org/wiki/Special:Export/" + title
    resp = requests.get(url_non_en)
    with open('non_en.xml', 'wb') as f:
        f.write(resp.content)

    page = wptools.page(title, lang=primary)
    # page = wptools.page(title,lang = "hi")
    page.get_parse()
    wikidata_id = page.data['wikibase']
    # print("Wikidata Id obtained :" + wikidata_id)
    page_en = wptools.page(wikibase=wikidata_id)
    page_en.get_wikidata()
    title_en = page_en.data['title']
    url_en = "https://en.wikipedia.org/wiki/Special:Export/" + title_en
    resp = requests.get(url_en)
    with open('eng.xml', 'wb') as f:
        f.write(resp.content)

    # Execute the WikiExtractor.py code to process the non-en and en XMLs
    WikiExtractor.main()
    print("Code Executed")
    non_en_text = ""
    en_text = ""
    with open("non_en/AA/wiki_00", 'r') as f:
        non_en_text = f.read()
        f.close()
    with open("en/AA/wiki_00", 'r') as f:
        en_text = f.read()
        f.close()
    # page_en = wptools.page(title_en)
    # page_en.get_parse()

    url = "https://en.wikipedia.org/w/api.php?action=query&prop=pageprops&ppprop=wikibase_item&redirects=1&titles=" + title_en
    url += "&format=json"
    resp = requests.get(url)
    data = resp.content
    data = data.decode('utf8')
    data = json.loads(data)
    pages = data["query"]["pages"]
    for items in pages:
        # print(items)
        try:
            ID = pages[items]["pageprops"]["wikibase_item"]
            title_en = pages[items]["title"]
        except KeyError:
            continue
    print("Title English")
    print(title_en)
    # --------------------------------- Obtain section headings from the English XML page ------------------------

    section_headings = [i.start() for i in re.finditer("<sec>", en_text)]
    section_headings_end = [i.start() for i in re.finditer("</sec>", en_text)]
    print(len(section_headings))
    headings = []
    headings_pos = []
    for j in range(len(section_headings)):
        occurence = section_headings[j]
        title = en_text[occurence + 5:section_headings_end[j] - 1]
        headings.append(title)
        headings_pos.append(section_headings_end[j] + 6)

    # --------------------------------- Part Considering anchor text as keywords----------------------------------
    start = time.process_time()
    keywords_en = []
    keywords = []
    occur_en = [i.start() for i in re.finditer("href", en_text)]
    end_occur_en = [i.start() for i in re.finditer("</a>", en_text)]
    occur = [i.start() for i in re.finditer("href", non_en_text)]
    end_occur = [i.start() for i in re.finditer("</a>", non_en_text)]

    pos_keywords_en = {}
    for j in range(len(occur_en)):
        occurence = occur_en[j]
        title = ""
        pos = occurence + 6
        while (en_text[pos] != "\""):
            title += en_text[pos]
            pos += 1
        pos += 2
        # print(title)
        url = title
        # print(en_text[pos:end_occur_en[j]])
        title = en_text[pos:end_occur_en[j]]
        # title = title.lower()
        if (title_en.find(title) != -1):
            continue
        elif ((len(url) / len(title)) > 3):
            continue
        if title in pos_keywords_en:
            repitition = True
        else:
            pos_keywords_en[title] = occurence + 6
        keywords_en.append(title)

    for j in range(len(occur)):
        occurence = occur[j]
        title = ""
        pos = occurence + 6
        while (non_en_text[pos] != "\""):
            title += non_en_text[pos]
            pos += 1
        pos += 2
        url = title
        title = non_en_text[pos:end_occur[j]]
        if (title == 'के'):
            continue
        keywords.append(title)

    dict_keys = {}
    dict_keys_en = {}
    mappings_eng = {}
    mappings_non_en = {}
    for i in range(0, len(keywords_en), 50):
        # print(i)
        url = "https://en.wikipedia.org/w/api.php?action=query&prop=pageprops&ppprop=wikibase_item&redirects=1&titles="
        count = 0
        j = i
        while (j < len(keywords_en) and count < 50):
            url += keywords_en[j]
            # url += urls_en[j]
            if (count != 49):
                url += "|"
            count += 1
            j += 1
        url += "&format=json"
        resp = requests.get(url)
        data = resp.content
        # print(data.json())
        data = data.decode('utf8')
        data = json.loads(data)
        pages = data["query"]["pages"]
        for items in pages:
            # print(items)
            try:
                ID = pages[items]["pageprops"]["wikibase_item"]
                title = pages[items]["title"]
                dict_keys_en[ID] = title
                mappings_eng[title] = title
            except KeyError:
                # print("error")
                continue
        normalizations = {}
        try:

            normalized = data["query"]["normalized"]
            for items in normalized:
                try:
                    normalizations[items["to"]] = items["from"]
                except KeyError:
                    continue
        except KeyError:
            continue
        try:

            redirects = data["query"]["redirects"]
            for items in redirects:
                try:
                    if items["from"] in normalizations:
                        mappings_eng[items["to"]] = normalizations[
                            items["from"]]
                    else:
                        mappings_eng[items["to"]] = items["from"]
                except KeyError:
                    continue
        except KeyError:
            continue

    for i in range(0, len(keywords), 50):
        # print(i)
        url = "https://" + primary + ".wikipedia.org/w/api.php?action=query&prop=pageprops&ppprop=wikibase_item&redirects=1&titles="
        count = 0
        j = i
        while (j < len(keywords) and count < 50):
            url += keywords[j]
            # url += urls[j]
            if (count != 49):
                url += "|"
            count += 1
            j += 1
        url += "&format=json"
        resp = requests.get(url)
        data = resp.content
        # print(data.json())
        data = data.decode('utf8')
        data = json.loads(data)
        pages = data["query"]["pages"]
        for items in pages:
            # print(items)
            try:
                ID = pages[items]["pageprops"]["wikibase_item"]
                title = pages[items]["title"]
                dict_keys[ID] = title
                mappings_non_en[title] = title
            except KeyError:
                # print("error")
                continue
        normalizations = {}
        try:

            normalized = data["query"]["normalized"]
            for items in normalized:
                try:
                    normalizations[items["to"]] = items["from"]
                except KeyError:
                    continue
        except KeyError:
            continue
        try:

            redirects = data["query"]["redirects"]
            for items in redirects:
                try:
                    if items["from"] in normalizations:
                        mappings_non_en[items["to"]] = normalizations[
                            items["from"]]
                    else:
                        mappings_non_en[items["to"]] = items["from"]
                except KeyError:
                    continue
        except KeyError:
            continue
    print(len(dict_keys_en))
    print(len(dict_keys))
    # translator = Translator()

    relevant_english = []
    for keys in dict_keys_en:
        try:
            temp = dict_keys[keys]
        except:
            # relevant_english.append(dict_keys_en[keys])
            relevant_english.append(mappings_eng[dict_keys_en[keys]])
    unique_non_en = []
    s = 0
    for keys in dict_keys:
        try:
            temp = dict_keys_en[keys]
        except:
            # unique_non_en.append(dict_keys[keys])
            unique_non_en.append(mappings_non_en[dict_keys[keys]])
    relevant_english_links = []
    base_link = "https://en.wikipedia.org/wiki/" + title_en.replace(" ", "_")
    # print(relevant_english)
    # print(pos_keywords_en)
    for key_title in relevant_english:
        # key_title = key_title.lower()
        section_level = len(headings_pos) - 1
        if (key_title not in pos_keywords_en):
            relevant_english_links.append(base_link)
            continue
        while (section_level >= 0):
            if (pos_keywords_en[key_title] > headings_pos[section_level]):
                break
            section_level = section_level - 1

        if (pos_keywords_en[key_title] < headings_pos[0]):
            relevant_english_links.append(base_link)
        else:
            link_to_section = base_link + "#" + headings[
                section_level].replace(" ", "_")
            relevant_english_links.append(link_to_section)
        # chl position here
    end = time.process_time()
    print("Time taken to obtain mapping between keywords and page ids")
    print(end - start)

    print(
        "--------------------------------------------------------------------------------"
    )
    print("Wikipedia2vec execution begins")
    start = time.process_time()

    similarity_score = []
    entity_found = False
    try:
        title_vec = wiki2vec.get_entity_vector(title_en)
        entity_found = True
    except:
        entity_found = False
    count = 0
    if (entity_found):
        out = open("scores.txt", "w")
        for i in range(len(relevant_english)):
            score = 0
            try:
                key_vec = wiki2vec.get_entity_vector(relevant_english[i])
                dot = np.dot(title_vec, key_vec)
                norma = np.linalg.norm(title_vec)
                normb = np.linalg.norm(key_vec)
                cos = dot / (norma * normb)
                score = cos
            except:
                key_found = False
                count += 1
                score = 0
            similarity_score.append(score)
            out.write(
                str(relevant_english[i]) + "--> " + str(similarity_score[i]) +
                "\n")
        print("Len relevant english and similariy score")
        print(len(relevant_english))
        print(len(similarity_score))
        order = np.argsort(similarity_score)
        print(len(order))
        other_index = []
        other_index_link = []
        for i in range(len(order)):
            other_index.append(relevant_english[order[(len(order) - 1) - i]])
            other_index_link.append(
                relevant_english_links[order[(len(order) - 1) - i]])
        relevant_english = other_index
        relevant_english_links = other_index_link
    else:
        do_something = 0
        count = -1
        # the relevant english list, unchanged needs to be shown, as the Wikipedia2vec hasn't returned any entity vector for the title
        # keywords itself and therefore we cannot calculate the similarity values between the title and the extracted keywords
    print(len(relevant_english))
    # print(final)
    end = time.process_time()
    print("Time taken to get similarity scores")
    print(end - start)
    print("API calls completed")
    print("Keys not found = ")
    print(count)
    # out.close()

    # --------------------------------- Part Considering anchor text as keywords Ends-----------------------------

    ans = {}
    URL_en = "https://en.wikipedia.org/wiki/" + title_en
    ans['url_en'] = URL_en
    ans["keywords"] = unique_non_en
    ans["English_keywords"] = relevant_english
    ans['links'] = relevant_english_links
    temp = jsonify(ans)
    temp.status_code = 200

    return temp