Exemple #1
0
def create_pagerank(C, L, I, k=1):
    """
    :param n: Matrix length
    :param k: iteration nb
    :return: List of page rank (indices are pages ids)
    """
    start_time = time.time()

    n = len(L) - 1
    Pi = [1 / n for _ in range(n)]
    P = [0] * n
    for _ in range(k):
        for i in range(n):

            if i + 1 < n + 1:
                if L[i] == L[i + 1]:  # Empty line
                    for j in range(n):
                        P[j] += 1 / n * Pi[i]
                else:
                    for j in range(L[i], L[i + 1]):
                        P[I[j]] += C[j] * Pi[i]
            print_percentage(i, n)

    print("     ** Finish create_pagerank()")
    elapsed_time = time.time() - start_time
    print("     Elapsed time create_pagerank() : {}".format(
        hms_string(elapsed_time)))
    return P
Exemple #2
0
def parse_corpus(file_name, pages_count=252374):
    """
    :param file_name:
        XML file containing pages data
    :param pages_count:
        number of pages
    :return:
        List of tuple containing (id, title, content) for each page
    """
    start_time = time.time()

    pagelist_noclean = []
    total_pages_count = 0

    id = None
    title = None
    content = None

    for event, elem in ET.iterparse(file_name, events=('start', 'end')):
        tname = elem.tag

        if event == 'start':

            if tname == 'page':
                title = ''
                id = -1
                content = ''
        else:
            if tname == 'title':
                title = elem.text

            elif tname == 'id':
                id = int(elem.text)

            elif tname == 'text':
                content = elem.text

            elif tname == 'page':
                total_pages_count += 1
                pagelist_noclean.append((id, title, content))
                print_percentage(total_pages_count, pages_count)

            elem.clear()

    elapsed_time = time.time() - start_time
    print("  ** Finish parse corpus")
    print("  - Elapsed time parse corpus : {}".format(
        hms_string(elapsed_time)))

    return pagelist_noclean
Exemple #3
0
def create_clean_tokens_pagelist(pagelist_plaintext):
    start_time = time.time()
    pagelist_clean_tokens = []
    listsize = len(pagelist_plaintext)

    for i, (id, title, content) in enumerate(pagelist_plaintext):
        content_clean_tokens = get_clean_tokens(content, remove_section=True)
        pagelist_clean_tokens.append((id, title, content_clean_tokens))
        print_percentage(i, listsize)

    elapsed_time = time.time() - start_time
    print("  ** Finish create clean tokens pagelist")
    print("  - Elapsed time create clean tokens pagelist : {}".format(
        hms_string(elapsed_time)))
    return pagelist_clean_tokens
Exemple #4
0
def create_plaintext_pagelist(pagelist_noclean):
    start_time = time.time()
    pagelist_plaintext = []
    listsize = len(pagelist_noclean)

    for i, (id, title, content) in enumerate(pagelist_noclean):
        text = wiki_to_paintext(content)
        pagelist_plaintext.append((id, title, text))
        print_percentage(i, listsize)

    elapsed_time = time.time() - start_time
    print("  ** Finish create plaintext pagelist")
    print("  - Elapsed time create plaintext pagelist : {}".format(
        hms_string(elapsed_time)))
    return pagelist_plaintext
Exemple #5
0
def create_links_pagelist(pagelist_noclean):
    start_time = time.time()
    pagelist_links = []
    listsize = len(pagelist_noclean)

    for i, (id, title, content) in enumerate(pagelist_noclean):
        links = get_links(content)
        pagelist_links.append((id, title, links))
        print_percentage(i, listsize)

    elapsed_time = time.time() - start_time
    print("  ** Finish create links pagelist")
    print("  - Elapsed time create links pagelist : {}".format(
        hms_string(elapsed_time)))
    return pagelist_links
Exemple #6
0
def create_resume_pagelist(pagelist_plaintext):
    start_time = time.time()
    pagelist_plaintext_resume = []
    listsize = len(pagelist_plaintext)

    for i, (id, title, content) in enumerate(pagelist_plaintext):
        resume = get_resume(content)
        pagelist_plaintext_resume.append((id, title, resume))
        print_percentage(i, listsize)

    elapsed_time = time.time() - start_time
    print("  ** Finish create resume pagelist")
    print("  - Elapsed time create resume pagelist : {}".format(
        hms_string(elapsed_time)))
    return pagelist_plaintext_resume
Exemple #7
0
def create_cli(pagelist_links):
    """
    edge : [[Title]] in page content
    node : page id
    :param pagelist_links: list of pair containing (id, title, list links content)
    :return:
        Adjacency matrix of the web graph in CLI form
    """
    start_time = time.time()
    listsize = len(pagelist_links)
    dic = {}
    dic_edges = {}

    for id_list, (_, title, _) in enumerate(pagelist_links):
        dic[title] = id_list

    for _, id_list in dic.items():
        dic_edges[id_list] = [
            link for link in pagelist_links[id_list][2] if link in dic.keys()
        ]

    C = []
    L = [0]
    I = []

    for i, _ in enumerate(pagelist_links):
        links = dic_edges[i]
        edge_nb = len(links)
        val = 1 / edge_nb if edge_nb > 0 else 0

        for link in links:
            if link not in dic.keys():
                continue

            id_link = dic[link]
            C.append(val)
            I.append(id_link)

        L.append(L[-1] + edge_nb)
        print_percentage(i, listsize)

    elapsed_time = time.time() - start_time
    print("  ** Finish create cli")
    print("  - Elapsed time create cli : {}".format(hms_string(elapsed_time)))
    return C, L, I
Exemple #8
0
def create_dico(pagelist_clean_tokens):
    """
    :param pagelist_clean_tokens: list of pages to parse
    :return:
        Dictionnary of ~200k most used words containing all the words from titles in form {word : ({page_id : TF_normalized}, IDF)}
    """
    start_time = time.time()

    dico_title = dict()
    dico_text = dict()
    listsize = len(pagelist_clean_tokens)

    for id, (_, title, content) in enumerate(pagelist_clean_tokens):
        # Tokeniser le titre
        title_clean = get_clean_tokens(title)

        # for word in title_lemmatized:
        for word in title_clean:

            if word not in dico_title.keys():  # word not in dict
                dico_title[word] = ({id: 1}, 0)
            else:  # word in dict
                if id not in dico_title[word][0].keys():  # page is not in list
                    dico_title[word][0][id] = 10
                else:  # page already in list
                    dico_title[word][0][id] += 10

        for word in content:
            if word not in dico_text.keys():
                dico_text[word] = ({id: 1}, 0)
            else:
                if id not in dico_text[word][0].keys():  # page is not in list
                    dico_text[word][0][id] = 1
                else:  # page already in list
                    dico_text[word][0][id] += 1

        print_percentage(id, listsize)

    dico_title.update({
        key: value
        for key, value in sorted(list(dico_text.items()),
                                 key=lambda item: len(item[1][0].items()))
        [-200000:]
    })

    # for key, value in {key: value for key, value in
    #                    sorted(list(dico_text.items()), key=lambda item: len(item[1][0].items()))[-200000:]}.items():
    #     if key in dico_title.keys():
    #         for page, freq in dico_text[key][0].items():
    #             if page in dico_title[key][0].keys():
    #                 dico_title[key][0][page] += freq
    #             else:
    #                 dico_title[key][0][page] = freq
    #     else:
    #         dico_title[key] = value

    tf_norm = dict()  # normalized TF

    for word, (occ_dic, idf) in dico_title.items():
        for pageid, freq in occ_dic.items():
            if freq > 0:
                if pageid not in tf_norm.keys():
                    tf_norm[pageid] = (1 + math.log10(freq))**2
                else:
                    tf_norm[pageid] += (1 + math.log10(freq))**2

    # writing IDF and normalized TF
    for word in dico_title.keys():
        idf = math.log10(listsize / len(dico_title[word][0].keys()))
        dico_title[word] = (dico_title[word][0], idf)

        for page, tf in dico_title[word][0].items():
            dico_title[word][0][page] = tf / math.sqrt(tf_norm[page])

    elapsed_time = time.time() - start_time
    print("  ** Finish create dico")
    print("  - Elapsed time create dico : {}".format(hms_string(elapsed_time)))
    return dico_title