def create_dictionaries_from_wiki_tables(input_file, output_folder):
    dict_headers = {}
    dict_page_titles = {}
    dict_captions = {}
    dict_section_titles = {}
    dict_data = {}

    with open(input_file) as json_file:
        wiki_tables = json.load(json_file)
        for table_id, wiki_table in wiki_tables.items():
            preprocessed_page_title = preprocess_string(wiki_table['pgTitle'])
            list(map(lambda x: add_to_dict(dict_page_titles, x, table_id), preprocessed_page_title))

            preprocessed_section_title = preprocess_string(wiki_table['secondTitle'])
            list(map(lambda x: add_to_dict(dict_section_titles, x, table_id), preprocessed_section_title))

            preprocessed_caption = preprocess_string(wiki_table['caption'])
            list(map(lambda x: add_to_dict(dict_captions, str(x), table_id), preprocessed_caption))

            preprocessed_headers = [x for title in wiki_table['title'] for x in preprocess_string(title)]
            list(map(lambda x: add_to_dict(dict_headers, str(x), table_id), preprocessed_headers))

            preprocessed_data = list(map(lambda x: list(map(lambda y: preprocess_string(y), x)), wiki_table['data']))
            list(map(lambda x: list(map(lambda y: list(map(lambda z: add_to_dict(dict_data, z, table_id), y)), x)),
                     preprocessed_data))

    write_dictionary_to_file(dict_headers, output_folder + '/words_headers.json')
    write_dictionary_to_file(dict_page_titles, output_folder + '/words_page_titles.json')
    write_dictionary_to_file(dict_section_titles, output_folder + '/words_section_titles.json')
    write_dictionary_to_file(dict_captions, output_folder + '/words_captions.json')
    write_dictionary_to_file(dict_data, output_folder + '/words_data.json')
def ratio_query_terms_in_page_title(query, table):
    """
    Ratio of the number of query tokens found in page title to total number of tokens
    :param query:
    :param table:
    :return:
    """
    tokenized_query = preprocess_string(query)
    tokenized_page_title = preprocess_string(table['pgTitle'])
    number_found = 0
    for query_token in tokenized_query:
        if query_token in tokenized_page_title:
            number_found += 1
    return number_found / len(tokenized_query)
def pmi(table):
    """
    Takes the table and returns the ACSDb-based schema coherency score
    :param table:
    :return:
    """
    average_pmi = 0
    counter = 0
    preprocessed_headers = list(
        map(lambda x: preprocess_string(x), table['title']))
    for i in range(len(preprocessed_headers) - 1):
        for j in range(i + 1, len(preprocessed_headers)):
            counter += 1
            pmi = 0
            for h1 in preprocessed_headers[i]:
                for h2 in preprocessed_headers[j]:
                    pmi += compute_pmi(h1, h2, n_documents, dict_headers)
            if pmi == 0:
                average_pmi = 0
            else:
                average_pmi += (pmi / (len(preprocessed_headers[i]) *
                                       len(preprocessed_headers[j])))
    if counter == 0:
        return 0.0
    return average_pmi / counter
Exemple #4
0
def tokenize_table(table, incl_headers=True):
    ''' All word tokens from the title, captions and heading of the table '''
    pgTable_tokens = word_tokenize(table['pgTitle'])
    caption_tokens = word_tokenize(table['caption'])
    if incl_headers:
        headers_tokens = [x for title in table['title'] for x in preprocess_string(title)]
    else:
        headers_tokens = []

    result = [x.lower() for x in list(set(pgTable_tokens + caption_tokens + headers_tokens))]
    return result
Exemple #5
0
def preprocess_field(field):
    field_result = field
    # Preprocess the initial string
    field_result = preprocess_string(field_result)
    filtered_result = []
    stop_words = set(stopwords.words('english'))
    for w in field_result:
        if w not in stop_words:
            filtered_result.append(w)

    return filtered_result
def idf_table_body(query):
    """
    Takes the query and returns the sum of the IDF scores of the words in the table bodies
    :param query:
    :return:
    """
    preprocessed_query = preprocess_string(query)
    final_idf = 0
    for term in preprocessed_query:
        if term in dict_data:
            final_idf += compute_idf_t(n_documents, len(dict_data[term]))
    return final_idf
def idf_section_title(query):
    """
    Takes the query and returns the sum of the IDF scores of the words in the section titles
    :param query:
    :return:
    """
    preprocessed_query = preprocess_string(query)
    final_idf = 0
    for term in preprocessed_query:
        if term in dict_section_titles:
            final_idf += compute_idf_t(n_documents,
                                       len(dict_section_titles[term]))
    return final_idf
def term_frequency_query_in_table_body(query, table):
    """
    Total query term frequency in the table body
    :param query:
    :param table:
    :return:
    """
    if len(table['data']) > 0:

        tokenized_query = word_tokenize(query.lower())
        number_found = 0

        for row in table['data']:
            data_row = list(map(lambda x: preprocess_string(x), row))
            for cell in data_row:
                for query_token in tokenized_query:
                    if query_token in cell:
                        number_found += 1
        return number_found
    return -1
def term_frequency_query_in_left_column(query, table):
    """
    Total query term frequency in the leftmost column cells
    :param query:
    :param table:
    :return:
    """
    if len(table['data']) > 0:
        tokenized_query = word_tokenize(query.lower())
        first_column = [i[0] for i in table['data']]
        tokenized_first_column = list(
            map(lambda x: preprocess_string(x), first_column))
        number_found = 0
        for query_token in tokenized_query:
            for cell in tokenized_first_column:
                if query_token in cell:
                    number_found += 1

        return number_found
    return -1
def idf_catch_all(query):
    """
    Takes the query and returns the sum of the IDF scores of the words in the all text of the tables
    :param query:
    :return:
    """
    preprocessed_query = preprocess_string(query)
    final_idf = 0
    for term in preprocessed_query:
        if term in dict_page_titles:
            final_idf += compute_idf_t(n_documents,
                                       len(dict_page_titles[term]))
        if term in dict_section_titles:
            final_idf += compute_idf_t(n_documents,
                                       len(dict_section_titles[term]))
        if term in dict_captions:
            final_idf += compute_idf_t(n_documents, len(dict_captions[term]))
        if term in dict_headers:
            final_idf += compute_idf_t(n_documents, len(dict_headers[term]))
        if term in dict_data:
            final_idf += compute_idf_t(n_documents, len(dict_data[term]))
    return final_idf