def get_subject_predicate(subjects, predicates, table_type, row, col):
    if table_type == 2:
        predicate = clean(predicates[row], specify_wikilinks=False)
        subject = subjects[col]
    else:
        predicate = clean(predicates[col], specify_wikilinks=False)
        subject = subjects[row]
    return subject, predicate
def prepare_subject_predicate(table_data, table_cells, top_header_layer,
                              col_header_layer, bottom_header_layer):
    """
    :return
        subject: list of subjects
        predicate: list of predicates
        table_type : 1=first_col_is_counter, 2=normal, 3=is_columnar_predicate ( for special tables )
        useful_table: table without subjects and predicates
    """

    table_type = 0
    bottom_header_layer_index = len(table_data) - bottom_header_layer
    useful_table = [
        row[col_header_layer:]
        for row in table_data[top_header_layer:bottom_header_layer_index]
    ]

    if (top_header_layer, col_header_layer) == (1, 1):

        if first_col_is_counter([
                row[0] for row in
                table_data[top_header_layer:bottom_header_layer_index]
        ]):
            subject = [
                clean_subject(row[1]) for row in
                table_cells[top_header_layer:bottom_header_layer_index]
            ]
            predicate = [
                clean(data, specify_wikilinks=False) if data else None
                for data in table_data[0][col_header_layer + 1:]
            ]
            useful_table = [
                row[col_header_layer + 1:] for row in
                table_data[top_header_layer:bottom_header_layer_index]
            ]
            table_type = 1

        elif check_columnar_predicate(table_cells[0][col_header_layer:]):
            subject = [
                clean_subject(subject)
                for subject in table_cells[0][col_header_layer:]
            ]
            predicate = [
                clean(row[0]) if row[0] else None for row in
                table_data[top_header_layer:bottom_header_layer_index]
            ]
            table_type = 2

        else:
            subject = [
                clean_subject(row[0]) for row in
                table_cells[col_header_layer:bottom_header_layer_index]
            ]
            predicate = [
                clean(data, specify_wikilinks=False) if data else None
                for data in table_data[0][col_header_layer:]
            ]
        return subject, predicate, table_type, useful_table
def clean_object_predicate(cell):
    # todo: make decision
    if cell:
        if len(cell.wikilinks) <= 1:
            if is_wiki_link(cell.string):
                return clean(cell.string)
            else:
                return clean(cell.string, specify_wikilinks=False)
        else:
            if is_wiki_link(cell.string):
                return clean(cell.string, specify_wikilinks=False)
            else:
                return clean(cell.string, specify_wikilinks=False)
    else:
        return None
Beispiel #4
0
def split_infobox_values(values):
    splitted_values = list()
    param_values = post_clean(values)
    param_values = dropNested(param_values, r'{{', r'}}')
    param_values = param_values.split('\n')
    for param_value in param_values:
        param_value = clean(param_value)
        only_wiki_links = re.findall(r"http://fa.wikipedia.org/wiki/\S+",
                                     param_value)
        without_wiki_links = re.sub(r"http://fa.wikipedia.org/wiki/\S+", '',
                                    param_value)
        splitters = set(' ()\\,،./-و•؟?%')
        if set(without_wiki_links) <= splitters:
            for value in only_wiki_links:
                if value:
                    splitted_values.append(
                        post_clean(value, remove_newline=True))
        else:
            param_value = re.sub(r"http://fa.wikipedia.org/wiki/(\S+) ?",
                                 r'\1 ', param_value).replace('_',
                                                              ' ').strip()
            param_value = re.sub(r'\s+', ' ', param_value)
            splitted_values.append(post_clean(param_value,
                                              remove_newline=True))

    return splitted_values
Beispiel #5
0
def get_template_name_type(template_name):
    template_name = clean(str(template_name)).replace('template:', '').replace(
        'Template:', '')
    template_name = template_name.replace('الگو:', '').replace('_', ' ')
    template_type = 'template'
    lang = detect_language(template_name)

    lower_template_name = template_name.lower()
    if any(t in lower_template_name
           for t in Config.infobox_flags_fa + Config.infobox_flags_en):
        template_type = next(t for t in Config.infobox_flags_fa +
                             Config.infobox_flags_en
                             if t in lower_template_name)

    if any(s in lower_template_name
           for s in Config.stub_flag_fa + Config.stub_flag_en):
        template_type = next(s
                             for s in Config.stub_flag_fa + Config.stub_flag_en
                             if s in lower_template_name)

    return template_name, template_type, lang
def build_tuples(table, page_name, section_name, revision_id):
    image_names_types_in_fawiki = DataUtils.load_json(
        Config.extracted_image_names_types_dir,
        Config.extracted_image_names_types_filename)

    extracted_tables = 0
    try:
        table_data = table.data()
        table_cells = table.cells()
    except (IndexError, ValueError) as e:
        # library exceptions
        print(e)
        return [], 0

    if table_data and table_cells:
        col_header_layer, top_header_layer, bottom_header_layer = headers_checker(
            table_cells)
        tuples = list()

        if (top_header_layer, col_header_layer) == (1, 1):
            extracted_tables += 1
            subjects, predicates, table_type, useful_table = prepare_subject_predicate(
                table_data, table_cells, top_header_layer, col_header_layer,
                bottom_header_layer)
            population_predicate_counter = dict()
            for row_index, row in enumerate(useful_table):
                for cell_index, cell in enumerate(row):
                    if cell:
                        for value in DataUtils.split_infobox_values(cell):
                            tuple_per_row = dict()
                            tuple_per_row['object'] = clean(
                                value) if value else None
                            if DataUtils.is_image(tuple_per_row['object']):
                                tuple_per_row[
                                    'object'] = DataUtils.clean_image_value(
                                        value, image_names_types_in_fawiki)

                            tuple_per_row['subject'], tuple_per_row[
                                'predicate'] = get_subject_predicate(
                                    subjects, predicates, table_type,
                                    row_index, cell_index)
                            tuple_per_row['module'] = "wiki_table_extractor"
                            tuple_per_row[
                                'source'] = 'http://fa.wikipedia.org/wiki/' + page_name.replace(
                                    ' ', '_')
                            tuple_per_row['version'] = revision_id

                            # tuple_per_row['subject2'], tuple_per_row['predicate2'] = get_subject_predicate(subjects,
                            #                                                                                predicates,
                            #                                                                                table_type,
                            #                                                                                row_index,
                            #                                                                                cell_index)

                            # tuple_per_row['subject1'] = 'http://fa.wikipedia.org/wiki/' + page_name.replace(' ', '_')
                            # tuple_per_row['predicate1'] = section_name

                            tuple_per_row, hidden_tuple_per_row, population_predicate_counter = heuristic_check(
                                tuple_per_row, population_predicate_counter)
                            if all(
                                    data_validation(data)
                                    for data in tuple_per_row.values()):
                                tuples.append(tuple_per_row)
                                if hidden_tuple_per_row:
                                    tuples.append(hidden_tuple_per_row)
                    else:
                        continue
        return tuples, extracted_tables

    else:
        return [], 0
def clean_subject(subject_cell):
    # todo: make decision
    if subject_cell and has_wiki_link(subject_cell):
        return clean(subject_cell.wikilinks[0].string)
    else:
        return None
def extract_bz2_dump_information(directory,
                                 filename,
                                 extract_abstracts=False,
                                 extract_page_ids=False,
                                 extract_revision_ids=False,
                                 extract_wiki_texts=False,
                                 extract_texts=False,
                                 extract_pages=False,
                                 extract_infoboxes=False,
                                 extract_disambiguations=False,
                                 extract_template_names=False,
                                 lang=None):
    abstracts = dict()
    page_ids = dict()
    revision_ids = dict()
    wiki_texts = dict()
    texts = dict()
    infoboxes = dict()
    pages_with_infobox = dict()
    pages_without_infobox = list()
    disambiguations = list()
    template_names_list = list()

    extract_pages = extract_pages or extract_abstracts or extract_texts or extract_infoboxes or extract_disambiguations

    pages_counter = 0
    input_filename = join(directory, filename)
    for page in DataUtils.get_wikipedia_pages(filename=input_filename):
        parsed_page = DataUtils.parse_page(page)
        pages_counter += 1

        if pages_counter % Config.logging_interval[lang] == 0:
            LogUtils.logging_information_extraction(pages_counter,
                                                    input_filename)
            gc.collect()

        if extract_template_names and parsed_page.ns.text == '10':
            template_dict = dict()
            template_name, template_type, language = DataUtils.get_template_name_type(
                parsed_page.title.text)

            template_dict['template_name'] = template_name
            template_dict['type'] = template_type
            template_dict['language_name'] = language
            template_names_list.append(template_dict)

        if parsed_page.ns.text != '0':
            continue

        page_name = parsed_page.title.text
        page_id = parsed_page.id.text
        revision_id = parsed_page.revision.id.text
        extracted_wiki_text = parsed_page.revision.find('text').text
        ref_tag = parsed_page.revision.find('text').find('ref')
        if ref_tag:
            ref_tag.extract()
            extracted_wiki_text_without_ref_tag = parsed_page.revision.find(
                'text').text
        else:
            extracted_wiki_text_without_ref_tag = extracted_wiki_text

        if extract_page_ids:
            page_ids[page_id] = page_name
        if extract_revision_ids:
            revision_ids[page_name] = revision_id
        if extract_wiki_texts:
            wiki_texts[page_name] = extracted_wiki_text

        if extract_pages:
            wiki_text = wtp.parse(extracted_wiki_text)
            wiki_text_without_ref_tag = wtp.parse(
                extracted_wiki_text_without_ref_tag)
            template_names = wiki_text.templates
            template_names_without_ref_tag = wiki_text_without_ref_tag.templates

            if extract_abstracts:
                first_section = wiki_text.sections[0]
                abstract = first_section.string

                if not any(name in abstract for name in Config.redirect_flags)\
                        and not any(name in page_name for name in Config.disambigution_flags):
                    first_section_templates = first_section.templates
                    for template in first_section_templates:
                        abstract = DataUtils.post_clean(
                            abstract.replace(template.string, ''))

                    abstract = clean(abstract, specify_wikilinks=False)
                    abstracts[page_name] = abstract

            if extract_texts:
                if not any(name in wiki_text.string
                           for name in Config.redirect_flags):
                    texts[page_name] = \
                        DataUtils.post_clean(clean(DataUtils.pre_clean(
                            extracted_wiki_text), specify_wikilinks=False), remove_newline=True)

            page_has_infobox = False
            for template in template_names_without_ref_tag:
                template_name, infobox_type = DataUtils.get_infobox_name_type(
                    template.name)
                if infobox_type:
                    page_has_infobox = True

                    if page_name not in pages_with_infobox:
                        pages_with_infobox[page_name] = list()

                    if template_name not in pages_with_infobox[page_name]:
                        pages_with_infobox[page_name].append(template_name)

                    if extract_infoboxes:
                        if template_name not in infoboxes:
                            infoboxes[template_name] = dict()

                        if page_name not in infoboxes[template_name]:
                            infoboxes[template_name][page_name] = list()

                        infobox = dict()
                        for param in template.arguments:
                            param_name = clean(str(param.name))
                            param_value = DataUtils.post_clean(
                                clean(DataUtils.pre_clean(str(param.value))))
                            if param_value:
                                infobox[param_name] = DataUtils.pre_clean(
                                    str(param.value))

                        infoboxes[template_name][page_name].append(infobox)

            if not page_has_infobox:
                pages_without_infobox.append(page_name)

            if extract_disambiguations:
                disambiguation_dict = dict()
                for flag in Config.disambigution_flags:
                    if any(flag in DataUtils.get_template_name_type(
                            template.name)[0] for template in template_names):
                        disambiguation_dict['title'] = page_name
                        disambiguation_dict['field'] = \
                            DataUtils.get_disambiguation_links_regular(str(extracted_wiki_text))
                        disambiguations.append(disambiguation_dict)
                        break

            del template_names
            del wiki_text

    if extract_abstracts:
        DataUtils.save_json(Config.extracted_abstracts_dir, filename,
                            abstracts)
        if extract_pages:
            DataUtils.save_json(Config.extracted_with_infobox_dir,
                                DataUtils.get_abstracts_filename(filename),
                                abstracts,
                                filter_dict=pages_with_infobox)
            DataUtils.save_json(Config.extracted_without_infobox_dir,
                                DataUtils.get_abstracts_filename(filename),
                                abstracts,
                                filter_dict=pages_without_infobox)

    if extract_page_ids:
        DataUtils.save_json(Config.extracted_page_ids_dir, filename, page_ids)

    if extract_revision_ids:
        DataUtils.save_json(Config.extracted_revision_ids_dir, filename,
                            revision_ids)
        if extract_pages:
            DataUtils.save_json(Config.extracted_with_infobox_dir,
                                DataUtils.get_revision_ids_filename(filename),
                                revision_ids,
                                filter_dict=pages_with_infobox)
            DataUtils.save_json(Config.extracted_without_infobox_dir,
                                DataUtils.get_revision_ids_filename(filename),
                                revision_ids,
                                filter_dict=pages_without_infobox)

    if extract_wiki_texts:
        DataUtils.save_json(Config.extracted_wiki_texts_dir, filename,
                            wiki_texts)
        if extract_pages:
            DataUtils.save_json(Config.extracted_with_infobox_dir,
                                DataUtils.get_wiki_texts_filename(filename),
                                wiki_texts,
                                filter_dict=pages_with_infobox)
            DataUtils.save_json(Config.extracted_without_infobox_dir,
                                DataUtils.get_wiki_texts_filename(filename),
                                wiki_texts,
                                filter_dict=pages_without_infobox)

    if extract_texts:
        DataUtils.save_json(Config.extracted_texts_dir, filename, texts)
        if extract_pages:
            DataUtils.save_json(Config.extracted_with_infobox_dir,
                                DataUtils.get_texts_filename(filename),
                                texts,
                                filter_dict=pages_with_infobox)
            DataUtils.save_json(Config.extracted_without_infobox_dir,
                                DataUtils.get_texts_filename(filename),
                                texts,
                                filter_dict=pages_without_infobox)

    if extract_pages:
        DataUtils.save_json(Config.extracted_pages_with_infobox_dir[lang],
                            filename, pages_with_infobox)
        DataUtils.save_json(Config.extracted_pages_without_infobox_dir[lang],
                            filename, pages_without_infobox)

    if extract_infoboxes:
        DataUtils.save_json(Config.extracted_with_infobox_dir,
                            DataUtils.get_infoboxes_filename(filename),
                            infoboxes)

    if extract_disambiguations:
        DataUtils.save_json(Config.extracted_disambiguations_dir, filename,
                            disambiguations)

    if extract_template_names:
        DataUtils.save_json(Config.extracted_template_names_dir[lang],
                            filename, template_names_list)

    LogUtils.logging_information_extraction(pages_counter, input_filename)