Ejemplo n.º 1
0
def _align_section_entity_types(df: pd.DataFrame) -> pd.DataFrame:
    """Align the types of section entities to the most common entity type aggregated by top-section."""
    section_types = {}
    for ts, s_df in df.groupby('TS_text'):
        section_ents = set(s_df['S_ent'].unique())
        type_counter = defaultdict(int)
        for s_ent in section_ents:
            for t in dbp_store.get_transitive_types(
                    dbp_util.name2resource(str(s_ent))):
                type_counter[t] += 1
        top_types = dbp_store.get_independent_types({
            t
            for t, cnt in type_counter.items()
            if cnt == max(type_counter.values())
        })
        if top_types:
            top_type = list(top_types)[0]
            section_types.update({
                (ts, se): dbp_util.type2name(top_type)
                for se in section_ents
                if top_type in dbp_store.get_transitive_types(
                    dbp_util.name2resource(str(se)))
            })
    section_types = pd.Series(section_types, name='S_enttype_new')
    df = pd.merge(how='left',
                  left=df,
                  right=section_types,
                  left_on=['TS_text', 'S_ent'],
                  right_index=True)
    df['S_enttype_new'].fillna(df['S_enttype'], inplace=True)
    return df.drop(columns='S_enttype').rename(
        columns={'S_enttype_new': 'S_enttype'})
Ejemplo n.º 2
0
def _get_resource_surface_scores(text):
    """Return resource lexicalisation scores for the given text."""
    resource_surface_scores = {}
    if not text:
        return resource_surface_scores
    resource_surface_scores[text] = 1
    direct_match = dbp_store.resolve_redirect(dbp_util.name2resource(text))
    if direct_match in dbp_store.get_resources():
        resource_surface_scores[direct_match] = 1
    for surface_match, frequency in sorted(dbp_store.get_inverse_lexicalisations(text.lower()).items(), key=operator.itemgetter(1)):
        resource_surface_scores[surface_match] = frequency
    return resource_surface_scores
Ejemplo n.º 3
0
def get_object_for_label(label: str) -> str:
    """Return the object that fits the given label."""
    global __RESOURCE_INVERSE_LABELS__
    global __ONTOLOGY_INVERSE_LABELS__
    if '__RESOURCE_INVERSE_LABELS__' not in globals():
        __RESOURCE_INVERSE_LABELS__ = {v: k for k, v in _get_label_mapping().items()}
        ontology_labels = rdf_util.create_single_val_dict_from_rdf([utils.get_data_file('files.dbpedia.taxonomy')], rdf_util.PREDICATE_LABEL)
        __ONTOLOGY_INVERSE_LABELS__ = {v: k for k, v in ontology_labels.items()}
    if label in __ONTOLOGY_INVERSE_LABELS__:
        return __ONTOLOGY_INVERSE_LABELS__[label]
    if label in __RESOURCE_INVERSE_LABELS__:
        return __RESOURCE_INVERSE_LABELS__[label]
    return dbp_util.name2resource(label)
Ejemplo n.º 4
0
def get_entity_for_wikilink(wikilink: wtp.WikiLink) -> Optional[str]:
    if not wikilink.target:
        return None
    link_target = _remove_language_tag(wikilink.target.strip())
    resource_uri = dbp_util.name2resource(str_util.capitalize(link_target))
    redirected_uri = dbp_store.resolve_spelling_redirect(resource_uri)
    if dbp_store.is_possible_resource(
            redirected_uri) and '#' not in redirected_uri:
        # return redirected uri only if it is an own Wikipedia article and it does not point to an article section
        final_uri = redirected_uri
    else:
        final_uri = resource_uri
    return dbp_util.resource2name(final_uri)
Ejemplo n.º 5
0
def _assign_entity_types_for_section(df: pd.DataFrame,
                                     section_entity: str) -> pd.DataFrame:
    """Retrieve the types of section entities."""
    section_types = {}
    for ent in df[section_entity].unique():
        types = dbp_store.get_independent_types(
            dbp_store.get_types(dbp_util.name2resource(str(ent))))
        if types:
            section_types[ent] = dbp_util.type2name(list(types)[0])
    section_types = pd.Series(section_types, name=f'{section_entity}type')
    return pd.merge(how='left',
                    left=df,
                    right=section_types,
                    left_on=section_entity,
                    right_index=True)
Ejemplo n.º 6
0
 def get_resource_provenance(self, resource: str) -> set:
     """Return provenance information of a resource (i.e. which categories and lists have been used to extract it)."""
     if not self._resource_provenance:
         for node in self.nodes:
             for cat in self.get_category_parts(node):
                 for res in cat_store.get_resources(cat):
                     self._resource_provenance[
                         clg_util.dbp_resource2clg_resource(res)].add(cat)
         if self.use_listing_resources:
             for res, res_data in listing.get_page_entities(self).items():
                 self._resource_provenance[clg_util.name2clg_resource(
                     res)].update({
                         dbp_util.name2resource(o)
                         for o in res_data['origins']
                     })
     return self._resource_provenance[resource]
Ejemplo n.º 7
0
 def get_resources_from_listings(self, node: str) -> set:
     if not self._node_listing_resources:
         for res, res_data in listing.get_page_entities(self).items():
             res_nodes = {
                 clg_util.name2clg_type(t)
                 for t in res_data['types']
             }
             res_nodes.update({
                 n
                 for o in res_data['origins']
                 for n in self.get_nodes_for_part(dbp_util.name2resource(o))
             })
             res_uri = clg_util.name2clg_resource(res)
             for n in res_nodes:
                 self._node_listing_resources[n].add(res_uri)
     return self._node_listing_resources[node]
Ejemplo n.º 8
0
def _retrieve_training_data_wle(nlp: Language):
    listpages = list_store.get_parsed_listpages(wikipedia.ARTICLE_TYPE_ENUM)
    lp_to_cat_mapping = {
        lp: list_mapping.get_equivalent_categories(lp)
        | list_mapping.get_parent_categories(lp)
        for lp in listpages
    }
    lp_to_cat_mapping = {
        lp: cats
        for lp, cats in lp_to_cat_mapping.items() if cats
    }

    training_data = []
    # extract entities
    for lp, cats in lp_to_cat_mapping.items():
        lp_data = listpages[lp]
        for section_data in lp_data['sections']:
            for enum_data in section_data['enums']:
                for entry_data in enum_data:
                    text = entry_data['text']
                    if not text:
                        continue
                    entities = entry_data['entities']
                    if not entities:
                        continue
                    valid_entities = []
                    for entity_data in entities:
                        entity_uri = dbp_util.name2resource(
                            entity_data['name'])
                        entity_tag = _get_tag_for_types(
                            dbp_store.get_types(entity_uri))
                        if not entity_tag:
                            continue
                        entity_text = entity_data['text']
                        start = int(entity_data['idx'])
                        end = start + len(text)
                        if end > len(text) or text[start:end] != entity_text:
                            continue
                        valid_entities.append((start, end, entity_tag))
                    if len(entities) == len(valid_entities):
                        training_data.append(
                            Example.from_dict(nlp.make_doc(text),
                                              {'entities': valid_entities}))
    return training_data
Ejemplo n.º 9
0
def _extract_axioms(patterns: dict) -> set:
    """Return the axioms extracted by applying the patterns to Wikipedia categories."""
    axioms = {}

    for cat, (sub, pred, subcats) in patterns.items():
        if pred:  # simple mapping of label to predicate (case 1)
            if pred.lower() in predicate_names:
                axioms[cat] = (sub, predicate_names[pred.lower()], subcats)
        else:  # Voting required to discover Z (case 2)
            predicate_counts = defaultdict(int)
            for subcat, value in subcats.items():
                value = normalize_val(value)
                for res in cat_store.get_resources(subcat):
                    for pred, values in dbp_store.get_properties(res).items():
                        normalized_values = {
                            normalize_val(val)
                            for val in values
                        }
                        if value in normalized_values:
                            predicate_counts[pred] += 1
            if predicate_counts:
                pred = max(predicate_counts.items(),
                           key=operator.itemgetter(1))[0]
                axioms[cat] = (sub, pred, subcats)

    # map values to dbpedia resources if necessary (only possible if we have an object property)
    valid_axioms = {}

    for cat in axioms:
        _, pred, subcats = axioms[cat]
        if dbp_store.is_object_property(pred):
            for subcat, obj in subcats.items():
                obj_uri = dbp_util.name2resource(obj)
                if obj_uri in dbp_store.get_resources():
                    if cat in valid_axioms:
                        valid_axioms[cat][1][subcat] = obj_uri
                    else:
                        valid_axioms[cat] = (pred, {subcat: obj_uri})
        else:
            valid_axioms[cat] = (pred, subcats)

    return {(cat, pred, val)
            for pred, cat_vals in valid_axioms.values()
            for cat, val in cat_vals.items()}
Ejemplo n.º 10
0
Archivo: cdf.py Proyecto: nheist/Cat2Ax
def _apply_rules(pattern_dict: dict, cat: str) -> set:
    """Apply rules form `pattern_dict` and return the implied axioms."""
    cat_words = cat_store.get_label(cat).split(' ')

    axiom_patterns, pattern_lengths = _detect_pattern(pattern_dict, cat_words)
    if not axiom_patterns:
        return set()

    (pred, pred_type), additional_axioms = axiom_patterns
    front_pattern_idx = pattern_lengths[0] or None
    back_pattern_idx = -1 * pattern_lengths[1] or None
    resource = ' '.join(cat_words[front_pattern_idx:back_pattern_idx])

    if pred_type:
        resource = dbp_util.name2resource(resource)
        if resource not in dbp_store.get_resources(
        ) or pred_type not in dbp_store.get_transitive_types(resource):
            return set()
    return {(cat, pred, resource)} | {(cat, pred, val)
                                      for pred, val in additional_axioms}
Ejemplo n.º 11
0
def _assign_pagetypes(df: pd.DataFrame) -> pd.DataFrame:
    """Assign (most basic and most specific) page types to the existing dataframe."""
    data = []
    for page_name in df['P'].unique():
        if page_name.startswith('List of'):
            data.append((page_name, 'List', 'List'))
            continue
        page_uri = dbp_util.name2resource(page_name)
        P_types = dbp_store.get_independent_types(
            dbp_store.get_types(page_uri))
        if not P_types:
            data.append((page_name, 'Other', 'Other'))
            continue
        P_type = sorted(P_types)[0]
        P_basetype = _get_basetype(P_type)
        data.append((page_name, dbp_util.type2name(P_type),
                     dbp_util.type2name(P_basetype)))
    return pd.merge(left=df,
                    right=pd.DataFrame(data,
                                       columns=['P', 'P_type', 'P_basetype']),
                    on='P')
Ejemplo n.º 12
0
def _compute_labeled_entities_for_listpage(page_uri: str, page_data: dict,
                                           graph) -> tuple:
    positive_SEs, negative_SEs = dict(), set()
    # compute potential subject entities for list page
    page_potential_SEs = {
        dbp_util.resource2name(res)
        for cat in _get_category_descendants_for_list(page_uri)
        for res in cat_store.get_resources(cat)
    }
    # compute types of list page
    page_types = {
        t
        for n in graph.get_nodes_for_part(page_uri)
        for t in dbp_store.get_independent_types(
            graph.get_transitive_dbpedia_types(n))
    }
    page_disjoint_types = {
        dt
        for t in page_types for dt in dbp_heur.get_disjoint_types(t)
    }
    # collect all linked entities on the page
    page_entities = {
        ent['name']
        for s in page_data['sections'] for enum in s['enums'] for entry in enum
        for ent in entry['entities']
    }
    page_entities.update({
        ent['name']
        for s in page_data['sections'] for table in s['tables']
        for row in table['data'] for cell in row for ent in cell['entities']
    })
    for ent in page_entities:
        ent_uri = dbp_util.name2resource(ent)
        if not dbp_store.is_possible_resource(ent_uri):
            negative_SEs.add(ent)
        elif ent in page_potential_SEs:
            positive_SEs[ent] = _compute_entity_label(ent_uri)
        elif page_disjoint_types.intersection(dbp_store.get_types(ent_uri)):
            negative_SEs.add(ent)
    return positive_SEs, negative_SEs
Ejemplo n.º 13
0
def extract_page_entities(graph) -> dict:
    utils.get_logger().info(
        f'LISTING/EXTRACT: Extracting types and relations for page entities..')

    page_entities = defaultdict(
        lambda: {
            'labels': set(),
            'origins': set(),
            'types': set(),
            'in': set(),
            'out': set()
        })

    df = context.retrieve_page_entity_context(graph)

    # extract list page entities
    utils.get_logger().info(
        f'LISTING/EXTRACT: Extracting types of list page entities..')
    df_lps = df[df['P_type'] == 'List']
    for lp, df_lp in df_lps.groupby(by='P'):
        clg_types = {
            clg_util.clg_type2name(t)
            for t in graph.get_nodes_for_part(dbp_util.name2resource(lp))
        }
        if clg_types:
            for _, row in df_lp.iterrows():
                name = row['E_ent']
                page_entities[name]['labels'].add(row['E_text'])
                page_entities[name]['origins'].add(lp)
                page_entities[name]['types'].update(clg_types)

    df = df.loc[df['P_type'] !=
                'List']  # ignore list pages in subsequent steps

    # compute valid combinations of types and NE tags
    df_types = context.get_entity_types(df, graph)
    dft = pd.merge(left=df, right=df_types, on='E_ent')
    valid_tags = context.get_valid_tags_for_entity_types(
        dft, graph, utils.get_config('listing.valid_tag_threshold'))

    # extract types
    utils.get_logger().info(
        f'LISTING/EXTRACT: Extracting types of page entities..')
    df_new_types = _compute_new_types(df, dft, df_types, valid_tags)
    for ent, df_ent in df_new_types.groupby(by='E_ent'):
        page_entities[ent]['labels'].update(set(df_ent['E_text'].unique()))
        page_entities[ent]['origins'].update(_get_origins_for_entity(df_ent))
        new_types = set(df_ent['E_enttype'].unique())
        transitive_types = {
            clg_util.clg_type2name(tt)
            for t in new_types
            for tt in graph.ancestors(clg_util.name2clg_type(t))
        }
        new_types = new_types.difference(
            transitive_types)  # remove transitive types
        page_entities[ent]['types'].update(new_types)

    # extract relations
    utils.get_logger().info(
        f'LISTING/EXTRACT: Extracting relations of page entities..')
    df_rels = context.get_entity_relations()
    df_new_relations = _compute_new_relations(df, df_rels, 'P', valid_tags)
    df_new_relations = pd.concat([
        df_new_relations,
        _compute_new_relations(df, df_rels, 'TS_ent', valid_tags)
    ])
    df_new_relations = pd.concat([
        df_new_relations,
        _compute_new_relations(df, df_rels, 'S_ent', valid_tags)
    ])
    for ent, df_ent in df_new_relations.groupby(by='E_ent'):
        page_entities[ent]['labels'].update(set(df_ent['E_text'].unique()))
        page_entities[ent]['origins'].update(_get_origins_for_entity(df_ent))
        rels_in = set(
            map(tuple, df_ent[~df_ent['inv']][['pred', 'target']].values))
        page_entities[ent]['in'].update(rels_in)
        rels_out = set(
            map(tuple, df_ent[df_ent['inv']][['pred', 'target']].values))
        page_entities[ent]['out'].update(rels_out)

    return dict(page_entities)
Ejemplo n.º 14
0
def _parse_raw_markup_from_xml() -> dict:
    utils.get_logger().info('WIKIPEDIA/XML: Parsing raw markup from XML dump..')
    parser = etree.XMLParser(target=WikiPageParser())
    with bz2.open(utils.get_data_file('files.wikipedia.pages')) as dbp_pages_file:
        page_markup = etree.parse(dbp_pages_file, parser)
        return {dbp_util.name2resource(p): markup for p, markup in page_markup.items()}