Example #1
0
def _create_tok_exurn(cite, line_list):
    c = CTS_URN(cite)
    _offsets = c.passage_component.split("-")
    start, end = _offsets[0], _offsets[-1]
    counter = line_list.count(start) + 1
    base = c.get_urn_without_passage()
    s = "{}.tbtokens.{}.{}".format(base, start, counter)
    return s, end
Example #2
0
def main():
    """Define the CLI inteface/commands."""
    arguments = docopt(__doc__)
    cfg_filename = pkg_resources.resource_filename('knowledge_base',
                                                   'config/virtuoso.ini')
    kb = KnowledgeBase(cfg_filename)

    # the user has issued a `find` command
    if arguments["find"]:
        search_string = arguments["<search_string>"]
        try:
            urn = CTS_URN(search_string)
            match = kb.get_resource_by_urn(str(urn))
            show_result(match, verbose=True)
            return
        except BadCtsUrnSyntax as e:
            pass
        except IndexError as e:
            raise e
            print("\nNo records with this CTS URN!\n")
            return
        try:
            matches = kb.search(search_string)
            print("\nSearching for \"%s\" yielded %s results" %
                  (search_string, len(matches)))
            print_results(matches)
            return
        except SparqlReaderException as e:
            print("\nWildcard word needs at least 4 leading characters")
    # the user has issued an `add` command
    elif arguments["add"]:
        input_urn = arguments["--to"]

        # first let's check if it's a valid URN
        try:
            urn = CTS_URN(input_urn)
        except Exception as e:
            print("The provided URN ({}) is invalid!".format(input_urn))
            return

        try:
            resource = kb.get_resource_by_urn(urn)
            assert resource is not None
        except ResourceNotFound:
            print("The KB does not contain a resource identified by {}".format(
                urn))
            return

        print(arguments)
        #if arguments[""]
        pass
Example #3
0
    def _consolidate_result(self, urn_string, citation_string, entity_type,
                            scope):
        urn = CTS_URN(urn_string)

        # check: does the URN have a scope but is missing the work element
        if (urn.work is None):
            # if so, try to get the opus maximum from the KB
            opmax = self._kb.get_opus_maximum_of(urn)

            if (opmax is not None):
                logger.debug("%s is opus maximum of %s" % (opmax, urn))
                urn = CTS_URN("{}".format(opmax.get_urn()))

        return Result(citation_string, entity_type, scope, urn)
Example #4
0
def _create_tokenized_cts_urn(document_urn, cite_string, line_list):
    """Create the CTS-URN pointing to the tokenized edition. The URN contains a version ("tokenized") and a
    supplementary citation level (token nr.). E.g.:
        `urn:cts:greekLit:tlg0085.tlg003.perseus-grc2.tokenized:1.1`

    Which means: token 1 of line 1 of the tokenized version of `tlg0085.tlg003.perseus-grc2`.
    The function returns a CTS URN and a line number, ready to be added to the line list; in case of span-tokens, the
    line nr. returned is that of the *end* of the span (so that every other token in that line will be counted
    starting from 2).

    Parameters
    ----------
    document_urn : str
        urn of the digital edition
    cite_string
        cite attribute in the TB file
    line_list
        complete list of the line numbers attached to all the tokens preceding the attual one.

    Returns
    -------
    str : the CTS_URN
    int : the last line nr. in the token cite attribute

    """
    c = CTS_URN(cite_string)
    _offsets = c.passage_component.split("-")
    start, end = _offsets[0], _offsets[-1]
    counter = line_list.count(start) + 1

    # urn:cts:greekLit:tlg0012.tlg001.allen.tokenized:1.1.1
    s = "{}.tokenized:{}.{}".format(document_urn, start, counter)
    return s, end
Example #5
0
 def get_urn(self):
     """
     TODO
     """
     urn = self.ecrm_P1_is_identified_by.one
     try:
         return CTS_URN(urn)
     except Exception, e:
         raise e
Example #6
0
def annotations_to_ctsurns(doc_metadata, annotations):
	"""
	TODO
	"""
	from pyCTS import CTS_URN
	doc_metadata["citations"] = []
	for ann in annotations:
	    label = ann[1]
	    cts_urn = CTS_URN(ann[2])
	    temp = {}
	    if(cts_urn.is_range()):
	    	resolv_urn = "%s%s:%s"%("http://data.perseus.org/citations/",cts_urn.get_urn_without_passage(),cts_urn._range_begin)
	    else:
	    	resolv_urn = "%s%s"%("http://data.perseus.org/citations/",ann[2])
	    temp["perseus_uri"]=resolv_urn
	    temp["label"]=label
	    temp["ctsurn"]=str(cts_urn)
	    doc_metadata["citations"].append(temp)
	return doc_metadata
def align(φ):
    cts_urn = CTS_URN(φ)
    work_component = cts_urn.work_component
    passage_component = cts_urn.passage_component
    nodesList = []
    with open(r'../citation.csv', newline='') as csvfile:
        reader = csv.reader(csvfile, delimiter=",", quotechar='|')
        rows = []
        for row in reader:
            rows.append(row)
        for x in range(len(rows)):
            a = rows[x][0]
            b = a.split(":")[3].split(".")[:-1]
            c = ".".join(b)
            if c == work_component:
                root = ET.parse("../" + rows[x][2]).getroot()
                new_cts_urn = CTS_URN(a + passage_component)
                r = passage_component.split("-")
                w = root[1][0]
                text = []
                if len(r) == 1:
                    text.append(w[int(passage_component) - 1].text)
                if len(r) > 1:
                    for i in range(int(r[0]), int(r[1])+1):
                        text.append(w[i - 1].text)
                node = citableNode(new_cts_urn, text)
                nodesList.append(node)
    for node in nodesList:
        if node.urn.version == 'normalized':
            normalized = node
        if node.urn.version == 'text':
            lyric = node
        if node.urn.version == 'notation':
            notation = node
        if node.urn.version == 'accent':
            accent = node
        if node.urn.version == 'meter':
            meter = node
    module = alignmentModule(cts_urn, normalized, lyric, notation, accent, meter)
    return(module)
Example #8
0
    def get_resource_by_urn(self, urn):
        """Fetch the resource corresponding to the input CTS URN.

        Currently supports
        only HucitAuthor and HucitWork.

        :param urn: the CTS URN of the resource to fetch
        :return: either an instance of `HucitAuthor` or of `HucitWork`

        """
        search_query = """
            PREFIX frbroo: <http://erlangen-crm.org/efrbroo/>
            PREFIX crm: <http://erlangen-crm.org/current/>
            PREFIX rdfs: <http://www.w3.org/2000/01/rdf-schema#>

            SELECT ?resource_URI

            WHERE {
                ?resource_URI crm:P1_is_identified_by ?urn .
                ?urn a crm:E42_Identifier .
                ?urn rdfs:label "%s"
            }
        """ % urn
        # check type of the input URN
        try:
            assert isinstance(urn, CTS_URN)
        except Exception as e:
            # convert to pyCTS.CTS_URN if it's a string
            urn = CTS_URN(urn)
            logger.debug('Converted the input urn from string to %s' %
                         type(CTS_URN))

        if (urn.work is not None):
            Work = self._session.get_class(surf.ns.EFRBROO['F1_Work'])
            result = self._store.execute_sparql(search_query)
            if len(result['results']['bindings']) == 0:
                raise ResourceNotFound
            else:
                tmp = result['results']['bindings'][0]
                resource_uri = tmp['resource_URI']['value']
                return self._session.get_resource(resource_uri, Work)

        elif (urn.work is None and urn.textgroup is not None):
            Person = self._session.get_class(surf.ns.EFRBROO['F10_Person'])
            result = self._store.execute_sparql(search_query)
            if len(result['results']['bindings']) == 0:
                raise ResourceNotFound
            else:
                tmp = result['results']['bindings'][0]
                resource_uri = tmp['resource_URI']['value']
                return self._session.get_resource(resource_uri, Person)
Example #9
0
def guess_speaker(sent, tei_file):
    u = CTS_URN(list(sent[0].misc["cite"])[0])
    book, line = u.passage_component.split(".")

    # retrieve the line in TEI
    xp = "//tei:div[@subtype='Book' and @n='{}']/descendant::tei:l[@n='{}']".format(
        book, line)
    l = tei_file.xpath(xp, namespaces=ns)[0]
    p = l.getparent()
    try:
        speaker = p.attrib["who"]
    except KeyError:
        speaker = "Narrator"

    sent.set_meta("speaker", speaker)
Example #10
0
 def get_urn(self):
     """
     Assumes that each HucitAuthor has only one CTS URN.
     """
     # TODO: check type
     try:
         type_ctsurn = self.session.get_resource(
             BASE_URI_TYPES % "CTS_URN",
             self.session.get_class(surf.ns.ECRM['E55_Type']))
         urn = [
             CTS_URN(urnstring.rdfs_label.one)
             for urnstring in self.ecrm_P1_is_identified_by
             if urnstring.uri == surf.ns.ECRM['E42_Identifier']
             and urnstring.ecrm_P2_has_type.first == type_ctsurn
         ][0]
         return urn
     except Exception as e:
         return None
Example #11
0
    def get_urn(self):
        """
        Get the CTS URN that identifies the work.

        :return: an instance of `pyCTS.CTS_URN` or None
        """
        try:
            type_ctsurn = self.session.get_resource(
                BASE_URI_TYPES % "CTS_URN",
                self.session.get_class(surf.ns.ECRM['E55_Type']))
            urn = [
                CTS_URN(urnstring.rdfs_label.one)
                for urnstring in self.ecrm_P1_is_identified_by
                if urnstring.uri == surf.ns.ECRM['E42_Identifier']
                and urnstring.ecrm_P2_has_type.first == type_ctsurn
            ][0]
            return urn
        except Exception, e:
            return None
Example #12
0
def load_brat_data(extractor, knowledge_base, postaggers, aph_ann_files,
                   aph_titles):
    """
    Utility function to load a set of brat documents and prepare them
    in a format suitable for processing (typically when carrying out the evaluation or the training).

    :param citation_extractor: instance of `core.citation_extractor`
    :parm knowledge_base: instance of `knowledge_base.KnowledgeBase`
    :param aph_ann_files: a tuple: [0] the base directory; [1] a list of file names
    :param aph_titles: `pandas.DataFrame` with column 'title'
    :return: a `pandas.DataFrame` (columns: 'surface', 'surface_norm', 'scope', 'type',
        'other_mentions', 'prev_mentions', 'urn', 'urn_clean','doc_id', 'doc_title',
        'doc_title_mentions', 'doc_title_norm', 'doc_text', 'sentence_start', 'sentence_end')

    """
    from citation_extractor.pipeline import extract_entity_mentions

    cols = [
        'surface', 'surface_norm', 'scope', 'type', 'other_mentions',
        'prev_mentions', 'urn', 'urn_clean', 'doc_id', 'doc_title',
        'doc_title_mentions', 'doc_title_norm', 'doc_text', 'sentence_start',
        'sentence_end'
    ]
    df_data = pd.DataFrame(dtype='object', columns=cols)

    # Read all annotated files
    ann_dir, files = aph_ann_files
    for filename in files:
        if filename.endswith('.ann'):
            logger.debug('Reading file: {}'.format(filename))

            # Read doc annotations
            file_suffix = filename.replace('-doc-1.ann', '')
            entities, relations, disambiguations = read_ann_file_new(
                file_suffix, ann_dir + '/')
            # Read doc text
            doc_text = None
            filename_text = filename.replace('.ann', '.txt')
            with open(os.path.join(ann_dir, filename_text)) as f:
                doc_text = f.read()
                doc_text = unicode(doc_text, 'utf-8')
            logger.debug(u'Document text: {}'.format(doc_text))
            doc_newlines = _find_newlines(doc_text)

            # Get title
            doc_title = None
            file_id = file_suffix.replace('.txt', '')
            if file_id in aph_titles.index:
                doc_title = aph_titles.loc[file_id, 'title']
                doc_title = unicode(doc_title, 'utf-8')
            logger.debug(u'Document title: {}'.format(doc_title))

            try:
                # Extract mentions from the title, list of (type, surface) tuples
                doc_title_extracted_mentions = extract_entity_mentions(
                    doc_title, extractor, postaggers, norm=True)
            except Exception, e:
                doc_title_extracted_mentions = []
                print(e)
                print(doc_title)
                print(file_id)

            # Normalize title
            doc_title_norm = StringUtils.normalize(doc_title)

            # Order the appearance of the mentions in the doc
            ordered_mentions = sort_mentions_by_appearance(entities, relations)
            logger.debug('Mentions appearance: {}'.format(ordered_mentions))

            # Rearrange disambiguations
            disambiguations_new = dict(
                map(lambda e: (e['anchor'], e['text']), disambiguations))

            prev_entities = []
            for mention_id in ordered_mentions:
                # Note: added utf-8 encoding after new error
                mention_data_id = file_id + '-' + mention_id.encode('utf-8')
                mention_urn = NIL_ENTITY
                clean_urn = mention_urn
                mention_surface = None
                mention_scope = None
                mention_type = None
                sentence_start = None
                sentence_end = None

                # It's a relation
                if mention_id.startswith('R'):
                    relation = relations[mention_id]

                    # Unpack the relation
                    entity_0 = entities[relation['arguments'][0]]
                    entity_1 = entities[relation['arguments'][1]]

                    # Sanity check for types of relation members
                    no_refscope = ['AAUTHOR', 'AWORK', 'REFAUWORK']
                    if entity_0['entity_type'] in no_refscope and entity_1[
                            'entity_type'] == 'REFSCOPE':
                        pass

                    elif entity_1['entity_type'] in no_refscope and entity_0[
                            'entity_type'] == 'REFSCOPE':
                        logger.warning(
                            'Swapped entities in relation {} in doc {}'.format(
                                mention_id, filename))
                        entity_0 = entities[relation['arguments'][1]]
                        entity_1 = entities[relation['arguments'][0]]

                    else:
                        logger.error(
                            'Unknown types in relation {} in doc {}'.format(
                                mention_id, filename))
                        continue

                    # Update fields
                    if mention_id in disambiguations_new:
                        mention_urn = disambiguations_new[mention_id]
                    mention_surface = entity_0['surface']
                    mention_scope = entity_1['surface']
                    mention_type = entity_0['entity_type']

                    if entity_0["offset_start"] > entity_1["offset_start"]:
                        sentence_start = _find_linenumber_by_offset(
                            int(entity_1["offset_start"]),
                            int(entity_1["offset_end"]), doc_newlines)[0]
                        sentence_end = _find_linenumber_by_offset(
                            int(entity_0["offset_start"]),
                            int(entity_0["offset_end"]), doc_newlines)[0]
                    else:
                        sentence_start = _find_linenumber_by_offset(
                            int(entity_0["offset_start"]),
                            int(entity_0["offset_end"]), doc_newlines)[0]
                        sentence_end = _find_linenumber_by_offset(
                            int(entity_1["offset_start"]),
                            int(entity_1["offset_end"]), doc_newlines)[0]

                # It's a non-relation
                elif mention_id.startswith('T'):
                    entity = entities[mention_id]

                    # Avoid to disambiguate the mention if it's a REFSCOPE (alone)
                    if entity['entity_type'] == 'REFSCOPE':
                        logger.warning(
                            'Lonely REFSCOPE with id: {} in doc: {}'.format(
                                mention_id, filename))
                        continue

                    # Update fields
                    if mention_id in disambiguations_new:
                        mention_urn = disambiguations_new[mention_id]
                    mention_surface = entity['surface']
                    mention_type = entity['entity_type']
                    mention_offset_start = int(entity['offset_start'])
                    mention_offset_end = int(entity['offset_end'])
                    sentence_start = _find_linenumber_by_offset(
                        mention_offset_start, mention_offset_end,
                        doc_newlines)[0]
                    sentence_end = sentence_start

                else:
                    logger.error('Unknown mention id: {} in doc {}'.format(
                        mention_id, filename))
                    continue

                # Get clean URN (without passage), skip if non-valid
                if mention_urn != NIL_ENTITY:
                    try:
                        cts_urn = CTS_URN(mention_urn)
                        clean_urn = cts_urn.get_urn_without_passage()
                        knowledge_base.get_resource_by_urn(clean_urn)
                    except Exception, e:
                        logger.error(e)
                        logger.warning(
                            'Failed parsing the URN: |{}| at: {}'.format(
                                mention_urn, file_id))
                        continue

                # Keep track of previous mentions
                mention_prev_entities = list(prev_entities)  # copy
                prev_entities.append(mention_data_id)

                df_data.loc[mention_data_id, 'surface'] = mention_surface
                df_data.loc[mention_data_id, 'sentence_start'] = sentence_start
                df_data.loc[mention_data_id, 'sentence_end'] = sentence_end
                df_data.loc[mention_data_id,
                            'surface_norm'] = StringUtils.normalize(
                                mention_surface)
                df_data.loc[mention_data_id, 'scope'] = mention_scope
                df_data.loc[mention_data_id, 'type'] = mention_type
                df_data.loc[mention_data_id,
                            'prev_mentions'] = mention_prev_entities
                df_data.loc[mention_data_id, 'doc_id'] = file_id
                df_data.loc[mention_data_id, 'doc_title'] = doc_title
                df_data.loc[
                    mention_data_id,
                    'doc_title_mentions'] = doc_title_extracted_mentions
                df_data.loc[mention_data_id, 'doc_title_norm'] = doc_title_norm
                df_data.loc[mention_data_id, 'doc_text'] = doc_text
                df_data.loc[mention_data_id, 'urn'] = mention_urn
                df_data.loc[mention_data_id, 'urn_clean'] = clean_urn

            # Add successfully parsed mentions of the doc to other_mentions field of each mention of the doc
            for m_id in prev_entities:
                other_mentions = list(prev_entities)
                other_mentions.remove(m_id)
                df_data.loc[m_id, 'other_mentions'] = other_mentions
Example #13
0
def annotations2references(doc_id, directory, kb):
    """
    Read annotations from a brat stand-off file (.ann).
    For each entity and relation keep also the context, i.e. the containing sentences.

    TODO:
    - add author and work labels
    - if annotation is a scope relation, add work- and author-urn
    if annotation is an AWORK, add work- and author-urn
    """
    def find_newlines(text, newline=u'\n'):
        positions = []
        last_position = 0
        if (text.find(newline) == -1):
            return positions
        else:
            while (text.find(newline, last_position + 1) > -1):
                last_position = text.find(newline, last_position + 1)
                positions.append((last_position, last_position + len(newline)))
            return positions

    def find_linenumber_newlineoffset_for_string(offset_start, offset_end,
                                                 newline_offsets):
        """
        TODO
        """
        for n, nl_offset in enumerate(newline_offsets):
            #print offset_start,offset_end,nl_offset
            if (offset_start <= nl_offset[0] and offset_end <= nl_offset[0]):
                return (n, newline_offsets[n - 1][1], newline_offsets[n][0])

    import knowledge_base

    entities, relations, disambiguations = read_ann_file_new(doc_id, directory)
    fulltext = codecs.open("%s%s%s" % (directory, doc_id, "-doc-1.txt"), "r",
                           "utf-8").read()
    newlines = find_newlines(fulltext)
    annotations = []
    for disambiguation in disambiguations:
        annotation = {}
        anchor = disambiguation["anchor"]
        urn = disambiguation["text"]
        ann_id = disambiguation["ann_id"]
        # the annotation refers to a scope relation
        if (anchor.startswith("R")):
            entity_ids = relations[anchor]["arguments"]
            annotation["annotation_type"] = relations[anchor][
                "relation_type"].lower()
            arg_entities = [entities[id] for id in entity_ids]
            ann_type = relations[anchor]["relation_type"].lower()
            spanning_lines = [
                find_linenumber_newlineoffset_for_string(
                    int(entity["offset_start"]), int(entity["offset_end"]),
                    newlines) for entity in arg_entities
            ]
            line_numbers = list(set([line[0] for line in spanning_lines]))
            line_numbers = sorted(line_numbers)
            start = spanning_lines[0][1]
            end = spanning_lines[-1][2]
            if (len(line_numbers) == 1):
                sentence = "\n".join(fulltext.split("\n")[line_numbers[0]])
            else:
                sentence = "\n".join(
                    fulltext.split("\n")[line_numbers[0]:line_numbers[1]])
            context = "%s<em>%s</em>%s<em>%s</em>%s" % (
                fulltext[start:int(arg_entities[0]["offset_start"])],
                fulltext[int(arg_entities[0]["offset_start"]
                             ):int(arg_entities[0]["offset_end"])],
                fulltext[int(arg_entities[0]["offset_end"]
                             ):int(arg_entities[1]["offset_start"])],
                fulltext[int(arg_entities[1]["offset_start"]
                             ):int(arg_entities[1]["offset_end"])],
                fulltext[int(arg_entities[1]["offset_end"]):end])
            annotation["surface"] = " ".join(
                [entity["surface"] for entity in arg_entities])
            annotation["context"] = context
            annotation["line_number"] = line_numbers[0]
        # the annotation refers to an entity
        elif (anchor.startswith("T")):
            entity = entities[anchor]
            annotation["annotation_type"] = entity["entity_type"].lower()
            line_number, start, end = find_linenumber_newlineoffset_for_string(
                int(entity["offset_start"]), int(entity["offset_end"]),
                newlines)
            sentence = fulltext.split("\n")[line_number]
            before_mention = sentence[start -
                                      start:int(entity["offset_start"]) -
                                      start]
            mention = sentence[int(entity["offset_start"]) -
                               start:int(entity["offset_end"]) - start]
            after_mention = sentence[int(entity["offset_end"]) - start:]
            context = "%s<em>%s</em>%s" % (before_mention, mention,
                                           after_mention)
            annotation["surface"] = entity["surface"]
            annotation["context"] = context
            annotation["line_number"] = line_number
        annotation["filename"] = doc_id
        annotation["annotation_id"] = ann_id
        annotation["urn"] = urn
        annotation["anchor"] = anchor
        try:
            if (annotation["annotation_type"] == "aauthor"):
                author = kb.get_resource_by_urn(urn)
                annotation["author_label"] = "%s" % author
                annotation["work_label"] = None
                annotation["author_urn"] = str(author.get_urn())
                annotation["work_urn"] = None
            elif (annotation["annotation_type"] == "awork"):
                work = kb.get_resource_by_urn(urn)
                annotation["author_label"] = unicode(work.author)
                annotation["work_label"] = unicode(work)
                annotation["author_urn"] = str(work.author.get_urn())
                annotation["work_urn"] = str(work.get_urn())
            elif (annotation["annotation_type"] == "scope"):
                try:
                    temp = CTS_URN(annotation["urn"]).get_urn_without_passage()
                    resource = kb.get_resource_by_urn(temp)
                    if (isinstance(resource,
                                   knowledge_base.surfext.HucitWork)):
                        annotation["author_label"] = unicode(resource.author)
                        annotation["work_label"] = unicode(resource)
                        annotation["author_urn"] = str(
                            resource.author.get_urn())
                        annotation["work_urn"] = str(resource.get_urn())
                    elif (isinstance(resource,
                                     knowledge_base.surfext.HucitAuthor)):
                        annotation["author_label"] = unicode(resource)
                        annotation["work_label"] = None
                        annotation["author_urn"] = str(resource.get_urn())
                        annotation["work_urn"] = None
                except Exception as e:
                    annotation["author_label"] = None
                    annotation["work_label"] = None
                    annotation["author_urn"] = None
                    annotation["work_urn"] = None
                    logger.error(
                        "Annotation %s raised the following error: %s" %
                        (annotation, e))
            annotations.append(annotation)
        except Exception as e:
            logger.error("The annotations %s raised an error: %s" %
                         (annotation, e))
    logger.info("Read %i annotations from file %s%s" %
                (len(annotations), directory, doc_id))
    return annotations
Example #14
0
def load_brat_data(extractor, knowledge_base, postaggers, aph_ann_files, aph_titles):
    """
    Utility function to load a set of brat documents and prepare them
    in a format suitable for processing (typically when carrying out the evaluation or the training).

    :param citation_extractor: instance of `core.citation_extractor`
    :parm knowledge_base: instance of `knowledge_base.KnowledgeBase`
    :param aph_ann_files: a tuple: [0] the base directory; [1] a list of file names
    :param aph_titles: `pandas.DataFrame` with column 'title'
    :return: a `pandas.DataFrame` (columns: 'surface', 'surface_norm', 'scope', 'type',
        'other_mentions', 'prev_mentions', 'urn', 'urn_clean','doc_id', 'doc_title',
        'doc_title_mentions', 'doc_title_norm', 'doc_text', 'sentence_start', 'sentence_end')

    """
    from citation_extractor.pipeline import extract_entity_mentions

    cols = ['surface', 'surface_norm', 'scope', 'type', 'other_mentions', 'prev_mentions', 'urn', 'urn_clean',
            'doc_id', 'doc_title', 'doc_title_mentions', 'doc_title_norm', 'doc_text', 'sentence_start', 'sentence_end']
    df_data = pd.DataFrame(dtype='object', columns=cols)

    # Read all annotated files
    ann_dir, files = aph_ann_files
    for filename in files:
        if filename.endswith('.ann'):
            logger.debug('Reading file: {}'.format(filename))

            # Read doc annotations
            file_suffix = filename.replace('-doc-1.ann', '')
            entities, relations, disambiguations = read_ann_file_new(file_suffix, ann_dir + '/')
            # Read doc text
            doc_text = None
            filename_text = filename.replace('.ann', '.txt')
            with open(os.path.join(ann_dir, filename_text)) as f:
                doc_text = f.read()
                doc_text = unicode(doc_text, 'utf-8')
            logger.debug(u'Document text: {}'.format(doc_text))
            doc_newlines = _find_newlines(doc_text)

            # Get title
            doc_title = None
            file_id = file_suffix.replace('.txt', '')
            if file_id in aph_titles.index:
                doc_title = aph_titles.loc[file_id, 'title']
                doc_title = unicode(doc_title, 'utf-8')
            logger.debug(u'Document title: {}'.format(doc_title))

            try:
                # Extract mentions from the title, list of (type, surface) tuples
                doc_title_extracted_mentions = extract_entity_mentions(doc_title, extractor, postaggers, norm=True)
            except Exception, e:
                doc_title_extracted_mentions = []
                print(e)
                print(doc_title)
                print(file_id)

            # Normalize title
            doc_title_norm = StringUtils.normalize(doc_title)

            # Order the appearance of the mentions in the doc
            ordered_mentions = sort_mentions_by_appearance(entities, relations)
            logger.debug('Mentions appearance: {}'.format(ordered_mentions))

            # Rearrange disambiguations
            disambiguations_new = dict(map(lambda e: (e['anchor'], e['text']), disambiguations))

            prev_entities = []
            for mention_id in ordered_mentions:
                # Note: added utf-8 encoding after new error 
                mention_data_id = file_id + '-' + mention_id.encode('utf-8')
                mention_urn = NIL_ENTITY
                clean_urn = mention_urn
                mention_surface = None
                mention_scope = None
                mention_type = None
                sentence_start = None
                sentence_end = None

                # It's a relation
                if mention_id.startswith('R'):
                    relation = relations[mention_id]

                    # Unpack the relation
                    entity_0 = entities[relation['arguments'][0]]
                    entity_1 = entities[relation['arguments'][1]]

                    # Sanity check for types of relation members
                    no_refscope = ['AAUTHOR', 'AWORK', 'REFAUWORK']
                    if entity_0['entity_type'] in no_refscope and entity_1['entity_type'] == 'REFSCOPE':
                        pass

                    elif entity_1['entity_type'] in no_refscope and entity_0['entity_type'] == 'REFSCOPE':
                        logger.warning('Swapped entities in relation {} in doc {}'.format(mention_id, filename))
                        entity_0 = entities[relation['arguments'][1]]
                        entity_1 = entities[relation['arguments'][0]]

                    else:
                        logger.error('Unknown types in relation {} in doc {}'.format(mention_id, filename))
                        continue

                    # Update fields
                    if mention_id in disambiguations_new:
                        mention_urn = disambiguations_new[mention_id]
                    mention_surface = entity_0['surface']
                    mention_scope = entity_1['surface']
                    mention_type = entity_0['entity_type']

                    if entity_0["offset_start"] > entity_1["offset_start"]:
                        sentence_start = _find_linenumber_by_offset(int(entity_1["offset_start"])
                                                                , int(entity_1["offset_end"])
                                                                , doc_newlines)[0]
                        sentence_end = _find_linenumber_by_offset(int(entity_0["offset_start"])
                                                                , int(entity_0["offset_end"])
                                                                , doc_newlines)[0]
                    else:
                        sentence_start = _find_linenumber_by_offset(int(entity_0["offset_start"])
                                                                , int(entity_0["offset_end"])
                                                                , doc_newlines)[0]
                        sentence_end = _find_linenumber_by_offset(int(entity_1["offset_start"])
                                                                , int(entity_1["offset_end"])
                                                                , doc_newlines)[0]


                # It's a non-relation
                elif mention_id.startswith('T'):
                    entity = entities[mention_id]

                    # Avoid to disambiguate the mention if it's a REFSCOPE (alone)
                    if entity['entity_type'] == 'REFSCOPE':
                        logger.warning('Lonely REFSCOPE with id: {} in doc: {}'.format(mention_id, filename))
                        continue

                    # Update fields
                    if mention_id in disambiguations_new:
                        mention_urn = disambiguations_new[mention_id]
                    mention_surface = entity['surface']
                    mention_type = entity['entity_type']
                    mention_offset_start = int(entity['offset_start'])
                    mention_offset_end = int(entity['offset_end'])
                    sentence_start = _find_linenumber_by_offset(mention_offset_start
                                                            , mention_offset_end
                                                            , doc_newlines)[0]
                    sentence_end = sentence_start

                else:
                    logger.error('Unknown mention id: {} in doc {}'.format(mention_id, filename))
                    continue

                # Get clean URN (without passage), skip if non-valid
                if mention_urn != NIL_ENTITY:
                    try:
                        cts_urn = CTS_URN(mention_urn)
                        clean_urn = cts_urn.get_urn_without_passage()
                        knowledge_base.get_resource_by_urn(clean_urn)
                    except Exception, e:
                        logger.error(e)
                        logger.warning('Failed parsing the URN: |{}| at: {}'.format(mention_urn, file_id))
                        continue

                # Keep track of previous mentions
                mention_prev_entities = list(prev_entities) # copy
                prev_entities.append(mention_data_id)

                df_data.loc[mention_data_id, 'surface'] = mention_surface
                df_data.loc[mention_data_id, 'sentence_start'] = sentence_start
                df_data.loc[mention_data_id, 'sentence_end'] = sentence_end
                df_data.loc[mention_data_id, 'surface_norm'] = StringUtils.normalize(mention_surface)
                df_data.loc[mention_data_id, 'scope'] = mention_scope
                df_data.loc[mention_data_id, 'type'] = mention_type
                df_data.loc[mention_data_id, 'prev_mentions'] = mention_prev_entities
                df_data.loc[mention_data_id, 'doc_id'] = file_id
                df_data.loc[mention_data_id, 'doc_title'] = doc_title
                df_data.loc[mention_data_id, 'doc_title_mentions'] = doc_title_extracted_mentions
                df_data.loc[mention_data_id, 'doc_title_norm'] = doc_title_norm
                df_data.loc[mention_data_id, 'doc_text'] = doc_text
                df_data.loc[mention_data_id, 'urn'] = mention_urn
                df_data.loc[mention_data_id, 'urn_clean'] = clean_urn

            # Add successfully parsed mentions of the doc to other_mentions field of each mention of the doc
            for m_id in prev_entities:
                other_mentions = list(prev_entities)
                other_mentions.remove(m_id)
                df_data.loc[m_id, 'other_mentions'] = other_mentions
Example #15
0
    _lines = []
    for i, (annsent, meta) in enumerate(zip(tqdm(ann_sents), sents_meta)):
        sdic = {}
        for e, tok in enumerate(annsent):
            t_line = process_token(tok)
            if isinstance(tok, Word):
                tid = _create_cite2urn("tokens", version, wprefix, tok_counter)
                t_line.insert(0, tid)
                tokurn, lend = _create_tokenized_cts_urn(
                    docurn, tok.cite, _lines)
                _lines.append(lend)
                t_line.append(tokurn)
                tok_counter += 1
                words.append(t_line)
                if tok.id == "1":
                    start_token_counter = CTS_URN(
                        tokurn).passage_component.split("-")[-1]
            elif isinstance(tok, Artificial):
                tid = _create_cite2urn("artificial", version, wprefix,
                                       art_counter)
                t_line.insert(0, tid)
                arts.append(t_line)
                art_counter += 1
            sdic[tok.id] = tid
        sid = _create_cite2urn("sentences", version, wprefix, i + 1)
        sdic["0"] = sid

        # s: id, ctsurn, speaker, author, title, subdoc
        c = CTS_URN(tokurn).passage_component.split("-")[-1]
        sent_urn = s = "{}.tokenized:{}-{}".format(docurn, start_token_counter,
                                                   c)
        s = [sid, config["author"], config["work"], meta.subdoc, sent_urn]
Example #16
0
    def _disambiguate_relation(self,
                               citation_string,
                               entity_type,
                               scope,
                               n_guess=1):
        """Disambiguate a relation.

        :citation_string: e.g. "Hom. Il.
        :scope: e.g. "1,100"
        :return: a named tuple  (see `Result`)
        """
        match = None

        # citation string has one single token
        if len(citation_string.split(" ")) == 1:

            match = self.matches_work(citation_string,
                                      self.fuzzy_match_relations,
                                      self.distance_relations)

            # TODO this is problematic
            # should be: match is None or match does not contain at least one entry with distance=0
            zero_distance_match = False
            if match is not None:
                for m in match:
                    if m[2] == 0:
                        zero_distance_match = True

            logger.debug("[%s %s] zero distance match is %s, match = %s" %
                         (citation_string, scope, zero_distance_match, match))

            if match is None or not zero_distance_match:
                match = self.matches_author(citation_string,
                                            self.fuzzy_match_relations,
                                            self.distance_relations)
            if match is not None:
                if (len(match) <= n_guess):
                    match = match[:n_guess]
                else:
                    match = select_lcs_match(citation_string, match, n_guess)

                for urn_string, label, score in match:
                    result = self._consolidate_result(urn_string,
                                                      citation_string,
                                                      entity_type, scope)
                    return result

        # citation string has two tokens
        elif (len(citation_string.split(" ")) == 2):
            tok1, tok2 = citation_string.split(" ")

            # case 2: tok1 and tok2 are author
            match = self.matches_author(citation_string,
                                        self.fuzzy_match_relations,
                                        self.distance_relations)

            if match is not None:
                if (len(match) <= n_guess):
                    match = match[:n_guess]
                else:
                    match = select_lcs_match(citation_string, match, n_guess)

                for urn_string, label, score in match:
                    result = self._consolidate_result(urn_string,
                                                      citation_string,
                                                      entity_type, scope)
                    return result
            else:
                # case 3: tok1 and tok2 are work
                match = self.matches_work(citation_string,
                                          self.fuzzy_match_relations,
                                          self.distance_relations)
                if match is not None:
                    if (len(match) <= n_guess):
                        match = match[:n_guess]
                    else:
                        match = select_lcs_match(citation_string, match,
                                                 n_guess)

                    for urn_string, label, score in match:
                        result = self._consolidate_result(
                            urn_string, citation_string, entity_type, scope)
                        return result

            # case 1: tok1 is author and tok2 is work
            match_tok1 = self.matches_author(tok1, self.fuzzy_match_relations,
                                             self.distance_relations)
            match_tok2 = self.matches_work(tok2, self.fuzzy_match_relations,
                                           self.distance_relations)

            if (match_tok1 is not None and match_tok2 is not None):

                for id1, label1, score1 in match_tok1:
                    for id2, label2, score2 in match_tok2:
                        work = self._kb.get_resource_by_urn(id2)

                        if id1 == str(work.author.get_urn()):
                            match = [(id2, label2, score2)]
                            return Result(citation_string, entity_type, scope,
                                          CTS_URN(id2))
                        else:
                            logger.debug(
                                "The combination: {} and {} was ruled out".
                                format(id1, id2))

        # citation string has more than two tokens
        elif (len(citation_string.split(" ")) > 2):
            match = self.matches_author(citation_string,
                                        self.fuzzy_match_relations,
                                        self.distance_relations)
        else:
            logger.error("This case is not handled properly: {}".format(
                citation_string))
            raise

        # return only n_guess results
        if match is None or len(match) == 0:
            logger.debug(
                "\'%s %s\': no disambiguation candidates were found." %
                (citation_string, scope))
            return Result(citation_string, entity_type, scope, NIL_URN)

        elif len(match) <= n_guess:
            logger.debug(
                "There are %i matches and `n_guess`==%i. Nothing to cut." %
                (len(match), n_guess))

        elif len(match) > n_guess:
            logger.debug("There are %i matches: selecting based on LCS" %
                         len(match))
            match = select_lcs_match(citation_string, match, n_guess)

        for urn_string, label, score in match:
            result = self._consolidate_result(urn_string, citation_string,
                                              entity_type, scope)
            return result
Example #17
0
    def _disambiguate_relation(self,
                               citation_string,
                               entity_type,
                               scope,
                               n_guess=1):  #TODO: finish debugging
        """
        :citation_string: e.g. "Hom. Il.
        :scope: e.g. "1,100"
        :return: a named tuple  (see `Result`)
        """

        # citation string has one single token
        if len(citation_string.split(" ")) == 1:

            match = self.matches_work(citation_string,
                                      self.fuzzy_match_relations,
                                      self.distance_relations)

            # TODO this is problematic
            # should be: match is None or match does not contain at least one entry with distance=0
            zero_distance_match = False
            if match is not None:
                for m in match:
                    if m[2] == 0:
                        zero_distance_match = True

            logger.debug("[%s %s] zero distance match is %s, match = %s" %
                         (citation_string, scope, zero_distance_match, match))

            if match is None or not zero_distance_match:
                match = self.matches_author(citation_string,
                                            self.fuzzy_match_relations,
                                            self.distance_relations)
            """
            if match is not None:
                #match = [(id,name,diff) for id, name, diff in match if diff == 0][:n_guess] # this has to be removed
                pass
            else:
                # fuzzy matching as author
                # then fuzzy matching as work
                # ad the end take the matching with lowest score
                pass
            """

        # citation string has two tokens
        elif (len(citation_string.split(" ")) == 2):
            tok1, tok2 = citation_string.split(" ")

            # case 1: tok1 is author and tok2 is work
            match_tok1 = self.matches_author(tok1, self.fuzzy_match_relations,
                                             self.distance_relations)
            match_tok2 = self.matches_work(tok2, self.fuzzy_match_relations,
                                           self.distance_relations)

            if (match_tok1 is not None and match_tok2 is not None):

                for id1, label1, score1 in match_tok1:
                    for id2, label2, score2 in match_tok2:
                        if id1 in id2:
                            match = [(id2, label2, score2)]
                            return Result(citation_string, entity_type, scope,
                                          CTS_URN(id2))
            else:
                # case 2: tok1 and tok2 are author
                match = self.matches_author(citation_string,
                                            self.fuzzy_match_relations,
                                            self.distance_relations)

                if match is None:
                    # case 3: tok1 and tok2 are work
                    match = self.matches_work(citation_string,
                                              self.fuzzy_match_relations,
                                              self.distance_relations)

        # citation string has more than two tokens
        elif (len(citation_string.split(" ")) > 2):

            match = self.matches_author(citation_string,
                                        self.fuzzy_match_relations,
                                        self.distance_relations)

        else:
            logger.error("This case is not handled properly: %s" %
                         citation_string)
            raise

        # return only n_guess results
        if match is None or len(match) == 0:
            logger.debug(
                "\'%s %s\': no disambiguation candidates were found." %
                (citation_string, scope))
            return Result(citation_string, entity_type, scope, NIL_URN)

        elif len(match) <= n_guess:
            logger.debug(
                "There are %i matches and `n_guess`==%i. Nothing to cut." %
                (len(match), n_guess))

        elif len(match) > n_guess:
            # iterate and get what's the lowest ed_score
            # then keep only the matches with lowest (best) score
            # then keep the one with longest common string
            lowest_score = 1000

            for m in match:
                score = m[2]
                if score < lowest_score:
                    lowest_score = score

            filtered_matches = [m for m in match if m[2] == lowest_score]

            best_match = ("", None)

            if (lowest_score > 0):
                for match in filtered_matches:
                    lcs = longest_common_substring(match[1], citation_string)
                    if (len(lcs) > len(best_match[0])):
                        best_match = (lcs, match)
                match = [best_match[1]
                         ]  # TODO: check this; don't think it's correct
                logger.debug("Longest_common_substring selected %s out of %s" %
                             (match, filtered_matches))
            else:
                # TODO: use context here to disambiguate
                match = match[:n_guess]

        for urn_string, label, score in match:

            urn = CTS_URN(urn_string)

            # check: does the URN have a scope but is missing the work element (not possible)?
            if (urn.work is None):
                # if so, try to get the opus maximum from the KB
                opmax = self._kb.get_opus_maximum_of(urn)

                if (opmax is not None):
                    logger.debug("%s is opus maximum of %s" % (opmax, urn))
                    urn = CTS_URN("%s:%s" % (opmax, formatted_scope))

            return Result(citation_string, entity_type, scope, urn)