def _create_tok_exurn(cite, line_list): c = CTS_URN(cite) _offsets = c.passage_component.split("-") start, end = _offsets[0], _offsets[-1] counter = line_list.count(start) + 1 base = c.get_urn_without_passage() s = "{}.tbtokens.{}.{}".format(base, start, counter) return s, end
def main(): """Define the CLI inteface/commands.""" arguments = docopt(__doc__) cfg_filename = pkg_resources.resource_filename('knowledge_base', 'config/virtuoso.ini') kb = KnowledgeBase(cfg_filename) # the user has issued a `find` command if arguments["find"]: search_string = arguments["<search_string>"] try: urn = CTS_URN(search_string) match = kb.get_resource_by_urn(str(urn)) show_result(match, verbose=True) return except BadCtsUrnSyntax as e: pass except IndexError as e: raise e print("\nNo records with this CTS URN!\n") return try: matches = kb.search(search_string) print("\nSearching for \"%s\" yielded %s results" % (search_string, len(matches))) print_results(matches) return except SparqlReaderException as e: print("\nWildcard word needs at least 4 leading characters") # the user has issued an `add` command elif arguments["add"]: input_urn = arguments["--to"] # first let's check if it's a valid URN try: urn = CTS_URN(input_urn) except Exception as e: print("The provided URN ({}) is invalid!".format(input_urn)) return try: resource = kb.get_resource_by_urn(urn) assert resource is not None except ResourceNotFound: print("The KB does not contain a resource identified by {}".format( urn)) return print(arguments) #if arguments[""] pass
def _consolidate_result(self, urn_string, citation_string, entity_type, scope): urn = CTS_URN(urn_string) # check: does the URN have a scope but is missing the work element if (urn.work is None): # if so, try to get the opus maximum from the KB opmax = self._kb.get_opus_maximum_of(urn) if (opmax is not None): logger.debug("%s is opus maximum of %s" % (opmax, urn)) urn = CTS_URN("{}".format(opmax.get_urn())) return Result(citation_string, entity_type, scope, urn)
def _create_tokenized_cts_urn(document_urn, cite_string, line_list): """Create the CTS-URN pointing to the tokenized edition. The URN contains a version ("tokenized") and a supplementary citation level (token nr.). E.g.: `urn:cts:greekLit:tlg0085.tlg003.perseus-grc2.tokenized:1.1` Which means: token 1 of line 1 of the tokenized version of `tlg0085.tlg003.perseus-grc2`. The function returns a CTS URN and a line number, ready to be added to the line list; in case of span-tokens, the line nr. returned is that of the *end* of the span (so that every other token in that line will be counted starting from 2). Parameters ---------- document_urn : str urn of the digital edition cite_string cite attribute in the TB file line_list complete list of the line numbers attached to all the tokens preceding the attual one. Returns ------- str : the CTS_URN int : the last line nr. in the token cite attribute """ c = CTS_URN(cite_string) _offsets = c.passage_component.split("-") start, end = _offsets[0], _offsets[-1] counter = line_list.count(start) + 1 # urn:cts:greekLit:tlg0012.tlg001.allen.tokenized:1.1.1 s = "{}.tokenized:{}.{}".format(document_urn, start, counter) return s, end
def get_urn(self): """ TODO """ urn = self.ecrm_P1_is_identified_by.one try: return CTS_URN(urn) except Exception, e: raise e
def annotations_to_ctsurns(doc_metadata, annotations): """ TODO """ from pyCTS import CTS_URN doc_metadata["citations"] = [] for ann in annotations: label = ann[1] cts_urn = CTS_URN(ann[2]) temp = {} if(cts_urn.is_range()): resolv_urn = "%s%s:%s"%("http://data.perseus.org/citations/",cts_urn.get_urn_without_passage(),cts_urn._range_begin) else: resolv_urn = "%s%s"%("http://data.perseus.org/citations/",ann[2]) temp["perseus_uri"]=resolv_urn temp["label"]=label temp["ctsurn"]=str(cts_urn) doc_metadata["citations"].append(temp) return doc_metadata
def align(φ): cts_urn = CTS_URN(φ) work_component = cts_urn.work_component passage_component = cts_urn.passage_component nodesList = [] with open(r'../citation.csv', newline='') as csvfile: reader = csv.reader(csvfile, delimiter=",", quotechar='|') rows = [] for row in reader: rows.append(row) for x in range(len(rows)): a = rows[x][0] b = a.split(":")[3].split(".")[:-1] c = ".".join(b) if c == work_component: root = ET.parse("../" + rows[x][2]).getroot() new_cts_urn = CTS_URN(a + passage_component) r = passage_component.split("-") w = root[1][0] text = [] if len(r) == 1: text.append(w[int(passage_component) - 1].text) if len(r) > 1: for i in range(int(r[0]), int(r[1])+1): text.append(w[i - 1].text) node = citableNode(new_cts_urn, text) nodesList.append(node) for node in nodesList: if node.urn.version == 'normalized': normalized = node if node.urn.version == 'text': lyric = node if node.urn.version == 'notation': notation = node if node.urn.version == 'accent': accent = node if node.urn.version == 'meter': meter = node module = alignmentModule(cts_urn, normalized, lyric, notation, accent, meter) return(module)
def get_resource_by_urn(self, urn): """Fetch the resource corresponding to the input CTS URN. Currently supports only HucitAuthor and HucitWork. :param urn: the CTS URN of the resource to fetch :return: either an instance of `HucitAuthor` or of `HucitWork` """ search_query = """ PREFIX frbroo: <http://erlangen-crm.org/efrbroo/> PREFIX crm: <http://erlangen-crm.org/current/> PREFIX rdfs: <http://www.w3.org/2000/01/rdf-schema#> SELECT ?resource_URI WHERE { ?resource_URI crm:P1_is_identified_by ?urn . ?urn a crm:E42_Identifier . ?urn rdfs:label "%s" } """ % urn # check type of the input URN try: assert isinstance(urn, CTS_URN) except Exception as e: # convert to pyCTS.CTS_URN if it's a string urn = CTS_URN(urn) logger.debug('Converted the input urn from string to %s' % type(CTS_URN)) if (urn.work is not None): Work = self._session.get_class(surf.ns.EFRBROO['F1_Work']) result = self._store.execute_sparql(search_query) if len(result['results']['bindings']) == 0: raise ResourceNotFound else: tmp = result['results']['bindings'][0] resource_uri = tmp['resource_URI']['value'] return self._session.get_resource(resource_uri, Work) elif (urn.work is None and urn.textgroup is not None): Person = self._session.get_class(surf.ns.EFRBROO['F10_Person']) result = self._store.execute_sparql(search_query) if len(result['results']['bindings']) == 0: raise ResourceNotFound else: tmp = result['results']['bindings'][0] resource_uri = tmp['resource_URI']['value'] return self._session.get_resource(resource_uri, Person)
def guess_speaker(sent, tei_file): u = CTS_URN(list(sent[0].misc["cite"])[0]) book, line = u.passage_component.split(".") # retrieve the line in TEI xp = "//tei:div[@subtype='Book' and @n='{}']/descendant::tei:l[@n='{}']".format( book, line) l = tei_file.xpath(xp, namespaces=ns)[0] p = l.getparent() try: speaker = p.attrib["who"] except KeyError: speaker = "Narrator" sent.set_meta("speaker", speaker)
def get_urn(self): """ Assumes that each HucitAuthor has only one CTS URN. """ # TODO: check type try: type_ctsurn = self.session.get_resource( BASE_URI_TYPES % "CTS_URN", self.session.get_class(surf.ns.ECRM['E55_Type'])) urn = [ CTS_URN(urnstring.rdfs_label.one) for urnstring in self.ecrm_P1_is_identified_by if urnstring.uri == surf.ns.ECRM['E42_Identifier'] and urnstring.ecrm_P2_has_type.first == type_ctsurn ][0] return urn except Exception as e: return None
def get_urn(self): """ Get the CTS URN that identifies the work. :return: an instance of `pyCTS.CTS_URN` or None """ try: type_ctsurn = self.session.get_resource( BASE_URI_TYPES % "CTS_URN", self.session.get_class(surf.ns.ECRM['E55_Type'])) urn = [ CTS_URN(urnstring.rdfs_label.one) for urnstring in self.ecrm_P1_is_identified_by if urnstring.uri == surf.ns.ECRM['E42_Identifier'] and urnstring.ecrm_P2_has_type.first == type_ctsurn ][0] return urn except Exception, e: return None
def load_brat_data(extractor, knowledge_base, postaggers, aph_ann_files, aph_titles): """ Utility function to load a set of brat documents and prepare them in a format suitable for processing (typically when carrying out the evaluation or the training). :param citation_extractor: instance of `core.citation_extractor` :parm knowledge_base: instance of `knowledge_base.KnowledgeBase` :param aph_ann_files: a tuple: [0] the base directory; [1] a list of file names :param aph_titles: `pandas.DataFrame` with column 'title' :return: a `pandas.DataFrame` (columns: 'surface', 'surface_norm', 'scope', 'type', 'other_mentions', 'prev_mentions', 'urn', 'urn_clean','doc_id', 'doc_title', 'doc_title_mentions', 'doc_title_norm', 'doc_text', 'sentence_start', 'sentence_end') """ from citation_extractor.pipeline import extract_entity_mentions cols = [ 'surface', 'surface_norm', 'scope', 'type', 'other_mentions', 'prev_mentions', 'urn', 'urn_clean', 'doc_id', 'doc_title', 'doc_title_mentions', 'doc_title_norm', 'doc_text', 'sentence_start', 'sentence_end' ] df_data = pd.DataFrame(dtype='object', columns=cols) # Read all annotated files ann_dir, files = aph_ann_files for filename in files: if filename.endswith('.ann'): logger.debug('Reading file: {}'.format(filename)) # Read doc annotations file_suffix = filename.replace('-doc-1.ann', '') entities, relations, disambiguations = read_ann_file_new( file_suffix, ann_dir + '/') # Read doc text doc_text = None filename_text = filename.replace('.ann', '.txt') with open(os.path.join(ann_dir, filename_text)) as f: doc_text = f.read() doc_text = unicode(doc_text, 'utf-8') logger.debug(u'Document text: {}'.format(doc_text)) doc_newlines = _find_newlines(doc_text) # Get title doc_title = None file_id = file_suffix.replace('.txt', '') if file_id in aph_titles.index: doc_title = aph_titles.loc[file_id, 'title'] doc_title = unicode(doc_title, 'utf-8') logger.debug(u'Document title: {}'.format(doc_title)) try: # Extract mentions from the title, list of (type, surface) tuples doc_title_extracted_mentions = extract_entity_mentions( doc_title, extractor, postaggers, norm=True) except Exception, e: doc_title_extracted_mentions = [] print(e) print(doc_title) print(file_id) # Normalize title doc_title_norm = StringUtils.normalize(doc_title) # Order the appearance of the mentions in the doc ordered_mentions = sort_mentions_by_appearance(entities, relations) logger.debug('Mentions appearance: {}'.format(ordered_mentions)) # Rearrange disambiguations disambiguations_new = dict( map(lambda e: (e['anchor'], e['text']), disambiguations)) prev_entities = [] for mention_id in ordered_mentions: # Note: added utf-8 encoding after new error mention_data_id = file_id + '-' + mention_id.encode('utf-8') mention_urn = NIL_ENTITY clean_urn = mention_urn mention_surface = None mention_scope = None mention_type = None sentence_start = None sentence_end = None # It's a relation if mention_id.startswith('R'): relation = relations[mention_id] # Unpack the relation entity_0 = entities[relation['arguments'][0]] entity_1 = entities[relation['arguments'][1]] # Sanity check for types of relation members no_refscope = ['AAUTHOR', 'AWORK', 'REFAUWORK'] if entity_0['entity_type'] in no_refscope and entity_1[ 'entity_type'] == 'REFSCOPE': pass elif entity_1['entity_type'] in no_refscope and entity_0[ 'entity_type'] == 'REFSCOPE': logger.warning( 'Swapped entities in relation {} in doc {}'.format( mention_id, filename)) entity_0 = entities[relation['arguments'][1]] entity_1 = entities[relation['arguments'][0]] else: logger.error( 'Unknown types in relation {} in doc {}'.format( mention_id, filename)) continue # Update fields if mention_id in disambiguations_new: mention_urn = disambiguations_new[mention_id] mention_surface = entity_0['surface'] mention_scope = entity_1['surface'] mention_type = entity_0['entity_type'] if entity_0["offset_start"] > entity_1["offset_start"]: sentence_start = _find_linenumber_by_offset( int(entity_1["offset_start"]), int(entity_1["offset_end"]), doc_newlines)[0] sentence_end = _find_linenumber_by_offset( int(entity_0["offset_start"]), int(entity_0["offset_end"]), doc_newlines)[0] else: sentence_start = _find_linenumber_by_offset( int(entity_0["offset_start"]), int(entity_0["offset_end"]), doc_newlines)[0] sentence_end = _find_linenumber_by_offset( int(entity_1["offset_start"]), int(entity_1["offset_end"]), doc_newlines)[0] # It's a non-relation elif mention_id.startswith('T'): entity = entities[mention_id] # Avoid to disambiguate the mention if it's a REFSCOPE (alone) if entity['entity_type'] == 'REFSCOPE': logger.warning( 'Lonely REFSCOPE with id: {} in doc: {}'.format( mention_id, filename)) continue # Update fields if mention_id in disambiguations_new: mention_urn = disambiguations_new[mention_id] mention_surface = entity['surface'] mention_type = entity['entity_type'] mention_offset_start = int(entity['offset_start']) mention_offset_end = int(entity['offset_end']) sentence_start = _find_linenumber_by_offset( mention_offset_start, mention_offset_end, doc_newlines)[0] sentence_end = sentence_start else: logger.error('Unknown mention id: {} in doc {}'.format( mention_id, filename)) continue # Get clean URN (without passage), skip if non-valid if mention_urn != NIL_ENTITY: try: cts_urn = CTS_URN(mention_urn) clean_urn = cts_urn.get_urn_without_passage() knowledge_base.get_resource_by_urn(clean_urn) except Exception, e: logger.error(e) logger.warning( 'Failed parsing the URN: |{}| at: {}'.format( mention_urn, file_id)) continue # Keep track of previous mentions mention_prev_entities = list(prev_entities) # copy prev_entities.append(mention_data_id) df_data.loc[mention_data_id, 'surface'] = mention_surface df_data.loc[mention_data_id, 'sentence_start'] = sentence_start df_data.loc[mention_data_id, 'sentence_end'] = sentence_end df_data.loc[mention_data_id, 'surface_norm'] = StringUtils.normalize( mention_surface) df_data.loc[mention_data_id, 'scope'] = mention_scope df_data.loc[mention_data_id, 'type'] = mention_type df_data.loc[mention_data_id, 'prev_mentions'] = mention_prev_entities df_data.loc[mention_data_id, 'doc_id'] = file_id df_data.loc[mention_data_id, 'doc_title'] = doc_title df_data.loc[ mention_data_id, 'doc_title_mentions'] = doc_title_extracted_mentions df_data.loc[mention_data_id, 'doc_title_norm'] = doc_title_norm df_data.loc[mention_data_id, 'doc_text'] = doc_text df_data.loc[mention_data_id, 'urn'] = mention_urn df_data.loc[mention_data_id, 'urn_clean'] = clean_urn # Add successfully parsed mentions of the doc to other_mentions field of each mention of the doc for m_id in prev_entities: other_mentions = list(prev_entities) other_mentions.remove(m_id) df_data.loc[m_id, 'other_mentions'] = other_mentions
def annotations2references(doc_id, directory, kb): """ Read annotations from a brat stand-off file (.ann). For each entity and relation keep also the context, i.e. the containing sentences. TODO: - add author and work labels - if annotation is a scope relation, add work- and author-urn if annotation is an AWORK, add work- and author-urn """ def find_newlines(text, newline=u'\n'): positions = [] last_position = 0 if (text.find(newline) == -1): return positions else: while (text.find(newline, last_position + 1) > -1): last_position = text.find(newline, last_position + 1) positions.append((last_position, last_position + len(newline))) return positions def find_linenumber_newlineoffset_for_string(offset_start, offset_end, newline_offsets): """ TODO """ for n, nl_offset in enumerate(newline_offsets): #print offset_start,offset_end,nl_offset if (offset_start <= nl_offset[0] and offset_end <= nl_offset[0]): return (n, newline_offsets[n - 1][1], newline_offsets[n][0]) import knowledge_base entities, relations, disambiguations = read_ann_file_new(doc_id, directory) fulltext = codecs.open("%s%s%s" % (directory, doc_id, "-doc-1.txt"), "r", "utf-8").read() newlines = find_newlines(fulltext) annotations = [] for disambiguation in disambiguations: annotation = {} anchor = disambiguation["anchor"] urn = disambiguation["text"] ann_id = disambiguation["ann_id"] # the annotation refers to a scope relation if (anchor.startswith("R")): entity_ids = relations[anchor]["arguments"] annotation["annotation_type"] = relations[anchor][ "relation_type"].lower() arg_entities = [entities[id] for id in entity_ids] ann_type = relations[anchor]["relation_type"].lower() spanning_lines = [ find_linenumber_newlineoffset_for_string( int(entity["offset_start"]), int(entity["offset_end"]), newlines) for entity in arg_entities ] line_numbers = list(set([line[0] for line in spanning_lines])) line_numbers = sorted(line_numbers) start = spanning_lines[0][1] end = spanning_lines[-1][2] if (len(line_numbers) == 1): sentence = "\n".join(fulltext.split("\n")[line_numbers[0]]) else: sentence = "\n".join( fulltext.split("\n")[line_numbers[0]:line_numbers[1]]) context = "%s<em>%s</em>%s<em>%s</em>%s" % ( fulltext[start:int(arg_entities[0]["offset_start"])], fulltext[int(arg_entities[0]["offset_start"] ):int(arg_entities[0]["offset_end"])], fulltext[int(arg_entities[0]["offset_end"] ):int(arg_entities[1]["offset_start"])], fulltext[int(arg_entities[1]["offset_start"] ):int(arg_entities[1]["offset_end"])], fulltext[int(arg_entities[1]["offset_end"]):end]) annotation["surface"] = " ".join( [entity["surface"] for entity in arg_entities]) annotation["context"] = context annotation["line_number"] = line_numbers[0] # the annotation refers to an entity elif (anchor.startswith("T")): entity = entities[anchor] annotation["annotation_type"] = entity["entity_type"].lower() line_number, start, end = find_linenumber_newlineoffset_for_string( int(entity["offset_start"]), int(entity["offset_end"]), newlines) sentence = fulltext.split("\n")[line_number] before_mention = sentence[start - start:int(entity["offset_start"]) - start] mention = sentence[int(entity["offset_start"]) - start:int(entity["offset_end"]) - start] after_mention = sentence[int(entity["offset_end"]) - start:] context = "%s<em>%s</em>%s" % (before_mention, mention, after_mention) annotation["surface"] = entity["surface"] annotation["context"] = context annotation["line_number"] = line_number annotation["filename"] = doc_id annotation["annotation_id"] = ann_id annotation["urn"] = urn annotation["anchor"] = anchor try: if (annotation["annotation_type"] == "aauthor"): author = kb.get_resource_by_urn(urn) annotation["author_label"] = "%s" % author annotation["work_label"] = None annotation["author_urn"] = str(author.get_urn()) annotation["work_urn"] = None elif (annotation["annotation_type"] == "awork"): work = kb.get_resource_by_urn(urn) annotation["author_label"] = unicode(work.author) annotation["work_label"] = unicode(work) annotation["author_urn"] = str(work.author.get_urn()) annotation["work_urn"] = str(work.get_urn()) elif (annotation["annotation_type"] == "scope"): try: temp = CTS_URN(annotation["urn"]).get_urn_without_passage() resource = kb.get_resource_by_urn(temp) if (isinstance(resource, knowledge_base.surfext.HucitWork)): annotation["author_label"] = unicode(resource.author) annotation["work_label"] = unicode(resource) annotation["author_urn"] = str( resource.author.get_urn()) annotation["work_urn"] = str(resource.get_urn()) elif (isinstance(resource, knowledge_base.surfext.HucitAuthor)): annotation["author_label"] = unicode(resource) annotation["work_label"] = None annotation["author_urn"] = str(resource.get_urn()) annotation["work_urn"] = None except Exception as e: annotation["author_label"] = None annotation["work_label"] = None annotation["author_urn"] = None annotation["work_urn"] = None logger.error( "Annotation %s raised the following error: %s" % (annotation, e)) annotations.append(annotation) except Exception as e: logger.error("The annotations %s raised an error: %s" % (annotation, e)) logger.info("Read %i annotations from file %s%s" % (len(annotations), directory, doc_id)) return annotations
def load_brat_data(extractor, knowledge_base, postaggers, aph_ann_files, aph_titles): """ Utility function to load a set of brat documents and prepare them in a format suitable for processing (typically when carrying out the evaluation or the training). :param citation_extractor: instance of `core.citation_extractor` :parm knowledge_base: instance of `knowledge_base.KnowledgeBase` :param aph_ann_files: a tuple: [0] the base directory; [1] a list of file names :param aph_titles: `pandas.DataFrame` with column 'title' :return: a `pandas.DataFrame` (columns: 'surface', 'surface_norm', 'scope', 'type', 'other_mentions', 'prev_mentions', 'urn', 'urn_clean','doc_id', 'doc_title', 'doc_title_mentions', 'doc_title_norm', 'doc_text', 'sentence_start', 'sentence_end') """ from citation_extractor.pipeline import extract_entity_mentions cols = ['surface', 'surface_norm', 'scope', 'type', 'other_mentions', 'prev_mentions', 'urn', 'urn_clean', 'doc_id', 'doc_title', 'doc_title_mentions', 'doc_title_norm', 'doc_text', 'sentence_start', 'sentence_end'] df_data = pd.DataFrame(dtype='object', columns=cols) # Read all annotated files ann_dir, files = aph_ann_files for filename in files: if filename.endswith('.ann'): logger.debug('Reading file: {}'.format(filename)) # Read doc annotations file_suffix = filename.replace('-doc-1.ann', '') entities, relations, disambiguations = read_ann_file_new(file_suffix, ann_dir + '/') # Read doc text doc_text = None filename_text = filename.replace('.ann', '.txt') with open(os.path.join(ann_dir, filename_text)) as f: doc_text = f.read() doc_text = unicode(doc_text, 'utf-8') logger.debug(u'Document text: {}'.format(doc_text)) doc_newlines = _find_newlines(doc_text) # Get title doc_title = None file_id = file_suffix.replace('.txt', '') if file_id in aph_titles.index: doc_title = aph_titles.loc[file_id, 'title'] doc_title = unicode(doc_title, 'utf-8') logger.debug(u'Document title: {}'.format(doc_title)) try: # Extract mentions from the title, list of (type, surface) tuples doc_title_extracted_mentions = extract_entity_mentions(doc_title, extractor, postaggers, norm=True) except Exception, e: doc_title_extracted_mentions = [] print(e) print(doc_title) print(file_id) # Normalize title doc_title_norm = StringUtils.normalize(doc_title) # Order the appearance of the mentions in the doc ordered_mentions = sort_mentions_by_appearance(entities, relations) logger.debug('Mentions appearance: {}'.format(ordered_mentions)) # Rearrange disambiguations disambiguations_new = dict(map(lambda e: (e['anchor'], e['text']), disambiguations)) prev_entities = [] for mention_id in ordered_mentions: # Note: added utf-8 encoding after new error mention_data_id = file_id + '-' + mention_id.encode('utf-8') mention_urn = NIL_ENTITY clean_urn = mention_urn mention_surface = None mention_scope = None mention_type = None sentence_start = None sentence_end = None # It's a relation if mention_id.startswith('R'): relation = relations[mention_id] # Unpack the relation entity_0 = entities[relation['arguments'][0]] entity_1 = entities[relation['arguments'][1]] # Sanity check for types of relation members no_refscope = ['AAUTHOR', 'AWORK', 'REFAUWORK'] if entity_0['entity_type'] in no_refscope and entity_1['entity_type'] == 'REFSCOPE': pass elif entity_1['entity_type'] in no_refscope and entity_0['entity_type'] == 'REFSCOPE': logger.warning('Swapped entities in relation {} in doc {}'.format(mention_id, filename)) entity_0 = entities[relation['arguments'][1]] entity_1 = entities[relation['arguments'][0]] else: logger.error('Unknown types in relation {} in doc {}'.format(mention_id, filename)) continue # Update fields if mention_id in disambiguations_new: mention_urn = disambiguations_new[mention_id] mention_surface = entity_0['surface'] mention_scope = entity_1['surface'] mention_type = entity_0['entity_type'] if entity_0["offset_start"] > entity_1["offset_start"]: sentence_start = _find_linenumber_by_offset(int(entity_1["offset_start"]) , int(entity_1["offset_end"]) , doc_newlines)[0] sentence_end = _find_linenumber_by_offset(int(entity_0["offset_start"]) , int(entity_0["offset_end"]) , doc_newlines)[0] else: sentence_start = _find_linenumber_by_offset(int(entity_0["offset_start"]) , int(entity_0["offset_end"]) , doc_newlines)[0] sentence_end = _find_linenumber_by_offset(int(entity_1["offset_start"]) , int(entity_1["offset_end"]) , doc_newlines)[0] # It's a non-relation elif mention_id.startswith('T'): entity = entities[mention_id] # Avoid to disambiguate the mention if it's a REFSCOPE (alone) if entity['entity_type'] == 'REFSCOPE': logger.warning('Lonely REFSCOPE with id: {} in doc: {}'.format(mention_id, filename)) continue # Update fields if mention_id in disambiguations_new: mention_urn = disambiguations_new[mention_id] mention_surface = entity['surface'] mention_type = entity['entity_type'] mention_offset_start = int(entity['offset_start']) mention_offset_end = int(entity['offset_end']) sentence_start = _find_linenumber_by_offset(mention_offset_start , mention_offset_end , doc_newlines)[0] sentence_end = sentence_start else: logger.error('Unknown mention id: {} in doc {}'.format(mention_id, filename)) continue # Get clean URN (without passage), skip if non-valid if mention_urn != NIL_ENTITY: try: cts_urn = CTS_URN(mention_urn) clean_urn = cts_urn.get_urn_without_passage() knowledge_base.get_resource_by_urn(clean_urn) except Exception, e: logger.error(e) logger.warning('Failed parsing the URN: |{}| at: {}'.format(mention_urn, file_id)) continue # Keep track of previous mentions mention_prev_entities = list(prev_entities) # copy prev_entities.append(mention_data_id) df_data.loc[mention_data_id, 'surface'] = mention_surface df_data.loc[mention_data_id, 'sentence_start'] = sentence_start df_data.loc[mention_data_id, 'sentence_end'] = sentence_end df_data.loc[mention_data_id, 'surface_norm'] = StringUtils.normalize(mention_surface) df_data.loc[mention_data_id, 'scope'] = mention_scope df_data.loc[mention_data_id, 'type'] = mention_type df_data.loc[mention_data_id, 'prev_mentions'] = mention_prev_entities df_data.loc[mention_data_id, 'doc_id'] = file_id df_data.loc[mention_data_id, 'doc_title'] = doc_title df_data.loc[mention_data_id, 'doc_title_mentions'] = doc_title_extracted_mentions df_data.loc[mention_data_id, 'doc_title_norm'] = doc_title_norm df_data.loc[mention_data_id, 'doc_text'] = doc_text df_data.loc[mention_data_id, 'urn'] = mention_urn df_data.loc[mention_data_id, 'urn_clean'] = clean_urn # Add successfully parsed mentions of the doc to other_mentions field of each mention of the doc for m_id in prev_entities: other_mentions = list(prev_entities) other_mentions.remove(m_id) df_data.loc[m_id, 'other_mentions'] = other_mentions
_lines = [] for i, (annsent, meta) in enumerate(zip(tqdm(ann_sents), sents_meta)): sdic = {} for e, tok in enumerate(annsent): t_line = process_token(tok) if isinstance(tok, Word): tid = _create_cite2urn("tokens", version, wprefix, tok_counter) t_line.insert(0, tid) tokurn, lend = _create_tokenized_cts_urn( docurn, tok.cite, _lines) _lines.append(lend) t_line.append(tokurn) tok_counter += 1 words.append(t_line) if tok.id == "1": start_token_counter = CTS_URN( tokurn).passage_component.split("-")[-1] elif isinstance(tok, Artificial): tid = _create_cite2urn("artificial", version, wprefix, art_counter) t_line.insert(0, tid) arts.append(t_line) art_counter += 1 sdic[tok.id] = tid sid = _create_cite2urn("sentences", version, wprefix, i + 1) sdic["0"] = sid # s: id, ctsurn, speaker, author, title, subdoc c = CTS_URN(tokurn).passage_component.split("-")[-1] sent_urn = s = "{}.tokenized:{}-{}".format(docurn, start_token_counter, c) s = [sid, config["author"], config["work"], meta.subdoc, sent_urn]
def _disambiguate_relation(self, citation_string, entity_type, scope, n_guess=1): """Disambiguate a relation. :citation_string: e.g. "Hom. Il. :scope: e.g. "1,100" :return: a named tuple (see `Result`) """ match = None # citation string has one single token if len(citation_string.split(" ")) == 1: match = self.matches_work(citation_string, self.fuzzy_match_relations, self.distance_relations) # TODO this is problematic # should be: match is None or match does not contain at least one entry with distance=0 zero_distance_match = False if match is not None: for m in match: if m[2] == 0: zero_distance_match = True logger.debug("[%s %s] zero distance match is %s, match = %s" % (citation_string, scope, zero_distance_match, match)) if match is None or not zero_distance_match: match = self.matches_author(citation_string, self.fuzzy_match_relations, self.distance_relations) if match is not None: if (len(match) <= n_guess): match = match[:n_guess] else: match = select_lcs_match(citation_string, match, n_guess) for urn_string, label, score in match: result = self._consolidate_result(urn_string, citation_string, entity_type, scope) return result # citation string has two tokens elif (len(citation_string.split(" ")) == 2): tok1, tok2 = citation_string.split(" ") # case 2: tok1 and tok2 are author match = self.matches_author(citation_string, self.fuzzy_match_relations, self.distance_relations) if match is not None: if (len(match) <= n_guess): match = match[:n_guess] else: match = select_lcs_match(citation_string, match, n_guess) for urn_string, label, score in match: result = self._consolidate_result(urn_string, citation_string, entity_type, scope) return result else: # case 3: tok1 and tok2 are work match = self.matches_work(citation_string, self.fuzzy_match_relations, self.distance_relations) if match is not None: if (len(match) <= n_guess): match = match[:n_guess] else: match = select_lcs_match(citation_string, match, n_guess) for urn_string, label, score in match: result = self._consolidate_result( urn_string, citation_string, entity_type, scope) return result # case 1: tok1 is author and tok2 is work match_tok1 = self.matches_author(tok1, self.fuzzy_match_relations, self.distance_relations) match_tok2 = self.matches_work(tok2, self.fuzzy_match_relations, self.distance_relations) if (match_tok1 is not None and match_tok2 is not None): for id1, label1, score1 in match_tok1: for id2, label2, score2 in match_tok2: work = self._kb.get_resource_by_urn(id2) if id1 == str(work.author.get_urn()): match = [(id2, label2, score2)] return Result(citation_string, entity_type, scope, CTS_URN(id2)) else: logger.debug( "The combination: {} and {} was ruled out". format(id1, id2)) # citation string has more than two tokens elif (len(citation_string.split(" ")) > 2): match = self.matches_author(citation_string, self.fuzzy_match_relations, self.distance_relations) else: logger.error("This case is not handled properly: {}".format( citation_string)) raise # return only n_guess results if match is None or len(match) == 0: logger.debug( "\'%s %s\': no disambiguation candidates were found." % (citation_string, scope)) return Result(citation_string, entity_type, scope, NIL_URN) elif len(match) <= n_guess: logger.debug( "There are %i matches and `n_guess`==%i. Nothing to cut." % (len(match), n_guess)) elif len(match) > n_guess: logger.debug("There are %i matches: selecting based on LCS" % len(match)) match = select_lcs_match(citation_string, match, n_guess) for urn_string, label, score in match: result = self._consolidate_result(urn_string, citation_string, entity_type, scope) return result
def _disambiguate_relation(self, citation_string, entity_type, scope, n_guess=1): #TODO: finish debugging """ :citation_string: e.g. "Hom. Il. :scope: e.g. "1,100" :return: a named tuple (see `Result`) """ # citation string has one single token if len(citation_string.split(" ")) == 1: match = self.matches_work(citation_string, self.fuzzy_match_relations, self.distance_relations) # TODO this is problematic # should be: match is None or match does not contain at least one entry with distance=0 zero_distance_match = False if match is not None: for m in match: if m[2] == 0: zero_distance_match = True logger.debug("[%s %s] zero distance match is %s, match = %s" % (citation_string, scope, zero_distance_match, match)) if match is None or not zero_distance_match: match = self.matches_author(citation_string, self.fuzzy_match_relations, self.distance_relations) """ if match is not None: #match = [(id,name,diff) for id, name, diff in match if diff == 0][:n_guess] # this has to be removed pass else: # fuzzy matching as author # then fuzzy matching as work # ad the end take the matching with lowest score pass """ # citation string has two tokens elif (len(citation_string.split(" ")) == 2): tok1, tok2 = citation_string.split(" ") # case 1: tok1 is author and tok2 is work match_tok1 = self.matches_author(tok1, self.fuzzy_match_relations, self.distance_relations) match_tok2 = self.matches_work(tok2, self.fuzzy_match_relations, self.distance_relations) if (match_tok1 is not None and match_tok2 is not None): for id1, label1, score1 in match_tok1: for id2, label2, score2 in match_tok2: if id1 in id2: match = [(id2, label2, score2)] return Result(citation_string, entity_type, scope, CTS_URN(id2)) else: # case 2: tok1 and tok2 are author match = self.matches_author(citation_string, self.fuzzy_match_relations, self.distance_relations) if match is None: # case 3: tok1 and tok2 are work match = self.matches_work(citation_string, self.fuzzy_match_relations, self.distance_relations) # citation string has more than two tokens elif (len(citation_string.split(" ")) > 2): match = self.matches_author(citation_string, self.fuzzy_match_relations, self.distance_relations) else: logger.error("This case is not handled properly: %s" % citation_string) raise # return only n_guess results if match is None or len(match) == 0: logger.debug( "\'%s %s\': no disambiguation candidates were found." % (citation_string, scope)) return Result(citation_string, entity_type, scope, NIL_URN) elif len(match) <= n_guess: logger.debug( "There are %i matches and `n_guess`==%i. Nothing to cut." % (len(match), n_guess)) elif len(match) > n_guess: # iterate and get what's the lowest ed_score # then keep only the matches with lowest (best) score # then keep the one with longest common string lowest_score = 1000 for m in match: score = m[2] if score < lowest_score: lowest_score = score filtered_matches = [m for m in match if m[2] == lowest_score] best_match = ("", None) if (lowest_score > 0): for match in filtered_matches: lcs = longest_common_substring(match[1], citation_string) if (len(lcs) > len(best_match[0])): best_match = (lcs, match) match = [best_match[1] ] # TODO: check this; don't think it's correct logger.debug("Longest_common_substring selected %s out of %s" % (match, filtered_matches)) else: # TODO: use context here to disambiguate match = match[:n_guess] for urn_string, label, score in match: urn = CTS_URN(urn_string) # check: does the URN have a scope but is missing the work element (not possible)? if (urn.work is None): # if so, try to get the opus maximum from the KB opmax = self._kb.get_opus_maximum_of(urn) if (opmax is not None): logger.debug("%s is opus maximum of %s" % (opmax, urn)) urn = CTS_URN("%s:%s" % (opmax, formatted_scope)) return Result(citation_string, entity_type, scope, urn)