def sentence_bounds(self, content, token_spans): sent_bounds = SpannedBounds() tokens = [content[t.lb:t.ub] for t in token_spans] opening_counts = [0 for i in token_spans] count = 0 for i in range(len(opening_counts)): if tokens[i] in u"«([": count += 1 elif tokens[i] in u"»)]": count -= 1 opening_counts[i] = count sent_bounds.append(Span(0, 0)) for index, span in enumerate(token_spans): token = tokens[index] if re.match(u"^[?!]+$", token) or token == u"…" or re.match( u"\.\.+", token): sent_bounds.append(Span(index + 1, index + 1)) elif token == u".": if opening_counts[index] == 0: sent_bounds.append(Span(index + 1, index + 1)) elif index < len(token_spans) - 1 and content[ span.ub:token_spans[index + 1].lb].count("\n") > 1: sent_bounds.append(Span(index + 1, index + 1)) sent_bounds.append(Span(len(tokens), len(tokens))) return sent_bounds
def sentence_bounds(self, content, token_spans): sent_bounds = SpannedBounds() tokens = [content[t.lb:t.ub] for t in token_spans] openings = set([u"«", u"(", u"[", u"``"]) closings = set([u"»", u")", u"]", u"''"]) opening_counts = [0 for i in tokens] count = 0 for i in range(len(opening_counts)): if tokens[i] in openings: count += 1 elif tokens[i] in closings: count -= 1 opening_counts[i] = count sent_bounds.append(Span(0, 0)) for index, token in enumerate(tokens): if re.match(u"^[?!]+$", token) or token == u"…" or re.match( u"\.\.+", token): sent_bounds.append(Span(index + 1, index + 1)) elif token == u".": if opening_counts[index] == 0: sent_bounds.append(Span(index + 1, index + 1)) sent_bounds.append(Span(len(tokens), len(tokens))) return sent_bounds
def conll_file(filename, fields, word_field, encoding="utf-8"): document = Document(os.path.basename(filename), encoding=encoding) document._corpus = Corpus.from_conll(filename, fields, encoding=encoding) character_index = 0 sentence_index = 0 contents = [] word_spans = [] sentence_spans = [] for sentence in document._corpus.sentences: contents.append([]) for token in sentence: word = token[word_field] contents[-1].append(word) word_spans.append( Span(character_index, character_index + len(word))) character_index += len(word) + 1 sentence_spans.append( Span(sentence_index, sentence_index + len(sentence))) sentence_index += len(sentence) document._content = u"\n".join( [u" ".join(content) for content in contents]) document.add_segmentation(Segmentation("tokens", spans=word_spans)) document.add_segmentation( Segmentation("sentences", reference=document.segmentation("tokens"), spans=sentence_spans)) return document
def force_regex(self, regex, s): """ Applies a regex for elements that should be segmented in a certain way and splits elements accordingly. """ for match in regex.finditer(s): self.add(Span(match.start(), match.start())) self.add(Span(match.end(), match.end()))
def json_data(data): document = Document(data.get(u"name", u"_DOCUMENT_"), content=data.get(u"content", u"")) for key, value in data.get(u"metadatas", {}).items(): document.add_metadata(key, value) for segmentation_name in data.get(u"segmentations", {}): d = data[u"segmentations"][segmentation_name] spans = [ Span(lb=span[u"s"], ub=0, length=span[u"l"]) for span in d[u"spans"] ] segmentation = Segmentation(segmentation_name, spans=spans, reference=d.get(u"reference", None)) document.add_segmentation(segmentation) for segmentation in document.segmentations: if segmentation.reference is not None: segmentation.reference = document.segmentation( segmentation.reference) for annotation_name in data.get(u"annotations", {}): d = data[u"annotations"][annotation_name] annotations = [ Tag(lb=annotation[u"s"], ub=0, length=annotation[u"l"], value=annotation[u"v"]) for annotation in d[u"annotations"] ] annotation = Annotation(annotation_name, reference=document.segmentation( d[u"reference"]), annotations=annotations) document.add_annotation(annotation)
def sentence_bounds(self, content, token_spans): """ Returns a list of bounds matching sentences. Parameters ---------- token_spans : list of Span the list of tokens spans """ sent_bounds = SpannedBounds() sent_bounds.add(Span(0, 0)) for index, span in enumerate(token_spans): token = content[span.lb:span.ub] if token in u"\r\n": sent_bounds.add_last(Span(index, index + 1)) sent_bounds.add_last(Span(len(token_spans), len(token_spans))) return sent_bounds
def bounds2spans(self, bounds): """ creates spans from bounds """ spans = [ Span(bounds[i].ub, bounds[i + 1].lb) for i in range(0, len(bounds) - 1) ] spans = [span for span in spans if span.lb != span.ub] return spans
def word_spans(self, content): spaces = re.compile(u"\s+", re.U + re.M) l = [match.span() for match in spaces.finditer(content)] l1 = [(l[i][1], l[i + 1][0]) for i in range(len(l) - 1)] if l[0][0] != 0: l1.insert(0, (0, l[0][0])) if l[-1][1] != len(content): l1.append((l[-1][1], len(content))) return [Span(span[0], span[1]) for span in l1]
def get_reference_spans(self): """ returns spans according to the reference chain. """ if self.reference is None: return self.spans else: reference_spans = self.reference.get_reference_spans() return [ Span(reference_spans[element.lb].lb, reference_spans[element.ub - 1].ub) for element in self.spans ]
def paragraph_bounds(self, content, sentence_spans, token_spans): """ Returns a list of bounds matching paragraphs. Parameters ---------- sentence_spans : list of Span the list of sentence spans """ s_spans = [ Span(token_spans[e.lb].lb, token_spans[e.ub - 1].ub) for e in sentence_spans ] paragraph_bounds = SpannedBounds() paragraph_bounds.add(Span(0, 0)) for index, sentence in enumerate(sentence_spans[1:], 1): substring = content[s_spans[index - 1].ub:s_spans[index].lb] if substring.count(u"\n") > 1: paragraph_bounds.append(Span(index, index)) paragraph_bounds.append(Span(len(sentence_spans), len(sentence_spans))) return paragraph_bounds
def word_bounds(self, s): bounds = SpannedBounds() bounds.append(Span(0, 0)) atomic = set(u";:«»()[]{}=+*$£€/\\\"?!%€$£") apostrophe = set(u"'ʼ’") for index, c in enumerate(s): is_first = index == 0 is_last = index == len(s) - 1 if c.isspace(): bounds.add_last(Span(index, index + 1)) elif c in atomic: bounds.add_last(Span(index, index)) bounds.append(Span(index + 1, index + 1)) elif c in apostrophe: if is_first or is_last: bounds.add_last(Span(index, index)) bounds.append(Span(index + 1, index + 1)) elif s[index + 1] == s[index]: bounds.append(Span(index, index + 1)) else: if s[index - 1] == u"n" and s[index + 1] == u"t": bounds.append(Span(index - 1, index - 1)) bounds.append(Span(index + 2, index + 2)) elif s[index + 1] == u"s": bounds.append(Span(index, index)) bounds.append(Span(index + 2, index + 2)) else: bounds.add_last(Span(index, index)) elif c in u'.,': if is_first or is_last: bounds.add_last(Span(index, index)) bounds.append(Span(index + 1, index + 1)) elif (is_first or not s[index - 1].isdigit()) and ( is_last or not s[index - 1].isdigit()): bounds.add_last(Span(index, index)) bounds.append(Span(index + 1, index + 1)) bounds.append(Span(len(s), len(s))) return bounds
def document_to_data(self, document, couples, **kwargs): TEI = ET.Element("TEI") TEI.set("xmlns", "http://www.tei-c.org/ns/1.0") lang = document.metadata("lang") if lang is not None: TEI.set("xml:lang", lang) teiHeader = ET.SubElement(TEI, "teiHeader") fileDesc = ET.SubElement(teiHeader, "fileDesc") titleStmt = ET.SubElement(fileDesc, "titleStmt") title = ET.SubElement(titleStmt, "title") title.text = "" respStmt = ET.SubElement(titleStmt, "respStmt") resp = ET.SubElement(respStmt, "resp") resp.text = "" name = ET.SubElement(respStmt, "name") name.text = "" publicationStmt = ET.SubElement(fileDesc, "publicationStmt") publisher = ET.SubElement(publicationStmt, "publisher") publisher.text = "" sourceDesc = ET.SubElement(fileDesc, "sourceDesc") sourceDesc.text = "" titleStmt = ET.SubElement(fileDesc, "titleStmt") title = ET.SubElement(titleStmt, "title") title.text = "" respStmt = ET.SubElement(titleStmt, "respStmt") resp = ET.SubElement(respStmt, "resp") resp.text = "" name = ET.SubElement(respStmt, "name") name.text = "" publicationStmt = ET.SubElement(fileDesc, "publicationStmt") publisher = ET.SubElement(publicationStmt, "publisher") publisher.text = "" sourceDesc = ET.SubElement(fileDesc, "sourceDesc") sourceDesc.text = "" root = ET.SubElement(TEI, "text") body = ET.SubElement(root, "body") div = ET.SubElement(body, "div") lower = {} for field in couples: lower[field.lower()] = couples[field] annotations = set(document.annotations.keys()) field = None if len(couples) == 1: field = lower[lower.keys()[0]] else: field = (lower.get("ner", None) if lower.get("ner", None) in annotations else None) if field is None: field = (lower.get("chunking", None) if lower.get( "chunking", None) in annotations else None) if field is None: raise ValueError( "Could not determine the field to use for TEI export.") content = document.content paragraphs = ( document.segmentation(u"paragraphs").get_reference_spans() if document.segmentation(u"paragraphs") is not None else [Span(0, len(content))]) NEs = document.annotation(field).get_reference_annotations() values = set([entity.value for entity in NEs]) nth = dict([(value, 0) for value in values]) for paragraph in paragraphs: entities = [ entity for entity in NEs if entity.lb >= paragraph.lb and entity.ub <= paragraph.ub ] p = ET.SubElement(div, "p") start = paragraph.lb if len(entities) == 0: p.text = content[paragraph.lb:paragraph.ub] else: p.text = content[paragraph.lb:entities[0].lb] for i, entity in enumerate(entities): entity_xml = ET.SubElement(p, entity.value) entity_xml.text = content[entity.lb:entity.ub] if i < len(entities) - 1: entity_xml.tail = content[entity.ub:entities[i + 1].lb] else: entity_xml.tail = content[entity.ub:paragraph.ub] return TEI
def document_to_data(self, document, couples, **kwargs): teiCorpus = ET.Element("teiCorpus") teiCorpus.set("xmlns", "http://www.tei-c.org/ns/1.0") teiHeader = ET.SubElement(teiCorpus, "teiHeader") fileDesc = ET.SubElement(teiHeader, "fileDesc") titleStmt = ET.SubElement(fileDesc, "titleStmt") title = ET.SubElement(titleStmt, "title") title.text = "" respStmt = ET.SubElement(titleStmt, "respStmt") resp = ET.SubElement(respStmt, "resp") resp.text = "" name = ET.SubElement(respStmt, "name") name.text = "" publicationStmt = ET.SubElement(fileDesc, "publicationStmt") publisher = ET.SubElement(publicationStmt, "publisher") publisher.text = "" sourceDesc = ET.SubElement(fileDesc, "sourceDesc") sourceDesc.text = "" TEI = ET.SubElement(teiCorpus, "TEI") teiHeader = ET.SubElement(TEI, "teiHeader") teiHeader.text = "" titleStmt = ET.SubElement(fileDesc, "titleStmt") title = ET.SubElement(titleStmt, "title") title.text = "" respStmt = ET.SubElement(titleStmt, "respStmt") resp = ET.SubElement(respStmt, "resp") resp.text = "" name = ET.SubElement(respStmt, "name") name.text = "" publicationStmt = ET.SubElement(fileDesc, "publicationStmt") publisher = ET.SubElement(publicationStmt, "publisher") publisher.text = "" sourceDesc = ET.SubElement(fileDesc, "sourceDesc") sourceDesc.text = "" root = ET.SubElement(TEI, "text") body = ET.SubElement(root, "body") lower = {} for field in couples: lower[field.lower()] = couples[field] chunking_field = None try: chunking_field = lower["chunking"] except KeyError: message = 'No "chunking" field was found, please check you have chunking information in your pipeline.' tei_np_logger.exception(message) raise KeyError(message) content = document.content possessives = set([u"son", u"sa", u"ses"]) pronoun2analec = { u"CL": u"PR_CL", u"CLO": u"PR_CL_O", u"CLR": u"PR_CL_R", u"CLS": u"PR_CL_S", u"PRO": u"PR_PRO", u"PROREL": u"PR_REL", u"PROWH": u"PR_WH", u"P+PRO": u"PR_PP" } words = document.segmentation(u"tokens").get_reference_spans() paragraphs = document.segmentation( u"paragraphs").get_reference_spans() or Span(0, len(content)) np_chunks = [ annotation for annotation in document.annotation(chunking_field) if annotation.value == u"NP" ] pos_tags = document.annotation(lower["pos"])[:] pos = [] for i in range(len(np_chunks)): chunk = np_chunks[i] pos.append([ annot for annot in pos_tags if annot.lb >= chunk.lb and annot.ub <= chunk.ub ]) for i in range(len(np_chunks)): np_chunks[i].ub = words[np_chunks[i].ub - 1].ub np_chunks[i].lb = words[np_chunks[i].lb].lb nth = 0 for paragraph in paragraphs: nps = [ chunk for chunk in np_chunks if chunk.lb >= paragraph.lb and chunk.ub <= paragraph.ub ] p = ET.SubElement(body, "p") start = paragraph.lb if len(nps) == 0: p.text = content[paragraph.lb:paragraph.ub] #add_text(p, content[paragraph.lb : paragraph.ub]) else: p.text = content[paragraph.lb:nps[0].lb] #add_text(p, content[paragraph.lb : nps[0].lb]) for i, np in enumerate(nps): nth += 1 np_start = ET.SubElement( p, "anchor", { "xml:id": "u-MENTION-%i-start" % nth, "type": "AnalecDelimiter", "subtype": "UnitStart" }) np_start.tail = content[np.lb:np.ub] #add_tail(np_start, content[np.lb : np.ub]) np_end = ET.SubElement( p, "anchor", { "xml:id": "u-MENTION-%i-end" % nth, "type": "AnalecDelimiter", "subtype": "UnitEnd" }) if i < len(nps) - 1: np_end.tail = content[np.ub:nps[i + 1].lb] #add_tail(np_end, content[np.ub : nps[i+1].lb]) else: np_end.tail = content[np.ub:paragraph.ub] #add_tail(np_end, content[np.ub : paragraph.ub]) back = ET.SubElement(root, "back") spanGrp = ET.SubElement(back, "spanGrp") spanGrp.set("type", "AnalecUnit") spanGrp.set("n", "MENTION") for i, np in enumerate(np_chunks): ET.SubElement( spanGrp, "span", { "xml:id": "u-MENTION-%i" % (i + 1), "from": "#u-MENTION-%i-start" % (i + 1), "to": "#u-MENTION-%i-end" % (i + 1), "ana": "#u-MENTION-%i-fs" % (i + 1) }) fvLib = ET.SubElement(back, "fvLib") fvLib.set("n", "AnalecElementProperties") for i, np in enumerate(np_chunks): value = pronoun2analec.get(pos[i][0].value, u"GN") fs = ET.SubElement(fvLib, "fs", {"xml:id": "u-MENTION-%i-fs" % (i + 1)}) f = ET.SubElement(fs, "f") f.set("name", "REF") ET.SubElement(f, "string") f = ET.SubElement(fs, "f") f.set("name", "CODE_SEM") fstring = ET.SubElement(f, "string") fstring.text = value f = ET.SubElement(fs, "f") f.set("name", "CATEGORIE") fstring = ET.SubElement(f, "string") fstring.text = value return teiCorpus
def word_bounds(self, s): bounds = SpannedBounds() bounds.append(Span(0, 0)) atomic = set(u";:«»()[]{}=+*$£€/\\\"?!…%€$£") apostrophe = set(u"'ʼ’") for forbidden in self._forbidden: bounds.add_forbiddens_regex(forbidden, s) previous = "" for index, c in enumerate(s): is_first = index == 0 is_last = index == len(s) - 1 if c.isspace(): if (index == bounds[-1].ub) and previous.isspace() or ( index == (bounds[-1].lb) and index == (bounds[-1].ub)): bounds[-1].expand_ub(1) else: bounds.append(Span(index, index + 1)) elif c in atomic: bounds.add_last(Span(index, index)) bounds.append(Span(index + 1, index + 1)) elif c in apostrophe: bounds.append(Span(index + 1, index + 1)) elif c.isdigit(): if is_first or not (previous.isupper() or previous in self._digit_valid): bounds.append(Span(index, index)) if is_last or not (s[index + 1].isupper() or s[index + 1] in self._digit_valid): bounds.append(Span(index + 1, index + 1)) elif c == u',': if is_first or is_last or not (previous.isdigit() and s[index + 1].isdigit()): bounds.add_last(Span(index, index)) bounds.append(Span(index + 1, index + 1)) elif c == u".": no_dot_before = previous != u"." no_dot_after = is_last or s[index + 1] != u"." if is_first or is_last or s[index + 1] in u"\r\n" or not ( previous.isdigit() and s[index + 1].isdigit()): if no_dot_before: bounds.add_last(Span(index, index)) if no_dot_after: bounds.append(Span(index + 1, index + 1)) elif c == u'-': if not (previous) or previous.isspace(): bounds.add_last(Span(index, index)) bounds.append(Span(index + 1, index + 1)) elif not is_last and s[index + 1].isspace(): bounds.add_last(Span(index, index)) bounds.append(Span(index + 1, index + 1)) previous = c for force in self._force: bounds.force_regex(force, s) bounds.append(Span(len(s), len(s))) return bounds
def word_spans(self, content): spaces = re.compile(u"\s+", re.U + re.M) l = [match.span() for match in spaces.finditer(content)] l1 = [(l[i][1], l[i + 1][0]) for i in range(len(l) - 1)] if l[0][0] != 0: l1.insert(0, (0, l[0][0])) if l[-1][1] != len(content): l1.append((l[-1][1], len(content))) word = re.compile(u"^[^\W\d]+$", re.U + re.M) number_with_unit = re.compile(u"([0-9][^0-9,.])|([^0-9,.][0-9])") atomic = re.compile(u"[;:«»()\\[\\]{}=+*$£€/\\\"?!…%€$£]") comma_not_number = re.compile(u"(?<=[^0-9]),(?![0-9])", re.U + re.M) apostrophe = re.compile(u"(?=['ʼ’])", re.U + re.M) clitics = re.compile( r"(-je|-tu|-nous|-vous|(:?-t)?-(:?on|ils?|elles?))$", re.U + re.I) i = 0 while i < len(l1): span = l1[i] text = content[span[0]:span[1]] if len(text) == 1: i += 1 continue if word.match(text): i += 1 continue found = False for forbidden in self._forbidden: found = forbidden.match(text) if found: i += 1 break if found: continue tmp = [] # atomic characters, they are always split prev = span[0] for find in atomic.finditer(text): if prev != span[0] + find.start(): tmp.append((prev, span[0] + find.start())) tmp.append((span[0] + find.start(), span[0] + find.end())) prev = span[0] + find.end() if tmp != []: if prev != span[1]: tmp.append((prev, span[1])) del l1[i] for t in reversed(tmp): l1.insert(i, t) continue del tmp[:] # commas prev = span[0] for find in comma_not_number.finditer(text): tmp.extend([(prev, span[0] + find.start()), (span[0] + find.start(), span[0] + find.end()), (span[0] + find.end(), span[1])]) prev = span[0] + find.end() + 1 if tmp != []: del l1[i] for t in reversed(tmp): l1.insert(i, t) continue del tmp[:] # apostrophes prev = span[0] for find in apostrophe.finditer(text): tmp.append((prev, span[0] + find.start() + 1)) prev = span[0] + find.start() + 1 if prev < span[1]: tmp.append((prev, span[1])) if len(tmp) > 1: del l1[i] for t in reversed(tmp): l1.insert(i, t) continue del tmp[:] # clitics prev = span[0] for find in clitics.finditer(text): tmp.append((prev, span[0] + find.start())) prev = span[0] + find.start() if tmp: if tmp[0][0] == tmp[0][1]: del tmp[:] else: tmp.append((prev, span[1])) if len(tmp) > 1: del l1[i] for t in reversed(tmp): l1.insert(i, t) continue del tmp[:] # number with unit prev = span[0] for find in number_with_unit.finditer(text): tmp.append((prev, span[0] + find.start() + 1)) #, (span[0]+find.start(), span[1])]) prev = span[0] + find.start() + 1 if tmp: tmp.append((prev, span[1])) del l1[i] for t in reversed(tmp): l1.insert(i, t) continue del tmp[:] # dots and ending commas if text and (text[-1] in u".," and not (len(text) == 2 and text[0].isupper())): tmp = [(span[0], span[1] - 1), (span[1] - 1, span[1])] if tmp: del l1[i] for t in reversed(tmp): l1.insert(i, t) continue i += 1 spans = [Span(s[0], s[1]) for s in l1] spans = [span for span in spans if len(span) > 0] return spans