Exemple #1
0
    def sentence_bounds(self, content, token_spans):
        sent_bounds = SpannedBounds()
        tokens = [content[t.lb:t.ub] for t in token_spans]
        opening_counts = [0 for i in token_spans]
        count = 0
        for i in range(len(opening_counts)):
            if tokens[i] in u"«([":
                count += 1
            elif tokens[i] in u"»)]":
                count -= 1
            opening_counts[i] = count

        sent_bounds.append(Span(0, 0))
        for index, span in enumerate(token_spans):
            token = tokens[index]
            if re.match(u"^[?!]+$", token) or token == u"…" or re.match(
                    u"\.\.+", token):
                sent_bounds.append(Span(index + 1, index + 1))
            elif token == u".":
                if opening_counts[index] == 0:
                    sent_bounds.append(Span(index + 1, index + 1))
            elif index < len(token_spans) - 1 and content[
                    span.ub:token_spans[index + 1].lb].count("\n") > 1:
                sent_bounds.append(Span(index + 1, index + 1))
        sent_bounds.append(Span(len(tokens), len(tokens)))

        return sent_bounds
Exemple #2
0
    def sentence_bounds(self, content, token_spans):
        sent_bounds = SpannedBounds()
        tokens = [content[t.lb:t.ub] for t in token_spans]
        openings = set([u"«", u"(", u"[", u"``"])
        closings = set([u"»", u")", u"]", u"''"])
        opening_counts = [0 for i in tokens]
        count = 0
        for i in range(len(opening_counts)):
            if tokens[i] in openings:
                count += 1
            elif tokens[i] in closings:
                count -= 1
            opening_counts[i] = count

        sent_bounds.append(Span(0, 0))
        for index, token in enumerate(tokens):
            if re.match(u"^[?!]+$", token) or token == u"…" or re.match(
                    u"\.\.+", token):
                sent_bounds.append(Span(index + 1, index + 1))
            elif token == u".":
                if opening_counts[index] == 0:
                    sent_bounds.append(Span(index + 1, index + 1))
        sent_bounds.append(Span(len(tokens), len(tokens)))

        return sent_bounds
Exemple #3
0
def conll_file(filename, fields, word_field, encoding="utf-8"):
    document = Document(os.path.basename(filename), encoding=encoding)
    document._corpus = Corpus.from_conll(filename, fields, encoding=encoding)
    character_index = 0
    sentence_index = 0
    contents = []
    word_spans = []
    sentence_spans = []
    for sentence in document._corpus.sentences:
        contents.append([])
        for token in sentence:
            word = token[word_field]
            contents[-1].append(word)
            word_spans.append(
                Span(character_index, character_index + len(word)))
            character_index += len(word) + 1
        sentence_spans.append(
            Span(sentence_index, sentence_index + len(sentence)))
        sentence_index += len(sentence)
    document._content = u"\n".join(
        [u" ".join(content) for content in contents])
    document.add_segmentation(Segmentation("tokens", spans=word_spans))
    document.add_segmentation(
        Segmentation("sentences",
                     reference=document.segmentation("tokens"),
                     spans=sentence_spans))
    return document
Exemple #4
0
    def force_regex(self, regex, s):
        """
        Applies a regex for elements that should be segmented in a certain
        way and splits elements accordingly.
        """

        for match in regex.finditer(s):
            self.add(Span(match.start(), match.start()))
            self.add(Span(match.end(), match.end()))
Exemple #5
0
def json_data(data):
    document = Document(data.get(u"name", u"_DOCUMENT_"),
                        content=data.get(u"content", u""))
    for key, value in data.get(u"metadatas", {}).items():
        document.add_metadata(key, value)

    for segmentation_name in data.get(u"segmentations", {}):
        d = data[u"segmentations"][segmentation_name]
        spans = [
            Span(lb=span[u"s"], ub=0, length=span[u"l"])
            for span in d[u"spans"]
        ]
        segmentation = Segmentation(segmentation_name,
                                    spans=spans,
                                    reference=d.get(u"reference", None))
        document.add_segmentation(segmentation)
    for segmentation in document.segmentations:
        if segmentation.reference is not None:
            segmentation.reference = document.segmentation(
                segmentation.reference)

    for annotation_name in data.get(u"annotations", {}):
        d = data[u"annotations"][annotation_name]
        annotations = [
            Tag(lb=annotation[u"s"],
                ub=0,
                length=annotation[u"l"],
                value=annotation[u"v"]) for annotation in d[u"annotations"]
        ]
        annotation = Annotation(annotation_name,
                                reference=document.segmentation(
                                    d[u"reference"]),
                                annotations=annotations)
        document.add_annotation(annotation)
Exemple #6
0
    def sentence_bounds(self, content, token_spans):
        """
        Returns a list of bounds matching sentences.
        
        Parameters
        ----------
        token_spans : list of Span
            the list of tokens spans
        """
        sent_bounds = SpannedBounds()

        sent_bounds.add(Span(0, 0))
        for index, span in enumerate(token_spans):
            token = content[span.lb:span.ub]
            if token in u"\r\n":
                sent_bounds.add_last(Span(index, index + 1))
        sent_bounds.add_last(Span(len(token_spans), len(token_spans)))

        return sent_bounds
Exemple #7
0
 def bounds2spans(self, bounds):
     """
     creates spans from bounds
     """
     spans = [
         Span(bounds[i].ub, bounds[i + 1].lb)
         for i in range(0,
                        len(bounds) - 1)
     ]
     spans = [span for span in spans if span.lb != span.ub]
     return spans
Exemple #8
0
    def word_spans(self, content):
        spaces = re.compile(u"\s+", re.U + re.M)

        l = [match.span() for match in spaces.finditer(content)]
        l1 = [(l[i][1], l[i + 1][0]) for i in range(len(l) - 1)]

        if l[0][0] != 0:
            l1.insert(0, (0, l[0][0]))
        if l[-1][1] != len(content):
            l1.append((l[-1][1], len(content)))

        return [Span(span[0], span[1]) for span in l1]
Exemple #9
0
    def get_reference_spans(self):
        """
        returns spans according to the reference chain.
        """

        if self.reference is None:
            return self.spans
        else:
            reference_spans = self.reference.get_reference_spans()
            return [
                Span(reference_spans[element.lb].lb,
                     reference_spans[element.ub - 1].ub)
                for element in self.spans
            ]
Exemple #10
0
    def paragraph_bounds(self, content, sentence_spans, token_spans):
        """
        Returns a list of bounds matching paragraphs.
        
        Parameters
        ----------
        sentence_spans : list of Span
            the list of sentence spans
        """
        s_spans = [
            Span(token_spans[e.lb].lb, token_spans[e.ub - 1].ub)
            for e in sentence_spans
        ]

        paragraph_bounds = SpannedBounds()

        paragraph_bounds.add(Span(0, 0))
        for index, sentence in enumerate(sentence_spans[1:], 1):
            substring = content[s_spans[index - 1].ub:s_spans[index].lb]
            if substring.count(u"\n") > 1:
                paragraph_bounds.append(Span(index, index))
        paragraph_bounds.append(Span(len(sentence_spans), len(sentence_spans)))

        return paragraph_bounds
Exemple #11
0
    def word_bounds(self, s):
        bounds = SpannedBounds()
        bounds.append(Span(0, 0))

        atomic = set(u";:«»()[]{}=+*$£€/\\\"?!%€$£")
        apostrophe = set(u"'ʼ’")

        for index, c in enumerate(s):
            is_first = index == 0
            is_last = index == len(s) - 1
            if c.isspace():
                bounds.add_last(Span(index, index + 1))
            elif c in atomic:
                bounds.add_last(Span(index, index))
                bounds.append(Span(index + 1, index + 1))
            elif c in apostrophe:
                if is_first or is_last:
                    bounds.add_last(Span(index, index))
                    bounds.append(Span(index + 1, index + 1))
                elif s[index + 1] == s[index]:
                    bounds.append(Span(index, index + 1))
                else:
                    if s[index - 1] == u"n" and s[index + 1] == u"t":
                        bounds.append(Span(index - 1, index - 1))
                        bounds.append(Span(index + 2, index + 2))
                    elif s[index + 1] == u"s":
                        bounds.append(Span(index, index))
                        bounds.append(Span(index + 2, index + 2))
                    else:
                        bounds.add_last(Span(index, index))
            elif c in u'.,':
                if is_first or is_last:
                    bounds.add_last(Span(index, index))
                    bounds.append(Span(index + 1, index + 1))
                elif (is_first or not s[index - 1].isdigit()) and (
                        is_last or not s[index - 1].isdigit()):
                    bounds.add_last(Span(index, index))
                    bounds.append(Span(index + 1, index + 1))

        bounds.append(Span(len(s), len(s)))

        return bounds
Exemple #12
0
    def document_to_data(self, document, couples, **kwargs):
        TEI = ET.Element("TEI")
        TEI.set("xmlns", "http://www.tei-c.org/ns/1.0")
        lang = document.metadata("lang")
        if lang is not None:
            TEI.set("xml:lang", lang)
        teiHeader = ET.SubElement(TEI, "teiHeader")
        fileDesc = ET.SubElement(teiHeader, "fileDesc")
        titleStmt = ET.SubElement(fileDesc, "titleStmt")
        title = ET.SubElement(titleStmt, "title")
        title.text = ""
        respStmt = ET.SubElement(titleStmt, "respStmt")
        resp = ET.SubElement(respStmt, "resp")
        resp.text = ""
        name = ET.SubElement(respStmt, "name")
        name.text = ""
        publicationStmt = ET.SubElement(fileDesc, "publicationStmt")
        publisher = ET.SubElement(publicationStmt, "publisher")
        publisher.text = ""
        sourceDesc = ET.SubElement(fileDesc, "sourceDesc")
        sourceDesc.text = ""
        titleStmt = ET.SubElement(fileDesc, "titleStmt")
        title = ET.SubElement(titleStmt, "title")
        title.text = ""
        respStmt = ET.SubElement(titleStmt, "respStmt")
        resp = ET.SubElement(respStmt, "resp")
        resp.text = ""
        name = ET.SubElement(respStmt, "name")
        name.text = ""
        publicationStmt = ET.SubElement(fileDesc, "publicationStmt")
        publisher = ET.SubElement(publicationStmt, "publisher")
        publisher.text = ""
        sourceDesc = ET.SubElement(fileDesc, "sourceDesc")
        sourceDesc.text = ""

        root = ET.SubElement(TEI, "text")
        body = ET.SubElement(root, "body")
        div = ET.SubElement(body, "div")

        lower = {}
        for field in couples:
            lower[field.lower()] = couples[field]
        annotations = set(document.annotations.keys())
        field = None
        if len(couples) == 1:
            field = lower[lower.keys()[0]]
        else:
            field = (lower.get("ner", None)
                     if lower.get("ner", None) in annotations else None)
            if field is None:
                field = (lower.get("chunking", None) if lower.get(
                    "chunking", None) in annotations else None)
        if field is None:
            raise ValueError(
                "Could not determine the field to use for TEI export.")

        content = document.content
        paragraphs = (
            document.segmentation(u"paragraphs").get_reference_spans()
            if document.segmentation(u"paragraphs") is not None else
            [Span(0, len(content))])
        NEs = document.annotation(field).get_reference_annotations()
        values = set([entity.value for entity in NEs])

        nth = dict([(value, 0) for value in values])
        for paragraph in paragraphs:
            entities = [
                entity for entity in NEs
                if entity.lb >= paragraph.lb and entity.ub <= paragraph.ub
            ]
            p = ET.SubElement(div, "p")
            start = paragraph.lb
            if len(entities) == 0:
                p.text = content[paragraph.lb:paragraph.ub]
            else:
                p.text = content[paragraph.lb:entities[0].lb]
                for i, entity in enumerate(entities):
                    entity_xml = ET.SubElement(p, entity.value)
                    entity_xml.text = content[entity.lb:entity.ub]
                    if i < len(entities) - 1:
                        entity_xml.tail = content[entity.ub:entities[i + 1].lb]
                    else:
                        entity_xml.tail = content[entity.ub:paragraph.ub]

        return TEI
Exemple #13
0
    def document_to_data(self, document, couples, **kwargs):
        teiCorpus = ET.Element("teiCorpus")
        teiCorpus.set("xmlns", "http://www.tei-c.org/ns/1.0")
        teiHeader = ET.SubElement(teiCorpus, "teiHeader")
        fileDesc = ET.SubElement(teiHeader, "fileDesc")
        titleStmt = ET.SubElement(fileDesc, "titleStmt")
        title = ET.SubElement(titleStmt, "title")
        title.text = ""
        respStmt = ET.SubElement(titleStmt, "respStmt")
        resp = ET.SubElement(respStmt, "resp")
        resp.text = ""
        name = ET.SubElement(respStmt, "name")
        name.text = ""
        publicationStmt = ET.SubElement(fileDesc, "publicationStmt")
        publisher = ET.SubElement(publicationStmt, "publisher")
        publisher.text = ""
        sourceDesc = ET.SubElement(fileDesc, "sourceDesc")
        sourceDesc.text = ""
        TEI = ET.SubElement(teiCorpus, "TEI")
        teiHeader = ET.SubElement(TEI, "teiHeader")
        teiHeader.text = ""
        titleStmt = ET.SubElement(fileDesc, "titleStmt")
        title = ET.SubElement(titleStmt, "title")
        title.text = ""
        respStmt = ET.SubElement(titleStmt, "respStmt")
        resp = ET.SubElement(respStmt, "resp")
        resp.text = ""
        name = ET.SubElement(respStmt, "name")
        name.text = ""
        publicationStmt = ET.SubElement(fileDesc, "publicationStmt")
        publisher = ET.SubElement(publicationStmt, "publisher")
        publisher.text = ""
        sourceDesc = ET.SubElement(fileDesc, "sourceDesc")
        sourceDesc.text = ""

        root = ET.SubElement(TEI, "text")
        body = ET.SubElement(root, "body")

        lower = {}
        for field in couples:
            lower[field.lower()] = couples[field]
        chunking_field = None
        try:
            chunking_field = lower["chunking"]
        except KeyError:
            message = 'No "chunking" field was found, please check you have chunking information in your pipeline.'
            tei_np_logger.exception(message)
            raise KeyError(message)

        content = document.content
        possessives = set([u"son", u"sa", u"ses"])
        pronoun2analec = {
            u"CL": u"PR_CL",
            u"CLO": u"PR_CL_O",
            u"CLR": u"PR_CL_R",
            u"CLS": u"PR_CL_S",
            u"PRO": u"PR_PRO",
            u"PROREL": u"PR_REL",
            u"PROWH": u"PR_WH",
            u"P+PRO": u"PR_PP"
        }
        words = document.segmentation(u"tokens").get_reference_spans()
        paragraphs = document.segmentation(
            u"paragraphs").get_reference_spans() or Span(0, len(content))
        np_chunks = [
            annotation for annotation in document.annotation(chunking_field)
            if annotation.value == u"NP"
        ]
        pos_tags = document.annotation(lower["pos"])[:]
        pos = []
        for i in range(len(np_chunks)):
            chunk = np_chunks[i]
            pos.append([
                annot for annot in pos_tags
                if annot.lb >= chunk.lb and annot.ub <= chunk.ub
            ])

        for i in range(len(np_chunks)):
            np_chunks[i].ub = words[np_chunks[i].ub - 1].ub
            np_chunks[i].lb = words[np_chunks[i].lb].lb

        nth = 0
        for paragraph in paragraphs:
            nps = [
                chunk for chunk in np_chunks
                if chunk.lb >= paragraph.lb and chunk.ub <= paragraph.ub
            ]
            p = ET.SubElement(body, "p")
            start = paragraph.lb
            if len(nps) == 0:
                p.text = content[paragraph.lb:paragraph.ub]
                #add_text(p, content[paragraph.lb : paragraph.ub])
            else:
                p.text = content[paragraph.lb:nps[0].lb]
                #add_text(p, content[paragraph.lb : nps[0].lb])
                for i, np in enumerate(nps):
                    nth += 1
                    np_start = ET.SubElement(
                        p, "anchor", {
                            "xml:id": "u-MENTION-%i-start" % nth,
                            "type": "AnalecDelimiter",
                            "subtype": "UnitStart"
                        })
                    np_start.tail = content[np.lb:np.ub]
                    #add_tail(np_start, content[np.lb : np.ub])
                    np_end = ET.SubElement(
                        p, "anchor", {
                            "xml:id": "u-MENTION-%i-end" % nth,
                            "type": "AnalecDelimiter",
                            "subtype": "UnitEnd"
                        })
                    if i < len(nps) - 1:
                        np_end.tail = content[np.ub:nps[i + 1].lb]
                        #add_tail(np_end, content[np.ub : nps[i+1].lb])
                    else:
                        np_end.tail = content[np.ub:paragraph.ub]
                        #add_tail(np_end, content[np.ub : paragraph.ub])

        back = ET.SubElement(root, "back")
        spanGrp = ET.SubElement(back, "spanGrp")
        spanGrp.set("type", "AnalecUnit")
        spanGrp.set("n", "MENTION")
        for i, np in enumerate(np_chunks):
            ET.SubElement(
                spanGrp, "span", {
                    "xml:id": "u-MENTION-%i" % (i + 1),
                    "from": "#u-MENTION-%i-start" % (i + 1),
                    "to": "#u-MENTION-%i-end" % (i + 1),
                    "ana": "#u-MENTION-%i-fs" % (i + 1)
                })

        fvLib = ET.SubElement(back, "fvLib")
        fvLib.set("n", "AnalecElementProperties")
        for i, np in enumerate(np_chunks):
            value = pronoun2analec.get(pos[i][0].value, u"GN")

            fs = ET.SubElement(fvLib, "fs",
                               {"xml:id": "u-MENTION-%i-fs" % (i + 1)})
            f = ET.SubElement(fs, "f")
            f.set("name", "REF")
            ET.SubElement(f, "string")

            f = ET.SubElement(fs, "f")
            f.set("name", "CODE_SEM")
            fstring = ET.SubElement(f, "string")
            fstring.text = value

            f = ET.SubElement(fs, "f")
            f.set("name", "CATEGORIE")
            fstring = ET.SubElement(f, "string")
            fstring.text = value

        return teiCorpus
Exemple #14
0
    def word_bounds(self, s):
        bounds = SpannedBounds()
        bounds.append(Span(0, 0))

        atomic = set(u";:«»()[]{}=+*$£€/\\\"?!…%€$£")
        apostrophe = set(u"'ʼ’")

        for forbidden in self._forbidden:
            bounds.add_forbiddens_regex(forbidden, s)

        previous = ""
        for index, c in enumerate(s):
            is_first = index == 0
            is_last = index == len(s) - 1

            if c.isspace():
                if (index == bounds[-1].ub) and previous.isspace() or (
                        index == (bounds[-1].lb) and index == (bounds[-1].ub)):
                    bounds[-1].expand_ub(1)
                else:
                    bounds.append(Span(index, index + 1))
            elif c in atomic:
                bounds.add_last(Span(index, index))
                bounds.append(Span(index + 1, index + 1))
            elif c in apostrophe:
                bounds.append(Span(index + 1, index + 1))
            elif c.isdigit():
                if is_first or not (previous.isupper()
                                    or previous in self._digit_valid):
                    bounds.append(Span(index, index))
                if is_last or not (s[index + 1].isupper()
                                   or s[index + 1] in self._digit_valid):
                    bounds.append(Span(index + 1, index + 1))
            elif c == u',':
                if is_first or is_last or not (previous.isdigit()
                                               and s[index + 1].isdigit()):
                    bounds.add_last(Span(index, index))
                    bounds.append(Span(index + 1, index + 1))
            elif c == u".":
                no_dot_before = previous != u"."
                no_dot_after = is_last or s[index + 1] != u"."
                if is_first or is_last or s[index + 1] in u"\r\n" or not (
                        previous.isdigit() and s[index + 1].isdigit()):
                    if no_dot_before:
                        bounds.add_last(Span(index, index))
                    if no_dot_after:
                        bounds.append(Span(index + 1, index + 1))
            elif c == u'-':
                if not (previous) or previous.isspace():
                    bounds.add_last(Span(index, index))
                    bounds.append(Span(index + 1, index + 1))
                elif not is_last and s[index + 1].isspace():
                    bounds.add_last(Span(index, index))
                    bounds.append(Span(index + 1, index + 1))
            previous = c

        for force in self._force:
            bounds.force_regex(force, s)

        bounds.append(Span(len(s), len(s)))

        return bounds
Exemple #15
0
    def word_spans(self, content):
        spaces = re.compile(u"\s+", re.U + re.M)

        l = [match.span() for match in spaces.finditer(content)]
        l1 = [(l[i][1], l[i + 1][0]) for i in range(len(l) - 1)]

        if l[0][0] != 0:
            l1.insert(0, (0, l[0][0]))
        if l[-1][1] != len(content):
            l1.append((l[-1][1], len(content)))

        word = re.compile(u"^[^\W\d]+$", re.U + re.M)
        number_with_unit = re.compile(u"([0-9][^0-9,.])|([^0-9,.][0-9])")
        atomic = re.compile(u"[;:«»()\\[\\]{}=+*$£€/\\\"?!…%€$£]")
        comma_not_number = re.compile(u"(?<=[^0-9]),(?![0-9])", re.U + re.M)
        apostrophe = re.compile(u"(?=['ʼ’])", re.U + re.M)
        clitics = re.compile(
            r"(-je|-tu|-nous|-vous|(:?-t)?-(:?on|ils?|elles?))$", re.U + re.I)
        i = 0
        while i < len(l1):
            span = l1[i]
            text = content[span[0]:span[1]]
            if len(text) == 1:
                i += 1
                continue
            if word.match(text):
                i += 1
                continue
            found = False
            for forbidden in self._forbidden:
                found = forbidden.match(text)
                if found:
                    i += 1
                    break
            if found:
                continue
            tmp = []
            # atomic characters, they are always split
            prev = span[0]
            for find in atomic.finditer(text):
                if prev != span[0] + find.start():
                    tmp.append((prev, span[0] + find.start()))
                tmp.append((span[0] + find.start(), span[0] + find.end()))
                prev = span[0] + find.end()
            if tmp != []:
                if prev != span[1]:
                    tmp.append((prev, span[1]))
                del l1[i]
                for t in reversed(tmp):
                    l1.insert(i, t)
                continue
            del tmp[:]
            # commas
            prev = span[0]
            for find in comma_not_number.finditer(text):
                tmp.extend([(prev, span[0] + find.start()),
                            (span[0] + find.start(), span[0] + find.end()),
                            (span[0] + find.end(), span[1])])
                prev = span[0] + find.end() + 1
            if tmp != []:
                del l1[i]
                for t in reversed(tmp):
                    l1.insert(i, t)
                continue
            del tmp[:]
            # apostrophes
            prev = span[0]
            for find in apostrophe.finditer(text):
                tmp.append((prev, span[0] + find.start() + 1))
                prev = span[0] + find.start() + 1
            if prev < span[1]:
                tmp.append((prev, span[1]))
            if len(tmp) > 1:
                del l1[i]
                for t in reversed(tmp):
                    l1.insert(i, t)
                continue
            del tmp[:]
            # clitics
            prev = span[0]
            for find in clitics.finditer(text):
                tmp.append((prev, span[0] + find.start()))
                prev = span[0] + find.start()
            if tmp:
                if tmp[0][0] == tmp[0][1]:
                    del tmp[:]
                else:
                    tmp.append((prev, span[1]))
            if len(tmp) > 1:
                del l1[i]
                for t in reversed(tmp):
                    l1.insert(i, t)
                continue
            del tmp[:]
            # number with unit
            prev = span[0]
            for find in number_with_unit.finditer(text):
                tmp.append((prev, span[0] + find.start() +
                            1))  #, (span[0]+find.start(), span[1])])
                prev = span[0] + find.start() + 1
            if tmp:
                tmp.append((prev, span[1]))
                del l1[i]
                for t in reversed(tmp):
                    l1.insert(i, t)
                continue
            del tmp[:]
            # dots and ending commas
            if text and (text[-1] in u".,"
                         and not (len(text) == 2 and text[0].isupper())):
                tmp = [(span[0], span[1] - 1), (span[1] - 1, span[1])]
            if tmp:
                del l1[i]
                for t in reversed(tmp):
                    l1.insert(i, t)
                continue
            i += 1

        spans = [Span(s[0], s[1]) for s in l1]
        spans = [span for span in spans if len(span) > 0]
        return spans