Exemple #1
0
def json_data(data):
    document = Document(data.get(u"name", u"_DOCUMENT_"),
                        content=data.get(u"content", u""))
    for key, value in data.get(u"metadatas", {}).items():
        document.add_metadata(key, value)

    for segmentation_name in data.get(u"segmentations", {}):
        d = data[u"segmentations"][segmentation_name]
        spans = [
            Span(lb=span[u"s"], ub=0, length=span[u"l"])
            for span in d[u"spans"]
        ]
        segmentation = Segmentation(segmentation_name,
                                    spans=spans,
                                    reference=d.get(u"reference", None))
        document.add_segmentation(segmentation)
    for segmentation in document.segmentations:
        if segmentation.reference is not None:
            segmentation.reference = document.segmentation(
                segmentation.reference)

    for annotation_name in data.get(u"annotations", {}):
        d = data[u"annotations"][annotation_name]
        annotations = [
            Tag(lb=annotation[u"s"],
                ub=0,
                length=annotation[u"l"],
                value=annotation[u"v"]) for annotation in d[u"annotations"]
        ]
        annotation = Annotation(annotation_name,
                                reference=document.segmentation(
                                    d[u"reference"]),
                                annotations=annotations)
        document.add_annotation(annotation)
Exemple #2
0
def brat_file(filename, encoding="utf-8"):
    no_ext, ext = os.path.splitext(filename)
    txt_file = no_ext + ".txt"
    ann_file = no_ext + ".ann"
    if not (os.path.exists(txt_file) and os.path.exists(ann_file)):
        raise ValueError("missing either .ann or .txt file")

    document = Document(os.path.basename(txt_file),
                        encoding=encoding,
                        mime_type="text/plain")
    document.content = codecs.open(txt_file, "rU",
                                   encoding).read().replace(u"\r", u"")
    annotations = Annotation("NER")
    for line in codecs.open(ann_file, "rU", encoding):
        line = line.strip()
        if line != u"" and line.startswith(u'T'):
            parts = line.split(u"\t")
            value, bounds = parts[1].split(" ", 1)
            for bound in bounds.split(";"):
                lb, ub = bound.split()
                lb = int(lb)
                ub = int(ub)
                annotations.append(Tag(lb=lb, ub=ub, value=value))
    annotations.sort()
    document.add_annotation(annotations)

    return document
def main(indirnames, outfilename, default_shift=0, top_level=False):
    dirs = []
    for indirname in indirnames:
        dirs.extend([
            os.path.join(indirname, name)
            for name in sorted(os.listdir(indirname))
            if os.path.isdir(os.path.join(indirname, name))
        ])

    contents = []
    annotations = []
    shift = 0
    for dirname in dirs:
        cur_contents, cur_annotations, cur_shift = make_data(
            dirname, default_shift=shift, top_level=top_level)
        contents.extend(cur_contents)
        annotations.extend(cur_annotations)
        shift = cur_shift

    document = Document("_doc_", content=(u"\n" * NUM_NEWLINES).join(contents))
    document.add_annotation(Annotation("NER", annotations=annotations))
    exporter = BratExporter()
    with codecs.open(outfilename + ".ann", "w", "utf-8") as O:
        O.write(exporter.document_to_unicode(document, {"ner": "NER"}))
    with codecs.open(outfilename + ".txt", "w", "utf-8") as O:
        O.write(document.content)
Exemple #4
0
    def test_enrich(self):
        document = Document("document", "Ceci est un test.")
        corpus = Corpus([u"word"],
                        sentences=[[{
                            u"word": u"Ceci"
                        }, {
                            u"word": u"est"
                        }, {
                            u"word": u"un"
                        }, {
                            u"word": u"test"
                        }, {
                            u"word": u"."
                        }]])
        document._corpus = corpus

        features = []
        cwg = DictGetterFeature(entry="word", x=0)
        features.append(BOSFeature(name="BOS", entry="word", getter=cwg))
        features.append(EOSFeature(name="EOS", entry="word", getter=cwg))

        informations = Informations(bentries=[Entry(u"word")],
                                    features=features)

        enrich = EnrichModule(informations)

        self.assertEquals(document._corpus.fields, [u"word"])

        enrich.process_document(document)

        self.assertEquals(document._corpus.fields, [u"word", u"BOS", u"EOS"])
Exemple #5
0
    def test_clean(self):
        document = Document("document", "Ceci est un test.")
        corpus = Corpus([u"word", u"remove"],
                        sentences=[[{
                            u"word": u"Ceci",
                            u"remove": u"Ceci"
                        }, {
                            u"word": u"est",
                            u"remove": u"est"
                        }, {
                            u"word": u"un",
                            u"remove": u"un"
                        }, {
                            u"word": u"test",
                            u"remove": u"test"
                        }, {
                            u"word": u".",
                            u"remove": u"."
                        }]])
        document._corpus = corpus

        self.assertEquals(document._corpus.fields, [u"word", u"remove"])

        clean = CleanModule(to_keep=[u"word"])
        clean.process_document(document)

        self.assertEquals(document._corpus.fields, [u"word"])
Exemple #6
0
    def test_wapiti_label(self):
        document = Document("document", "Ceci est un test.")
        corpus = Corpus([u"word"],
                        sentences=[[{
                            u"word": u"Ceci"
                        }, {
                            u"word": u"est"
                        }, {
                            u"word": u"un"
                        }, {
                            u"word": u"test"
                        }, {
                            u"word": u"."
                        }]])
        document._corpus = corpus

        self.assertEquals(document._corpus.fields, [u"word"])

        wapiti_label = WapitiLabelModule(
            os.path.join(SEM_DATA_DIR, "non-regression", "models", "model"),
            u"the_new_field")
        wapiti_label.process_document(document)

        self.assertEquals(document._corpus.fields, [u"word", u"the_new_field"])

        sentence = document._corpus.sentences[0]
        self.assertEquals(sentence[0]["the_new_field"], u"A")
        self.assertEquals(sentence[1]["the_new_field"], u"B")
        self.assertEquals(sentence[2]["the_new_field"], u"B")
        self.assertEquals(sentence[3]["the_new_field"], u"A")
        self.assertEquals(sentence[4]["the_new_field"], u"O")
Exemple #7
0
def conll_file(filename, fields, word_field, encoding="utf-8"):
    document = Document(os.path.basename(filename), encoding=encoding)
    document._corpus = Corpus.from_conll(filename, fields, encoding=encoding)
    character_index = 0
    sentence_index = 0
    contents = []
    word_spans = []
    sentence_spans = []
    for sentence in document._corpus.sentences:
        contents.append([])
        for token in sentence:
            word = token[word_field]
            contents[-1].append(word)
            word_spans.append(
                Span(character_index, character_index + len(word)))
            character_index += len(word) + 1
        sentence_spans.append(
            Span(sentence_index, sentence_index + len(sentence)))
        sentence_index += len(sentence)
    document._content = u"\n".join(
        [u" ".join(content) for content in contents])
    document.add_segmentation(Segmentation("tokens", spans=word_spans))
    document.add_segmentation(
        Segmentation("sentences",
                     reference=document.segmentation("tokens"),
                     spans=sentence_spans))
    return document
Exemple #8
0
def main(indirname, outfilename, default_shift=0, top_level=False):
    contents, annotations, shift = make_data(indirname,
                                             default_shift=default_shift,
                                             top_level=top_level)

    document = Document("_doc_", content=(u"\n" * NUM_NEWLINES).join(contents))
    document.add_annotation(Annotation("NER", annotations=annotations))
    exporter = BratExporter()
    with codecs.open(outfilename + ".ann", "w", "utf-8") as O:
        O.write(exporter.document_to_unicode(document, {"ner": "NER"}))
    with codecs.open(outfilename + ".txt", "w", "utf-8") as O:
        O.write(document.content)
Exemple #9
0
def load(filename,
         encoding="utf-8",
         fields=None,
         word_field=None,
         wikinews_format=False,
         logger=None,
         strip_html=False,
         tagset_name=None,
         *args,
         **kwargs):
    if type(filename) in (Document, SEMCorpus):
        if logger is not None:
            logger.info(u"detected format: SEM XML")
        return filename

    try:
        filename = filename.decode(sys.getfilesystemencoding())
    except UnicodeDecodeError:
        pass
    except AttributeError:  # AttributeError raised in python3 as it will be str
        pass

    if filename.startswith("http"):
        if logger is not None:
            logger.info(u"detected format: HTML")
        return from_url(filename,
                        strip_html=strip_html,
                        wikinews_format=wikinews_format)

    if filename.endswith(".xml"):
        xml = ET.parse(filename)
        root_tag = xml.getroot().tag
        if root_tag == "sem":
            if logger is not None:
                logger.info(u"detected format: SEM XML")
            return SEMCorpus.from_xml(xml)
        elif root_tag == "document":
            if logger is not None:
                logger.info(u"detected format: SEM XML")
            return Document.from_xml(xml)
        elif root_tag == "GateDocument":
            if logger is not None:
                logger.info(u"detected format: GATE XML")
            return gate_data(xml, os.path.basename(filename))

    no_ext, ext = os.path.splitext(filename)
    if (ext == ".ann") or (ext == ".txt" and os.path.exists(no_ext + ".ann")):
        if logger is not None:
            logger.info(u"detected format: BRAT")
        return brat_file(filename, encoding=encoding, tagset_name=tagset_name)

    if fields is not None and word_field is not None:
        if logger is not None:
            logger.info(u"No specific format found, defaulting to text format")
        return conll_file(filename, fields, word_field, encoding=encoding)

    # this should be the last: if everything fail, just load as text document
    return text_file(filename, encoding=encoding)
Exemple #10
0
def gate_data(data, name=None):
    document = Document(name or "__DOCUMENT__", mime_type="text/plain")

    textwithnodes = data.findall("TextWithNodes")[0]
    annotation_sets = data.findall("AnnotationSet")

    text_parts = [textwithnodes.text or u""]
    nodes = {}
    for node in list(textwithnodes):
        nodes[int(node.attrib["id"])] = sum([len(part) for part in text_parts])
        text_parts.append(node.tail or u"")
    document.content = u"".join(text_parts)

    annotations = []
    for annotation_set in annotation_sets:
        annotation_name = annotation_set.attrib["Name"]
        sem_annotation = Annotation(annotation_name)
        for annotation in annotation_set:
            lb = nodes[int(annotation.attrib["StartNode"])]
            ub = nodes[int(annotation.attrib["EndNode"])]
            sem_annotation.append(Tag(lb, ub, annotation.attrib["Type"]))
        document.add_annotation(sem_annotation)

    return document
Exemple #11
0
def from_url(url, strip_html=False, wikinews_format=False):
    url = url.strip()

    if url == u"": return None

    try:
        url = url.decode(sys.getfilesystemencoding())
    except:
        pass

    strip_html |= wikinews_format  # wikinews format is always stripped

    charset = re.compile('charset="(.+?)"')
    escaped_url = u"".join([(urllib.quote(c) if ord(c) > 127 else c)
                            for c in url.encode("utf-8")])
    escaped_url = escaped_url.replace(u"%2525", u"%25")
    #url = url.decode("iso-8859-1")
    #url = url.replace(":","")

    content = u""
    f = urllib.urlopen(escaped_url)
    content = f.read()
    f.close()
    encoding = charset.search(content)
    if encoding is not None:
        encoding = encoding.group(1) or "utf-8"
    else:
        encoding = "utf-8"
    content = content.decode(encoding)

    regex = re.compile('^.+?[^/]/(?=[^/])', re.M)
    parts = regex.findall(escaped_url)
    base_url = (escaped_url[:] +
                u"/" if parts == [] else parts[0]).decode("iso-8859-1")

    content = content.replace(u'="//', u'="http://')
    content = content.replace(u'="/', u'="%s' % base_url)
    content = content.replace(u'=\\"//', u'=\\"http://')
    content = content.replace(u'=\\"/', u'=\\"%s' % base_url)
    content = content.replace(u'\r', u'')
    content = content.replace(u'</p>', u'</p>\n\n')

    if strip_html:
        new_content = sem.misc.strip_html(content, keep_offsets=True)
    else:
        new_content = content

    if wikinews_format:
        cleaned_content = new_content[:content.index("<h2>")].strip()
    else:
        cleaned_content = new_content

    if strip_html:
        h = HTMLParser()
        empty_line = re.compile("\n[ \t]+")
        spaces = re.compile("[ \t]+")
        newlines = re.compile("\n{2,}")
        cleaned_content = h.unescape(cleaned_content)
        cleaned_content = empty_line.sub(u"\n", cleaned_content)
        cleaned_content = spaces.sub(u" ", cleaned_content)
        cleaned_content = newlines.sub("\n\n", cleaned_content)

    spaces_begin = re.compile("^[ \t]+", re.M)
    spaces_end = re.compile("[ \t]+$", re.M)
    cleaned_content = spaces_begin.sub("", cleaned_content)
    cleaned_content = spaces_end.sub("", cleaned_content)

    mime_type = ("text/plain" if strip_html else "text/html")
    return Document(name=url,
                    content=cleaned_content,
                    original_content=content,
                    mime_type=mime_type)
Exemple #12
0
def text_file(filename, encoding="utf-8"):
    return Document(os.path.basename(filename),
                    content=codecs.open(filename, "rU",
                                        encoding).read().replace("\r", ""),
                    encoding=encoding)
Exemple #13
0
    def test_wapiti_label(self):
        corpus = Corpus([u"word", u"tag"],
                        sentences=[
                            [{
                                u"word": u"Ceci",
                                u"tag": u"B-tag"
                            }, {
                                u"word": u"est",
                                u"tag": u"O"
                            }, {
                                u"word": u"un",
                                u"tag": u"O"
                            }, {
                                u"word": u"test",
                                u"tag": u"O"
                            }, {
                                u"word": u".",
                                u"tag": u"O"
                            }],
                            [{
                                u"word": u"Ceci",
                                u"tag": u"O"
                            }, {
                                u"word": u"est",
                                u"tag": u"O"
                            }, {
                                u"word": u"un",
                                u"tag": u"O"
                            }, {
                                u"word": u"test",
                                u"tag": u"O"
                            }, {
                                u"word": u".",
                                u"tag": u"O"
                            }],
                            [{
                                u"word": u"ceci",
                                u"tag": u"O"
                            }, {
                                u"word": u"est",
                                u"tag": u"O"
                            }, {
                                u"word": u"un",
                                u"tag": u"O"
                            }, {
                                u"word": u"test",
                                u"tag": u"O"
                            }, {
                                u"word": u".",
                                u"tag": u"O"
                            }],
                        ])
        document = Document.from_corpus("document", corpus, u"word")
        tags = []
        for sentence in document._corpus.sentences:
            for token in sentence:
                tags.append(token[u"tag"])
        self.assertEquals(tags.count(u"O"), 14)
        self.assertEquals(tags.count(u"B-tag"), 1)

        label_consistency = LabelConsistencyModule(u"tag", token_field=u"word")
        label_consistency.process_document(document)

        self.assertEquals(document._corpus.sentences[0][0][u"tag"], u"B-tag")
        self.assertEquals(document._corpus.sentences[1][0][u"tag"], u"B-tag")
        self.assertEquals(document._corpus.sentences[2][0][u"tag"], u"O")

        tags = []
        for sentence in document._corpus.sentences:
            for token in sentence:
                tags.append(token[u"tag"])
        self.assertEquals(tags.count(u"O"), 13)
        self.assertEquals(tags.count(u"B-tag"), 2)
Exemple #14
0
def main(args):
    infile = args.infile
    reference_column = args.reference_column
    tagging_column = args.tagging_column
    ienc = args.ienc or args.enc
    oenc = args.oenc or args.enc
    verbose = args.verbose
    input_format = args.input_format
    reference_file = args.reference_file
    annotation_name = args.annotation_name
    dump = args.dump
    context_size = args.context_size

    counts = {}
    prf = {}
    if input_format == "conll":
        if reference_file:
            print(u"reference_file not handled for CoNLL files")
        L = []
        R = []
        keys = None
        nth = -1
        for n_line, p in Reader(infile, ienc).line_iter():
            nth += 1
            keys = keys or range(len(p[0]))
            L.extend(
                annotation_from_sentence(p,
                                         column=reference_column,
                                         shift=n_line - nth))
            R.extend(
                annotation_from_sentence(p,
                                         column=tagging_column,
                                         shift=n_line - nth))
        document = sem.importers.conll_file(infile,
                                            keys,
                                            keys[0],
                                            encoding=ienc)
        L = Annotation("",
                       annotations=L,
                       reference=document.segmentation(
                           "tokens")).get_reference_annotations()
        R = Annotation("",
                       annotations=R,
                       reference=document.segmentation(
                           "tokens")).get_reference_annotations()
    elif input_format == "brat":
        document = sem.importers.brat_file(reference_file)
        L = document.annotation("NER").get_reference_annotations()
        R = sem.importers.brat_file(infile).annotation(
            "NER").get_reference_annotations()
    elif input_format in ("sem", "SEM"):
        document = Document.from_xml(reference_file)
        system = Document.from_xml(infile)
        common_annotations = set(document.annotations.keys()) & set(
            system.annotations.keys())
        if len(common_annotations) == 1 and annotation_name is None:
            annotation_name = list(common_annotations)[0]
        if annotation_name is None:
            raise RuntimeError(
                "Could not find an annotation set to evaluate: please provide one"
            )
        L = document.annotation(annotation_name).get_reference_annotations()
        R = system.annotation(annotation_name).get_reference_annotations()
    else:
        raise RuntimeError("format not handled: {0}".format(input_format))

    len_ref = len(L)
    len_tag = len(R)
    d = {
        CORRECT: [],
        TYPE_ERROR: [],
        BOUNDARY_ERROR: [],
        TYPE_AND_BOUNDARY_ERROR: [],
        SILENCE_ERROR: [],
        NOISE_ERROR: []
    }
    # first pass, removing correct
    i = 0
    while i < len(L):
        LR = L[i]
        j = 0
        while j < len(R):
            RR = R[j]
            if LR == RR:
                del L[i]
                del R[j]
                i -= 1
                d[CORRECT].append([LR, RR])
                break
            j += 1
        i += 1

    # second pass, typing errors
    i = 0
    while i < len(L):
        LR = L[i]
        j = 0
        while j < len(R):
            RR = R[j]
            if LR.value != RR.value and LR.lb == RR.lb and LR.ub == RR.ub:
                del L[i]
                del R[j]
                d[TYPE_ERROR].append([LR, RR])
                break
            j += 1
        i += 1

    # third pass, boundary errors
    i = 0
    while i < len(L):
        LR = L[i]
        j = 0
        while j < len(R):
            RR = R[j]
            if LR.value == RR.value and ((LR.lb != RR.lb and LR.ub == RR.ub) or
                                         (LR.lb == RR.lb and LR.ub != RR.ub)):
                del L[i]
                del R[j]
                i -= 1
                d[BOUNDARY_ERROR].append([LR, RR])
                break
            j += 1
        i += 1

    # fourth pass, both type and boundary errors
    i = 0
    while i < len(L):
        LR = L[i]
        j = 0
        while j < len(R):
            RR = R[j]
            if LR.value != RR.value and (LR.lb != RR.lb and LR.ub
                                         == RR.ub) or (LR.lb == RR.lb
                                                       and LR.ub != RR.ub):
                del L[i]
                del R[j]
                i -= 1
                d[TYPE_AND_BOUNDARY_ERROR].append([LR, RR])
                break
            j += 1
        i += 1

    d[SILENCE_ERROR] = L[:]
    d[NOISE_ERROR] = R[:]

    entities = set()
    for l in d.values():
        for e in l:
            try:
                l, r = e
                entities.add(l.value)
                entities.add(r.value)
            except:
                entities.add(e.value)

    with codecs.open(dump, "w", "utf-8") as O:
        O.write(u"error kind\treference entity\toutput entity\tdiff\n")
        for error_kind in (TYPE_ERROR, BOUNDARY_ERROR, TYPE_AND_BOUNDARY_ERROR,
                           NOISE_ERROR, SILENCE_ERROR):
            for ex in d[error_kind]:
                if error_kind == NOISE_ERROR:
                    gold = None
                    guess = ex
                elif error_kind == SILENCE_ERROR:
                    gold = ex
                    guess = None
                else:
                    gold = ex[0]
                    guess = ex[1]
                gold_str = (u"{0}:{1}".format(
                    gold.value, document.content[gold.lb:gold.ub]) if gold else
                            "").replace("\r", "").replace("\n", " ")
                guess_str = (u"{0}:{1}".format(
                    guess.value, document.content[guess.lb:guess.ub]) if guess
                             else "").replace("\r", "").replace("\n", " ")
                diff = get_diff(document.content,
                                gold,
                                guess,
                                error_kind,
                                context_size=context_size)
                O.write(u"{0}\t{1}\t{2}\t{3}\n".format(error_kind, gold_str,
                                                       guess_str, diff))

    counts = {}
    for entity in entities:
        sub_d = {}
        sub_d[CORRECT] = [m for m in d[CORRECT] if m[0].value == entity]
        sub_d[TYPE_ERROR] = [
            m for m in d[TYPE_ERROR]
            if m[0].value == entity or m[1].value == entity
        ]
        sub_d[BOUNDARY_ERROR] = [
            m for m in d[BOUNDARY_ERROR]
            if m[0].value == entity or m[1].value == entity
        ]
        sub_d[TYPE_AND_BOUNDARY_ERROR] = [
            m for m in d[TYPE_AND_BOUNDARY_ERROR]
            if m[0].value == entity or m[1].value == entity
        ]
        sub_d[NOISE_ERROR] = [m for m in d[NOISE_ERROR] if m.value == entity]
        sub_d[SILENCE_ERROR] = [
            m for m in d[SILENCE_ERROR] if m.value == entity
        ]
        counts[entity] = sub_d

    # basic counts
    print(u"entity\tmeasure\tvalue")
    for entity in sorted(entities):
        for kind in OUTPUT_KINDS:
            print(u"{0}\t{1}\t{2}".format(entity, kind,
                                          len(counts[entity][kind])))
    print(u"global\treference\t{0}".format(len_ref))
    print(u"global\ttagging\t{0}".format(len_tag))
    for kind in OUTPUT_KINDS:
        print(u"global\t{0}\t{1}".format(kind, len(d[kind])))

    # P R F
    precisions = []
    recalls = []
    print()
    print(u"entity\tmeasure\tvalue")
    for entity in sorted(entities):
        precisions.append(precision(counts[entity]))
        recalls.append(recall(counts[entity]))
        print(u"{0}\tprecision\t{1:.4f}".format(entity, precisions[-1]))
        print(u"{0}\trecall\t{1:.4f}".format(entity, recalls[-1]))
        print(u"{0}\tfscore\t{1:.4f}".format(
            entity, fscore(precision(counts[entity]), recall(counts[entity]))))
    print(u"global\tprecision\t{0:.4f}".format(precision(d)))
    print(u"global\trecall\t{0:.4f}".format(recall(d)))
    print(u"global\tfscore\t{0:.4f}".format(fscore(precision(d), recall(d))))
    print(u"global\tmacro-precision\t{0:.4f}".format(mean(precisions)))
    print(u"global\tmacro-recall\t{0:.4f}".format(mean(recalls)))
    print(u"global\tmacro-fscore\t{0:.4f}".format(
        fscore(mean(precisions), mean(recalls))))

    # over/under generation, substitution
    print()
    print(u"entity\tmeasure\tvalue")
    for entity in sorted(entities):
        print(u"{0}\tundergeneration\t{1:.4f}".format(
            entity, undergeneration(counts[entity])))
        print(u"{0}\tovergeneration\t{1:.4f}".format(
            entity, overgeneration(counts[entity])))
        print(u"{0}\tsubstitution\t{1:.4f}".format(
            entity, substitution(counts[entity])))
    print(u"global\tundergeneration\t{0:.4f}".format(undergeneration(d)))
    print(u"global\tovergeneration\t{0:.4f}".format(overgeneration(d)))
    print(u"global\tsubstitution\t{0:.4f}".format(substitution(d)))
Exemple #15
0
def main(args):
    """
    Return a document after it passed through a pipeline.
    
    Parameters
    ----------
    masterfile : str
        the file containing the pipeline and global options
    infile : str
        the input for the upcoming pipe. Its base value is the file to
        treat, it can be either "plain text" or CoNNL-formatted file.
    directory : str
        the directory where every file will be outputted.
    """

    start = time.time()

    infile = args.infile

    try:
        output_directory = args.output_directory
    except AttributeError:
        output_directory = u"."
    try:
        force_format = args.force_format
    except AttributeError:
        force_format = "default"

    try:
        pipeline = args.pipeline
        options = args.options
        exporter = args.exporter
        couples = args.couples
    except AttributeError:
        pipeline, options, exporter, couples = load_master(
            args.master, force_format)

    if get_option(options, "log", "log_file") is not None:
        sem_tagger_logger.addHandler(
            file_handler(get_option(options, "log", "log_file")))
    sem_tagger_logger.setLevel(
        get_option(options, "log", "log_level", "WARNING"))

    if not os.path.exists(output_directory):
        os.makedirs(output_directory)

    exports = {}  # keeping track of already done exports

    nth = 1
    ienc = get_option(options, "encoding", "input_encoding", "utf-8")
    oenc = get_option(options, "encoding", "output_encoding", "utf-8")

    current_fields = None
    # the fields at the current state (depends on enrichments and
    # info cleaning). They will be used for wapiti

    if isinstance(infile, Document):
        sem_tagger_logger.info("Reading %s" % (infile.name))
        document = infile
    else:
        sem_tagger_logger.info("Reading %s" % (infile))
        file_shortname, _ = os.path.splitext(os.path.basename(infile))
        export_name = os.path.join(output_directory, file_shortname)
        file_format = get_option(options, "file", "format", "guess")
        opts = get_section(options, "file")
        opts.update(get_section(options, "encoding"))
        if file_format == "text":
            document = Document(os.path.basename(infile),
                                content=codecs.open(infile, "rU",
                                                    ienc).read().replace(
                                                        u"\r", u""),
                                **opts)
        elif file_format == "conll":
            opts["fields"] = opts["fields"].split(u",")
            opts["taggings"] = [
                tagging for tagging in opts.get("taggings", u"").split(u",")
                if tagging
            ]
            opts["chunkings"] = [
                chunking for chunking in opts.get("chunkings", u"").split(u",")
                if chunking
            ]
            document = Document.from_conll(infile, **opts)
        elif file_format == "guess":
            document = sem.importers.load(infile,
                                          logger=sem_tagger_logger,
                                          **opts)
        else:
            raise ValueError(u"unknown format: %s" % file_format)

    pipeline.process_document(document)

    if exporter is not None:
        name = document.escaped_name()
        if "html" in exporter.extension():
            shutil.copy(os.path.join(sem.SEM_RESOURCE_DIR, "css", "tabs.css"),
                        output_directory)
            shutil.copy(
                os.path.join(
                    sem.SEM_RESOURCE_DIR, "css", exporter._lang,
                    get_option(options, "export", "lang_style",
                               "default.css")), output_directory)

        if exporter.extension() == "ann":
            out_path = os.path.join(
                output_directory,
                "%s.%s" % (os.path.splitext(name)[0], exporter.extension()))
            filename = name
            if not filename.endswith(".txt"):
                filename += ".txt"
            with codecs.open(os.path.join(output_directory, filename), "w",
                             oenc) as O:
                O.write(document.content)
        else:
            out_path = os.path.join(output_directory,
                                    "%s.%s" % (name, exporter.extension()))
        exporter.document_to_file(document, couples, out_path, encoding=oenc)

    laps = time.time() - start
    sem_tagger_logger.info('done in %s' % (timedelta(seconds=laps)))

    return document