def main(args): if args.log_file is not None: segmentation_logger.addHandler(file_handler(args.log_file)) segmentation_logger.setLevel(args.log_level) ienc = args.ienc or args.enc oenc = args.oenc or args.enc segmenter = SEMModule(args.tokeniser_name, log_level=args.log_level) document = Document(os.path.basename(args.infile), content=codecs.open(args.infile, "rU", ienc).read().replace(u"\r", u"")) segmenter.process_document(document, log_level=args.log_level) tokens_spans = document.segmentation("tokens") sentence_spans = document.segmentation("sentences") joiner = (u"\n" if args.output_format == "vector" else u" ") content = document.content with codecs.open(args.outfile, "w", oenc) as O: for sentence in sentence_spans: sentence_token_spans = tokens_spans[sentence.lb:sentence.ub] sentence_tokens = [ content[s.lb:s.ub] for s in sentence_token_spans ] O.write(joiner.join(sentence_tokens)) if args.output_format == "vector": O.write(u"\n") O.write(u"\n")
def process_document(self, document, encoding="utf-8", **kwargs): """ Annotate document with Wapiti. Parameters ---------- document : sem.storage.Document the input data. It is a document with only a content log_level : str or int the logging level log_file : str if not None, the file to log to (does not remove command-line logging). """ start = time.time() if self._log_file is not None: wapiti_label_logger.addHandler(file_handler(self._log_file)) wapiti_label_logger.setLevel(self._log_level) if self._field in document.corpus.fields: wapiti_label_logger.warn("field %s already exists in document, not annotating", self._field) tags = [[s[self._field] for s in sentence] for sentence in document.corpus] document.add_annotation_from_tags(tags, self._field, self._field) else: wapiti_label_logger.info("annotating document with %s field", self._field) self._label_document(document, encoding) laps = time.time() - start wapiti_label_logger.info('in %s', timedelta(seconds=laps))
def compile_dictionary(infile, outfile, kind="token", ienc="UTF-8", log_level=logging.CRITICAL, log_file=None): if log_file is not None: compile_dictionary_logger.addHandler(file_handler(log_file)) compile_dictionary_logger.setLevel(log_level) if kind not in _choices: raise RuntimeError("Invalid kind: {0}".format(kind)) compile_dictionary_logger.info( u'compiling {0} dictionary from "{1}" to "{2}"'.format( kind, infile, outfile)) try: dictionary_compile = _compile[kind] except KeyError: # invalid kind asked compile_dictionary_logger.exception( "Invalid kind: {0}. Should be in: {1}".format( kind, u", ".join(_compile.keys()))) raise pickle.dump(dictionary_compile(infile, ienc), open(outfile, "w")) compile_dictionary_logger.info(u"done")
def main(args): """ Cleans a CoNLL-formatted file, removing fields at given indices. Parameters ---------- args.infile : str the name of the file to clean. args.outfile : str the name of the output file, where some columns have been removed. args.ranges : str the fields to remove. Fields is a coma-separated list of indices or ranges of indices using a python format (ie: "lo:hi"). """ if args.log_file is not None: clean_info_logger.addHandler(file_handler(args.log_file)) clean_info_logger.setLevel(args.log_level) ienc = args.ienc or args.enc oenc = args.oenc or args.enc allowed = ranges_to_set(args.ranges, len( codecs.open(args.infile, "rU", ienc).readline().strip().split()), include_zero=True) max_abs = 0 for element in allowed: element = abs(element) + (1 if element > 0 else 0) max_abs = max(max_abs, element) nelts = len( codecs.open(args.infile, "rU", ienc).readline().strip().split()) if nelts < max_abs: clean_info_logger.error( u'asked to keep up to {0} field(s), yet only {1} are present in the "{2}"' .format(max_abs, nelts, args.infile)) raise runtimeError( u'asked to keep up to {0} field(s), yet only {1} are present in the "{2}"' .format(max_abs, nelts, args.infile)) clean_info_logger.info(u'cleaning "{0}"'.format(args.infile)) clean_info_logger.info(u'keeping columns: {0}'.format(u", ".join( [str(s) for s in sorted(allowed)]))) clean_info_logger.info(u'writing "{0}"'.format(args.outfile)) with codecs.open(args.outfile, "w", oenc) as O: for line in codecs.open(args.infile, "rU", ienc): line = line.strip().split() if line != []: tokens = [line[i] for i in range(len(line)) if i in allowed] O.write(u"\t".join(tokens)) O.write(u"\n") clean_info_logger.info(u'done')
def process_document(self, document, **kwargs): """ Updates the CoNLL-formatted corpus inside a document with various features. Parameters ---------- document : sem.storage.Document the input data, contains an object representing CoNLL-formatted data. Each token is a dict which works like TSV. log_level : str or int the logging level log_file : str if not None, the file to log to (does not remove command-line logging). """ start = time.time() if self._log_file is not None: enrich_logger.addHandler(file_handler(self._log_file)) enrich_logger.setLevel(self._log_level) missing_fields = set([I.name for I in self.bentries + self.aentries ]) - set(document.corpus.fields) if len(missing_fields) > 0: raise ValueError("Missing fields in input corpus: {0}".format( u",".join(sorted(missing_fields)))) enrich_logger.info(u'enriching file "%s"', document.name) new_fields = [ feature.name for feature in self.features if feature.display ] document.corpus.fields += new_fields nth = 0 for i, p in enumerate(document.corpus): for feature in self.features: if feature.is_sequence: for i, value in enumerate(feature(p)): p[i][feature.name] = value else: for i in range(len(p)): p[i][feature.name] = feature(p, i) if feature.is_boolean: p[i][feature.name] = int(p[i][feature.name]) elif p[i][feature.name] is None: p[i][feature.name] = feature.default() nth += 1 if (0 == nth % 1000): enrich_logger.debug(u'%i sentences enriched', nth) enrich_logger.debug(u'%i sentences enriched', nth) laps = time.time() - start enrich_logger.info(u"done in %s", timedelta(seconds=laps))
def process_document(self, document, **kwargs): """ Updates a document with various segmentations and creates an sem.corpus (CoNLL-formatted data) using field argument as index. Parameters ---------- document : sem.storage.Document the input data. It is a document with only a content log_level : str or int the logging level log_file : str if not None, the file to log to (does not remove command-line logging). """ start = time.time() if self._log_file is not None: segmentation_logger.addHandler(file_handler(self._log_file)) segmentation_logger.setLevel(self._log_level) current_tokeniser = self._tokeniser segmentation_logger.debug(u'segmenting "%s" content', document.name) content = document.content if document.metadata("MIME") == "text/html": content = strip_html(content, keep_offsets=True) do_segmentation = document.segmentation("tokens") is None or document.segmentation("sentences") is None or document.segmentation("paragraphs") is None if do_segmentation: try: token_spans = current_tokeniser.word_spans(content) except NotImplementedError: token_spans = current_tokeniser.bounds2spans(current_tokeniser.word_bounds(content)) sentence_spans = current_tokeniser.bounds2spans(current_tokeniser.sentence_bounds(content, token_spans)) paragraph_spans = current_tokeniser.bounds2spans(current_tokeniser.paragraph_bounds(content, sentence_spans, token_spans)) else: segmentation_logger.info(u'{0} already has segmenation, not computing'.format(document.name)) token_spans = document.segmentation("tokens").spans sentence_spans = document.segmentation("sentences").spans paragraph_spans = document.segmentation("paragraphs").spans segmentation_logger.info(u'"{0}" segmented in {1} sentences, {2} tokens'.format(document.name, len(sentence_spans), len(token_spans))) if document.segmentation("tokens") is None: document.add_segmentation(Segmentation("tokens", spans=token_spans)) if document.segmentation("sentences") is None: document.add_segmentation(Segmentation("sentences", reference=document.segmentation("tokens"), spans=sentence_spans)) if document.segmentation("paragraphs") is None: document.add_segmentation(Segmentation("paragraphs", reference=document.segmentation("sentences"), spans=paragraph_spans)) if len(document.corpus) == 0: document.corpus.from_segmentation(document.content, document.segmentation("tokens"), document.segmentation("sentences")) laps = time.time() - start segmentation_logger.info(u'in {0}'.format(timedelta(seconds=laps)))
def main(args): """ Takes a CoNLL-formatted file and write another CoNLL-formatted file with additional features in it. Parameters ---------- infile : str the CoNLL-formatted input file. outfile : str the CoNLL-formatted output file. mdl : str the wapiti model file. log_level : str or int the logging level. log_file : str if not None, the file to log to (does not remove command-line logging). """ start = time.time() if args.log_file is not None: tagging_logger.addHandler(file_handler(args.log_file)) tagging_logger.setLevel(args.log_level) infile = args.infile outfile = args.outfile ienc = args.ienc or args.enc oenc = args.oenc or args.enc annotator = SEMModule(**vars(args)) length = -1 fields = None for sentence in Reader(infile, ienc): fields = fields or [unicode(i) for i in range(len(sentence[0]))] if length == -1: length = len(fields) if length != len(sentence[0]): raise ValueError( u"%s has inconsistent number of columns, found %i and %i" % (infile, length, len(sentence[0]))) document = conll_file(infile, fields, fields[0], encoding=ienc) annotator.process_document(document) exporter = CoNLLExporter() exporter.document_to_file(document, None, outfile, encoding=oenc) laps = time.time() - start tagging_logger.info("done in %s", timedelta(seconds=laps))
def main(args): """ Takes a CoNLL-formatted file and write another CoNLL-formatted file with additional features in it. Parameters ---------- infile : str the CoNLL-formatted input file. infofile : str the XML file containing the different features. mode : str the mode to use for infofile. Some inputs may only be present in a particular mode. For example, the output tag is only available in "train" mode. log_level : str or int the logging level. log_file : str if not None, the file to log to (does not remove command-line logging). """ start = time.time() if args.log_file is not None: enrich_logger.addHandler(file_handler(args.log_file)) enrich_logger.setLevel(args.log_level) enrich_logger.info('parsing enrichment file "%s"' % args.infofile) informations = Informations(path=args.infofile, mode=args.mode) enrich_logger.debug('enriching file "%s"' % args.infile) bentries = [entry.name for entry in informations.bentries] aentries = [entry.name for entry in informations.aentries] features = [ feature.name for feature in informations.features if feature.display ] with KeyWriter(args.outfile, args.oenc or args.enc, bentries + features + aentries) as O: nth = 0 for p in informations.enrich( KeyReader(args.infile, args.ienc or args.enc, bentries + aentries)): O.write_p(p) nth += 1 if (0 == nth % 1000): enrich_logger.debug('%i sentences enriched' % nth) enrich_logger.debug('%i sentences enriched' % nth) laps = time.time() - start enrich_logger.info("done in %s", timedelta(seconds=laps))
def main(args): """ Takes a CoNLL-formatted file and write another CoNLL-formatted file with additional features in it. Parameters ---------- infile : str the CoNLL-formatted input file. infofile : str the XML file containing the different features. mode : str the mode to use for infofile. Some inputs may only be present in a particular mode. For example, the output tag is only available in "train" mode. log_level : str or int the logging level. log_file : str if not None, the file to log to (does not remove command-line logging). """ start = time.time() if args.log_file is not None: enrich_logger.addHandler(file_handler(args.log_file)) enrich_logger.setLevel(args.log_level) enrich_logger.info(u'parsing enrichment file "%s"', args.infofile) processor = SEMModule(path=args.infofile, mode=args.mode) enrich_logger.debug(u'enriching file "%s"', args.infile) bentries = [entry.name for entry in processor.bentries] aentries = [entry.name for entry in processor.aentries] features = [ feature.name for feature in processor.features if feature.display ] document = from_conll(args.infile, bentries + aentries, (bentries + aentries)[0], encoding=args.ienc or args.enc) processor.process_document(document) with KeyWriter(args.outfile, args.oenc or args.enc, bentries + features + aentries) as O: for p in document.corpus: O.write_p(p) laps = time.time() - start enrich_logger.info(u"done in %s", timedelta(seconds=laps))
def process_document(self, document, **kwargs): """ Updates the CoNLL-formatted corpus inside a document with various features. Parameters ---------- document : sem.storage.Document the input data, contains an object representing CoNLL-formatted data. Each token is a dict which works like TSV. log_level : str or int the logging level log_file : str if not None, the file to log to (does not remove command-line logging). """ start = time.time() if self._log_file is not None: enrich_logger.addHandler(file_handler(self._log_file)) enrich_logger.setLevel(self._log_level) informations = self._informations missing_fields = set([ I.name for I in informations.bentries + informations.aentries ]) - set(document.corpus.fields) if len(missing_fields) > 0: raise ValueError("Missing fields in input corpus: %s" % u",".join(sorted(missing_fields))) enrich_logger.debug('enriching file "%s"' % document.name) new_fields = [ feature.name for feature in informations.features if feature.display ] document.corpus.fields += new_fields nth = 0 for i, sentence in enumerate(informations.enrich(document.corpus)): nth += 1 if (0 == nth % 1000): enrich_logger.debug('%i sentences enriched' % nth) enrich_logger.debug('%i sentences enriched' % nth) laps = time.time() - start enrich_logger.info("done in %s" % timedelta(seconds=laps))
def __init__(self, annotator, field, log_level="WARNING", log_file=None, *args, **kwargs): super(SEMModule, self).__init__(log_level=log_level, log_file=log_file, **kwargs) if self._log_file is not None: tagging_logger.addHandler(file_handler(self._log_file)) tagging_logger.setLevel(self._log_level) self._annotator = sem.annotators.get_annotator(annotator)(field, *args, **kwargs)
def process_document(self, document, outfile=sys.stdout, output_encoding="utf-8", **kwargs): start = time.time() if self._log_file is not None: export_logger.addHandler(file_handler(self._log_file)) export_logger.setLevel(self._log_level) export_logger.debug('setting name/column couples for exportation') oenc = kwargs.get("output-encoding", "utf-8") pos_column = self._pos_column chunk_column = self._chunk_column ner_column = self._ner_column couples = {} if "word" in document.corpus.fields: couples["token"] = "word" elif "token" in document.corpus.fields: couples["token"] = "token" if pos_column: couples["pos"] = pos_column export_logger.debug('POS column is %s' % pos_column) if chunk_column: couples["chunking"] = chunk_column export_logger.debug('chunking column is %s' % chunk_column) if ner_column: couples["ner"] = ner_column export_logger.debug('NER column is %s' % ner_column) export_logger.debug('exporting document to %s format' % self._exporter.extension) self._exporter.document_to_file(document, couples, outfile, encoding=output_encoding) laps = time.time() - start export_logger.info('done in %s' % (timedelta(seconds=laps)))
def __init__(self, exporter, log_level="WARNING", log_file=None, lang="fr", lang_style="default.css", pos_column=None, chunk_column=None, ner_column=None, **kwargs): super(SEMModule, self).__init__(log_level=log_level, log_file=log_file, **kwargs) if log_file is not None: export_logger.addHandler(file_handler(log_file)) export_logger.setLevel(log_level) self._lang = lang self._lang_style = lang_style self._pos_column = pos_column self._chunk_column = chunk_column self._ner_column = ner_column if is_string(exporter): export_logger.info(u'getting exporter {0}'.format(exporter)) Exporter = get_exporter(exporter) self._exporter = Exporter(lang=self._lang, lang_style=self._lang_style) else: export_logger.info(u'using loaded exporter') self._exporter = exporter
def process_document(self, document, **kwargs): """ Cleans the sem.storage.corpus of a document, removing unwanted fields. Parameters ---------- document : sem.storage.Document the document containing the corpus to clean. ranges : str or list of int or list of str if str: fields to remove will be induced if list of int: each element in the list is the index of a field to remove in corpus.fields if list of string: the list of fields to remove """ start = time.time() if self._log_file is not None: clean_info_logger.addHandler(file_handler(self._log_file)) clean_info_logger.setLevel(self._log_level) clean_info_logger.info(u'cleaning document') allowed = set(self._allowed) fields = set(field for field in document.corpus.fields) to_remove = fields - allowed document.corpus.fields = self._allowed[:] if len(allowed - fields) > 0: clean_info_logger.warn( u"the following fields are not present in document, this might cause an error sometime later: %s", u", ".join(allowed - fields)) indices = [entry.index for entry in document.corpus.fields] for i in range(len(document.corpus.sentences)): for j in range(len(document.corpus.sentences[i])): document.corpus.sentences[i][j] = dict( (a, document.corpus.sentences[i][j][a]) for a in allowed) laps = time.time() - start clean_info_logger.info(u'done in {0}'.format(timedelta(seconds=laps)))
def process_document(self, document, **kwargs): """ Updates a document with various segmentations and creates an sem.corpus (CoNLL-formatted data) using field argument as index. Parameters ---------- document : sem.storage.Document the input data. It is a document with only a content log_level : str or int the logging level log_file : str if not None, the file to log to (does not remove command-line logging). """ start = time.time() if self._log_file is not None: map_annotations_logger.addHandler(file_handler(self._log_file)) map_annotations_logger.setLevel(self._log_level) ref_annotation = document.annotation(self._annotation_name) ref_annotations = ref_annotation.annotations values = set([a.value for a in ref_annotations]) new_annotations = [ Tag(annotation.lb, annotation.ub, self._mapping.get(annotation.value, annotation.value)) for annotation in ref_annotations if self._mapping.get(annotation.value, None) != u"" ] document.add_annotation( Annotation(self._annotation_name, reference=ref_annotation.reference, annotations=new_annotations)) laps = time.time() - start map_annotations_logger.info('in %s' % (timedelta(seconds=laps)))
def main(args): start = time.time() infile = args.infile outfile = args.outfile exporter_name = args.exporter_name lang = args.lang lang_style = args.lang_style import_options = args.import_options or {} ienc = args.ienc or args.enc oenc = args.oenc or args.enc log_file = args.log_file log_level = args.log_level pos_column = args.pos_column chunk_column = args.chunk_column ner_column = args.ner_column couples = {} if pos_column: couples["pos"] = pos_column export_logger.debug('POS column is %s' % pos_column) if chunk_column: couples["chunking"] = chunk_column export_logger.debug('chunking column is %s' % chunk_column) if ner_column: couples["ner"] = ner_column export_logger.debug('NER column is %s' % ner_column) if log_file is not None: export_logger.addHandler(file_handler(log_file)) export_logger.setLevel(log_level) if type(exporter_name) in (str, unicode): export_logger.info('getting exporter %s' % (exporter_name)) Exporter = get_exporter(exporter_name) exporter = Exporter(lang=lang, lang_style=lang_style) else: export_logger.info('using loaded exporter') exporter = exporter_name if type(import_options) in (list, ): # list from argparse options = {} for option in import_options: key, value = option.split(u"=", 1) try: value = sem.misc.str2bool(value) except ValueError: pass options[key] = value options["encoding"] = ienc else: options = import_options infile_is_str = type(infile) in (str, unicode) if infile_is_str: export_logger.info('loading input file') document = sem.importers.load(infile, logger=export_logger, **options) else: export_logger.info('using input document') document = infile export_logger.debug('exporting document %s' % document.name) exporter.document_to_file(document, couples, outfile, encoding=oenc, logger=export_logger) laps = time.time() - start export_logger.info('done in %s' % (timedelta(seconds=laps)))