Exemple #1
0
def tei2txt(doc, method=u'tei2txt'):
    """
    Convert a TEI Facsimile to a plain text file.

    Args:
        doc (unicode, unicode): Storage tuple of the input document

    Returns:
        (unicode, unicode): Storage tuple of the output document
    """
    with storage.StorageFile(*doc) as fp:
        tei = TEIFacsimile()
        logger.debug('Reading TEI ({}/{})'.format(*doc))
        tei.read(fp)
    output_path = storage.insert_suffix(doc[1], method)
    with storage.StorageFile(doc[0], output_path, 'wb') as fp:
        logger.debug('Writing text to {}'.format(fp.abs_path))
        tei.write_text(fp)
    return (doc[0], output_path)
Exemple #2
0
def tei2txt(doc, method=u'tei2txt'):
    """
    Convert a TEI Facsimile to a plain text file.

    Args:
        doc (unicode, unicode): Storage tuple of the input document

    Returns:
        (unicode, unicode): Storage tuple of the output document
    """
    with storage.StorageFile(*doc) as fp:
        tei = TEIFacsimile()
        logger.debug('Reading TEI ({}/{})'.format(*doc))
        tei.read(fp)
    output_path = storage.insert_suffix(doc[1], method)
    with storage.StorageFile(doc[0], output_path, 'wb') as fp:
        logger.debug('Writing text to {}'.format(fp.abs_path))
        tei.write_text(fp)
    return (doc[0], output_path)
Exemple #3
0
def text_diff_ratio(doc, method=u'text_diff_ratio', ground_truth=None,
                    xml_in=True, gt_format=u'tei', clean_in=True, clean_gt=True,
                    divert=True):
    """
    Calculates the similarity of the input documents and a given ground truth
    using the algorithm of python's difflib SequenceMatcher. The result is a
    value between 0.0 (no commonality) and 1.0 (identical strings).

    Args:
        doc (unicode, unicode): The input document tuple
        method (unicode): The suffix string appended to the output file.
        ground_truth (unicode): Ground truth location tuple or a list of ground
                                truths to choose from. When more than one is
                                given, the file sharing the longest prefix with
                                the input document is chosen.
        xml_in (bool): Switch to treat input as an TEI-XML document.
        gt_format (unicode): Switch to select ground truth format. Valid values
                             are 'tei', 'hocr', and 'text'.
        clean_in (bool): Normalize to NFD and strip input data. (DO NOT DISABLE!)
        clean_gt (bool): Normalize to NFD and strip ground truth. (DO NOT DISABLE!)
        divert (bool): Switch selecting output diversion. If enabled the output
                       will be added to the tracking arguments and the input
                       document will be returned as the result of the task. Use
                       this to insert a statistical measure into a chain
                       without affecting the results.

    Returns:
        (unicode, unicode): Storage tuple of the output document
    """
    input_path = storage.get_abs_path(*doc[0])
    output_path = storage.insert_suffix(input_path, method,
                                        os.path.basename(input_path))
    if not isinstance(ground_truth[0], basestring):
        ground_truth = find_matching(doc, ground_truth)
    with storage.StorageFile(*ground_truth) as fp:
        if gt_format == 'tei':
            tei = TEIFacsimile()
            tei.read(fp)
            t = StringIO.StringIO()
            tei.write_text(t)
            gt = t.getvalue()
        elif gt_format == 'hocr':
            gt = html.parse(fp).text_content()
        elif gt_format == 'text':
            gt = fp.read()
        else:
            raise NidabaInvalidParameterException('Input format ' + gt_format + ' unknown.')
    with storage.StorageFile(*doc) as fp:
        if xml_in:
            tei = TEIFacsimile()
            tei.read(fp)
            t = StringIO.StringIO()
            tei.write_text(t)
            text = t.getvalue()
        else:
            text = fp.read()
    if clean_in:
        text = cleanup(text)
    if clean_gt:
        gt = cleanup(gt)
    logger.debug('Recognition result: \n{}'.format(text))
    logger.debug('Ground truth: \n{}'.format(gt))
    sm = difflib.SequenceMatcher()
    sm.set_seqs(text, gt)
    logger.debug('Accuracy: {}'.format(sm.ratio()))
    if not divert:
        storage.write_text(*storage.get_storage_path(output_path),
                           text=unicode(sm.ratio()))
        return output_path
    else:
        return {'diff_ratio': sm.ratio(), 'ground_truth': ground_truth, 'doc': doc}
Exemple #4
0
def text_diff_ratio(doc,
                    method=u'text_diff_ratio',
                    ground_truth=None,
                    xml_in=True,
                    gt_format=u'tei',
                    clean_in=True,
                    clean_gt=True,
                    divert=True):
    """
    Calculates the similarity of the input documents and a given ground truth
    using the algorithm of python's difflib SequenceMatcher. The result is a
    value between 0.0 (no commonality) and 1.0 (identical strings).

    Args:
        doc (unicode, unicode): The input document tuple
        method (unicode): The suffix string appended to the output file.
        ground_truth (unicode): Ground truth location tuple or a list of ground
                                truths to choose from. When more than one is
                                given, the file sharing the longest prefix with
                                the input document is chosen.
        xml_in (bool): Switch to treat input as an TEI-XML document.
        gt_format (unicode): Switch to select ground truth format. Valid values
                             are 'tei', 'hocr', and 'text'.
        clean_in (bool): Normalize to NFD and strip input data. (DO NOT DISABLE!)
        clean_gt (bool): Normalize to NFD and strip ground truth. (DO NOT DISABLE!)
        divert (bool): Switch selecting output diversion. If enabled the output
                       will be added to the tracking arguments and the input
                       document will be returned as the result of the task. Use
                       this to insert a statistical measure into a chain
                       without affecting the results.

    Returns:
        (unicode, unicode): Storage tuple of the output document
    """
    input_path = storage.get_abs_path(*doc[0])
    output_path = storage.insert_suffix(input_path, method,
                                        os.path.basename(input_path))
    if not isinstance(ground_truth[0], basestring):
        ground_truth = find_matching(doc, ground_truth)
    with storage.StorageFile(*ground_truth) as fp:
        if gt_format == 'tei':
            tei = TEIFacsimile()
            tei.read(fp)
            t = StringIO.StringIO()
            tei.write_text(t)
            gt = t.getvalue()
        elif gt_format == 'hocr':
            gt = html.parse(fp).text_content()
        elif gt_format == 'text':
            gt = fp.read()
        else:
            raise NidabaInvalidParameterException('Input format ' + gt_format +
                                                  ' unknown.')
    with storage.StorageFile(*doc) as fp:
        if xml_in:
            tei = TEIFacsimile()
            tei.read(fp)
            t = StringIO.StringIO()
            tei.write_text(t)
            text = t.getvalue()
        else:
            text = fp.read()
    if clean_in:
        text = cleanup(text)
    if clean_gt:
        gt = cleanup(gt)
    logger.debug('Recognition result: \n{}'.format(text))
    logger.debug('Ground truth: \n{}'.format(gt))
    sm = difflib.SequenceMatcher()
    sm.set_seqs(text, gt)
    logger.debug('Accuracy: {}'.format(sm.ratio()))
    if not divert:
        storage.write_text(*storage.get_storage_path(output_path),
                           text=unicode(sm.ratio()))
        return output_path
    else:
        return {
            'diff_ratio': sm.ratio(),
            'ground_truth': ground_truth,
            'doc': doc
        }