Example #1
0
def create_word_mappings(saf, alignments, lowercase, out_dir):
    create_dirs(out_dir)

    alignment_data = json.load(alignments)
    aligned1 = alignment_data['gs']
    aligned2 = alignment_data['ocr']

    saf = json.load(saf)
    if lowercase:
        words = [w['word'].lower() for w in saf['tokens']]

        aligned1 = [c.lower() for c in aligned1]
        aligned2 = [c.lower() for c in aligned2]
    else:
        words = [w['word'] for w in saf['tokens']]

    wb = find_word_boundaries(words, aligned1)

    doc_id = remove_ext(alignments.name)

    res = {'gs': [], 'ocr': [], 'doc_id': []}
    for s, e in wb:
        w1 = u''.join(aligned1[s:e])
        w2 = u''.join(aligned2[s:e])

        res['gs'].append(w1.strip())
        res['ocr'].append(w2.strip())
        res['doc_id'].append(doc_id)

    # Use pandas DataFrame to create the csv, so commas and quotes are properly
    # escaped.
    df = pd.DataFrame(res)

    out_file = out_file_name(out_dir, doc_id, ext='csv')
    df.to_csv(out_file, encoding='utf-8')
Example #2
0
def ocrevaluation_extract(in_file, out_dir):
    create_dirs(out_dir)

    tables = []

    write = False

    (fd, tmpfile) = tempfile.mkstemp()
    with codecs.open(tmpfile, 'w', encoding='utf-8') as tmp:
        for line in in_file:
            if line.startswith('<h2>General'):
                write = True
            if line.startswith('<h2>Difference'):
                write = False
            if line.startswith('<h2>Error'):
                write = True

            if write:
                tmp.write(line)

    with codecs.open(tmpfile, encoding='utf-8') as f:
        soup = BeautifulSoup(f.read(), 'lxml')

    tables = soup.find_all('table')
    assert len(tables) == 2
    os.remove(tmpfile)

    doc = remove_ext(in_file.name)

    t = tables[0]
    table_data = [[cell.text for cell in row('td')] for row in t('tr')]

    # 'transpose' table_data
    lines = {}
    for data in table_data:
        for i, entry in enumerate(data):
            if i not in lines.keys():
                # add doc id to data line (but not to header)
                if i != 0:
                    lines[i] = [doc]
                else:
                    lines[i] = ['']
            lines[i].append(entry)

    out_file = os.path.join(out_dir, '{}-global.csv'.format(doc))
    with codecs.open(out_file, 'wb', encoding='utf-8') as f:
        for i in range(len(lines.keys())):
            f.write(u','.join(lines[i]))
            f.write(u'\n')

    t = tables[1]
    table_data = [[cell.text for cell in row('td')] for row in t('tr')]
    out_file = os.path.join(out_dir, '{}-character.csv'.format(doc))
    with codecs.open(out_file, 'wb', encoding='utf-8') as f:
        for data in table_data:
            f.write(u'"{}",'.format(data[0]))
            f.write(u','.join(data[1:]))
            f.write(u'\n')
def clin2018st_extract_text(json_file, out_dir):
    create_dirs(out_dir)

    corrections = {}
    gs_text = []
    text_with_errors = []

    text = json.load(json_file)
    for w in text['corrections']:
        span = w['span']
        # TODO: fix 'after'
        if 'after' in w.keys():
            print('Found "after" in {}.'.format(
                os.path.basename(json_file.name)))
        for i, w_id in enumerate(span):
            corrections[w_id] = {}
            if i == 0:
                corrections[w_id]['text'] = w['text']
            else:
                corrections[w_id]['text'] = u''
            corrections[w_id]['last'] = False
            if i == (len(span) - 1):
                corrections[w_id]['last'] = True

    for w in text['words']:
        w_id = w['id']
        gs_text.append(w['text'])
        if w_id in corrections.keys():
            text_with_errors.append(corrections[w_id]['text'])
        else:
            text_with_errors.append(w['text'])
        if w['space']:
            gs_text.append(u' ')
            text_with_errors.append(u' ')

    gs_file = remove_ext(json_file.name)
    gs_file = os.path.join(out_dir, '{}-gs.txt'.format(gs_file))
    with codecs.open(gs_file, 'wb', encoding='utf-8') as f:
        f.write(u''.join(gs_text))

    err_file = remove_ext(json_file.name)
    err_file = os.path.join(out_dir, '{}-errors.txt'.format(err_file))
    with codecs.open(err_file, 'wb', encoding='utf-8') as f:
        f.write(u''.join(text_with_errors))
def ocrevaluation_extract(in_file, out_dir):
    create_dirs(out_dir)

    soup = BeautifulSoup(in_file, 'lxml')
    tables = []
    for header in soup.find_all('h2'):
        if (header.text == 'General results'
                or header.text.startswith('Error rate')):
            tables.append(header.find_next('table'))

    assert len(tables) == 2

    doc = remove_ext(in_file.name)

    t = tables[0]
    table_data = [[cell.text for cell in row('td')] for row in t('tr')]

    # 'transpose' table_data
    lines = {}
    for data in table_data:
        for i, entry in enumerate(data):
            if i not in lines.keys():
                # add doc id to data line (but not to header)
                if i != 0:
                    lines[i] = [doc]
                else:
                    lines[i] = ['doc_id']
            lines[i].append(entry.replace(',', '.'))

    out_file = os.path.join(out_dir, '{}-global.csv'.format(doc))
    with codecs.open(out_file, 'wb', encoding='utf-8') as f:
        for i in range(len(lines.keys())):
            f.write(u';'.join(lines[i]))
            f.write(u'\n')

    t = tables[1]
    table_data = [[cell.text.replace(',', '.') for cell in row('td')]
                  for row in t('tr')]
    out_file = os.path.join(out_dir, '{}-character.csv'.format(doc))
    with codecs.open(out_file, 'wb', encoding='utf-8') as f:
        for data in table_data:
            f.write(u'"{}";'.format(data[0]))
            f.write(u';'.join(data[1:]))
            f.write(u'\n')
def ocrevaluation_extract(in_file, out_dir):
    create_dirs(out_dir)

    soup = BeautifulSoup(in_file, 'lxml')
    tables = soup.find_all('table')
    assert len(tables) == 3

    doc = remove_ext(in_file.name)

    # global measures: table[0]
    t = tables[0]
    table_data = [[cell.text for cell in row('td')] for row in t('tr')]

    # 'transpose' table_data
    lines = {}
    for data in table_data:
        for i, entry in enumerate(data):
            if i not in lines.keys():
                # add doc id to data line (but not to header)
                if i != 0:
                    lines[i] = [doc]
                else:
                    lines[i] = ['']
            lines[i].append(entry)

    out_file = os.path.join(out_dir, '{}-global.csv'.format(doc))
    with codecs.open(out_file, 'wb', encoding='utf-8') as f:
        for i in range(len(lines.keys())):
            f.write(u','.join(lines[i]))
            f.write(u'\n')

    # character measures: table[2]
    t = tables[2]
    table_data = [[cell.text for cell in row('td')] for row in t('tr')]
    out_file = os.path.join(out_dir, '{}-character.csv'.format(doc))
    with codecs.open(out_file, 'wb', encoding='utf-8') as f:
        for data in table_data:
            f.write(u'"{}",'.format(data[0]))
            f.write(u','.join(data[1:]))
            f.write(u'\n')
Example #6
0
def test_remove_ext_full_path():
    fname = '/home/jvdzwaan/data/test.txt'
    assert remove_ext(fname) == 'test'
Example #7
0
def test_remove_ext_no_ext():
    fname = 'test'
    assert remove_ext(fname) == 'test'
Example #8
0
def test_remove_ext_filename():
    fname = 'test.txt'
    assert remove_ext(fname) == 'test'
Example #9
0
def safar_add_metadata(in_file, in_file_meta, max_len, out_dir):
    """Add metadata from a csv file to a SAFAR XML file.
    """
    create_dirs(out_dir)

    analysis_tag = None
    total_words = None

    markers = b'<markers></markers>'

    # check whether the analysis_tag should be stemmer_analysis
    with codecs.open(in_file, 'r', encoding='utf-8') as xml_file:
        for line in xml_file:
            if re.search('morphology_analysis', line):
                analysis_tag = 'morphology_analysis'
            elif re.search('stemmer_analysis', line):
                analysis_tag = 'stemmer_analysis'

            m = re.search('total_words="(\d+)"', line)
            if m:
                total_words = m.group(1)

            if analysis_tag is not None and total_words is not None:
                break

    # Extract the words and markers
    click.echo('Extracting tokens')
    (fd, tmpfile) = tempfile.mkstemp()
    with codecs.open(tmpfile, 'wb') as words:
        context = etree.iterparse(in_file,
                                  events=('end', ),
                                  tag=('word', 'markers'),
                                  huge_tree=True)
        context = tqdm(context, total=int(total_words))
        for event, elem in context:
            if elem.tag == 'word':
                # Setting method to html (instead of xml) fixes problems
                # with writing Arabic characters in the value attribute of
                # the word element.
                words.write(
                    etree.tostring(elem, encoding='utf-8', method='html'))
            elif elem.tag == 'markers':
                markers = etree.tostring(elem, encoding='utf-8')

            # make iteration over context fast and consume less memory
            # https://www.ibm.com/developerworks/xml/library/x-hiperfparse
            elem.clear()
            while elem.getprevious() is not None:
                del elem.getparent()[0]
        del context

    # Get the metadata
    md = pd.read_csv(in_file_meta,
                     sep=',|;',
                     index_col='BookURI',
                     encoding='utf-8')
    # make sure the index type is string
    if six.PY2:
        md.index = md.index.map(unicode)
    else:
        md.index = md.index.map(str)

    if '-' in os.path.basename(in_file):
        uri = os.path.basename(in_file).split('-', 1)[0]
    else:
        uri = remove_ext(in_file)

    try:
        md = md.loc[uri]
        metadata = [u'<metadata>']
        for key in md.keys()[1:]:  # skip over order (the old index)
            val = md[key]
            if isinstance(val, six.string_types):
                val = smart_strip(val)
                val = escape(val)
                # Make sure the values aren't to long, because
                # BlackLab doesn't allow values that are to long in dropdowns.
                # The default value of 94 was set emperically. It seems the
                # lengths of strings are caluclated differently in Java (the
                # max length in Java is 256).
                if len(val) >= max_len:
                    val = 'X ' + val[:max_len - 2]
            metadata.append(u'<meta name="{}">{}</meta>'.format(key, val))
        metadata.append(u'<meta name="{}">{}</meta>'.format('BookURI', uri))
        metadata.append(u'</metadata>')

        metadata = u'\n'.join(metadata)
    except KeyError:
        metadata = u'<metadata></metadata>'

    # Write output
    click.echo('Writing output')
    xml_out = out_file_name(out_dir, in_file)
    with codecs.open(xml_out, 'wb') as f:
        f.write(b'<?xml version="1.0" encoding="utf-8"?>\n')
        f.write(b'<document>\n')

        f.write(metadata.encode('utf-8'))

        tag = '  <{} total_words="{}">\n'.format(analysis_tag, total_words)
        f.write(tag.encode('utf-8'))

        with codecs.open(tmpfile, 'rb') as words_file:
            for line in tqdm(words_file):
                f.write(line)

        f.write('  </{}>\n'.format(analysis_tag).encode('utf-8'))

        f.write(markers)

        f.write(b'</document>\n')

    os.remove(tmpfile)