def test_out_file_name_change_ext():
    out_dir = '/home/jvdzwaan/data/'
    fname = 'foo.txt'

    out_fname = out_file_name(out_dir, fname, ext='csv')

    assert out_fname == '/home/jvdzwaan/data/foo.csv'
def saf_to_text(in_dir, out_dir, mode):
    create_dirs(out_dir)

    if mode not in ('word', 'lemma'):
        raise ValueError("Unknown mode: {mode}, "
                         "please choose either word or lemma"
                         .format(**locals()))

    in_files = get_files(in_dir)

    for fi in in_files:
        with codecs.open(fi, encoding='utf-8') as f:
            saf = json.load(f)

        s_id = None
        lines = []

        for t in saf['tokens']:
            if s_id is None:
                s_id = t['sentence']
                sentence = []
            elif t['sentence'] != s_id:
                lines.append(u' '.join(sentence))
                sentence = []
                s_id = t['sentence']

            sentence.append(t[mode])

        out_file = out_file_name(out_dir, os.path.basename(fi), ext='txt')
        with codecs.open(out_file, 'wb', encoding='utf-8') as f:
            f.write(u'\n'.join(lines))
            f.write(u'\n')
def normalize_whitespace_punctuation(txt, out_dir):
    create_dirs(out_dir)

    text = txt.read()
    text = normalize_whitespace(text)
    text = normalize_punctuation(text)

    out_file = out_file_name(out_dir, os.path.basename(txt.name))
    with codecs.open(out_file, 'wb', encoding='utf-8') as f:
        f.write(text)
def command(in_file, rename, out_dir):
    create_dirs(out_dir)

    ext = os.path.splitext(in_file)[1].replace('.', '')
    fname = os.path.basename(in_file)

    if rename == 'spaces':
        fname = fname.replace(' ', '-')
    elif rename == 'random':
        fname = '{}.{}'.format(uuid.uuid4(), ext)

    fo = out_file_name(out_dir, fname)
    shutil.copy2(in_file, fo)
def command(xml_file, element, out_dir):
    create_dirs(out_dir)

    bs = BeautifulSoup(xml_file.read(), 'xml')

    for elem in element:
        to_empty = bs.find_all(elem)
        for t in to_empty:
            t.clear()

    out_file = out_file_name(out_dir, os.path.basename(xml_file.name))
    with codecs.open(out_file, 'wb', encoding='utf-8') as f:
        f.write(bs.prettify())
def command(in_dir, out_dir, tika_server):
    create_dirs(out_dir)

    in_files = get_files(in_dir)

    for fi in in_files:
        if tika_server:
            parsed = parser.from_file(fi, tika_server)
        else:
            parsed = parser.from_file(fi)

        out_file = out_file_name(out_dir, fi, 'txt')
        with codecs.open(out_file, 'wb', encoding='utf-8') as f:
            f.write(parsed['content'])
def check_file(in_file, convert, out_dir):
    fo = out_file_name(out_dir, in_file)
    try:
        with codecs.open(in_file, encoding='utf-8') as f:
            text = f.read()
        if convert:
            # don't copy if it's the same file
            if os.path.abspath(in_file) != fo:
                shutil.copy2(in_file, fo)
    except UnicodeDecodeError:
        with codecs.open(in_file, 'rb') as f:
            text = f.read()
        dammit = UnicodeDammit(text)
        print('{}: {}'.format(in_file, dammit.original_encoding))
        if convert:
            with codecs.open(fo, 'w', encoding='utf-8') as f:
                f.write(dammit.unicode_markup)
def frog2saf(in_dir, out_dir):
    create_dirs(out_dir)

    in_files = get_files(in_dir)

    for fi in in_files:
        with codecs.open(fi, encoding='utf-8') as f:
            lines = f.readlines()
            lines = [line.strip() for line in lines]
        saf_data = frog_to_saf(parse_frog(lines))

        head, tail = os.path.split(fi)
        fname = tail.replace(os.path.splitext(tail)[1], '')

        out_file = os.path.join(out_dir, out_file_name(out_dir, fname, 'json'))
        with codecs.open(out_file, 'wb', encoding='utf-8') as f:
            json.dump(saf_data, f, indent=4)
def delete_empty_files(in_dir, out_dir):
    create_dirs(out_dir)

    in_files = get_files(in_dir)
    for fi in in_files:
        with codecs.open(fi, encoding='utf-8') as f:
            text = f.read()

        if len(text.strip()) > 0:
            fname = out_file_name(out_dir, fi)
            try:
                shutil.copy2(fi, fname)
            except shutil.Error:
                pass
        else:
            print('deleting {}'.format(os.path.basename(fi)))
            if os.path.abspath(in_dir) == os.path.abspath(out_dir):
                os.remove(fi)
Example #10
0
def command(ocr_text, gs_text, metadata, out_dir):
    create_dirs(out_dir)

    ocr = ocr_text.read()
    gs = gs_text.read()
    md = json.load(metadata)

    check = True
    # Too many strange characters, so disable sanity check
    if len(set(ocr+gs)) > 127:
        check = False

    ocr_a, gs_a = align_characters(ocr, gs, md['cigar'], sanity_check=check)

    out_file = out_file_name(out_dir, md['doc_id'], 'json')
    with codecs.open(out_file, 'wb', encoding='utf-8') as f:
        try:
            json.dump({'ocr': ocr_a, 'gs': gs_a}, f, encoding='utf-8')
        except TypeError:
            json.dump({'ocr': ocr_a, 'gs': gs_a}, f)
def merge2openiti(in_file1, in_file2, out_dir):
    create_dirs(out_dir)

    try:
        nltk.data.find('tokenizers/punkt')
    except LookupError:
        nltk.download('punkt')

    lines1 = in_file1.readlines()
    lines2 = in_file2.readlines()

    merged = []
    for l1, l2 in zip(lines1[:10], lines2[:10]):
        merged_sentence = merge_sentences(l1, l2)
        merged.append(merged_sentence)

    out_file = out_file_name(out_dir, in_file1.name)
    print out_file
    with codecs.open(out_file, 'wb', encoding='utf-8') as f:
        f.write(''.join(merged))
Example #12
0
def concat_files(in_dir, out_dir):
    in_files = get_files(in_dir)

    counts = Counter()

    for in_file in in_files:
        parts = os.path.basename(in_file).split(u'_')
        prefix = u'_'.join(parts[:2])
        counts[prefix] += 1

        out_file = out_file_name(out_dir, prefix, ext='txt')

        with codecs.open(in_file, 'r', encoding='utf-8') as fi:
            text = fi.read()
            text = text.replace(u'\n', u'')
            text = text.strip()

        with codecs.open(out_file, 'a', encoding='utf-8') as fo:
            fo.write(text)
            fo.write(u'\n')
Example #13
0
def xml_to_text(in_dir, out_dir, tag):
    create_dirs(out_dir)

    in_files = get_files(in_dir)

    for fi in in_files:
        with codecs.open(fi, encoding='utf-8') as f:
            root = etree.ElementTree().parse(f)
        if tag is not None:
            elements = list(root.iter('{*}' + tag))
        else:
            elements = [root]
        texts = []
        for el in elements:
            texts.append(' '.join(
                [e.text for e in el.iterdescendants() if e.text is not None]))

        out_file = out_file_name(out_dir, fi, 'txt')
        with codecs.open(out_file, 'wb', encoding='utf-8') as f:
            f.write('\n'.join(texts))
            f.write('\n')
Example #14
0
def safar_add_metadata(in_dir, in_dir_meta, in_file_meta, out_dir):
    in_files = get_files(in_dir)
    metadata_files = {os.path.basename(f): f for f in get_files(in_dir_meta)}

    doc_id = os.path.splitext(os.path.basename(in_file_meta))[0]

    out_dir_sub = os.path.join(out_dir, doc_id)
    if not os.path.exists(out_dir_sub):
        os.mkdir(out_dir_sub)

    with open(in_file_meta) as fn:
        metadata_all = BeautifulSoup(fn, 'xml')

    for in_file in in_files:
        metadata_file = metadata_files[os.path.basename(in_file)]
        with open(metadata_file) as f:
            metadata = BeautifulSoup(f, 'xml')
        with codecs.open(in_file, encoding='utf-8') as f:
            soup = BeautifulSoup(f, 'xml')

        # Make document with a single root element
        document = BeautifulSoup('<document></document>', 'xml')
        md_all = copy.copy(metadata_all.metadata)
        md = copy.copy(metadata.metadata)
        # add common meta data
        for m in md_all.find_all('meta'):
            md.append(m)
        document.document.append(md)
        try:
            document.document.append(soup.morphology_analysis)
        except:
            document.document.append(soup.stemmer_analysis)
        xml_out = out_file_name(out_dir_sub, in_file)
        with codecs.open(xml_out, 'wb', encoding='utf-8') as f:
            if six.PY2:
                # six.u doesn't work in Python 2 with non-ascii text
                # See https://pythonhosted.org/six/#six.u
                f.write(unicode(document))
            else:
                f.write(str(document))
def xml_to_text(in_dir, out_dir, tag):
    create_dirs(out_dir)

    in_files = get_files(in_dir)

    for fi in in_files:
        with codecs.open(fi, encoding='utf-8') as f:
            root = etree.ElementTree().parse(f)
        if tag is not None:
            elements = list(root.iter('{*}' + tag))
        else:
            elements = [root]
        texts = []
        for el in elements:
            texts.append(' '.join(
                [e.text for e in el.iterdescendants() if
                    e.text is not None]))

        out_file = out_file_name(out_dir, fi, 'txt')
        with codecs.open(out_file, 'wb', encoding='utf-8') as f:
            f.write('\n'.join(texts))
            f.write('\n')
Example #16
0
def freqs(in_dir, out_dir, name, mode):
    if mode not in ('word', 'lemma'):
        raise ValueError(
            "Unknown mode: {mode}, "
            "please choose either word or lemma".format(**locals()))
    output_file = out_file_name(out_dir, name)
    create_dirs(output_file)

    in_files = get_files(in_dir)

    cnt = Counter()
    for fi in in_files:
        with codecs.open(fi, encoding='utf-8') as f:
            saf = json.load(f)
        for token in saf['tokens']:
            word = token[mode]
            pos = token['pos1']
            cnt.update({(word, pos): 1})
    data = [(word, pos, count) for ((word, pos), count) in cnt.most_common()]
    vocab_df = pd.DataFrame(data, columns=[mode, 'pos', 'cnt'])
    vocab_df['rank'] = vocab_df.index + 1
    vocab_df.to_csv(output_file, encoding='utf-8', index=False)
Example #17
0
def merge_csv(in_dir, out_dir, name):
    create_dirs(out_dir)

    in_files = get_files(in_dir)

    wrote_header = False

    out_file = out_file_name(out_dir, name)
    with codecs.open(out_file, 'wb', encoding='utf-8') as fo:
        for fi in in_files:
            with codecs.open(fi, encoding='utf-8') as f:
                lines = f.readlines()
            if len(lines) > 1:
                header = lines[0]
                data = lines[1:]

                # TODO: check if headers are the same
                if not wrote_header:
                    fo.write(header)
                    wrote_header = True
                for line in data:
                    fo.write(line)
def freqs(in_dir, out_dir, name, mode):
    if mode not in ('word', 'lemma'):
        raise ValueError("Unknown mode: {mode}, "
                         "please choose either word or lemma"
                         .format(**locals()))
    output_file = out_file_name(out_dir, name)
    create_dirs(output_file)

    in_files = get_files(in_dir)

    cnt = Counter()
    for fi in in_files:
        with codecs.open(fi, encoding='utf-8') as f:
            saf = json.load(f)
        for token in saf['tokens']:
            word = token[mode]
            pos = token['pos1']
            cnt.update({(word, pos): 1})
    data = [(word, pos, count) for ((word, pos), count) in cnt.most_common()]
    vocab_df = pd.DataFrame(data, columns=[mode, 'pos', 'cnt'])
    vocab_df['rank'] = vocab_df.index + 1
    vocab_df.to_csv(output_file, encoding='utf-8', index=False)
Example #19
0
def extract_quotes(in_file, out_dir):
    """Extract Quran quotes from an OpenITI markdown file.
    
    And write them to a text file. The text file contains a single quote per
    line.
    """
    data = in_file.read()

    qurquotes = re.findall(r'@QB@(.*)@QE@', data)

    fn_out = out_file_name(out_dir,
                           'quotes_' + os.path.basename(in_file.name),
                           ext='txt')
    print(in_file.name, fn_out)
    if os.path.exists(fn_out):
        os.remove(fn_out)
    with open(fn_out, 'w', encoding='utf-8') as f:
        for q in qurquotes:
            q = q.strip()
            if len(q) > 0:
                f.write(re.sub('[^\u0621-\u064A ]', '',
                               q))  #Remove annotations
                f.write('\n')
def basic_text_statistics(in_dir, out_dir, name):
    create_dirs(out_dir)

    d = {'num_words': [], 'num_sentences': []}

    text_names = []

    in_files = get_files(in_dir)

    for fi in in_files:
        with codecs.open(fi, encoding='utf-8') as f:
            text = json.load(f, encoding='utf-8')

        text_id = os.path.splitext(os.path.basename(fi))[0]
        text_names.append(text_id)
        d['num_words'].append(len(text['tokens']))
        sentences = [t['sentence'] for t in text['tokens']]
        num_sentences = len(set(sentences))
        d['num_sentences'].append(num_sentences)

    df = pd.DataFrame(d, index=text_names)
    meta_out = out_file_name(out_dir, name)
    df.to_csv(meta_out, encoding='utf-8')
Example #21
0
def archive2dir(archive, remove_dir_structure, out_dir):
    if remove_dir_structure:
        result_dir = os.path.join(out_dir, str(uuid.uuid4()))
        create_dirs(result_dir)

        # make temporary directory
        tempdir = tempfile.mkdtemp()

        # extract archive to temporary directory
        patoolib.extract_archive(archive, outdir=tempdir)

        # copy extracted files to output dir
        files = get_files(tempdir, recursive=True)
        for f in files:
            fo = out_file_name(result_dir, f)
            # don't copy if it's the same file
            if os.path.abspath(f) != fo:
                shutil.copy2(f, fo)

        # remove temporary directory and its contents
        shutil.rmtree(tempdir)
    else:
        # extract archive to temporary directory
        patoolib.extract_archive(archive, outdir=out_dir)
Example #22
0
def safar_add_metadata(in_file, in_file_meta, max_len, out_dir):
    """Add metadata from a csv file to a SAFAR XML file.
    """
    create_dirs(out_dir)

    analysis_tag = None
    total_words = None

    markers = b'<markers></markers>'

    # check whether the analysis_tag should be stemmer_analysis
    with codecs.open(in_file, 'r', encoding='utf-8') as xml_file:
        for line in xml_file:
            if re.search('morphology_analysis', line):
                analysis_tag = 'morphology_analysis'
            elif re.search('stemmer_analysis', line):
                analysis_tag = 'stemmer_analysis'

            m = re.search('total_words="(\d+)"', line)
            if m:
                total_words = m.group(1)

            if analysis_tag is not None and total_words is not None:
                break

    # Extract the words and markers
    click.echo('Extracting tokens')
    (fd, tmpfile) = tempfile.mkstemp()
    with codecs.open(tmpfile, 'wb') as words:
        context = etree.iterparse(in_file,
                                  events=('end', ),
                                  tag=('word', 'markers'),
                                  huge_tree=True)
        context = tqdm(context, total=int(total_words))
        for event, elem in context:
            if elem.tag == 'word':
                # Setting method to html (instead of xml) fixes problems
                # with writing Arabic characters in the value attribute of
                # the word element.
                words.write(
                    etree.tostring(elem, encoding='utf-8', method='html'))
            elif elem.tag == 'markers':
                markers = etree.tostring(elem, encoding='utf-8')

            # make iteration over context fast and consume less memory
            # https://www.ibm.com/developerworks/xml/library/x-hiperfparse
            elem.clear()
            while elem.getprevious() is not None:
                del elem.getparent()[0]
        del context

    # Get the metadata
    md = pd.read_csv(in_file_meta,
                     sep=',|;',
                     index_col='BookURI',
                     encoding='utf-8')
    # make sure the index type is string
    if six.PY2:
        md.index = md.index.map(unicode)
    else:
        md.index = md.index.map(str)

    if '-' in os.path.basename(in_file):
        uri = os.path.basename(in_file).split('-', 1)[0]
    else:
        uri = remove_ext(in_file)

    try:
        md = md.loc[uri]
        metadata = [u'<metadata>']
        for key in md.keys()[1:]:  # skip over order (the old index)
            val = md[key]
            if isinstance(val, six.string_types):
                val = smart_strip(val)
                val = escape(val)
                # Make sure the values aren't to long, because
                # BlackLab doesn't allow values that are to long in dropdowns.
                # The default value of 94 was set emperically. It seems the
                # lengths of strings are caluclated differently in Java (the
                # max length in Java is 256).
                if len(val) >= max_len:
                    val = 'X ' + val[:max_len - 2]
            metadata.append(u'<meta name="{}">{}</meta>'.format(key, val))
        metadata.append(u'<meta name="{}">{}</meta>'.format('BookURI', uri))
        metadata.append(u'</metadata>')

        metadata = u'\n'.join(metadata)
    except KeyError:
        metadata = u'<metadata></metadata>'

    # Write output
    click.echo('Writing output')
    xml_out = out_file_name(out_dir, in_file)
    with codecs.open(xml_out, 'wb') as f:
        f.write(b'<?xml version="1.0" encoding="utf-8"?>\n')
        f.write(b'<document>\n')

        f.write(metadata.encode('utf-8'))

        tag = '  <{} total_words="{}">\n'.format(analysis_tag, total_words)
        f.write(tag.encode('utf-8'))

        with codecs.open(tmpfile, 'rb') as words_file:
            for line in tqdm(words_file):
                f.write(line)

        f.write('  </{}>\n'.format(analysis_tag).encode('utf-8'))

        f.write(markers)

        f.write(b'</document>\n')

    os.remove(tmpfile)
Example #23
0
def lstm_synced_correct_ocr(model, charset, text, out_dir):
    create_dirs(out_dir)

    # load model
    model = load_model(model)
    conf = model.get_config()
    conf_result = conf[0].get('config').get('batch_input_shape')
    seq_length = conf_result[1]
    char_embedding = False
    if conf[0].get('class_name') == u'Embedding':
        char_embedding = True

    charset = charset.read()
    n_vocab = len(charset)
    char_to_int = get_char_to_int(charset)
    int_to_char = get_int_to_char(charset)
    lowercase = True
    for c in u'ABCDEFGHIJKLMNOPQRSTUVWXYZ':
        if c in charset:
            lowercase = False
            break

    pad = u'\n'

    to_predict = read_text_to_predict(text.read(),
                                      seq_length,
                                      lowercase,
                                      n_vocab,
                                      char_to_int,
                                      padding_char=pad,
                                      char_embedding=char_embedding)

    outputs = []
    inputs = []

    predicted = model.predict(to_predict, verbose=0)
    for i, sequence in enumerate(predicted):
        predicted_indices = [np.random.choice(n_vocab, p=p) for p in sequence]
        pred_str = u''.join([int_to_char[j] for j in predicted_indices])
        outputs.append(pred_str)

        if char_embedding:
            indices = to_predict[i]
        else:
            indices = np.where(to_predict[i:i + 1, :, :] == True)[2]
        inp = u''.join([int_to_char[j] for j in indices])
        inputs.append(inp)

    idx = 0
    counters = {}

    for input_str, output_str in zip(inputs, outputs):
        if pad in output_str:
            output_str2 = align_output_to_input(input_str,
                                                output_str,
                                                empty_char=pad)
        else:
            output_str2 = output_str
        for i, (inp, outp) in enumerate(zip(input_str, output_str2)):
            if not idx + i in counters.keys():
                counters[idx + i] = Counter()
            counters[idx + i][outp] += 1

        idx += 1

    agg_out = []
    for idx, c in counters.items():
        agg_out.append(c.most_common(1)[0][0])

    corrected_text = u''.join(agg_out)
    corrected_text = corrected_text.replace(pad, u'')

    out_file = out_file_name(out_dir, text.name)
    with codecs.open(out_file, 'wb', encoding='utf-8') as f:
        f.write(corrected_text)
Example #24
0
def merge_safar_xml(in_dir, out_dir):
    """Command line tool that merges SAFAR xml files into a single file.
    """
    create_dirs(out_dir)

    in_files = get_files(in_dir)

    analysis_tag = 'morphology_analysis'

    words = []
    metadata = b'<metadata></metadata>'
    markers = {}
    marker_words = {}

    if len(in_files) == 0:
        msg = 'Unable to merge xml files, because the input directory is ' \
              'empty.'
        raise (ValueError(msg))
    else:
        num_words = 0

        fname = os.path.basename(in_files[0]).split('-')[0]
        xml_out = out_file_name(out_dir, u'{}.xml'.format(fname))

        click.echo('Reading xml files')
        (fd, tmpfile) = tempfile.mkstemp()
        with codecs.open(tmpfile, 'wb') as words:
            for i, fi in tqdm.tqdm(enumerate(in_files)):
                # Check whether we are dealing with a marker
                m = is_marked(os.path.basename(fi))
                if m:
                    mname = os.path.basename(fi).rsplit('-', 1)[0]

                if i == 0:
                    # check whether the analysis_tag should be stemmer_analysis
                    # and extract the metadata
                    context = etree.iterparse(fi,
                                              events=('end', ),
                                              tag=('stemmer_analysis',
                                                   'metadata'))
                    for event, elem in context:
                        if elem.tag == 'stemmer_analysis':
                            analysis_tag = elem.tag
                        elif elem.tag == 'metadata':
                            metadata = etree.tostring(elem, encoding='utf-8')

                # Check whether we are dealing with a marker
                if m:
                    if fname not in markers.keys():
                        markers[mname] = []
                        marker_words[mname] = []
                # Extract the words
                context = etree.iterparse(fi, events=('end', ), tag=('word'))
                for event, elem in context:
                    num_words += 1
                    elem.attrib['w_id'] = str(num_words)

                    if m:
                        markers[mname].append(str(num_words))
                        marker_words[mname].append(elem.attrib['value'])

                    # Setting method to html (instead of xml) fixes problems
                    # with writing Arabic characters in the value attribute of
                    # the word element.
                    words.write(
                        etree.tostring(elem, encoding='utf-8', method='html'))

                    # make iteration over context fast and consume less memory
                    # https://www.ibm.com/developerworks/xml/library/x-hiperfparse
                    elem.clear()
                    while elem.getprevious() is not None:
                        del elem.getparent()[0]
                del context

        # write the output
        click.echo('Writing output')
        with codecs.open(xml_out, 'wb') as f:
            f.write(b'<?xml version="1.0" encoding="utf-8"?>\n')
            f.write(b'<document>\n')

            f.write(metadata)

            tag = '  <{} total_words="{}">\n'.format(analysis_tag, num_words)
            f.write(tag.encode('utf-8'))

            with codecs.open(tmpfile, 'rb') as words_file:
                for line in tqdm.tqdm(words_file):
                    f.write(line)

            f.write('  </{}>\n'.format(analysis_tag).encode('utf-8'))

            f.write(b'<markers>\n')

            for fname, w_ids in markers.items():
                if 'header' in fname:
                    level = fname.rsplit('-', 1)[1]
                    f.write(
                        marker_xml('header', marker_words[fname], w_ids,
                                   'level', level))
                else:
                    if 'QQuote' in fname:
                        typ = 'quran'
                    else:
                        typ = 'hadith'
                    f.write(
                        marker_xml('quote', marker_words[fname], w_ids, 'type',
                                   typ))

            f.write(b'</markers>\n')

            f.write(b'</document>\n')
        os.remove(tmpfile)
Example #25
0
def safar_filter_analyses(in_file, out_dir):
    """Tool for filtering duplicate root/stem pairs from SAFAR output.
    """
    analyses = []

    markers = b'<markers></markers>'

    xml_out = out_file_name(out_dir, in_file)
    click.echo(xml_out)
    with codecs.open(xml_out, 'wb') as f:
        f.write('<?xml version="1.0" encoding="utf-8"?>\n'.encode('utf-8'))
        f.write('<document>\n'.encode('utf-8'))
        context = etree.iterparse(in_file,
                                  events=('start', ),
                                  tag=('morphology_analysis'))
        for event, elem in context:
            num_words = elem.attrib['total_words']
            break
        del context

        first_word = True
        context = etree.iterparse(in_file,
                                  events=('end', ),
                                  tag=('word', 'analysis', 'metadata',
                                       'markers'))
        for event, elem in tqdm(context):
            if elem.tag == 'word':
                if first_word:
                    tag = '<morphology_analysis total_words="{}">\n'. \
                          format(num_words)
                    f.write(tag.encode('utf-8'))
                    first_word = False
                analyses = list(set(analyses))
                tag = '<word total_analysis="{}" value="{}" w_id="{}">\n'
                tag = tag.format(len(analyses), elem.attrib['value'],
                                 elem.attrib['w_id'])
                f.write(tag.encode('utf-8'))
                f.write(b''.join(analyses))
                f.write('</word>\n'.encode('utf-8'))

                analyses = []
            elif elem.tag == 'analysis':
                for attribute in ('a_id', 'vowled', 'pattern', 'prefix',
                                  'suffix', 'additional_info', 'caze',
                                  'gender', 'mood', 'pos', 'type', 'impartial',
                                  'transitive', 'number'):
                    try:
                        del elem.attrib[attribute]
                    except KeyError:
                        pass
                # Setting method to html (instead of xml) fixes problems
                # with writing Arabic characters in the value attribute of
                # the word element.
                analyses.append(
                    etree.tostring(elem, encoding='utf-8', method='html'))
            elif elem.tag == 'metadata':
                f.write(etree.tostring(elem, encoding='utf-8'))
                f.write(b'\n')

            elif elem.tag == 'markers':
                markers = etree.tostring(elem, encoding='utf-8')

            # make iteration over context fast and consume less memory
            # https://www.ibm.com/developerworks/xml/library/x-hiperfparse
            elem.clear()
            while elem.getprevious() is not None:
                del elem.getparent()[0]
        del context

        f.write('</morphology_analysis>\n'.encode('utf-8'))
        f.write(markers)
        f.write(b'\n')
        f.write('</document>\n'.encode('utf-8'))
Example #26
0
def copy_file(fi, name, out_dir, dest):
    fo = out_file_name(os.path.join(out_dir, dest), name)
    create_dirs(fo, is_file=True)
    shutil.copy2(fi, fo)
Example #27
0
def test_out_file_name_same_ext():
    out_dir = '/home/jvdzwaan/data/'
    fname = 'foo.txt'

    assert out_file_name(out_dir, fname) == '/home/jvdzwaan/data/foo.txt'
Example #28
0
def test_out_file_name_path():
    out_dir = '/home/jvdzwaan/data/'
    fname = '/other/path/foo.txt'

    assert out_file_name(out_dir, fname) == '/home/jvdzwaan/data/foo.txt'