def test_out_file_name_change_ext(): out_dir = '/home/jvdzwaan/data/' fname = 'foo.txt' out_fname = out_file_name(out_dir, fname, ext='csv') assert out_fname == '/home/jvdzwaan/data/foo.csv'
def saf_to_text(in_dir, out_dir, mode): create_dirs(out_dir) if mode not in ('word', 'lemma'): raise ValueError("Unknown mode: {mode}, " "please choose either word or lemma" .format(**locals())) in_files = get_files(in_dir) for fi in in_files: with codecs.open(fi, encoding='utf-8') as f: saf = json.load(f) s_id = None lines = [] for t in saf['tokens']: if s_id is None: s_id = t['sentence'] sentence = [] elif t['sentence'] != s_id: lines.append(u' '.join(sentence)) sentence = [] s_id = t['sentence'] sentence.append(t[mode]) out_file = out_file_name(out_dir, os.path.basename(fi), ext='txt') with codecs.open(out_file, 'wb', encoding='utf-8') as f: f.write(u'\n'.join(lines)) f.write(u'\n')
def normalize_whitespace_punctuation(txt, out_dir): create_dirs(out_dir) text = txt.read() text = normalize_whitespace(text) text = normalize_punctuation(text) out_file = out_file_name(out_dir, os.path.basename(txt.name)) with codecs.open(out_file, 'wb', encoding='utf-8') as f: f.write(text)
def command(in_file, rename, out_dir): create_dirs(out_dir) ext = os.path.splitext(in_file)[1].replace('.', '') fname = os.path.basename(in_file) if rename == 'spaces': fname = fname.replace(' ', '-') elif rename == 'random': fname = '{}.{}'.format(uuid.uuid4(), ext) fo = out_file_name(out_dir, fname) shutil.copy2(in_file, fo)
def command(xml_file, element, out_dir): create_dirs(out_dir) bs = BeautifulSoup(xml_file.read(), 'xml') for elem in element: to_empty = bs.find_all(elem) for t in to_empty: t.clear() out_file = out_file_name(out_dir, os.path.basename(xml_file.name)) with codecs.open(out_file, 'wb', encoding='utf-8') as f: f.write(bs.prettify())
def command(in_dir, out_dir, tika_server): create_dirs(out_dir) in_files = get_files(in_dir) for fi in in_files: if tika_server: parsed = parser.from_file(fi, tika_server) else: parsed = parser.from_file(fi) out_file = out_file_name(out_dir, fi, 'txt') with codecs.open(out_file, 'wb', encoding='utf-8') as f: f.write(parsed['content'])
def check_file(in_file, convert, out_dir): fo = out_file_name(out_dir, in_file) try: with codecs.open(in_file, encoding='utf-8') as f: text = f.read() if convert: # don't copy if it's the same file if os.path.abspath(in_file) != fo: shutil.copy2(in_file, fo) except UnicodeDecodeError: with codecs.open(in_file, 'rb') as f: text = f.read() dammit = UnicodeDammit(text) print('{}: {}'.format(in_file, dammit.original_encoding)) if convert: with codecs.open(fo, 'w', encoding='utf-8') as f: f.write(dammit.unicode_markup)
def frog2saf(in_dir, out_dir): create_dirs(out_dir) in_files = get_files(in_dir) for fi in in_files: with codecs.open(fi, encoding='utf-8') as f: lines = f.readlines() lines = [line.strip() for line in lines] saf_data = frog_to_saf(parse_frog(lines)) head, tail = os.path.split(fi) fname = tail.replace(os.path.splitext(tail)[1], '') out_file = os.path.join(out_dir, out_file_name(out_dir, fname, 'json')) with codecs.open(out_file, 'wb', encoding='utf-8') as f: json.dump(saf_data, f, indent=4)
def delete_empty_files(in_dir, out_dir): create_dirs(out_dir) in_files = get_files(in_dir) for fi in in_files: with codecs.open(fi, encoding='utf-8') as f: text = f.read() if len(text.strip()) > 0: fname = out_file_name(out_dir, fi) try: shutil.copy2(fi, fname) except shutil.Error: pass else: print('deleting {}'.format(os.path.basename(fi))) if os.path.abspath(in_dir) == os.path.abspath(out_dir): os.remove(fi)
def command(ocr_text, gs_text, metadata, out_dir): create_dirs(out_dir) ocr = ocr_text.read() gs = gs_text.read() md = json.load(metadata) check = True # Too many strange characters, so disable sanity check if len(set(ocr+gs)) > 127: check = False ocr_a, gs_a = align_characters(ocr, gs, md['cigar'], sanity_check=check) out_file = out_file_name(out_dir, md['doc_id'], 'json') with codecs.open(out_file, 'wb', encoding='utf-8') as f: try: json.dump({'ocr': ocr_a, 'gs': gs_a}, f, encoding='utf-8') except TypeError: json.dump({'ocr': ocr_a, 'gs': gs_a}, f)
def merge2openiti(in_file1, in_file2, out_dir): create_dirs(out_dir) try: nltk.data.find('tokenizers/punkt') except LookupError: nltk.download('punkt') lines1 = in_file1.readlines() lines2 = in_file2.readlines() merged = [] for l1, l2 in zip(lines1[:10], lines2[:10]): merged_sentence = merge_sentences(l1, l2) merged.append(merged_sentence) out_file = out_file_name(out_dir, in_file1.name) print out_file with codecs.open(out_file, 'wb', encoding='utf-8') as f: f.write(''.join(merged))
def concat_files(in_dir, out_dir): in_files = get_files(in_dir) counts = Counter() for in_file in in_files: parts = os.path.basename(in_file).split(u'_') prefix = u'_'.join(parts[:2]) counts[prefix] += 1 out_file = out_file_name(out_dir, prefix, ext='txt') with codecs.open(in_file, 'r', encoding='utf-8') as fi: text = fi.read() text = text.replace(u'\n', u'') text = text.strip() with codecs.open(out_file, 'a', encoding='utf-8') as fo: fo.write(text) fo.write(u'\n')
def xml_to_text(in_dir, out_dir, tag): create_dirs(out_dir) in_files = get_files(in_dir) for fi in in_files: with codecs.open(fi, encoding='utf-8') as f: root = etree.ElementTree().parse(f) if tag is not None: elements = list(root.iter('{*}' + tag)) else: elements = [root] texts = [] for el in elements: texts.append(' '.join( [e.text for e in el.iterdescendants() if e.text is not None])) out_file = out_file_name(out_dir, fi, 'txt') with codecs.open(out_file, 'wb', encoding='utf-8') as f: f.write('\n'.join(texts)) f.write('\n')
def safar_add_metadata(in_dir, in_dir_meta, in_file_meta, out_dir): in_files = get_files(in_dir) metadata_files = {os.path.basename(f): f for f in get_files(in_dir_meta)} doc_id = os.path.splitext(os.path.basename(in_file_meta))[0] out_dir_sub = os.path.join(out_dir, doc_id) if not os.path.exists(out_dir_sub): os.mkdir(out_dir_sub) with open(in_file_meta) as fn: metadata_all = BeautifulSoup(fn, 'xml') for in_file in in_files: metadata_file = metadata_files[os.path.basename(in_file)] with open(metadata_file) as f: metadata = BeautifulSoup(f, 'xml') with codecs.open(in_file, encoding='utf-8') as f: soup = BeautifulSoup(f, 'xml') # Make document with a single root element document = BeautifulSoup('<document></document>', 'xml') md_all = copy.copy(metadata_all.metadata) md = copy.copy(metadata.metadata) # add common meta data for m in md_all.find_all('meta'): md.append(m) document.document.append(md) try: document.document.append(soup.morphology_analysis) except: document.document.append(soup.stemmer_analysis) xml_out = out_file_name(out_dir_sub, in_file) with codecs.open(xml_out, 'wb', encoding='utf-8') as f: if six.PY2: # six.u doesn't work in Python 2 with non-ascii text # See https://pythonhosted.org/six/#six.u f.write(unicode(document)) else: f.write(str(document))
def freqs(in_dir, out_dir, name, mode): if mode not in ('word', 'lemma'): raise ValueError( "Unknown mode: {mode}, " "please choose either word or lemma".format(**locals())) output_file = out_file_name(out_dir, name) create_dirs(output_file) in_files = get_files(in_dir) cnt = Counter() for fi in in_files: with codecs.open(fi, encoding='utf-8') as f: saf = json.load(f) for token in saf['tokens']: word = token[mode] pos = token['pos1'] cnt.update({(word, pos): 1}) data = [(word, pos, count) for ((word, pos), count) in cnt.most_common()] vocab_df = pd.DataFrame(data, columns=[mode, 'pos', 'cnt']) vocab_df['rank'] = vocab_df.index + 1 vocab_df.to_csv(output_file, encoding='utf-8', index=False)
def merge_csv(in_dir, out_dir, name): create_dirs(out_dir) in_files = get_files(in_dir) wrote_header = False out_file = out_file_name(out_dir, name) with codecs.open(out_file, 'wb', encoding='utf-8') as fo: for fi in in_files: with codecs.open(fi, encoding='utf-8') as f: lines = f.readlines() if len(lines) > 1: header = lines[0] data = lines[1:] # TODO: check if headers are the same if not wrote_header: fo.write(header) wrote_header = True for line in data: fo.write(line)
def freqs(in_dir, out_dir, name, mode): if mode not in ('word', 'lemma'): raise ValueError("Unknown mode: {mode}, " "please choose either word or lemma" .format(**locals())) output_file = out_file_name(out_dir, name) create_dirs(output_file) in_files = get_files(in_dir) cnt = Counter() for fi in in_files: with codecs.open(fi, encoding='utf-8') as f: saf = json.load(f) for token in saf['tokens']: word = token[mode] pos = token['pos1'] cnt.update({(word, pos): 1}) data = [(word, pos, count) for ((word, pos), count) in cnt.most_common()] vocab_df = pd.DataFrame(data, columns=[mode, 'pos', 'cnt']) vocab_df['rank'] = vocab_df.index + 1 vocab_df.to_csv(output_file, encoding='utf-8', index=False)
def extract_quotes(in_file, out_dir): """Extract Quran quotes from an OpenITI markdown file. And write them to a text file. The text file contains a single quote per line. """ data = in_file.read() qurquotes = re.findall(r'@QB@(.*)@QE@', data) fn_out = out_file_name(out_dir, 'quotes_' + os.path.basename(in_file.name), ext='txt') print(in_file.name, fn_out) if os.path.exists(fn_out): os.remove(fn_out) with open(fn_out, 'w', encoding='utf-8') as f: for q in qurquotes: q = q.strip() if len(q) > 0: f.write(re.sub('[^\u0621-\u064A ]', '', q)) #Remove annotations f.write('\n')
def basic_text_statistics(in_dir, out_dir, name): create_dirs(out_dir) d = {'num_words': [], 'num_sentences': []} text_names = [] in_files = get_files(in_dir) for fi in in_files: with codecs.open(fi, encoding='utf-8') as f: text = json.load(f, encoding='utf-8') text_id = os.path.splitext(os.path.basename(fi))[0] text_names.append(text_id) d['num_words'].append(len(text['tokens'])) sentences = [t['sentence'] for t in text['tokens']] num_sentences = len(set(sentences)) d['num_sentences'].append(num_sentences) df = pd.DataFrame(d, index=text_names) meta_out = out_file_name(out_dir, name) df.to_csv(meta_out, encoding='utf-8')
def archive2dir(archive, remove_dir_structure, out_dir): if remove_dir_structure: result_dir = os.path.join(out_dir, str(uuid.uuid4())) create_dirs(result_dir) # make temporary directory tempdir = tempfile.mkdtemp() # extract archive to temporary directory patoolib.extract_archive(archive, outdir=tempdir) # copy extracted files to output dir files = get_files(tempdir, recursive=True) for f in files: fo = out_file_name(result_dir, f) # don't copy if it's the same file if os.path.abspath(f) != fo: shutil.copy2(f, fo) # remove temporary directory and its contents shutil.rmtree(tempdir) else: # extract archive to temporary directory patoolib.extract_archive(archive, outdir=out_dir)
def safar_add_metadata(in_file, in_file_meta, max_len, out_dir): """Add metadata from a csv file to a SAFAR XML file. """ create_dirs(out_dir) analysis_tag = None total_words = None markers = b'<markers></markers>' # check whether the analysis_tag should be stemmer_analysis with codecs.open(in_file, 'r', encoding='utf-8') as xml_file: for line in xml_file: if re.search('morphology_analysis', line): analysis_tag = 'morphology_analysis' elif re.search('stemmer_analysis', line): analysis_tag = 'stemmer_analysis' m = re.search('total_words="(\d+)"', line) if m: total_words = m.group(1) if analysis_tag is not None and total_words is not None: break # Extract the words and markers click.echo('Extracting tokens') (fd, tmpfile) = tempfile.mkstemp() with codecs.open(tmpfile, 'wb') as words: context = etree.iterparse(in_file, events=('end', ), tag=('word', 'markers'), huge_tree=True) context = tqdm(context, total=int(total_words)) for event, elem in context: if elem.tag == 'word': # Setting method to html (instead of xml) fixes problems # with writing Arabic characters in the value attribute of # the word element. words.write( etree.tostring(elem, encoding='utf-8', method='html')) elif elem.tag == 'markers': markers = etree.tostring(elem, encoding='utf-8') # make iteration over context fast and consume less memory # https://www.ibm.com/developerworks/xml/library/x-hiperfparse elem.clear() while elem.getprevious() is not None: del elem.getparent()[0] del context # Get the metadata md = pd.read_csv(in_file_meta, sep=',|;', index_col='BookURI', encoding='utf-8') # make sure the index type is string if six.PY2: md.index = md.index.map(unicode) else: md.index = md.index.map(str) if '-' in os.path.basename(in_file): uri = os.path.basename(in_file).split('-', 1)[0] else: uri = remove_ext(in_file) try: md = md.loc[uri] metadata = [u'<metadata>'] for key in md.keys()[1:]: # skip over order (the old index) val = md[key] if isinstance(val, six.string_types): val = smart_strip(val) val = escape(val) # Make sure the values aren't to long, because # BlackLab doesn't allow values that are to long in dropdowns. # The default value of 94 was set emperically. It seems the # lengths of strings are caluclated differently in Java (the # max length in Java is 256). if len(val) >= max_len: val = 'X ' + val[:max_len - 2] metadata.append(u'<meta name="{}">{}</meta>'.format(key, val)) metadata.append(u'<meta name="{}">{}</meta>'.format('BookURI', uri)) metadata.append(u'</metadata>') metadata = u'\n'.join(metadata) except KeyError: metadata = u'<metadata></metadata>' # Write output click.echo('Writing output') xml_out = out_file_name(out_dir, in_file) with codecs.open(xml_out, 'wb') as f: f.write(b'<?xml version="1.0" encoding="utf-8"?>\n') f.write(b'<document>\n') f.write(metadata.encode('utf-8')) tag = ' <{} total_words="{}">\n'.format(analysis_tag, total_words) f.write(tag.encode('utf-8')) with codecs.open(tmpfile, 'rb') as words_file: for line in tqdm(words_file): f.write(line) f.write(' </{}>\n'.format(analysis_tag).encode('utf-8')) f.write(markers) f.write(b'</document>\n') os.remove(tmpfile)
def lstm_synced_correct_ocr(model, charset, text, out_dir): create_dirs(out_dir) # load model model = load_model(model) conf = model.get_config() conf_result = conf[0].get('config').get('batch_input_shape') seq_length = conf_result[1] char_embedding = False if conf[0].get('class_name') == u'Embedding': char_embedding = True charset = charset.read() n_vocab = len(charset) char_to_int = get_char_to_int(charset) int_to_char = get_int_to_char(charset) lowercase = True for c in u'ABCDEFGHIJKLMNOPQRSTUVWXYZ': if c in charset: lowercase = False break pad = u'\n' to_predict = read_text_to_predict(text.read(), seq_length, lowercase, n_vocab, char_to_int, padding_char=pad, char_embedding=char_embedding) outputs = [] inputs = [] predicted = model.predict(to_predict, verbose=0) for i, sequence in enumerate(predicted): predicted_indices = [np.random.choice(n_vocab, p=p) for p in sequence] pred_str = u''.join([int_to_char[j] for j in predicted_indices]) outputs.append(pred_str) if char_embedding: indices = to_predict[i] else: indices = np.where(to_predict[i:i + 1, :, :] == True)[2] inp = u''.join([int_to_char[j] for j in indices]) inputs.append(inp) idx = 0 counters = {} for input_str, output_str in zip(inputs, outputs): if pad in output_str: output_str2 = align_output_to_input(input_str, output_str, empty_char=pad) else: output_str2 = output_str for i, (inp, outp) in enumerate(zip(input_str, output_str2)): if not idx + i in counters.keys(): counters[idx + i] = Counter() counters[idx + i][outp] += 1 idx += 1 agg_out = [] for idx, c in counters.items(): agg_out.append(c.most_common(1)[0][0]) corrected_text = u''.join(agg_out) corrected_text = corrected_text.replace(pad, u'') out_file = out_file_name(out_dir, text.name) with codecs.open(out_file, 'wb', encoding='utf-8') as f: f.write(corrected_text)
def merge_safar_xml(in_dir, out_dir): """Command line tool that merges SAFAR xml files into a single file. """ create_dirs(out_dir) in_files = get_files(in_dir) analysis_tag = 'morphology_analysis' words = [] metadata = b'<metadata></metadata>' markers = {} marker_words = {} if len(in_files) == 0: msg = 'Unable to merge xml files, because the input directory is ' \ 'empty.' raise (ValueError(msg)) else: num_words = 0 fname = os.path.basename(in_files[0]).split('-')[0] xml_out = out_file_name(out_dir, u'{}.xml'.format(fname)) click.echo('Reading xml files') (fd, tmpfile) = tempfile.mkstemp() with codecs.open(tmpfile, 'wb') as words: for i, fi in tqdm.tqdm(enumerate(in_files)): # Check whether we are dealing with a marker m = is_marked(os.path.basename(fi)) if m: mname = os.path.basename(fi).rsplit('-', 1)[0] if i == 0: # check whether the analysis_tag should be stemmer_analysis # and extract the metadata context = etree.iterparse(fi, events=('end', ), tag=('stemmer_analysis', 'metadata')) for event, elem in context: if elem.tag == 'stemmer_analysis': analysis_tag = elem.tag elif elem.tag == 'metadata': metadata = etree.tostring(elem, encoding='utf-8') # Check whether we are dealing with a marker if m: if fname not in markers.keys(): markers[mname] = [] marker_words[mname] = [] # Extract the words context = etree.iterparse(fi, events=('end', ), tag=('word')) for event, elem in context: num_words += 1 elem.attrib['w_id'] = str(num_words) if m: markers[mname].append(str(num_words)) marker_words[mname].append(elem.attrib['value']) # Setting method to html (instead of xml) fixes problems # with writing Arabic characters in the value attribute of # the word element. words.write( etree.tostring(elem, encoding='utf-8', method='html')) # make iteration over context fast and consume less memory # https://www.ibm.com/developerworks/xml/library/x-hiperfparse elem.clear() while elem.getprevious() is not None: del elem.getparent()[0] del context # write the output click.echo('Writing output') with codecs.open(xml_out, 'wb') as f: f.write(b'<?xml version="1.0" encoding="utf-8"?>\n') f.write(b'<document>\n') f.write(metadata) tag = ' <{} total_words="{}">\n'.format(analysis_tag, num_words) f.write(tag.encode('utf-8')) with codecs.open(tmpfile, 'rb') as words_file: for line in tqdm.tqdm(words_file): f.write(line) f.write(' </{}>\n'.format(analysis_tag).encode('utf-8')) f.write(b'<markers>\n') for fname, w_ids in markers.items(): if 'header' in fname: level = fname.rsplit('-', 1)[1] f.write( marker_xml('header', marker_words[fname], w_ids, 'level', level)) else: if 'QQuote' in fname: typ = 'quran' else: typ = 'hadith' f.write( marker_xml('quote', marker_words[fname], w_ids, 'type', typ)) f.write(b'</markers>\n') f.write(b'</document>\n') os.remove(tmpfile)
def safar_filter_analyses(in_file, out_dir): """Tool for filtering duplicate root/stem pairs from SAFAR output. """ analyses = [] markers = b'<markers></markers>' xml_out = out_file_name(out_dir, in_file) click.echo(xml_out) with codecs.open(xml_out, 'wb') as f: f.write('<?xml version="1.0" encoding="utf-8"?>\n'.encode('utf-8')) f.write('<document>\n'.encode('utf-8')) context = etree.iterparse(in_file, events=('start', ), tag=('morphology_analysis')) for event, elem in context: num_words = elem.attrib['total_words'] break del context first_word = True context = etree.iterparse(in_file, events=('end', ), tag=('word', 'analysis', 'metadata', 'markers')) for event, elem in tqdm(context): if elem.tag == 'word': if first_word: tag = '<morphology_analysis total_words="{}">\n'. \ format(num_words) f.write(tag.encode('utf-8')) first_word = False analyses = list(set(analyses)) tag = '<word total_analysis="{}" value="{}" w_id="{}">\n' tag = tag.format(len(analyses), elem.attrib['value'], elem.attrib['w_id']) f.write(tag.encode('utf-8')) f.write(b''.join(analyses)) f.write('</word>\n'.encode('utf-8')) analyses = [] elif elem.tag == 'analysis': for attribute in ('a_id', 'vowled', 'pattern', 'prefix', 'suffix', 'additional_info', 'caze', 'gender', 'mood', 'pos', 'type', 'impartial', 'transitive', 'number'): try: del elem.attrib[attribute] except KeyError: pass # Setting method to html (instead of xml) fixes problems # with writing Arabic characters in the value attribute of # the word element. analyses.append( etree.tostring(elem, encoding='utf-8', method='html')) elif elem.tag == 'metadata': f.write(etree.tostring(elem, encoding='utf-8')) f.write(b'\n') elif elem.tag == 'markers': markers = etree.tostring(elem, encoding='utf-8') # make iteration over context fast and consume less memory # https://www.ibm.com/developerworks/xml/library/x-hiperfparse elem.clear() while elem.getprevious() is not None: del elem.getparent()[0] del context f.write('</morphology_analysis>\n'.encode('utf-8')) f.write(markers) f.write(b'\n') f.write('</document>\n'.encode('utf-8'))
def copy_file(fi, name, out_dir, dest): fo = out_file_name(os.path.join(out_dir, dest), name) create_dirs(fo, is_file=True) shutil.copy2(fi, fo)
def test_out_file_name_same_ext(): out_dir = '/home/jvdzwaan/data/' fname = 'foo.txt' assert out_file_name(out_dir, fname) == '/home/jvdzwaan/data/foo.txt'
def test_out_file_name_path(): out_dir = '/home/jvdzwaan/data/' fname = '/other/path/foo.txt' assert out_file_name(out_dir, fname) == '/home/jvdzwaan/data/foo.txt'