def pattern_parse(in_file, language, out_dir): if language == 'en': from pattern.en import parsetree elif language == 'es': from pattern.es import parsetree elif language == 'de': from pattern.de import parsetree elif language == 'fr': from pattern.fr import parsetree elif language == 'it': from pattern.it import parsetree elif language == 'nl': from pattern.nl import parsetree tokens = parse(in_file.read(), parsetree) pattern_version = pattern.__version__ header = { 'format': 'SAF', 'format-version': '0.1', 'processed': [{ 'module': "pattern.{}".format(language), 'module-version': pattern_version, 'started': datetime.date.today().strftime('%Y-%m-%d') }] } out_file = out_file_name(out_dir, in_file.name, 'json') create_dirs(out_file, is_file=True) with codecs.open(out_file, 'wb', encoding='utf-8') as f: json.dump({'header': header, 'tokens': tokens}, f, indent=4)
def create_word_mappings(saf, alignments, lowercase, out_dir): create_dirs(out_dir) alignment_data = json.load(alignments) aligned1 = alignment_data['gs'] aligned2 = alignment_data['ocr'] saf = json.load(saf) if lowercase: words = [w['word'].lower() for w in saf['tokens']] aligned1 = [c.lower() for c in aligned1] aligned2 = [c.lower() for c in aligned2] else: words = [w['word'] for w in saf['tokens']] wb = find_word_boundaries(words, aligned1) doc_id = remove_ext(alignments.name) res = {'gs': [], 'ocr': [], 'doc_id': []} for s, e in wb: w1 = u''.join(aligned1[s:e]) w2 = u''.join(aligned2[s:e]) res['gs'].append(w1.strip()) res['ocr'].append(w2.strip()) res['doc_id'].append(doc_id) # Use pandas DataFrame to create the csv, so commas and quotes are properly # escaped. df = pd.DataFrame(res) out_file = out_file_name(out_dir, doc_id, ext='csv') df.to_csv(out_file, encoding='utf-8')
def command(in_dir, out_dir, out_name): """Create a division of the data in train, test and validation sets. The result is stored to a JSON file, so it can be reused. """ # TODO: make seed and percentages options SEED = 4 TEST_PERCENTAGE = 10 VAL_PERCENTAGE = 10 create_dirs(out_dir) in_files = get_files(in_dir) np.random.seed(SEED) np.random.shuffle(in_files) n_test = int(len(in_files) / 100.0 * TEST_PERCENTAGE) n_val = int(len(in_files) / 100.0 * VAL_PERCENTAGE) validation_texts = in_files[0:n_val] test_texts = in_files[n_val:n_val + n_test] train_texts = in_files[n_val + n_test:] division = { 'train': [os.path.basename(t) for t in train_texts], 'val': [os.path.basename(t) for t in validation_texts], 'test': [os.path.basename(t) for t in test_texts] } out_file = os.path.join(out_dir, out_name) with codecs.open(out_file, 'wb', encoding='utf-8') as f: json.dump(division, f, indent=4)
def saf_to_text(in_dir, out_dir, mode): create_dirs(out_dir) if mode not in ('word', 'lemma'): raise ValueError( "Unknown mode: {mode}, " "please choose either word or lemma".format(**locals())) in_files = get_files(in_dir) for fi in in_files: with codecs.open(fi, encoding='utf-8') as f: saf = json.load(f) s_id = None lines = [] for t in saf['tokens']: if s_id is None: s_id = t['sentence'] sentence = [] elif t['sentence'] != s_id: lines.append(u' '.join(sentence)) sentence = [] s_id = t['sentence'] sentence.append(t[mode]) out_file = out_file_name(out_dir, os.path.basename(fi), ext='txt') with codecs.open(out_file, 'wb', encoding='utf-8') as f: f.write(u'\n'.join(lines)) f.write(u'\n')
def rmgarbage(in_file, out_dir): create_dirs(out_dir) text = in_file.read() words = text.split() doc_id = os.path.basename(in_file.name).split('.')[0] result = [] removed = [] for word in words: errors = get_rmgarbage_errors(word) if len(errors) == 0: result.append(word) else: removed.append({ 'word': word, 'errors': u''.join(errors), 'doc_id': doc_id }) out_file = out_file_name(out_dir, in_file.name) with codecs.open(out_file, 'wb', encoding='utf-8') as f: f.write(u' '.join(result)) metadata_out = pd.DataFrame(removed) fname = '{}-rmgarbage-metadata.csv'.format(doc_id) out_file = out_file_name(out_dir, fname) metadata_out.to_csv(out_file, encoding='utf-8')
def saf_to_text(in_dir, out_dir, mode): create_dirs(out_dir) if mode not in ('word', 'lemma'): raise ValueError("Unknown mode: {mode}, " "please choose either word or lemma" .format(**locals())) in_files = get_files(in_dir) for fi in in_files: with codecs.open(fi, encoding='utf-8') as f: saf = json.load(f) s_id = None lines = [] for t in saf['tokens']: if s_id is None: s_id = t['sentence'] sentence = [] elif t['sentence'] != s_id: lines.append(u' '.join(sentence)) sentence = [] s_id = t['sentence'] sentence.append(t[mode]) out_file = out_file_name(out_dir, os.path.basename(fi), ext='txt') with codecs.open(out_file, 'wb', encoding='utf-8') as f: f.write(u'\n'.join(lines)) f.write(u'\n')
def command(ner_statistics, keep, name, out_dir): df = pd.read_csv(ner_statistics, index_col=0, encoding='utf-8') df = df.query(u' or '.join([u'ner=="{}"'.format(k) for k in keep])) output_file = os.path.join(out_dir, name) create_dirs(output_file) df.to_csv(output_file, encoding='utf-8')
def check_utf8(in_dir, convert, processes, out_dir): create_dirs(out_dir) in_files = get_files(in_dir) check = partial(check_file, convert=convert, out_dir=out_dir) pool = Pool(processes=processes) pool.map(check, in_files)
def lowercase(in_file, out_dir): create_dirs(out_dir) text = in_file.read() text = text.lower() stdout_text = click.get_text_stream('stdout') stdout_text.write(text)
def prettify_xml(in_file, out_dir): create_dirs(out_dir) bs = BeautifulSoup(in_file.read(), 'xml') out_file = out_file_name(out_dir, in_file.name, 'xml') with codecs.open(out_file, 'wb', encoding='utf-8') as f: f.write(bs.prettify())
def command(in_dir, datadivision, name, out_dir): create_dirs(out_dir) div = json.load(datadivision) files_out = [cwl_file(f) for f in get_files(in_dir, div, name)] stdout_text = click.get_text_stream('stdout') stdout_text.write(json.dumps({'out_files': files_out}))
def ocrevaluation_extract(in_file, out_dir): create_dirs(out_dir) tables = [] write = False (fd, tmpfile) = tempfile.mkstemp() with codecs.open(tmpfile, 'w', encoding='utf-8') as tmp: for line in in_file: if line.startswith('<h2>General'): write = True if line.startswith('<h2>Difference'): write = False if line.startswith('<h2>Error'): write = True if write: tmp.write(line) with codecs.open(tmpfile, encoding='utf-8') as f: soup = BeautifulSoup(f.read(), 'lxml') tables = soup.find_all('table') assert len(tables) == 2 os.remove(tmpfile) doc = remove_ext(in_file.name) t = tables[0] table_data = [[cell.text for cell in row('td')] for row in t('tr')] # 'transpose' table_data lines = {} for data in table_data: for i, entry in enumerate(data): if i not in lines.keys(): # add doc id to data line (but not to header) if i != 0: lines[i] = [doc] else: lines[i] = [''] lines[i].append(entry) out_file = os.path.join(out_dir, '{}-global.csv'.format(doc)) with codecs.open(out_file, 'wb', encoding='utf-8') as f: for i in range(len(lines.keys())): f.write(u','.join(lines[i])) f.write(u'\n') t = tables[1] table_data = [[cell.text for cell in row('td')] for row in t('tr')] out_file = os.path.join(out_dir, '{}-character.csv'.format(doc)) with codecs.open(out_file, 'wb', encoding='utf-8') as f: for data in table_data: f.write(u'"{}",'.format(data[0])) f.write(u','.join(data[1:])) f.write(u'\n')
def normalize_whitespace_punctuation(txt, out_dir): create_dirs(out_dir) text = txt.read() text = normalize_whitespace(text) text = normalize_punctuation(text) out_file = out_file_name(out_dir, os.path.basename(txt.name)) with codecs.open(out_file, 'wb', encoding='utf-8') as f: f.write(text)
def command(xml_file, element, out_dir): create_dirs(out_dir) bs = BeautifulSoup(xml_file.read(), 'xml') for elem in element: to_empty = bs.find_all(elem) for t in to_empty: t.decompose() out_file = out_file_name(out_dir, os.path.basename(xml_file.name)) with codecs.open(out_file, 'wb', encoding='utf-8') as f: f.write(bs.prettify())
def remove_newlines(in_file, replacement, out_dir): create_dirs(out_dir) text = in_file.read() if replacement == u'space': text = re.sub('\n+', u' ', text) else: text = text.replace(u'\n', u'') text = text.strip() stdout_text = click.get_text_stream('stdout') stdout_text.write(text)
def command(in_file, rename, out_dir): create_dirs(out_dir) ext = os.path.splitext(in_file)[1].replace('.', '') fname = os.path.basename(in_file) if rename == 'spaces': fname = fname.replace(' ', '-') elif rename == 'random': fname = '{}.{}'.format(uuid.uuid4(), ext) fo = out_file_name(out_dir, fname) shutil.copy2(in_file, fo)
def command(xml_file, element, out_dir): create_dirs(out_dir) bs = BeautifulSoup(xml_file.read(), 'xml') for elem in element: to_empty = bs.find_all(elem) for t in to_empty: t.clear() out_file = out_file_name(out_dir, os.path.basename(xml_file.name)) with codecs.open(out_file, 'wb', encoding='utf-8') as f: f.write(bs.prettify())
def command(in_dir, out_dir, tika_server): create_dirs(out_dir) in_files = get_files(in_dir) for fi in in_files: if tika_server: parsed = parser.from_file(fi, tika_server) else: parsed = parser.from_file(fi) out_file = out_file_name(out_dir, fi, 'txt') with codecs.open(out_file, 'wb', encoding='utf-8') as f: f.write(parsed['content'])
def freqs(in_dir, out_dir, name): out_file = os.path.join(out_dir, name) create_dirs(out_file) in_files = get_files(in_dir) vectorizer = CountVectorizer(input='filename', tokenizer=split) X = vectorizer.fit_transform(in_files) freqs = np.array(X.sum(axis=0)).squeeze() vocab_df = pd.DataFrame( {'word': vectorizer.get_feature_names(), 'freq': freqs}) vocab_df['rank'] = vocab_df['freq'].rank(method='first', ascending=False) vocab_df = vocab_df.sort('rank') vocab_df.to_csv(out_file, encoding='utf-8', index=False)
def freqs(in_dir, out_dir, name): out_file = os.path.join(out_dir, name) create_dirs(out_file) in_files = get_files(in_dir) vectorizer = CountVectorizer(input='filename', tokenizer=split) X = vectorizer.fit_transform(in_files) freqs = np.array(X.sum(axis=0)).squeeze() vocab_df = pd.DataFrame({ 'word': vectorizer.get_feature_names(), 'freq': freqs }) vocab_df['rank'] = vocab_df['freq'].rank(method='first', ascending=False) vocab_df = vocab_df.sort('rank') vocab_df.to_csv(out_file, encoding='utf-8', index=False)
def command(ocr_text, gs_text, metadata, out_dir): create_dirs(out_dir) ocr = ocr_text.read() gs = gs_text.read() md = json.load(metadata) check = True # Too many strange characters, so disable sanity check if len(set(ocr + gs)) > 127: check = False ocr_a, gs_a = align_characters(ocr, gs, md['cigar'], sanity_check=check) out_file = out_file_name(out_dir, md['doc_id'], 'json') with codecs.open(out_file, 'wb', encoding='utf-8') as f: json.dump({'ocr': ocr_a, 'gs': gs_a}, f, encoding='utf-8', indent=4)
def frog2saf(in_dir, out_dir): create_dirs(out_dir) in_files = get_files(in_dir) for fi in in_files: with codecs.open(fi, encoding='utf-8') as f: lines = f.readlines() lines = [line.strip() for line in lines] saf_data = frog_to_saf(parse_frog(lines)) head, tail = os.path.split(fi) fname = tail.replace(os.path.splitext(tail)[1], '') out_file = os.path.join(out_dir, out_file_name(out_dir, fname, 'json')) with codecs.open(out_file, 'wb', encoding='utf-8') as f: json.dump(saf_data, f, indent=4)
def command(meta_in, meta_out): create_dirs(meta_out) cgn_tags = pd.read_csv(meta_in, index_col=0, encoding='utf-8') result = {} for tag in cgn_tags.index: click.echo(tag) for cgn, uni in tag_mapping.iteritems(): if tag.startswith(cgn): result[tag] = {'pos': uni} click.echo(result) with codecs.open(meta_out, 'wb', encoding='utf-8') as f: json.dump(result, f, indent=4, encoding='utf-8')
def delete_empty_files(in_dir, out_dir): create_dirs(out_dir) in_files = get_files(in_dir) for fi in in_files: with codecs.open(fi, encoding='utf-8') as f: text = f.read() if len(text.strip()) > 0: fname = out_file_name(out_dir, fi) try: shutil.copy2(fi, fname) except shutil.Error: pass else: print('deleting {}'.format(os.path.basename(fi))) if os.path.abspath(in_dir) == os.path.abspath(out_dir): os.remove(fi)
def copy_cwl_files(from_dir=CWL_PATH, to_dir=None): """Copy cwl files to a directory where the cwl-runner can find them. Args: from_dir (str): Path to directory where to copy files from (default: the cwl directory of nlppln). to_dir (str): Path to directory where the files should be copied to (e.g., the CWL working directory). """ cwl_files = glob.glob('{}{}*.cwl'.format(from_dir, os.sep)) # if no files are found, the output directory should not be created if len(cwl_files) > 0: create_dirs(to_dir) for fi in cwl_files: fo = os.path.join(to_dir, os.path.basename(fi)) shutil.copy2(fi, fo) return len(cwl_files)
def create_chunked_list(in_dir, size, out_dir, out_name): """Create a division of the input files in chunks. The result is stored to a JSON file. """ create_dirs(out_dir) in_files = get_files(in_dir) chunks = chunk(in_files, size) division = {} for i, files in enumerate(chunks): division[i] = [os.path.basename(f) for f in files] out_file = os.path.join(out_dir, out_name) with codecs.open(out_file, 'wb', encoding='utf-8') as f: json.dump(division, f, indent=4)
def clin2018st_extract_text(json_file, out_dir): create_dirs(out_dir) corrections = {} gs_text = [] text_with_errors = [] text = json.load(json_file) for w in text['corrections']: span = w['span'] # TODO: fix 'after' if 'after' in w.keys(): print('Found "after" in {}.'.format( os.path.basename(json_file.name))) for i, w_id in enumerate(span): corrections[w_id] = {} if i == 0: corrections[w_id]['text'] = w['text'] else: corrections[w_id]['text'] = u'' corrections[w_id]['last'] = False if i == (len(span) - 1): corrections[w_id]['last'] = True for w in text['words']: w_id = w['id'] gs_text.append(w['text']) if w_id in corrections.keys(): text_with_errors.append(corrections[w_id]['text']) else: text_with_errors.append(w['text']) if w['space']: gs_text.append(u' ') text_with_errors.append(u' ') gs_file = remove_ext(json_file.name) gs_file = os.path.join(out_dir, '{}-gs.txt'.format(gs_file)) with codecs.open(gs_file, 'wb', encoding='utf-8') as f: f.write(u''.join(gs_text)) err_file = remove_ext(json_file.name) err_file = os.path.join(out_dir, '{}-errors.txt'.format(err_file)) with codecs.open(err_file, 'wb', encoding='utf-8') as f: f.write(u''.join(text_with_errors))
def ocrevaluation_extract(in_file, out_dir): create_dirs(out_dir) soup = BeautifulSoup(in_file, 'lxml') tables = [] for header in soup.find_all('h2'): if (header.text == 'General results' or header.text.startswith('Error rate')): tables.append(header.find_next('table')) assert len(tables) == 2 doc = remove_ext(in_file.name) t = tables[0] table_data = [[cell.text for cell in row('td')] for row in t('tr')] # 'transpose' table_data lines = {} for data in table_data: for i, entry in enumerate(data): if i not in lines.keys(): # add doc id to data line (but not to header) if i != 0: lines[i] = [doc] else: lines[i] = ['doc_id'] lines[i].append(entry.replace(',', '.')) out_file = os.path.join(out_dir, '{}-global.csv'.format(doc)) with codecs.open(out_file, 'wb', encoding='utf-8') as f: for i in range(len(lines.keys())): f.write(u';'.join(lines[i])) f.write(u'\n') t = tables[1] table_data = [[cell.text.replace(',', '.') for cell in row('td')] for row in t('tr')] out_file = os.path.join(out_dir, '{}-character.csv'.format(doc)) with codecs.open(out_file, 'wb', encoding='utf-8') as f: for data in table_data: f.write(u'"{}";'.format(data[0])) f.write(u';'.join(data[1:])) f.write(u'\n')
def merge2openiti(in_file1, in_file2, out_dir): create_dirs(out_dir) try: nltk.data.find('tokenizers/punkt') except LookupError: nltk.download('punkt') lines1 = in_file1.readlines() lines2 = in_file2.readlines() merged = [] for l1, l2 in zip(lines1[:10], lines2[:10]): merged_sentence = merge_sentences(l1, l2) merged.append(merged_sentence) out_file = out_file_name(out_dir, in_file1.name) print out_file with codecs.open(out_file, 'wb', encoding='utf-8') as f: f.write(''.join(merged))
def nerstats(in_dir, out_dir, name): create_dirs(out_dir) frames = [] in_files = get_files(in_dir) for fi in in_files: with codecs.open(fi, encoding='utf-8') as f: saf = json.load(f) data = {} data['word'] = [t['word'] for t in saf['tokens'] if 'ne' in t.keys()] data['ner'] = [t['ne'] for t in saf['tokens'] if 'ne' in t.keys()] data['w_id'] = [t['id'] for t in saf['tokens'] if 'ne' in t.keys()] data['text'] = [os.path.basename(fi) for t in saf['tokens'] if 'ne' in t.keys()] frames.append(pd.DataFrame(data=data)) df = pd.concat(frames, ignore_index=True) df.to_csv(os.path.join(out_dir, name), encoding='utf-8')
def xml_to_text(in_dir, out_dir, tag): create_dirs(out_dir) in_files = get_files(in_dir) for fi in in_files: with codecs.open(fi, encoding='utf-8') as f: root = etree.ElementTree().parse(f) if tag is not None: elements = list(root.iter('{*}' + tag)) else: elements = [root] texts = [] for el in elements: texts.append(' '.join( [e.text for e in el.iterdescendants() if e.text is not None])) out_file = out_file_name(out_dir, fi, 'txt') with codecs.open(out_file, 'wb', encoding='utf-8') as f: f.write('\n'.join(texts)) f.write('\n')
def command(in_file, out_dir): create_dirs(out_dir) lines = in_file.readlines() # OCR_toInput: lines[0][:14] # OCR_aligned: lines[1][:14] # GS_aligned: lines[2][:14] ocr = to_character_list(lines[1][14:].strip()) gs = to_character_list(lines[2][14:].strip()) # Write texts out_file = out_file_name(os.path.join(out_dir, 'ocr'), os.path.basename(in_file.name)) print out_file create_dirs(out_file) with codecs.open(out_file, 'wb', encoding='utf-8') as f: f.write(u''.join(ocr)) out_file = out_file_name(os.path.join(out_dir, 'gs'), os.path.basename(in_file.name)) print out_file create_dirs(out_file) with codecs.open(out_file, 'wb', encoding='utf-8') as f: f.write(u''.join(gs)) out_file = out_file_name(out_dir, os.path.basename(in_file.name), 'json') with codecs.open(out_file, 'wb', encoding='utf-8') as f: json.dump({'ocr': ocr, 'gs': gs}, f, encoding='utf-8', indent=4)
def ocrevaluation_extract(in_file, out_dir): create_dirs(out_dir) soup = BeautifulSoup(in_file, 'lxml') tables = soup.find_all('table') assert len(tables) == 3 doc = remove_ext(in_file.name) # global measures: table[0] t = tables[0] table_data = [[cell.text for cell in row('td')] for row in t('tr')] # 'transpose' table_data lines = {} for data in table_data: for i, entry in enumerate(data): if i not in lines.keys(): # add doc id to data line (but not to header) if i != 0: lines[i] = [doc] else: lines[i] = [''] lines[i].append(entry) out_file = os.path.join(out_dir, '{}-global.csv'.format(doc)) with codecs.open(out_file, 'wb', encoding='utf-8') as f: for i in range(len(lines.keys())): f.write(u','.join(lines[i])) f.write(u'\n') # character measures: table[2] t = tables[2] table_data = [[cell.text for cell in row('td')] for row in t('tr')] out_file = os.path.join(out_dir, '{}-character.csv'.format(doc)) with codecs.open(out_file, 'wb', encoding='utf-8') as f: for data in table_data: f.write(u'"{}",'.format(data[0])) f.write(u','.join(data[1:])) f.write(u'\n')
def freqs(in_dir, out_dir, name, mode): if mode not in ('word', 'lemma'): raise ValueError("Unknown mode: {mode}, " "please choose either word or lemma" .format(**locals())) output_file = out_file_name(out_dir, name) create_dirs(output_file) in_files = get_files(in_dir) cnt = Counter() for fi in in_files: with codecs.open(fi, encoding='utf-8') as f: saf = json.load(f) for token in saf['tokens']: word = token[mode] pos = token['pos1'] cnt.update({(word, pos): 1}) data = [(word, pos, count) for ((word, pos), count) in cnt.most_common()] vocab_df = pd.DataFrame(data, columns=[mode, 'pos', 'cnt']) vocab_df['rank'] = vocab_df.index + 1 vocab_df.to_csv(output_file, encoding='utf-8', index=False)
def match_ocr_and_gs(ocr_dir, gs_dir, out_dir): create_dirs(out_dir) ocr_files = {os.path.basename(f): f for f in get_files(ocr_dir)} gs_files = {os.path.basename(f): f for f in get_files(gs_dir)} ocr = set(ocr_files.keys()) gs = set(gs_files.keys()) if len(ocr) == 0: raise ValueError('No ocr files in directory "{}".'.format(ocr_dir)) if len(gs) == 0: raise ValueError('No gs files in directory "{}".'.format(gs_dir)) keep = ocr.intersection(gs) if len(keep) == 0: raise ValueError('No matching ocr and gs files.') for name in keep: copy_file(ocr_files[name], name, out_dir, 'ocr') copy_file(gs_files[name], name, out_dir, 'gs')
def basic_text_statistics(in_dir, out_dir, name): create_dirs(out_dir) d = {'num_words': [], 'num_sentences': []} text_names = [] in_files = get_files(in_dir) for fi in in_files: with codecs.open(fi, encoding='utf-8') as f: text = json.load(f, encoding='utf-8') text_id = os.path.splitext(os.path.basename(fi))[0] text_names.append(text_id) d['num_words'].append(len(text['tokens'])) sentences = [t['sentence'] for t in text['tokens']] num_sentences = len(set(sentences)) d['num_sentences'].append(num_sentences) df = pd.DataFrame(d, index=text_names) meta_out = out_file_name(out_dir, name) df.to_csv(meta_out, encoding='utf-8')
def archive2dir(archive, remove_dir_structure, out_dir): if remove_dir_structure: result_dir = os.path.join(out_dir, str(uuid.uuid4())) create_dirs(result_dir) # make temporary directory tempdir = tempfile.mkdtemp() # extract archive to temporary directory patoolib.extract_archive(archive, outdir=tempdir) # copy extracted files to output dir files = get_files(tempdir, recursive=True) for f in files: fo = out_file_name(result_dir, f) # don't copy if it's the same file if os.path.abspath(f) != fo: shutil.copy2(f, fo) # remove temporary directory and its contents shutil.rmtree(tempdir) else: # extract archive to temporary directory patoolib.extract_archive(archive, outdir=out_dir)
def test_create_dirs_with_file_name(fs): # Uses pyfakefs http://pyfakefs.org create_dirs('/test/test/test.txt', is_file=True) assert os.path.exists('/test/test/') assert not os.path.exists('/test/test/test.txt')
def test_create_dirs_with_dir_name(fs): # Uses pyfakefs http://pyfakefs.org create_dirs('/test/test/') assert os.path.exists('/test/test/')