Ejemplo n.º 1
0
def generate_editions(editions, out_dir, work_name):
    # writing all the editions in their respective folder
    for e in editions:
        path = out_dir / 'editions' / e.replace('་', '།')
        file_name = work_name + '_' + e + '.txt'
        content = ''.join([e[0] for e in editions[e]]).replace('_', ' ')
        write_file(path / file_name, content)
Ejemplo n.º 2
0
def export_unified_structure(editions,
                             text_name,
                             out_dir=outDir / 'unified_structure'):
    unified = generate_unified_version(editions)
    out = yaml.dump(unified,
                    allow_unicode=True,
                    default_flow_style=False,
                    width=float("inf"))
    write_file(out_dir / f'{text_name}_unified_structure.yaml', out)
Ejemplo n.º 3
0
def copy_derge_layout(derge_layout):
    for f in os.listdir('output'):
        no_layout = True
        if f in [a.replace('_raw_page_reinserted.txt', '') for a in os.listdir(derge_layout)]:
            content = open_file('{}/{}_raw_page_reinserted.txt'.format(derge_layout, f))
            reformated = re.sub(r'\n-+', '', content).replace('\\', '')
            write_file('output/{}/{}'.format(f, f+'_derge_layout.txt'), reformated)
            no_layout = False
        if no_layout and f in [a.replace('_with_a.txt', '') for a in os.listdir('../4-a-final_formatting/output/2-0-with_a')]:
            content = open_file('../4-a-final_formatting/output/2-0-with_a/{}_with_a.txt'.format(f))
            reformated = content.replace('\n', ' ').replace('a', '\n')
            write_file('output/{}/{}'.format(f, f + '_derge_lines.txt'), reformated)
Ejemplo n.º 4
0
def reconstruct_version_texts(in_path):
    for f in os.listdir(in_path):
        text_name = f.strip('_updated_structure.txt')
        # create a folder for the layered files if missing
        if text_name in os.listdir('output'):
            current_out_folder = 'output/' + text_name
            # open structure file
            from_structure = yaml.load(open_file('{}/{}'.format(in_path, f)))
            # reconstruct the editions
            editions = reconstruct_edition_versions(from_structure)
            # write them in the corresponding folder
            for ed, version in editions.items():
                version = version.replace('_', ' ')  # reconstruct spaces
                write_file('{}/{}_{}_layer.txt'.format(current_out_folder, text_name, ed), version)
Ejemplo n.º 5
0
def process(in_path, file_origin, name_end, out_path):
    for f in os.listdir(in_path):
        work_name = f.replace(name_end, '')
        # raw_content = open_file(file_origin.format(work_name.replace('_', ' ')))
        try:
            raw_content = open_file(file_origin.format(work_name))
        except FileNotFoundError:
            continue

        content = re.sub(r'\n?[0-9]+\.\s+', '', raw_content)
        content = re.sub(r' ', '\n', content)
        write_file(out_path[0].format(work_name), content)

        content = content.replace('a', '')
        write_file(out_path[1].format(work_name), content)
Ejemplo n.º 6
0
def reinsert_raw(in_path, out_path, patterns):
    print('raw reinsertion')
    for f in os.listdir(in_path):
        work_name = f.replace('_with_a.txt', '')
        print(work_name)
        if work_name in patterns:
            print('\t', work_name)
            content = open_file('{}/{}'.format(in_path, f))
            lines = deque(content.replace('\n', ' ').split('a'))

            pages = []
            text_pattern = patterns[work_name][2:]
            counter = patterns[work_name][0][1]
            side = patterns[work_name][0][2]

            # beginning pages
            for num in text_pattern[0]:
                pages.append(create_page(lines, num, counter, side))
                counter, side = increment_counter(counter, side)

            # body of the text
            while len(lines) > 0:
                if len(lines) >= text_pattern[1]:
                    pages.append(
                        create_page(lines, text_pattern[1], counter, side))
                    counter, side = increment_counter(counter, side)
                elif text_pattern[2] == len(lines):
                    pages.append(create_page(lines, len(lines), counter, side))
                    counter, side = increment_counter(counter, side)
                else:
                    pages.append(create_page(lines, len(lines), counter, side))
                    counter, side = increment_counter(counter, side)

            output = '\n{}\n'.format('-' * 100).join(pages)

            write_file(
                '{}/{}_raw_page_reinserted.txt'.format(out_path, work_name),
                output)
            print('2-2!')
Ejemplo n.º 7
0
    # replace them with spaces
    text = []
    for r in range(len(content)-1):
        line = content[r]
        for t in to_delete:
            line = line.replace(t, ' ')
        text.append(re.sub(r'\s+', r' ', line))

    lexicon = []
    for t in text:
        lexicon.extend([u.strip('་')+'་' for u in t.split(' ') if u.strip('་') != ''])
    new.extend(lexicon)
new = list(set(new))

oral_corpus_num = 0
extant_lexicon = []
extant_lexicon.extend(open_file('../updateJs/src/TDC.txt').split('\n'))
extant_lexicon.extend(open_file('../updateJs/src/verbs.txt').split('\n'))
extant_lexicon.extend(open_file('../updateJs/src/particles.txt').split('\n'))
for f in os.listdir('../updateJs/src/new_entries/'):
    extant_lexicon.extend(open_file('../updateJs/src/new_entries/'+f).split('\n'))
    number = int(f.split('.')[0].split('_')[2])
    if number > oral_corpus_num:
        oral_corpus_num = number

new_entries = [n for n in new if n not in extant_lexicon]

write_file(out_path+'all_entries{}.txt'.format(oral_corpus_num+1), '\n'.join(tib_sort(new)))
if new_entries:
    write_file('../updateJs/src/new_entries/oral_corpus_{}.txt'.format(oral_corpus_num+1), '\n'.join(tib_sort(new_entries)))
Ejemplo n.º 8
0
def generate_context_versions(editions,
                              file_name,
                              out_dir,
                              left=5,
                              right=5,
                              base_ed='སྡེ་'):
    def calculate_contexts(unified_version, left=5, right=5, base_ed='སྡེ་'):
        all_versions = []
        c = 0
        for num, syl in enumerate(unified_version):
            if type(syl) == dict:
                if c == 137:
                    print('ok')
                versions = {}
                for ed in syl:
                    # add left context
                    n_l = num - left
                    if n_l < 0:
                        n_l = 0
                    left_context = unified_version[n_l:num]
                    # add note
                    note = syl[ed]
                    # add right context
                    n_r = num + right + 1
                    if n_r > len(unified_version) - 1:
                        n_r = len(unified_version) - 1
                    right_context = unified_version[num + 1:n_r]
                    version = left_context + note + right_context
                    # if there is a note (if version[v] == dict), choose the base_ed version
                    no_note_version = []
                    for v in version:
                        if type(v) == dict:
                            for base_syl in v[base_ed]:
                                no_note_version.append(base_syl)
                        else:
                            no_note_version.append(v)
                    # add the versions in the versions
                    versions[ed] = ''.join(no_note_version).replace('_', ' ')
                c += 1
                versions[str(c)] = ''
                all_versions.append(versions)
        return all_versions

    unified = generate_unified_version(editions)
    with_context = calculate_contexts(unified,
                                      left=left,
                                      right=right,
                                      base_ed=base_ed)
    for i in range(len(with_context)):
        with_context[i] = [[a, with_context[i][a]]
                           for a in sorted(with_context[i])]
    output = yaml.dump_all(with_context,
                           allow_unicode=True,
                           default_flow_style=False,
                           width=float("inf"))
    # reformat the page number
    output = re.sub(r'\n- -([^\n]+)\n  -', r'\n\1: ', output)
    output = re.sub(r"---\n '([0-9]+)':  ''", r'-\1-', output)
    output = re.sub(r"- - '1'\n  - ''", r'-1-',
                    output).replace(" '", '').replace("'", '')
    output = re.sub(r'\n', r',,,,,,,,,,,,,,,\n', output)  # Todo
    write_file(out_dir / f'/conc_yaml/{file_name}_conc.txt', output)
Ejemplo n.º 9
0
def copy_cat_json_file(json_path):
    for f in os.listdir('output'):
        write_file('output/{}/{}'.format(f, f+'_cats.json'), open_file('{}/{}_cats.json'.format(json_path, f)))
                        output.append(note)
                        notes.append(format_footnote(s, decision, ref))
                        note_map.append('K')
                        stats[decision] += 1
                        if grouped_unified[num] == s:
                            similar_notes += 1
                    else:
                        note_map.append('0')
                        stats[decision] += 1
                        if grouped_unified[num] == s:
                            similar_notes += 1
                        continue

        prepared = ''.join(output).replace(' ', '').replace('#', '').replace(
            '_', ' ').replace(' ', '\n')
        write_file('output/0-1-formatted/{}_formatted.txt'.format(work_name),
                   prepared + '\n\n' + '\n'.join(notes))
        write_file('output/0-3-corrected/{}_corrected.txt'.format(work_name),
                   prepared + '\n\n' + '\n'.join(notes))

        # Stats
        total = 0
        for kind, value in stats.items():
            total += value
        percentages = {}
        for kind, value in stats.items():
            percentages[kind] = (value, value * 100 / total)
        discarted_notes = percentages['D'][0] + percentages['U'][0]
        kept_notes = percentages['C'][0] + percentages['K'][0] + percentages[
            '?'][0]

        statistics = []
Ejemplo n.º 11
0
missing_space = r'^([^ -])'
missing_space_rpl = r' \1'

missing_tsek = r'(ཅོ|སྣར|སྡེ|པེ)(?=):'
missing_tsek_rpl = r'\1་:'

files = [a for a in os.listdir('.') if a != 'conc_sanity_check.py']

for f in files:
    print(f)
    raw = open_file('./' + f)
    raw = re.sub(missing_space, missing_space_rpl, raw)
    raw = re.sub(missing_tsek, missing_tsek_rpl, raw)
    raw = raw.replace('-1-,,,,,,,,,,,,,,,', '-1-,,,,,,,,,,,,,,,')
    write_file('./' + f, raw)
    lines = raw.split('\n')
    for num, line in enumerate(lines):
        toprint = False
        if line.startswith('-'):
            pass
        elif line.startswith(r' ཅོ་:'):
            pass
        elif line.startswith(r' སྣར་:'):
            pass
        elif line.startswith(' སྡེ་:'):
            pass
        elif line.startswith(' པེ་:'):
            pass
        elif num == len(lines)-1 and line.strip() == '':
            pass
Ejemplo n.º 12
0
def create_base_text(raw_path):
    for f in os.listdir('output'):
        content = open_file('{}/{}_raw.txt'.format(raw_path, f))
        # put back in a single line
        content = content.replace('\n', ' ')
        write_file('output/{}/{}_base.txt'.format(f, f), content)
Ejemplo n.º 13
0
def reinsert(in_path, out_path1, out_path2, patterns):
    print('reinsertion with notes')
    for f in os.listdir(in_path):
        work_name = f.replace('_a_reinserted.txt', '')
        if work_name in patterns:
            print('\t', work_name)
            content = open_file('{}/{}'.format(in_path, f))
            if not re.findall(r'\n\[\^[0-9A-Z]+\]\:', content):
                text = content
                notes = ''
            else:
                text, notes = [
                    a for a in re.split(
                        r'((?:\n?\[\^[0-9A-Z]+\]\:[^\n]+\n?)+)', content)
                    if a != ''
                ]
            lines = deque(text.replace('\n', ' ').split('a'))

            pages = []
            text_pattern = patterns[work_name][2:]
            counter = patterns[work_name][0][1]
            side = patterns[work_name][0][2]

            # beginning pages
            for num in text_pattern[0]:
                pages.append(create_page(lines, num, counter, side))
                counter, side = increment_counter(counter, side)

            # body of the text
            while len(lines) > 0:
                if len(lines) >= text_pattern[1]:
                    pages.append(
                        create_page(lines, text_pattern[1], counter, side))
                    counter, side = increment_counter(counter, side)
                elif text_pattern[2] == len(lines):
                    pages.append(create_page(lines, len(lines), counter, side))
                    counter, side = increment_counter(counter, side)
                else:
                    print(
                        'There is a line number issue: only {} lines were left for the last page.'
                        .format(len(lines)))
                    pages.append(create_page(lines, len(lines), counter, side))
                    counter, side = increment_counter(counter, side)

            output = '\n{}\n'.format('-' * 100).join(pages) + '\n\n' + notes

            write_file(
                '{}/{}_page_reinserted.txt'.format(out_path1, work_name),
                output)

            # write to the file to 3-2-compared if it is not yet there
            existing = [
                g.replace('_compared.txt', '') for g in os.listdir(out_path2)
                if g.endswith('.txt')
            ]
            #if work_name not in existing:
            write_file('{}/{}_compared.txt'.format(out_path2, work_name),
                       output)
            text_path = '{}/extra_copies/{}'.format(out_path2, work_name)
            if not os.path.exists(text_path):
                os.makedirs(text_path)
Ejemplo n.º 14
0
# write individual files for each text, presenting the mistakes in total frequency order
len_ordered_mistakes = sorted(total, key=lambda x: len(total[x]), reverse=True)
for f in os.listdir(in_path):
    if f.endswith('txt'):
        current_text = f.replace('.txt', '')
        # filter mistakes of the current file
        output = []
        for mis in len_ordered_mistakes:
            tmp = []
            for occ in total[mis]:
                if current_text == occ[0]:
                    tmp.append(''.join(occ[1][0]) + mis + ''.join(occ[1][1]))
            if tmp:
                output.append('\n'.join([mis, '\n'.join(tmp)]))
        write_file('segmented/{}_segmented.txt'.format(current_text),
                   '\n\n'.join(output))

# write
total_formatted = []
for mis in len_ordered_mistakes:
    tmp = []
    for occ in total[mis]:
        tmp.append(''.join(occ[1][0]) + mis + ''.join(occ[1][1]))
    if tmp:
        total_formatted.append('\n'.join(
            ['   {} {}'.format(mis, len(total[mis])), '\n'.join(tmp)]))

total_len = ', '.join([m + str(len(total[m]))
                       for m in len_ordered_mistakes]).replace('#', '')
write_file('total_mistakes.txt',
           total_len + '\n' + '\n\n'.join(total_formatted))
Ejemplo n.º 15
0
import sys, os
grandParentDir = os.path.dirname(os.path.dirname(os.path.realpath(__file__)))
sys.path.append(grandParentDir)

from PyTib.common import open_file, write_file
import os

in_path = 'output/antconc_format'
out_path = '../3-b-reviewed_texts'
for f in os.listdir(in_path):
    name = f.replace('_antconc_format.txt', '')
    print(name)

    content = open_file('{}/{}'.format(in_path, f)).strip()
    lines = content.split('\n')

    output = [
        'Left,p,c,d,n,right,new,min_mod,particles,spelling_mistake,sskrt,verb,?,empty,double,profile,ngram_freq,file name,note_num'
    ]
    for line in lines:
        columns = line.split('\t')
        columns[6] = 'K'
        output.append(','.join(columns))
    write_file('{}/{}_DUCKed.csv'.format(out_path, name), '\n'.join(output))
Ejemplo n.º 16
0
def process(in_path, template_path, total_stats):
    global collection_eds, file, debug
    raw_template = open_file(template_path)
    verbs = jp.decode(open_file('./resources/monlam_verbs.json'))
    all_ngrams = open_ngrams()
    files = find_file_path(in_path, '../1-a-reinsert_notes/output/conc_yaml')
    # print(files)
    for filename in files:
        # if 'N5000' not in filename:
        #     continue
        f = filename.split('/')[-1]
        print(f)
        if debug and f != file:
            continue
        work_name = f.replace('_conc.txt', '').replace('.txt', '')

        raw = open_file(filename)
        # setting collection_eds for the current file
        collection_eds = list({a for a in re.findall(r' ([^ ]+): ', raw)})
        if len(collection_eds) > 4:
            print(collection_eds)
        data = prepare_data(raw)
        profiles, profile_cats = find_profiles(data)

        # prepare
        prepared = find_all_parts(data)

        # categorise
        categorised_notes = jp.decode(raw_template)

        # find ngram frequencies
        frequencies = ngram_frequency(prepared, all_ngrams)

        if debug:
            if file == f and note_num != 0:
                for note in prepared:
                    if note[0] == note_num:
                        categorise(note, categorised_notes, verbs)
            elif file == f:
                for note in prepared:
                    categorise(note, categorised_notes, verbs)
        else:
            for note in prepared:
                categorise(note, categorised_notes, verbs)

        # finally write the json file
        stats = {}
        total = 0
        for key1, item1 in sorted(categorised_notes.items()):
            if type(item1) == list:
                if len(item1) != 0:
                    stats[key1] = len(item1)
                    total += len(item1)
            else:
                stats[key1] = {}
                for key2, item2 in sorted(item1.items()):
                    if type(item2) == list:
                        if len(item2) != 0:
                            stats[key1][key2] = len(item2)
                            total += len(item2)
                    else:
                        stats[key1][key2] = {}
                        for key3, item3 in sorted(item2.items()):
                            if type(item3) == list:
                                if len(item3) != 0:
                                    stats[key1][key2][key3] = len(item3)
                                    total += len(item3)
                            else:
                                stats[key1][key2][key3] = {}
                                for key4, item4 in sorted(item3.items()):
                                    if type(item4) == list:
                                        if len(item4) != 0:
                                            stats[key1][key2][key3][
                                                key4] = len(item4)
                                            total += len(item4)
        stats['Notes’ total'] = total
        categorised = total
        if 'long_diff' in stats['dunno'].keys():
            categorised -= stats['dunno']['long_diff']
        if 'short_diff' in stats['dunno'].keys():
            categorised -= stats['dunno']['short_diff']
        if 'no_diff' in stats['dunno'].keys():
            categorised -= stats['dunno']['no_diff']
        if total == 0:
            percentage = 0
            print('the notes were not processed!')
        else:
            percentage = categorised * 100 / total
        stats['Categorised'] = '{} notes ({:02.2f}%)'.format(
            categorised, percentage)
        stats['Profiles'] = profile_cats
        total_stats.append('{}\n{}'.format(work_name, jp.encode(stats)))

        encoded = jp.encode(categorised_notes)
        if encoded != raw_template:
            categorised_notes['Stats'] = stats
            categorised_notes['profile'] = profiles
            categorised_notes['ngram_freq'] = frequencies
            write_file('output/{}_cats.json'.format(work_name),
                       jp.encode(categorised_notes))
Ejemplo n.º 17
0
            percentage = 0
            print('the notes were not processed!')
        else:
            percentage = categorised * 100 / total
        stats['Categorised'] = '{} notes ({:02.2f}%)'.format(
            categorised, percentage)
        stats['Profiles'] = profile_cats
        total_stats.append('{}\n{}'.format(work_name, jp.encode(stats)))

        encoded = jp.encode(categorised_notes)
        if encoded != raw_template:
            categorised_notes['Stats'] = stats
            categorised_notes['profile'] = profiles
            categorised_notes['ngram_freq'] = frequencies
            write_file('output/{}_cats.json'.format(work_name),
                       jp.encode(categorised_notes))


if __name__ == '__main__':
    debug = False
    # file = '563_རྒྱུད་ཀྱི་རྒྱལ་པོ་ཆེན་པོ་དཔལ་དགྱེས་པའི་རྡོ་རྗེའི་དཀའ་འགྲེལ་སྤྱན་འབྱེད།_conc.txt'
    file = ''
    note_num = 0

    in_path = '../1-b-manually_corrected_conc/notes_formatted'
    template = 'resources/template.json'
    total_stats = []
    process(in_path, template, total_stats)

    write_file('total_stats.txt', '\n\n'.join(total_stats))
Ejemplo n.º 18
0
def copy_final_version(final_path):
    for f in os.listdir('output'):
        if f+'_final.txt' in os.listdir('../4-a-final_formatting/output/3-3-final'):
            write_file('output/{}/{}'.format(f, f+'_final.txt'), open_file('{}/{}_final.txt'.format(final_path, f)))