Beispiel #1
0
def insert_comma():
    print('inserting commas')
    rd = workfiles.read_lasttmp_or_output()
    cnt = workfiles.new_tmpfile()
    while True:
        line = rd.readline()
        if not line:
            break
        line = line.replace('\n', '')
        if len(line) < 5:
            workfiles.write_tmpfile(cnt, line + '\n', 'a')
            continue
        terms = line.split('\t')
        if len(terms) == 1:
            workfiles.write_tmpfile(cnt, line + '\n', 'a')
            continue
        words = terms[1].split(',')
        new_words = []
        for word in words:
            new_word = word.replace('    ', ' ')
            new_word = new_word.replace('   ', ' ')
            new_word = new_word.replace('  ', ' ')
            new_word = new_word.replace(' ', ', ')
            word = word.strip()
            if word.lower() == terms[0].lower():
                continue
            if word == '' or word == ',':
                continue
            new_words.append(new_word)
        new_line = ','.join(new_words)
        new_line = new_line.replace(', , ', ',')
        line = terms[0] + '\t' + new_line
        workfiles.write_tmpfile(cnt, line + '\n', 'a')
    rd.close()
def workfile():
    print('working in file')
    rd = workfiles.read_lasttmp_or_lists()
    cnt = workfiles.new_tmpfile()
    while True:
        line = rd.readline()
        if not line:
            break
        if len(line) < 5:
            workfiles.write_tmpfile(cnt, line, 'a')
            continue
        terms = line.split('\t')
        if len(terms) == 1:
            workfiles.write_tmpfile(cnt, line, 'a')
            continue
        word = terms[0].strip()

        google = str(terms[2])
        reverso = str(terms[3])
        yandex = str(terms[4])
        cambridge = str(terms[5])
        linguee = str(terms[6])

        yandex = accentuation_only_matching(yandex, google, reverso, cambridge,
                                            linguee)
        linguee = only_matching(linguee, [google, reverso, cambridge, yandex])

        yandex = remove_duplicated_terms(yandex)
        linguee = remove_duplicated_terms(linguee)

        line = f'{word}\t{yandex}\t{linguee}\n'

        workfiles.write_tmpfile(cnt, line, 'a')
    rd.close()
Beispiel #3
0
def remove_same_translations():
    print('removing two equal translations')
    '''it hapens when a translation is considered principal without frequency, 
       and another has a frequency, both same translation
    '''
    rd = workfiles.read_lasttmp_or_output()
    cnt = workfiles.new_tmpfile()
    while True:
        line = rd.readline()
        if not line:
            break
        if len(line) < 5:
            workfiles.write_tmpfile(cnt, line, 'a')
            continue
        terms = line.split('\t')
        if len(terms) == 1:
            workfiles.write_tmpfile(cnt, line, 'a')
            continue
        translations = terms[1].split(',')
        new_trs = []
        anterior_translation = ''
        for translation in translations:
            ini = translation.find(')')
            term_translation = translation[ini + 1:].strip().lower()
            if term_translation == anterior_translation:
                continue
            anterior_translation = term_translation
            new_trs.append(translation)
        new_line = ','.join(new_trs)
        if new_line[-1:] != '\n':
            new_line = new_line + '\n'
        line = terms[0] + '\t' + new_line
        workfiles.write_tmpfile(cnt, line, 'a')
    rd.close()
Beispiel #4
0
def remove_less_half_frequency():
    print('removing term with less than half frequency from another term')
    rd = workfiles.read_lasttmp_or_output()
    cnt = workfiles.new_tmpfile()
    while True:
        line = rd.readline()
        if not line:
            break
        if len(line) < 5:
            workfiles.write_tmpfile(cnt, line, 'a')
            continue
        terms = line.split('\t')
        if len(terms) == 1:
            workfiles.write_tmpfile(cnt, line, 'a')
            continue
        translations = terms[1].split(',')
        new_trs = []
        anterior_frequency = 2
        frequency = 0
        for translation in translations:
            ini = translation.find(')')
            txt_freq = translation[1:ini]
            if txt_freq != '':
                frequency = int(txt_freq)
            if frequency < (anterior_frequency / 2):
                continue
            anterior_frequency = frequency
            new_trs.append(translation)
        new_line = ','.join(new_trs)
        if new_line[-1:] != '\n':
            new_line = new_line + '\n'
        line = terms[0] + '\t' + new_line
        workfiles.write_tmpfile(cnt, line, 'a')
    rd.close()
Beispiel #5
0
def remove_same_word():
    print('removing term equal word')
    rd = workfiles.read_lasttmp_or_output()
    cnt = workfiles.new_tmpfile()
    while True:
        line = rd.readline()
        if not line:
            break
        if len(line) < 5:
            workfiles.write_tmpfile(cnt, line, 'a')
            continue
        terms = line.split('\t')
        if len(terms) == 1:
            workfiles.write_tmpfile(cnt, line, 'a')
            continue
        word = terms[0].strip().lower()
        translations = terms[1].split(',')
        new_trs = []
        for translation in translations:
            ini = translation.find(')')
            if ini > len(translation) or ini == -1:
                continue
            if translation[ini + 1:][0].isupper():
                continue
            if translation[ini + 1:].strip().lower() == word:
                continue
            new_trs.append(translation)
        new_line = ','.join(new_trs)
        if new_line[-1:] != '\n':
            new_line = new_line + '\n'
        line = terms[0] + '\t' + new_line
        workfiles.write_tmpfile(cnt, line, 'a')
    rd.close()
Beispiel #6
0
def remove_same_word():
    print('removing translation equal word')
    rd = workfiles.read_lasttmp_or_output()
    cnt = workfiles.new_tmpfile()
    while True:
        line = rd.readline()
        if not line:
            break
        if len(line) < 5:
            workfiles.write_tmpfile(cnt, line, 'a')
            continue
        terms = line.split('\t')
        if len(terms) == 1:
            workfiles.write_tmpfile(cnt, line, 'a')
            continue
        word = terms[0].strip()
        translations = terms[2].split(',')
        new_trs = []
        for translation in translations:
            if translation.strip() == '':
                continue
            if translation.strip().lower() == word.lower():
                continue
            new_trs.append(translation.lower())
        translations_result = ','.join(new_trs)
        if translations_result[-1:] != '\n':
            translations_result = translations_result + '\n'
        new_line = terms[0] + '\t' + terms[1] + '\t' + translations_result
        workfiles.write_tmpfile(cnt, new_line, 'a')
    rd.close()
Beispiel #7
0
def reorganizeTranslations():
    print("""remove a term when have both male and female,
    change the main translation to the front of translations,
    sort the translations by frequency,
    remove terms beggining with "o", "a", "os", "as"
    """)
    rd = workfiles.read_lasttmp_or_output()
    cnt = workfiles.new_tmpfile()
    output = ''
    while True:
        line = rd.readline()
        if not line:
            break
        line = line[:-1]
        fields = line.split('\t')
        word = fields[0]
        if len(fields) == 1:
            terms = ''
        else:
            terms = fields[1].split(',')
        others = {}
        i = 0
        if len(terms) > 1:
            if terms[0][:-1] == terms[1] and terms[0][-1:] == 'a':
                i = 1
            if len(terms[0]) == len(
                    terms[1]
            ) and terms[1][-1:] == 'o' and terms[0][-1:] == 'a':
                i = 1
        while i < len(terms):
            if terms[i][3:4].isupper() and word[0:1].isupper():
                i += 1
                continue
            if terms[i][0:1] == '(':
                others[terms[i][3:].lower()] = terms[i][:3]
            else:
                if terms[i] not in others:
                    others[terms[i].lower()] = '*'
            i += 1
        others = dict(
            sorted(others.items(), key=lambda item: item[1], reverse=True))
        output += word + '\t'
        for term in others:
            if others[term] == '*':
                others[term] = ''
            output += others[term] + term + ','
        output = output.replace(')a ', ')').replace(')o ', ')').replace(
            ')os ', ')').replace(')as ', ')')
        output += '\n'
    rd.close()
    workfiles.write_tmpfile(cnt, output, 'a')
def rem_spell_synonyms():
    print('removing fields spell and synonyms')
    rd = workfiles.read_lasttmp_or_lists()
    cnt = workfiles.new_tmpfile()
    while True:
        line = rd.readline()
        if not line:
            break
        line = line.replace('\n', '')
        fields = line.split('\t')
        newline = fields[0] + '\t' + fields[1] + '\t' + fields[
            3] + '\t' + fields[4] + '\n'
        workfiles.write_tmpfile(cnt, newline, 'a')
    rd.close()
Beispiel #9
0
def organize_definitions_with_n():
    rd = workfiles.read_lasttmp_or_output()
    cnt = workfiles.new_tmpfile()
    while True:
        line = rd.readline()
        if not line:
            break
        line = line.replace('\n', '')
        fields = line.split('\t')
        fields[4] = fields[4].replace('. ,', '. ')
        newline = fields[0] + '\t' + fields[1] + '\t' + fields[
            2] + '\t' + fields[3] + '\t' + fields[4] + '\n'
        workfiles.write_tmpfile(cnt, newline, 'a')
    rd.close()
Beispiel #10
0
def remove_wordlist_from_soundname():
    print('remove wordlist from name of mp3 sound')
    rd = workfiles.read_lasttmp_or_output()
    cnt = workfiles.new_tmpfile()
    while True:
        line = rd.readline()
        if not line:
            break
        line = line.replace('\n', '')
        fields = line.split('\t')
        fields[3] = fields[3].replace(workfiles.word_list + '-', '')
        newline = fields[0] + '\t' + fields[1] + '\t' + fields[
            2] + '\t' + fields[3] + '\t' + fields[4] + '\n'
        workfiles.write_tmpfile(cnt, newline, 'a')
    rd.close()
Beispiel #11
0
def sound_mp3_directory():
    print('change the reference of sound in mp3 field')
    rd = workfiles.read_lasttmp_or_output()
    cnt = workfiles.new_tmpfile()
    while True:
        line = rd.readline()
        if not line:
            break
        line = line.replace('\n', '')
        fields = line.split('\t')
        fields[3] = workfiles.add_wordlist_dictionary_soundmp3(fields[3])
        newline = fields[0] + '\t' + fields[1] + '\t' + fields[
            2] + '\t' + fields[3] + '\t' + fields[4] + '\n'
        workfiles.write_tmpfile(cnt, newline, 'a')
    rd.close()
def add_tabs():
    print('adding tabs')
    rd = workfiles.read_lasttmp_or_lists()
    cnt = workfiles.new_tmpfile()
    while True:
        line = rd.readline()
        if not line:
            break
        if line.count('\t') == 1:
            line = line[:-1] + '\t\t\t\t\t\n'
        if line.count('\t') == 5:
            first_tab = line.find('\t')
            line = line[:first_tab] + '\t' + line[first_tab:]
        workfiles.write_tmpfile(cnt, line, 'a')
    rd.close()
def remove_wordlist_from_name():
    print('remove wordlist from name')
    rd = workfiles.read_lasttmp_or_output()
    cnt = workfiles.new_tmpfile()
    while True:
        line = rd.readline()
        if not line:
            break
        line = line[:-1]
        fields = line.split('\t')
        if len(fields) == 1:
            newline = fields[0] + '\t\n'
        else:
            fields[1] = fields[1].replace(workfiles.word_list + '-', '')
            newline = fields[0] + '\t' + fields[1] + '\n'
        workfiles.write_tmpfile(cnt, newline, 'a')
    rd.close()
Beispiel #14
0
def clean_ipa():
    print('cleaning ipa, adding space between translations')
    rd = workfiles.read_lasttmp_or_output()
    cnt = workfiles.new_tmpfile()
    while True:
        line = rd.readline()
        if not line:
            break
        line = line[:-1]
        fields = line.split('\t')
        ipa = fields[2]
        ipa = ipa.replace('/<span', '<span')
        ipa = ipa.replace('</span>/', '</span>')
        pt = fields[1]
        fields[2] = ipa
        fields[1] = pt
        workfiles.write_tmpfile(cnt, '\t'.join(fields) + '\n', 'a')
    rd.close()
def rem_definitions_duplicate():
    rd = workfiles.read_lasttmp_or_lists()
    cnt = workfiles.new_tmpfile()
    while True:
        line = rd.readline()
        if not line:
            break
        fourth_tab = line.find('\t')
        fourth_tab = line.find('\t', fourth_tab + 1)
        fourth_tab = line.find('\t', fourth_tab + 1)
        fourth_tab = line.find('\t', fourth_tab + 1)
        fifth_tab = line.find('\t', fourth_tab + 1)
        definitions = line[fourth_tab + 1:fifth_tab]
        list_definitions = definitions.split(',')
        list_uniq = list(dict.fromkeys(list_definitions))
        str_definitions = ','.join(list_uniq)
        line = line[:fourth_tab + 1] + str_definitions + line[fifth_tab:]
        workfiles.write_tmpfile(cnt, line, 'a')
    rd.close()
Beispiel #16
0
def workfile():
    rd = workfiles.read_lasttmp_or_lists()
    cnt = workfiles.new_tmpfile()
    while True:
        line = rd.readline()
        if not line:
            break
        if len(line) < 5:
            workfiles.write_tmpfile(cnt, line, 'a')
            continue
        terms = line.split('\t')
        if len(terms) == 1:
            workfiles.write_tmpfile(cnt, line, 'a')
            continue
        word = terms[0].strip()
        babla = terms[7].strip()
        babla = only_word_flexion(word, babla)
        line = f'{word}\t{babla}\n'
        workfiles.write_tmpfile(cnt, line, 'a')
    rd.close()
Beispiel #17
0
def remove_spaces_between_words():
    print('removing spaces between words')
    rd = workfiles.read_lasttmp_or_output()
    cnt = workfiles.new_tmpfile()
    while True:
        line = rd.readline()
        if not line:
            break
        if len(line) < 5:
            workfiles.write_tmpfile(cnt, line, 'a')
            continue
        terms = line.split('\t')
        if len(terms) == 1:
            workfiles.write_tmpfile(cnt, line, 'a')
            continue
        words = terms[1].split(',')
        new_line = ','.join(words)
        new_line = new_line.replace(', ', ',')
        workfiles.write_tmpfile(cnt, terms[0] + '\t' + new_line, 'a')
    rd.close()
Beispiel #18
0
def rem_spaces_between_translations():
    print('remove spaces between comma in translations')
    rd = workfiles.read_lasttmp_or_output()
    cnt = workfiles.new_tmpfile()
    while True:
        line = rd.readline()
        line = line[:-1]
        if not line:
            break
        line = line.replace('\n', '')
        fields = line.split('\t')
        fields[1] = fields[1].replace(', ', ',')
        if len(fields) >= 5:
            newline = fields[0] + '\t' + fields[1] + '\t' + fields[
                2] + '\t' + fields[3] + '\t' + fields[4] + '\n'
        else:
            newline = fields[0] + '\t' + fields[1] + '\t' + fields[
                2] + '\t' + fields[3] + '\t\n'
        workfiles.write_tmpfile(cnt, newline, 'a')
    rd.close()
Beispiel #19
0
def organize_definitions_without_n():
    print('organizing definitions without numbers')
    rd = workfiles.read_lasttmp_or_output()
    cnt = workfiles.new_tmpfile()
    while True:
        line = rd.readline()
        if not line:
            break
        line = line.replace('\n', '')
        fields = line.split('\t')
        if fields[4].strip() != '':
            fields[4] = '1. ' + fields[4]
        num = 1
        while fields[4].find('. ,') > -1:
            num += 1
            fields[4] = fields[4].replace('. ,', '. ' + str(num) + '. ', 1)
        newline = fields[0] + '\t' + fields[1] + '\t' + fields[
            2] + '\t' + fields[3] + '\t' + fields[4] + '\n'
        workfiles.write_tmpfile(cnt, newline, 'a')
    rd.close()
Beispiel #20
0
def remove_frequency():
    '''remove translations with frequency 2 if there are others with frequency 3'''
    print(
        'removing translations with frequency 1, if there are at least 1 translation with another frequency'
    )
    rd = workfiles.read_lasttmp_or_output()
    cnt = workfiles.new_tmpfile()
    while True:
        exclude_all_frequence1 = False
        exclude_all_frequence2 = False
        line = rd.readline()
        if not line:
            break
        if len(line) < 5:
            workfiles.write_tmpfile(cnt, line, 'a')
            continue
        terms = line.split('\t')
        if len(terms) == 1:
            workfiles.write_tmpfile(cnt, line, 'a')
            continue
        if terms[1].find('(2)') >= 0 or terms[1].find('(3)') >= 0:
            exclude_all_frequence1 = True
        if terms[1].find('(3)') >= 0:
            exclude_all_frequence2 = True
        translations = terms[1].split(',')
        new_trs = []
        for translation in translations:
            ini = translation.find(')')
            if translation[1:ini] == '1' and exclude_all_frequence1 == True:
                continue
            if translation[1:ini] == '2' and exclude_all_frequence2 == True:
                continue
            new_trs.append(translation)
        new_line = ','.join(new_trs)
        if new_line[-1:] != '\n':
            new_line = new_line + '\n'
        line = terms[0] + '\t' + new_line
        workfiles.write_tmpfile(cnt, line, 'a')
    rd.close()
Beispiel #21
0
def only_four_translations():
    print('let only the first 4 translations and remove the other ones')
    rd = workfiles.read_lasttmp_or_output()
    cnt = workfiles.new_tmpfile()
    while True:
        line = rd.readline()
        if not line:
            break
        if len(line) < 5:
            workfiles.write_tmpfile(cnt, line, 'a')
            continue
        terms = line.split('\t')
        if len(terms) == 1:
            workfiles.write_tmpfile(cnt, line, 'a')
            continue
        translations = terms[1].split(',')
        new_line = ','.join(translations[:4])
        if new_line[-1:] != '\n':
            new_line = new_line + '\n'
        line = terms[0] + '\t' + new_line
        workfiles.write_tmpfile(cnt, line, 'a')
    rd.close()
Beispiel #22
0
def clean_definitions():
    print(
        'removing repeated numbers ex: 1. 1. and adding a break line for each item'
    )
    rd = workfiles.read_lasttmp_or_output()
    cnt = workfiles.new_tmpfile()
    while True:
        line = rd.readline()
        if not line:
            break
        line = line.replace('1. 1. ', '1. ')
        line = line.replace('2. 2. ', '<BR> 2. ')
        line = line.replace('3. 3. ', '<BR> 3. ')
        line = line.replace('4. 4. ', '<BR> 4. ')
        line = line.replace('5. 5. ', '<BR> 5. ')
        line = line.replace('6. 6. ', '<BR> 6. ')
        line = line.replace('7. 7. ', '<BR> 7. ')
        line = line.replace('8. 8. ', '<BR> 8. ')
        line = line.replace('9. 9. ', '<BR> 9. ')
        line = line.replace('..', '.')
        workfiles.write_tmpfile(cnt, line, 'a')
    rd.close()
def workfile():
    rd = workfiles.read_lasttmp_or_lists()
    cnt = workfiles.new_tmpfile()
    while True:
        line = rd.readline()
        if not line:
            break
        if len(line) < 5:
            workfiles.write_tmpfile(cnt, line, 'a')
            continue
        terms = line.split('\t')
        if len(terms) == 1:
            workfiles.write_tmpfile(cnt, line, 'a')
            continue
        word = terms[0].strip()
        free = terms[15].strip()
        free = clean_infos(free)
        free = only_word_flexion(word, free)
        if len(free) > 99:
            free = ''
        line = f'{word}\t{free}\n'
        workfiles.write_tmpfile(cnt, line, 'a')
    rd.close()
Beispiel #24
0
def remove_replication():
    print('removing replications in translations')
    """sometime the scraper write more than once the same translation in lines"""
    rd = workfiles.read_lasttmp_or_output()
    cnt = workfiles.new_tmpfile()
    a_translations = ''
    while True:
        line = rd.readline()
        if not line:
            break
        if len(line) < 5:
            workfiles.write_tmpfile(cnt, line, 'a')
            continue
        terms = line.split('\t')
        if len(terms) == 1:
            workfiles.write_tmpfile(cnt, line, 'a')
            continue
        word = terms[0].strip()
        translations = terms[1]
        if translations == a_translations:
            workfiles.write_tmpfile(cnt, word + '\t\n', 'a')
        else:
            workfiles.write_tmpfile(cnt, line, 'a')
        a_translations = translations
Beispiel #25
0
def remove_same_translations_without_frequency():
    print('removing translation without frequency if there is another')
    rd = workfiles.read_lasttmp_or_output()
    cnt = workfiles.new_tmpfile()
    while True:
        line = rd.readline()
        if not line:
            break
        if len(line) < 5:
            workfiles.write_tmpfile(cnt, line, 'a')
            continue
        terms = line.split('\t')
        if len(terms) == 1:
            workfiles.write_tmpfile(cnt, line, 'a')
            continue
        translations = terms[1].split(',')
        new_trs = []
        if is_first_translation_without_frequency(translations):
            has_with_frequency = False
            for translation in translations[1:]:
                ini = translation.find(')')
                term_translation = translation[ini + 1:].strip().lower()
                if term_translation == translations[0]:
                    has_with_frequency = True
            if has_with_frequency:
                new_trs = translations[1:]
            else:
                new_trs = translations
        else:
            new_trs = translations
        new_line = ','.join(new_trs)
        if new_line[-1:] != '\n':
            new_line = new_line + '\n'
        line = terms[0] + '\t' + new_line
        workfiles.write_tmpfile(cnt, line, 'a')
    rd.close()
Beispiel #26
0
def clean_debbris():
    print('cleaning all info words')
    content = workfiles.get_content_tmp_output()
    cnt = workfiles.new_tmpfile()
    four_commas = ',​,, ,'
    four_commas_spaces = ', ​,, ,'
    f_commas = ', f ​,, ,'
    endline = ',​,\n'
    dot_comma = ',​,·'
    two_comma_dot = ', ​,·'
    m_comma = ', m ​,·'
    tab_comma = '\t, ,'
    f_dot_comma = ', f ​,·'
    m_commas = ', m ​,, ,'
    three_comma = ' ​,, ,'
    s_dot_comma = ', s ​,·'
    comma_f_dot = ', f ​,·'
    comma_m_comma = ', m ​,'
    comma_prep_comma = ', prep ​,'
    comma_pl_dot = ', pl ​,·'
    comma_f_comma = ', f ​,'
    comma_num_comma = ', num ​,'
    dot_dot_end = ', ​,\n'
    comma_np_f_comma = ', np f ​,'
    comma_prep_comma_dot = ', prep ​,·'
    comma_s_comma = ', s,'
    comma_s = ', s ​,'
    comma_interj_comma = ', interj ​,'
    comma_conj_comma = ', conj ​,'
    num_comma = ', num,'
    comma_new_ln = ',\n'
    comma_sp_prep_comma = ', prep,'
    prop = ', prop.'
    sp_pl = ' pl​,'
    comma_n_comma = ',n,'
    sp_pl_sp = ' pl ​'
    comma_pron_comma = ', pron,'
    comma_conj = ', conj,'
    part_pas = '******'
    np_m_comma = ', np m,'
    content = substitutes(four_commas, ',', content)
    content = substitutes(f_commas, ',', content)
    content = substitutes(endline, '\n', content)
    content = substitutes(dot_comma, ',', content)
    content = substitutes(m_comma, ',', content)
    content = substitutes(tab_comma, '\t', content)
    content = substitutes(f_dot_comma, ',', content)
    content = substitutes(m_commas, ',', content)
    content = substitutes(s_dot_comma, ',', content)
    content = substitutes(four_commas_spaces, ',', content)
    content = substitutes(comma_f_dot, ',', content)
    content = substitutes(three_comma, ',', content)
    content = substitutes(two_comma_dot, ',', content)
    content = substitutes(comma_m_comma, ',', content)
    content = substitutes(comma_prep_comma, ',', content)
    content = substitutes(comma_pl_dot, ',', content)
    content = substitutes(comma_f_comma, ',', content)
    content = substitutes(comma_num_comma, ',', content)
    content = substitutes(dot_dot_end, '\n', content)
    content = substitutes(comma_np_f_comma, ',', content)
    content = substitutes(comma_prep_comma_dot, ',', content)
    content = substitutes(comma_s_comma, ',', content)
    content = substitutes(comma_s, ',', content)
    content = substitutes(comma_interj_comma, ',', content)
    content = substitutes(comma_conj_comma, ',', content)
    content = substitutes(num_comma, ',', content)
    content = substitutes(comma_new_ln, '\n', content)
    content = substitutes(comma_sp_prep_comma, ',', content)
    content = substitutes(prop, ',', content)
    content = substitutes(sp_pl, ',', content)
    content = substitutes(comma_n_comma, ',', content)
    content = substitutes(sp_pl_sp, ' ', content)
    content = substitutes(comma_pron_comma, ',', content)
    content = substitutes(comma_conj, ',', content)
    content = substitutes(part_pas, ',', content)
    content = substitutes(np_m_comma, ',', content)
    content = substitutes(',,', ',', content)
    content = substitutes(', ,', ',', content)
    content = substitutes('·', '', content)
    workfiles.write_tmpfile(cnt, content, 'w')
Beispiel #27
0
def remove_volume_up():
    print('removing :volume_up and volume_up')
    rd = workfiles.read_lasttmp_or_output()
    cnt = workfiles.new_tmpfile()
    while True:
        line = rd.readline()
        if not line :
            break
        if len(line) < 5 :
            workfiles.write_tmpfile(cnt,line,'a')
            continue
        terms = line.split('\t')
        if len(terms) == 1 :
            workfiles.write_tmpfile(cnt,line,'a')
            continue
        word = terms[0].strip()
        translations = terms[1].replace('\n','')
        translations = translations.replace(':volume_up,','')
        translations = translations.replace('volume_up,','')
        translations = translations.replace(f'{word}:{word}.','')
        translations = translations.replace(f'<u>{word}</u>:{word}.','')
        translations = translations.replace(f'<u><b>{word}</b></u>:{word}.','')
        translations = translations.replace(f'<b>{word}</b>:{word}.','')

        translations = translations.replace(f'{word}:{word},','')
        translations = translations.replace(f'<u>{word}</u>:{word},','')
        translations = translations.replace(f'<u><b>{word}</b></u>:{word},','')
        translations = translations.replace(f'<b>{word}</b>:{word},','')

        translations = translations.replace(f'{word}:{word} ','')
        translations = translations.replace(f'<u>{word}</u>:{word} ','')
        translations = translations.replace(f'<u><b>{word}</b></u>:{word} ','')
        translations = translations.replace(f'<b>{word}</b>:{word} ','')

        if translations.endswith(word):
            translations = translations.replace(f'{word}:{word}','')
            translations = translations.replace(f'<u>{word}</u>:{word}','')
            translations = translations.replace(f'<u><b>{word}</b></u>:{word}','')
            translations = translations.replace(f'<b>{word}</b>:{word}','')

        capitalWord = word.capitalize()
        translations = translations.replace(f'{capitalWord}:{capitalWord}.','')
        translations = translations.replace(f'<u>{capitalWord}</u>:{capitalWord}.','')
        translations = translations.replace(f'<u><b>{capitalWord}</b></u>:{capitalWord}.','')
        translations = translations.replace(f'<b>{capitalWord}</b>:{capitalWord}.','')

        translations = translations.replace(f'{capitalWord}:{capitalWord},','')
        translations = translations.replace(f'<u>{capitalWord}</u>:{capitalWord},','')
        translations = translations.replace(f'<u><b>{capitalWord}</b></u>:{capitalWord},','')
        translations = translations.replace(f'<b>{capitalWord}</b>:{capitalWord},','')

        translations = translations.replace(f'{capitalWord}:{capitalWord} ','')
        translations = translations.replace(f'<u>{capitalWord}</u>:{capitalWord} ','')
        translations = translations.replace(f'<u><b>{capitalWord}</b></u>:{capitalWord} ','')
        translations = translations.replace(f'<b>{capitalWord}</b>:{capitalWord} ','')

        if translations.endswith(capitalWord):
            translations = translations.replace(f'{capitalWord}:{capitalWord}','')
            translations = translations.replace(f'<u>{capitalWord}</u>:{capitalWord}','')
            translations = translations.replace(f'<u><b>{capitalWord}</b></u>:{capitalWord}','')
            translations = translations.replace(f'<b>{capitalWord}</b>:{capitalWord}','')

        lowerWord = word.lower()
        translations = translations.replace(f'{lowerWord}:{lowerWord}.','')
        translations = translations.replace(f'<u>{lowerWord}</u>:{lowerWord}.','')
        translations = translations.replace(f'<u><b>{lowerWord}</b></u>:{lowerWord}.','')
        translations = translations.replace(f'<b>{lowerWord}</b>:{lowerWord}.','')

        translations = translations.replace(f'{lowerWord}:{lowerWord},','')
        translations = translations.replace(f'<u>{lowerWord}</u>:{lowerWord},','')
        translations = translations.replace(f'<u><b>{lowerWord}</b></u>:{lowerWord},','')
        translations = translations.replace(f'<b>{lowerWord}</b>:{lowerWord},','')

        translations = translations.replace(f'{lowerWord}:{lowerWord} ','')
        translations = translations.replace(f'<u>{lowerWord}</u>:{lowerWord} ','')
        translations = translations.replace(f'<u><b>{lowerWord}</b></u>:{lowerWord} ','')
        translations = translations.replace(f'<b>{lowerWord}</b>:{lowerWord} ','')

        if translations.endswith(lowerWord):
            translations = translations.replace(f'{lowerWord}:{lowerWord}','')
            translations = translations.replace(f'<u>{lowerWord}</u>:{lowerWord}','')
            translations = translations.replace(f'<u><b>{lowerWord}</b></u>:{lowerWord}','')
            translations = translations.replace(f'<b>{lowerWord}</b>:{lowerWord}','')

        translations = translations.replace(',,','').replace('..','').replace(', ,','').replace('. .','')
        translations = translations.replace(':,','').replace(':.','').replace(': ,','').replace(': .','')
        translations = translations.replace(';,','').replace(';.','').replace('; ,','').replace('; .','')
        translations = translations.replace('.,','').replace('.,','').replace('. ,','').replace(', .','')
        translations = translations.replace(',:','').replace('.:','').replace(', :','').replace('. :','')
        translations = translations.replace(',;','').replace('.;','').replace(', ;','').replace('. ;','')
        if translations.startswith('. '):
            translations = translations[2:]
        translations = translations
        new_line = terms[0]+'\t'+translations+'\n'
        workfiles.write_tmpfile(cnt,new_line,'a')
    rd.close()
def workfile():
    rd = workfiles.read_lasttmp_or_lists()
    cnt = workfiles.new_tmpfile()
    while True:
        line = rd.readline()
        if not line :
            break
        if len(line) < 5 :
            workfiles.write_tmpfile(cnt,line,'a')
            continue
        terms = line.split('\t')
        if len(terms) == 1 :
            workfiles.write_tmpfile(cnt,line,'a')
            continue
        word = terms[0].strip()

        babla = terms[1].strip()
        cambridge_def = terms[2]
        free_def = terms[3]
        mac_def = terms[4]
        google = terms[5]
        reverso = terms[6]
        yandex = terms[7]
        linguee = terms[8]
        cambridge_pt = terms[9]
        free_pt = terms[10].strip()


        babla = only_word_flexion(word,babla)

        cambridge_def = organize_definitions_with_br(cambridge_def)


        free_def = clean_lengthy_definitions(free_def)

        mac_def = clean_lengthy_definitions(mac_def)



        google = reorganizeTranslations(word,google)
        google = remove_lower_frequency(google)
        google = trim_all(google)
        google = remove_same_word(word,google)
        google = remove_same_translations_without_frequency(google)
        google = only_four_translations(google)
        google = remove_duplicated_terms_freq(google)


        reverso = trim_all(reverso)
        yandex = trim_all(yandex)
        linguee = trim_all(linguee)
        cambridge_pt = trim_all(cambridge_pt)
        
        cambridge_pt = remove_dirt(cambridge_pt)

        reverso = reverso.replace('"','')
        reverso = remove_less_half_frequency(reverso)
        reverso = remove_same_word(word,reverso)
        reverso = only_four_translations(reverso)
        reverso = remove_duplicated_terms_freq(reverso)

        yandex = accentuation_only_matching(yandex,google,reverso,cambridge_pt,linguee)
        linguee = only_matching(linguee,google,reverso,cambridge_pt,yandex)

        yandex = remove_duplicated_terms(yandex)
        linguee = remove_duplicated_terms(linguee)

        free_pt = clean_infos(free_pt)
        free_pt = only_word_flexion_free(word,free_pt)
        free_pt = exclude_lengthy_translation(free_pt)

        line = f'{word}\t{babla}\t{cambridge_def}\t{free_def}\t{mac_def}\t{google}\t{reverso}\t{yandex}\t{linguee}\t{cambridge_pt}\t{free_pt}\t\t\t\t\n'
        workfiles.write_tmpfile(cnt,line,'a')
    rd.close()