Python read_lasttmp_or_output Examples, workfiles.read_lasttmp_or_output Python Examples

Example #1

0

Show file

def remove_same_translations():
    print('removing two equal translations')
    '''it hapens when a translation is considered principal without frequency, 
       and another has a frequency, both same translation
    '''
    rd = workfiles.read_lasttmp_or_output()
    cnt = workfiles.new_tmpfile()
    while True:
        line = rd.readline()
        if not line:
            break
        if len(line) < 5:
            workfiles.write_tmpfile(cnt, line, 'a')
            continue
        terms = line.split('\t')
        if len(terms) == 1:
            workfiles.write_tmpfile(cnt, line, 'a')
            continue
        translations = terms[1].split(',')
        new_trs = []
        anterior_translation = ''
        for translation in translations:
            ini = translation.find(')')
            term_translation = translation[ini + 1:].strip().lower()
            if term_translation == anterior_translation:
                continue
            anterior_translation = term_translation
            new_trs.append(translation)
        new_line = ','.join(new_trs)
        if new_line[-1:] != '\n':
            new_line = new_line + '\n'
        line = terms[0] + '\t' + new_line
        workfiles.write_tmpfile(cnt, line, 'a')
    rd.close()

Example #2

0

Show file

def insert_comma():
    print('inserting commas')
    rd = workfiles.read_lasttmp_or_output()
    cnt = workfiles.new_tmpfile()
    while True:
        line = rd.readline()
        if not line:
            break
        line = line.replace('\n', '')
        if len(line) < 5:
            workfiles.write_tmpfile(cnt, line + '\n', 'a')
            continue
        terms = line.split('\t')
        if len(terms) == 1:
            workfiles.write_tmpfile(cnt, line + '\n', 'a')
            continue
        words = terms[1].split(',')
        new_words = []
        for word in words:
            new_word = word.replace('    ', ' ')
            new_word = new_word.replace('   ', ' ')
            new_word = new_word.replace('  ', ' ')
            new_word = new_word.replace(' ', ', ')
            word = word.strip()
            if word.lower() == terms[0].lower():
                continue
            if word == '' or word == ',':
                continue
            new_words.append(new_word)
        new_line = ','.join(new_words)
        new_line = new_line.replace(', , ', ',')
        line = terms[0] + '\t' + new_line
        workfiles.write_tmpfile(cnt, line + '\n', 'a')
    rd.close()

Example #3

0

Show file

def remove_same_word():
    print('removing term equal word')
    rd = workfiles.read_lasttmp_or_output()
    cnt = workfiles.new_tmpfile()
    while True:
        line = rd.readline()
        if not line:
            break
        if len(line) < 5:
            workfiles.write_tmpfile(cnt, line, 'a')
            continue
        terms = line.split('\t')
        if len(terms) == 1:
            workfiles.write_tmpfile(cnt, line, 'a')
            continue
        word = terms[0].strip().lower()
        translations = terms[1].split(',')
        new_trs = []
        for translation in translations:
            ini = translation.find(')')
            if ini > len(translation) or ini == -1:
                continue
            if translation[ini + 1:][0].isupper():
                continue
            if translation[ini + 1:].strip().lower() == word:
                continue
            new_trs.append(translation)
        new_line = ','.join(new_trs)
        if new_line[-1:] != '\n':
            new_line = new_line + '\n'
        line = terms[0] + '\t' + new_line
        workfiles.write_tmpfile(cnt, line, 'a')
    rd.close()

Example #4

0

Show file

def remove_same_word():
    print('removing translation equal word')
    rd = workfiles.read_lasttmp_or_output()
    cnt = workfiles.new_tmpfile()
    while True:
        line = rd.readline()
        if not line:
            break
        if len(line) < 5:
            workfiles.write_tmpfile(cnt, line, 'a')
            continue
        terms = line.split('\t')
        if len(terms) == 1:
            workfiles.write_tmpfile(cnt, line, 'a')
            continue
        word = terms[0].strip()
        translations = terms[2].split(',')
        new_trs = []
        for translation in translations:
            if translation.strip() == '':
                continue
            if translation.strip().lower() == word.lower():
                continue
            new_trs.append(translation.lower())
        translations_result = ','.join(new_trs)
        if translations_result[-1:] != '\n':
            translations_result = translations_result + '\n'
        new_line = terms[0] + '\t' + terms[1] + '\t' + translations_result
        workfiles.write_tmpfile(cnt, new_line, 'a')
    rd.close()

Example #5

0

Show file

def remove_less_half_frequency():
    print('removing term with less than half frequency from another term')
    rd = workfiles.read_lasttmp_or_output()
    cnt = workfiles.new_tmpfile()
    while True:
        line = rd.readline()
        if not line:
            break
        if len(line) < 5:
            workfiles.write_tmpfile(cnt, line, 'a')
            continue
        terms = line.split('\t')
        if len(terms) == 1:
            workfiles.write_tmpfile(cnt, line, 'a')
            continue
        translations = terms[1].split(',')
        new_trs = []
        anterior_frequency = 2
        frequency = 0
        for translation in translations:
            ini = translation.find(')')
            txt_freq = translation[1:ini]
            if txt_freq != '':
                frequency = int(txt_freq)
            if frequency < (anterior_frequency / 2):
                continue
            anterior_frequency = frequency
            new_trs.append(translation)
        new_line = ','.join(new_trs)
        if new_line[-1:] != '\n':
            new_line = new_line + '\n'
        line = terms[0] + '\t' + new_line
        workfiles.write_tmpfile(cnt, line, 'a')
    rd.close()

Example #6

0

Show file

def definitions_startswith_number():
    print('verifying if definitions field starts with number')
    rd = workfiles.read_lasttmp_or_output()
    line = rd.readline()
    rd.close()
    fields = line.split('\t')
    if fields[4].startswith('1.'):
        return True
    else:
        return False

Example #7

0

Show file

def reorganizeTranslations():
    print("""remove a term when have both male and female,
    change the main translation to the front of translations,
    sort the translations by frequency,
    remove terms beggining with "o", "a", "os", "as"
    """)
    rd = workfiles.read_lasttmp_or_output()
    cnt = workfiles.new_tmpfile()
    output = ''
    while True:
        line = rd.readline()
        if not line:
            break
        line = line[:-1]
        fields = line.split('\t')
        word = fields[0]
        if len(fields) == 1:
            terms = ''
        else:
            terms = fields[1].split(',')
        others = {}
        i = 0
        if len(terms) > 1:
            if terms[0][:-1] == terms[1] and terms[0][-1:] == 'a':
                i = 1
            if len(terms[0]) == len(
                    terms[1]
            ) and terms[1][-1:] == 'o' and terms[0][-1:] == 'a':
                i = 1
        while i < len(terms):
            if terms[i][3:4].isupper() and word[0:1].isupper():
                i += 1
                continue
            if terms[i][0:1] == '(':
                others[terms[i][3:].lower()] = terms[i][:3]
            else:
                if terms[i] not in others:
                    others[terms[i].lower()] = '*'
            i += 1
        others = dict(
            sorted(others.items(), key=lambda item: item[1], reverse=True))
        output += word + '\t'
        for term in others:
            if others[term] == '*':
                others[term] = ''
            output += others[term] + term + ','
        output = output.replace(')a ', ')').replace(')o ', ')').replace(
            ')os ', ')').replace(')as ', ')')
        output += '\n'
    rd.close()
    workfiles.write_tmpfile(cnt, output, 'a')

Example #8

0

Show file

def organize_definitions_with_n():
    rd = workfiles.read_lasttmp_or_output()
    cnt = workfiles.new_tmpfile()
    while True:
        line = rd.readline()
        if not line:
            break
        line = line.replace('\n', '')
        fields = line.split('\t')
        fields[4] = fields[4].replace('. ,', '. ')
        newline = fields[0] + '\t' + fields[1] + '\t' + fields[
            2] + '\t' + fields[3] + '\t' + fields[4] + '\n'
        workfiles.write_tmpfile(cnt, newline, 'a')
    rd.close()

Example #9

0

Show file

def sound_mp3_directory():
    print('change the reference of sound in mp3 field')
    rd = workfiles.read_lasttmp_or_output()
    cnt = workfiles.new_tmpfile()
    while True:
        line = rd.readline()
        if not line:
            break
        line = line.replace('\n', '')
        fields = line.split('\t')
        fields[3] = workfiles.add_wordlist_dictionary_soundmp3(fields[3])
        newline = fields[0] + '\t' + fields[1] + '\t' + fields[
            2] + '\t' + fields[3] + '\t' + fields[4] + '\n'
        workfiles.write_tmpfile(cnt, newline, 'a')
    rd.close()

Example #10

0

Show file

def remove_wordlist_from_soundname():
    print('remove wordlist from name of mp3 sound')
    rd = workfiles.read_lasttmp_or_output()
    cnt = workfiles.new_tmpfile()
    while True:
        line = rd.readline()
        if not line:
            break
        line = line.replace('\n', '')
        fields = line.split('\t')
        fields[3] = fields[3].replace(workfiles.word_list + '-', '')
        newline = fields[0] + '\t' + fields[1] + '\t' + fields[
            2] + '\t' + fields[3] + '\t' + fields[4] + '\n'
        workfiles.write_tmpfile(cnt, newline, 'a')
    rd.close()

Example #11

0

Show file

File: howjsay.py Project: danilobatistaqueiroz/node_dictionaries_scrapers

def remove_wordlist_from_name():
    print('remove wordlist from name')
    rd = workfiles.read_lasttmp_or_output()
    cnt = workfiles.new_tmpfile()
    while True:
        line = rd.readline()
        if not line:
            break
        line = line[:-1]
        fields = line.split('\t')
        if len(fields) == 1:
            newline = fields[0] + '\t\n'
        else:
            fields[1] = fields[1].replace(workfiles.word_list + '-', '')
            newline = fields[0] + '\t' + fields[1] + '\n'
        workfiles.write_tmpfile(cnt, newline, 'a')
    rd.close()

Example #12

0

Show file

def clean_ipa():
    print('cleaning ipa, adding space between translations')
    rd = workfiles.read_lasttmp_or_output()
    cnt = workfiles.new_tmpfile()
    while True:
        line = rd.readline()
        if not line:
            break
        line = line[:-1]
        fields = line.split('\t')
        ipa = fields[2]
        ipa = ipa.replace('/<span', '<span')
        ipa = ipa.replace('</span>/', '</span>')
        pt = fields[1]
        fields[2] = ipa
        fields[1] = pt
        workfiles.write_tmpfile(cnt, '\t'.join(fields) + '\n', 'a')
    rd.close()

Example #13

0

Show file

def rem_spaces_between_translations():
    print('remove spaces between comma in translations')
    rd = workfiles.read_lasttmp_or_output()
    cnt = workfiles.new_tmpfile()
    while True:
        line = rd.readline()
        line = line[:-1]
        if not line:
            break
        line = line.replace('\n', '')
        fields = line.split('\t')
        fields[1] = fields[1].replace(', ', ',')
        if len(fields) >= 5:
            newline = fields[0] + '\t' + fields[1] + '\t' + fields[
                2] + '\t' + fields[3] + '\t' + fields[4] + '\n'
        else:
            newline = fields[0] + '\t' + fields[1] + '\t' + fields[
                2] + '\t' + fields[3] + '\t\n'
        workfiles.write_tmpfile(cnt, newline, 'a')
    rd.close()

Example #14

0

Show file

def organize_definitions_without_n():
    print('organizing definitions without numbers')
    rd = workfiles.read_lasttmp_or_output()
    cnt = workfiles.new_tmpfile()
    while True:
        line = rd.readline()
        if not line:
            break
        line = line.replace('\n', '')
        fields = line.split('\t')
        if fields[4].strip() != '':
            fields[4] = '1. ' + fields[4]
        num = 1
        while fields[4].find('. ,') > -1:
            num += 1
            fields[4] = fields[4].replace('. ,', '. ' + str(num) + '. ', 1)
        newline = fields[0] + '\t' + fields[1] + '\t' + fields[
            2] + '\t' + fields[3] + '\t' + fields[4] + '\n'
        workfiles.write_tmpfile(cnt, newline, 'a')
    rd.close()

Example #15

0

Show file

def remove_spaces_between_words():
    print('removing spaces between words')
    rd = workfiles.read_lasttmp_or_output()
    cnt = workfiles.new_tmpfile()
    while True:
        line = rd.readline()
        if not line:
            break
        if len(line) < 5:
            workfiles.write_tmpfile(cnt, line, 'a')
            continue
        terms = line.split('\t')
        if len(terms) == 1:
            workfiles.write_tmpfile(cnt, line, 'a')
            continue
        words = terms[1].split(',')
        new_line = ','.join(words)
        new_line = new_line.replace(', ', ',')
        workfiles.write_tmpfile(cnt, terms[0] + '\t' + new_line, 'a')
    rd.close()

Example #16

0

Show file

def remove_frequency():
    '''remove translations with frequency 2 if there are others with frequency 3'''
    print(
        'removing translations with frequency 1, if there are at least 1 translation with another frequency'
    )
    rd = workfiles.read_lasttmp_or_output()
    cnt = workfiles.new_tmpfile()
    while True:
        exclude_all_frequence1 = False
        exclude_all_frequence2 = False
        line = rd.readline()
        if not line:
            break
        if len(line) < 5:
            workfiles.write_tmpfile(cnt, line, 'a')
            continue
        terms = line.split('\t')
        if len(terms) == 1:
            workfiles.write_tmpfile(cnt, line, 'a')
            continue
        if terms[1].find('(2)') >= 0 or terms[1].find('(3)') >= 0:
            exclude_all_frequence1 = True
        if terms[1].find('(3)') >= 0:
            exclude_all_frequence2 = True
        translations = terms[1].split(',')
        new_trs = []
        for translation in translations:
            ini = translation.find(')')
            if translation[1:ini] == '1' and exclude_all_frequence1 == True:
                continue
            if translation[1:ini] == '2' and exclude_all_frequence2 == True:
                continue
            new_trs.append(translation)
        new_line = ','.join(new_trs)
        if new_line[-1:] != '\n':
            new_line = new_line + '\n'
        line = terms[0] + '\t' + new_line
        workfiles.write_tmpfile(cnt, line, 'a')
    rd.close()

Example #17

0

Show file

def clean_definitions():
    print(
        'removing repeated numbers ex: 1. 1. and adding a break line for each item'
    )
    rd = workfiles.read_lasttmp_or_output()
    cnt = workfiles.new_tmpfile()
    while True:
        line = rd.readline()
        if not line:
            break
        line = line.replace('1. 1. ', '1. ')
        line = line.replace('2. 2. ', '<BR> 2. ')
        line = line.replace('3. 3. ', '<BR> 3. ')
        line = line.replace('4. 4. ', '<BR> 4. ')
        line = line.replace('5. 5. ', '<BR> 5. ')
        line = line.replace('6. 6. ', '<BR> 6. ')
        line = line.replace('7. 7. ', '<BR> 7. ')
        line = line.replace('8. 8. ', '<BR> 8. ')
        line = line.replace('9. 9. ', '<BR> 9. ')
        line = line.replace('..', '.')
        workfiles.write_tmpfile(cnt, line, 'a')
    rd.close()

Example #18

0

Show file

def only_four_translations():
    print('let only the first 4 translations and remove the other ones')
    rd = workfiles.read_lasttmp_or_output()
    cnt = workfiles.new_tmpfile()
    while True:
        line = rd.readline()
        if not line:
            break
        if len(line) < 5:
            workfiles.write_tmpfile(cnt, line, 'a')
            continue
        terms = line.split('\t')
        if len(terms) == 1:
            workfiles.write_tmpfile(cnt, line, 'a')
            continue
        translations = terms[1].split(',')
        new_line = ','.join(translations[:4])
        if new_line[-1:] != '\n':
            new_line = new_line + '\n'
        line = terms[0] + '\t' + new_line
        workfiles.write_tmpfile(cnt, line, 'a')
    rd.close()

Example #19

0

Show file

def remove_replication():
    print('removing replications in translations')
    """sometime the scraper write more than once the same translation in lines"""
    rd = workfiles.read_lasttmp_or_output()
    cnt = workfiles.new_tmpfile()
    a_translations = ''
    while True:
        line = rd.readline()
        if not line:
            break
        if len(line) < 5:
            workfiles.write_tmpfile(cnt, line, 'a')
            continue
        terms = line.split('\t')
        if len(terms) == 1:
            workfiles.write_tmpfile(cnt, line, 'a')
            continue
        word = terms[0].strip()
        translations = terms[1]
        if translations == a_translations:
            workfiles.write_tmpfile(cnt, word + '\t\n', 'a')
        else:
            workfiles.write_tmpfile(cnt, line, 'a')
        a_translations = translations

Example #20

0

Show file

def remove_same_translations_without_frequency():
    print('removing translation without frequency if there is another')
    rd = workfiles.read_lasttmp_or_output()
    cnt = workfiles.new_tmpfile()
    while True:
        line = rd.readline()
        if not line:
            break
        if len(line) < 5:
            workfiles.write_tmpfile(cnt, line, 'a')
            continue
        terms = line.split('\t')
        if len(terms) == 1:
            workfiles.write_tmpfile(cnt, line, 'a')
            continue
        translations = terms[1].split(',')
        new_trs = []
        if is_first_translation_without_frequency(translations):
            has_with_frequency = False
            for translation in translations[1:]:
                ini = translation.find(')')
                term_translation = translation[ini + 1:].strip().lower()
                if term_translation == translations[0]:
                    has_with_frequency = True
            if has_with_frequency:
                new_trs = translations[1:]
            else:
                new_trs = translations
        else:
            new_trs = translations
        new_line = ','.join(new_trs)
        if new_line[-1:] != '\n':
            new_line = new_line + '\n'
        line = terms[0] + '\t' + new_line
        workfiles.write_tmpfile(cnt, line, 'a')
    rd.close()

Example #21

0

Show file

def remove_volume_up():
    print('removing :volume_up and volume_up')
    rd = workfiles.read_lasttmp_or_output()
    cnt = workfiles.new_tmpfile()
    while True:
        line = rd.readline()
        if not line :
            break
        if len(line) < 5 :
            workfiles.write_tmpfile(cnt,line,'a')
            continue
        terms = line.split('\t')
        if len(terms) == 1 :
            workfiles.write_tmpfile(cnt,line,'a')
            continue
        word = terms[0].strip()
        translations = terms[1].replace('\n','')
        translations = translations.replace(':volume_up,','')
        translations = translations.replace('volume_up,','')
        translations = translations.replace(f'{word}:{word}.','')
        translations = translations.replace(f'<u>{word}</u>:{word}.','')
        translations = translations.replace(f'<u><b>{word}</b></u>:{word}.','')
        translations = translations.replace(f'<b>{word}</b>:{word}.','')

        translations = translations.replace(f'{word}:{word},','')
        translations = translations.replace(f'<u>{word}</u>:{word},','')
        translations = translations.replace(f'<u><b>{word}</b></u>:{word},','')
        translations = translations.replace(f'<b>{word}</b>:{word},','')

        translations = translations.replace(f'{word}:{word} ','')
        translations = translations.replace(f'<u>{word}</u>:{word} ','')
        translations = translations.replace(f'<u><b>{word}</b></u>:{word} ','')
        translations = translations.replace(f'<b>{word}</b>:{word} ','')

        if translations.endswith(word):
            translations = translations.replace(f'{word}:{word}','')
            translations = translations.replace(f'<u>{word}</u>:{word}','')
            translations = translations.replace(f'<u><b>{word}</b></u>:{word}','')
            translations = translations.replace(f'<b>{word}</b>:{word}','')

        capitalWord = word.capitalize()
        translations = translations.replace(f'{capitalWord}:{capitalWord}.','')
        translations = translations.replace(f'<u>{capitalWord}</u>:{capitalWord}.','')
        translations = translations.replace(f'<u><b>{capitalWord}</b></u>:{capitalWord}.','')
        translations = translations.replace(f'<b>{capitalWord}</b>:{capitalWord}.','')

        translations = translations.replace(f'{capitalWord}:{capitalWord},','')
        translations = translations.replace(f'<u>{capitalWord}</u>:{capitalWord},','')
        translations = translations.replace(f'<u><b>{capitalWord}</b></u>:{capitalWord},','')
        translations = translations.replace(f'<b>{capitalWord}</b>:{capitalWord},','')

        translations = translations.replace(f'{capitalWord}:{capitalWord} ','')
        translations = translations.replace(f'<u>{capitalWord}</u>:{capitalWord} ','')
        translations = translations.replace(f'<u><b>{capitalWord}</b></u>:{capitalWord} ','')
        translations = translations.replace(f'<b>{capitalWord}</b>:{capitalWord} ','')

        if translations.endswith(capitalWord):
            translations = translations.replace(f'{capitalWord}:{capitalWord}','')
            translations = translations.replace(f'<u>{capitalWord}</u>:{capitalWord}','')
            translations = translations.replace(f'<u><b>{capitalWord}</b></u>:{capitalWord}','')
            translations = translations.replace(f'<b>{capitalWord}</b>:{capitalWord}','')

        lowerWord = word.lower()
        translations = translations.replace(f'{lowerWord}:{lowerWord}.','')
        translations = translations.replace(f'<u>{lowerWord}</u>:{lowerWord}.','')
        translations = translations.replace(f'<u><b>{lowerWord}</b></u>:{lowerWord}.','')
        translations = translations.replace(f'<b>{lowerWord}</b>:{lowerWord}.','')

        translations = translations.replace(f'{lowerWord}:{lowerWord},','')
        translations = translations.replace(f'<u>{lowerWord}</u>:{lowerWord},','')
        translations = translations.replace(f'<u><b>{lowerWord}</b></u>:{lowerWord},','')
        translations = translations.replace(f'<b>{lowerWord}</b>:{lowerWord},','')

        translations = translations.replace(f'{lowerWord}:{lowerWord} ','')
        translations = translations.replace(f'<u>{lowerWord}</u>:{lowerWord} ','')
        translations = translations.replace(f'<u><b>{lowerWord}</b></u>:{lowerWord} ','')
        translations = translations.replace(f'<b>{lowerWord}</b>:{lowerWord} ','')

        if translations.endswith(lowerWord):
            translations = translations.replace(f'{lowerWord}:{lowerWord}','')
            translations = translations.replace(f'<u>{lowerWord}</u>:{lowerWord}','')
            translations = translations.replace(f'<u><b>{lowerWord}</b></u>:{lowerWord}','')
            translations = translations.replace(f'<b>{lowerWord}</b>:{lowerWord}','')

        translations = translations.replace(',,','').replace('..','').replace(', ,','').replace('. .','')
        translations = translations.replace(':,','').replace(':.','').replace(': ,','').replace(': .','')
        translations = translations.replace(';,','').replace(';.','').replace('; ,','').replace('; .','')
        translations = translations.replace('.,','').replace('.,','').replace('. ,','').replace(', .','')
        translations = translations.replace(',:','').replace('.:','').replace(', :','').replace('. :','')
        translations = translations.replace(',;','').replace('.;','').replace(', ;','').replace('. ;','')
        if translations.startswith('. '):
            translations = translations[2:]
        translations = translations
        new_line = terms[0]+'\t'+translations+'\n'
        workfiles.write_tmpfile(cnt,new_line,'a')
    rd.close()