def remove_same_translations(): print('removing two equal translations') '''it hapens when a translation is considered principal without frequency, and another has a frequency, both same translation ''' rd = workfiles.read_lasttmp_or_output() cnt = workfiles.new_tmpfile() while True: line = rd.readline() if not line: break if len(line) < 5: workfiles.write_tmpfile(cnt, line, 'a') continue terms = line.split('\t') if len(terms) == 1: workfiles.write_tmpfile(cnt, line, 'a') continue translations = terms[1].split(',') new_trs = [] anterior_translation = '' for translation in translations: ini = translation.find(')') term_translation = translation[ini + 1:].strip().lower() if term_translation == anterior_translation: continue anterior_translation = term_translation new_trs.append(translation) new_line = ','.join(new_trs) if new_line[-1:] != '\n': new_line = new_line + '\n' line = terms[0] + '\t' + new_line workfiles.write_tmpfile(cnt, line, 'a') rd.close()
def insert_comma(): print('inserting commas') rd = workfiles.read_lasttmp_or_output() cnt = workfiles.new_tmpfile() while True: line = rd.readline() if not line: break line = line.replace('\n', '') if len(line) < 5: workfiles.write_tmpfile(cnt, line + '\n', 'a') continue terms = line.split('\t') if len(terms) == 1: workfiles.write_tmpfile(cnt, line + '\n', 'a') continue words = terms[1].split(',') new_words = [] for word in words: new_word = word.replace(' ', ' ') new_word = new_word.replace(' ', ' ') new_word = new_word.replace(' ', ' ') new_word = new_word.replace(' ', ', ') word = word.strip() if word.lower() == terms[0].lower(): continue if word == '' or word == ',': continue new_words.append(new_word) new_line = ','.join(new_words) new_line = new_line.replace(', , ', ',') line = terms[0] + '\t' + new_line workfiles.write_tmpfile(cnt, line + '\n', 'a') rd.close()
def remove_same_word(): print('removing term equal word') rd = workfiles.read_lasttmp_or_output() cnt = workfiles.new_tmpfile() while True: line = rd.readline() if not line: break if len(line) < 5: workfiles.write_tmpfile(cnt, line, 'a') continue terms = line.split('\t') if len(terms) == 1: workfiles.write_tmpfile(cnt, line, 'a') continue word = terms[0].strip().lower() translations = terms[1].split(',') new_trs = [] for translation in translations: ini = translation.find(')') if ini > len(translation) or ini == -1: continue if translation[ini + 1:][0].isupper(): continue if translation[ini + 1:].strip().lower() == word: continue new_trs.append(translation) new_line = ','.join(new_trs) if new_line[-1:] != '\n': new_line = new_line + '\n' line = terms[0] + '\t' + new_line workfiles.write_tmpfile(cnt, line, 'a') rd.close()
def remove_same_word(): print('removing translation equal word') rd = workfiles.read_lasttmp_or_output() cnt = workfiles.new_tmpfile() while True: line = rd.readline() if not line: break if len(line) < 5: workfiles.write_tmpfile(cnt, line, 'a') continue terms = line.split('\t') if len(terms) == 1: workfiles.write_tmpfile(cnt, line, 'a') continue word = terms[0].strip() translations = terms[2].split(',') new_trs = [] for translation in translations: if translation.strip() == '': continue if translation.strip().lower() == word.lower(): continue new_trs.append(translation.lower()) translations_result = ','.join(new_trs) if translations_result[-1:] != '\n': translations_result = translations_result + '\n' new_line = terms[0] + '\t' + terms[1] + '\t' + translations_result workfiles.write_tmpfile(cnt, new_line, 'a') rd.close()
def remove_less_half_frequency(): print('removing term with less than half frequency from another term') rd = workfiles.read_lasttmp_or_output() cnt = workfiles.new_tmpfile() while True: line = rd.readline() if not line: break if len(line) < 5: workfiles.write_tmpfile(cnt, line, 'a') continue terms = line.split('\t') if len(terms) == 1: workfiles.write_tmpfile(cnt, line, 'a') continue translations = terms[1].split(',') new_trs = [] anterior_frequency = 2 frequency = 0 for translation in translations: ini = translation.find(')') txt_freq = translation[1:ini] if txt_freq != '': frequency = int(txt_freq) if frequency < (anterior_frequency / 2): continue anterior_frequency = frequency new_trs.append(translation) new_line = ','.join(new_trs) if new_line[-1:] != '\n': new_line = new_line + '\n' line = terms[0] + '\t' + new_line workfiles.write_tmpfile(cnt, line, 'a') rd.close()
def definitions_startswith_number(): print('verifying if definitions field starts with number') rd = workfiles.read_lasttmp_or_output() line = rd.readline() rd.close() fields = line.split('\t') if fields[4].startswith('1.'): return True else: return False
def reorganizeTranslations(): print("""remove a term when have both male and female, change the main translation to the front of translations, sort the translations by frequency, remove terms beggining with "o", "a", "os", "as" """) rd = workfiles.read_lasttmp_or_output() cnt = workfiles.new_tmpfile() output = '' while True: line = rd.readline() if not line: break line = line[:-1] fields = line.split('\t') word = fields[0] if len(fields) == 1: terms = '' else: terms = fields[1].split(',') others = {} i = 0 if len(terms) > 1: if terms[0][:-1] == terms[1] and terms[0][-1:] == 'a': i = 1 if len(terms[0]) == len( terms[1] ) and terms[1][-1:] == 'o' and terms[0][-1:] == 'a': i = 1 while i < len(terms): if terms[i][3:4].isupper() and word[0:1].isupper(): i += 1 continue if terms[i][0:1] == '(': others[terms[i][3:].lower()] = terms[i][:3] else: if terms[i] not in others: others[terms[i].lower()] = '*' i += 1 others = dict( sorted(others.items(), key=lambda item: item[1], reverse=True)) output += word + '\t' for term in others: if others[term] == '*': others[term] = '' output += others[term] + term + ',' output = output.replace(')a ', ')').replace(')o ', ')').replace( ')os ', ')').replace(')as ', ')') output += '\n' rd.close() workfiles.write_tmpfile(cnt, output, 'a')
def organize_definitions_with_n(): rd = workfiles.read_lasttmp_or_output() cnt = workfiles.new_tmpfile() while True: line = rd.readline() if not line: break line = line.replace('\n', '') fields = line.split('\t') fields[4] = fields[4].replace('. ,', '. ') newline = fields[0] + '\t' + fields[1] + '\t' + fields[ 2] + '\t' + fields[3] + '\t' + fields[4] + '\n' workfiles.write_tmpfile(cnt, newline, 'a') rd.close()
def sound_mp3_directory(): print('change the reference of sound in mp3 field') rd = workfiles.read_lasttmp_or_output() cnt = workfiles.new_tmpfile() while True: line = rd.readline() if not line: break line = line.replace('\n', '') fields = line.split('\t') fields[3] = workfiles.add_wordlist_dictionary_soundmp3(fields[3]) newline = fields[0] + '\t' + fields[1] + '\t' + fields[ 2] + '\t' + fields[3] + '\t' + fields[4] + '\n' workfiles.write_tmpfile(cnt, newline, 'a') rd.close()
def remove_wordlist_from_soundname(): print('remove wordlist from name of mp3 sound') rd = workfiles.read_lasttmp_or_output() cnt = workfiles.new_tmpfile() while True: line = rd.readline() if not line: break line = line.replace('\n', '') fields = line.split('\t') fields[3] = fields[3].replace(workfiles.word_list + '-', '') newline = fields[0] + '\t' + fields[1] + '\t' + fields[ 2] + '\t' + fields[3] + '\t' + fields[4] + '\n' workfiles.write_tmpfile(cnt, newline, 'a') rd.close()
def remove_wordlist_from_name(): print('remove wordlist from name') rd = workfiles.read_lasttmp_or_output() cnt = workfiles.new_tmpfile() while True: line = rd.readline() if not line: break line = line[:-1] fields = line.split('\t') if len(fields) == 1: newline = fields[0] + '\t\n' else: fields[1] = fields[1].replace(workfiles.word_list + '-', '') newline = fields[0] + '\t' + fields[1] + '\n' workfiles.write_tmpfile(cnt, newline, 'a') rd.close()
def clean_ipa(): print('cleaning ipa, adding space between translations') rd = workfiles.read_lasttmp_or_output() cnt = workfiles.new_tmpfile() while True: line = rd.readline() if not line: break line = line[:-1] fields = line.split('\t') ipa = fields[2] ipa = ipa.replace('/<span', '<span') ipa = ipa.replace('</span>/', '</span>') pt = fields[1] fields[2] = ipa fields[1] = pt workfiles.write_tmpfile(cnt, '\t'.join(fields) + '\n', 'a') rd.close()
def rem_spaces_between_translations(): print('remove spaces between comma in translations') rd = workfiles.read_lasttmp_or_output() cnt = workfiles.new_tmpfile() while True: line = rd.readline() line = line[:-1] if not line: break line = line.replace('\n', '') fields = line.split('\t') fields[1] = fields[1].replace(', ', ',') if len(fields) >= 5: newline = fields[0] + '\t' + fields[1] + '\t' + fields[ 2] + '\t' + fields[3] + '\t' + fields[4] + '\n' else: newline = fields[0] + '\t' + fields[1] + '\t' + fields[ 2] + '\t' + fields[3] + '\t\n' workfiles.write_tmpfile(cnt, newline, 'a') rd.close()
def organize_definitions_without_n(): print('organizing definitions without numbers') rd = workfiles.read_lasttmp_or_output() cnt = workfiles.new_tmpfile() while True: line = rd.readline() if not line: break line = line.replace('\n', '') fields = line.split('\t') if fields[4].strip() != '': fields[4] = '1. ' + fields[4] num = 1 while fields[4].find('. ,') > -1: num += 1 fields[4] = fields[4].replace('. ,', '. ' + str(num) + '. ', 1) newline = fields[0] + '\t' + fields[1] + '\t' + fields[ 2] + '\t' + fields[3] + '\t' + fields[4] + '\n' workfiles.write_tmpfile(cnt, newline, 'a') rd.close()
def remove_spaces_between_words(): print('removing spaces between words') rd = workfiles.read_lasttmp_or_output() cnt = workfiles.new_tmpfile() while True: line = rd.readline() if not line: break if len(line) < 5: workfiles.write_tmpfile(cnt, line, 'a') continue terms = line.split('\t') if len(terms) == 1: workfiles.write_tmpfile(cnt, line, 'a') continue words = terms[1].split(',') new_line = ','.join(words) new_line = new_line.replace(', ', ',') workfiles.write_tmpfile(cnt, terms[0] + '\t' + new_line, 'a') rd.close()
def remove_frequency(): '''remove translations with frequency 2 if there are others with frequency 3''' print( 'removing translations with frequency 1, if there are at least 1 translation with another frequency' ) rd = workfiles.read_lasttmp_or_output() cnt = workfiles.new_tmpfile() while True: exclude_all_frequence1 = False exclude_all_frequence2 = False line = rd.readline() if not line: break if len(line) < 5: workfiles.write_tmpfile(cnt, line, 'a') continue terms = line.split('\t') if len(terms) == 1: workfiles.write_tmpfile(cnt, line, 'a') continue if terms[1].find('(2)') >= 0 or terms[1].find('(3)') >= 0: exclude_all_frequence1 = True if terms[1].find('(3)') >= 0: exclude_all_frequence2 = True translations = terms[1].split(',') new_trs = [] for translation in translations: ini = translation.find(')') if translation[1:ini] == '1' and exclude_all_frequence1 == True: continue if translation[1:ini] == '2' and exclude_all_frequence2 == True: continue new_trs.append(translation) new_line = ','.join(new_trs) if new_line[-1:] != '\n': new_line = new_line + '\n' line = terms[0] + '\t' + new_line workfiles.write_tmpfile(cnt, line, 'a') rd.close()
def clean_definitions(): print( 'removing repeated numbers ex: 1. 1. and adding a break line for each item' ) rd = workfiles.read_lasttmp_or_output() cnt = workfiles.new_tmpfile() while True: line = rd.readline() if not line: break line = line.replace('1. 1. ', '1. ') line = line.replace('2. 2. ', '<BR> 2. ') line = line.replace('3. 3. ', '<BR> 3. ') line = line.replace('4. 4. ', '<BR> 4. ') line = line.replace('5. 5. ', '<BR> 5. ') line = line.replace('6. 6. ', '<BR> 6. ') line = line.replace('7. 7. ', '<BR> 7. ') line = line.replace('8. 8. ', '<BR> 8. ') line = line.replace('9. 9. ', '<BR> 9. ') line = line.replace('..', '.') workfiles.write_tmpfile(cnt, line, 'a') rd.close()
def only_four_translations(): print('let only the first 4 translations and remove the other ones') rd = workfiles.read_lasttmp_or_output() cnt = workfiles.new_tmpfile() while True: line = rd.readline() if not line: break if len(line) < 5: workfiles.write_tmpfile(cnt, line, 'a') continue terms = line.split('\t') if len(terms) == 1: workfiles.write_tmpfile(cnt, line, 'a') continue translations = terms[1].split(',') new_line = ','.join(translations[:4]) if new_line[-1:] != '\n': new_line = new_line + '\n' line = terms[0] + '\t' + new_line workfiles.write_tmpfile(cnt, line, 'a') rd.close()
def remove_replication(): print('removing replications in translations') """sometime the scraper write more than once the same translation in lines""" rd = workfiles.read_lasttmp_or_output() cnt = workfiles.new_tmpfile() a_translations = '' while True: line = rd.readline() if not line: break if len(line) < 5: workfiles.write_tmpfile(cnt, line, 'a') continue terms = line.split('\t') if len(terms) == 1: workfiles.write_tmpfile(cnt, line, 'a') continue word = terms[0].strip() translations = terms[1] if translations == a_translations: workfiles.write_tmpfile(cnt, word + '\t\n', 'a') else: workfiles.write_tmpfile(cnt, line, 'a') a_translations = translations
def remove_same_translations_without_frequency(): print('removing translation without frequency if there is another') rd = workfiles.read_lasttmp_or_output() cnt = workfiles.new_tmpfile() while True: line = rd.readline() if not line: break if len(line) < 5: workfiles.write_tmpfile(cnt, line, 'a') continue terms = line.split('\t') if len(terms) == 1: workfiles.write_tmpfile(cnt, line, 'a') continue translations = terms[1].split(',') new_trs = [] if is_first_translation_without_frequency(translations): has_with_frequency = False for translation in translations[1:]: ini = translation.find(')') term_translation = translation[ini + 1:].strip().lower() if term_translation == translations[0]: has_with_frequency = True if has_with_frequency: new_trs = translations[1:] else: new_trs = translations else: new_trs = translations new_line = ','.join(new_trs) if new_line[-1:] != '\n': new_line = new_line + '\n' line = terms[0] + '\t' + new_line workfiles.write_tmpfile(cnt, line, 'a') rd.close()
def remove_volume_up(): print('removing :volume_up and volume_up') rd = workfiles.read_lasttmp_or_output() cnt = workfiles.new_tmpfile() while True: line = rd.readline() if not line : break if len(line) < 5 : workfiles.write_tmpfile(cnt,line,'a') continue terms = line.split('\t') if len(terms) == 1 : workfiles.write_tmpfile(cnt,line,'a') continue word = terms[0].strip() translations = terms[1].replace('\n','') translations = translations.replace(':volume_up,','') translations = translations.replace('volume_up,','') translations = translations.replace(f'{word}:{word}.','') translations = translations.replace(f'<u>{word}</u>:{word}.','') translations = translations.replace(f'<u><b>{word}</b></u>:{word}.','') translations = translations.replace(f'<b>{word}</b>:{word}.','') translations = translations.replace(f'{word}:{word},','') translations = translations.replace(f'<u>{word}</u>:{word},','') translations = translations.replace(f'<u><b>{word}</b></u>:{word},','') translations = translations.replace(f'<b>{word}</b>:{word},','') translations = translations.replace(f'{word}:{word} ','') translations = translations.replace(f'<u>{word}</u>:{word} ','') translations = translations.replace(f'<u><b>{word}</b></u>:{word} ','') translations = translations.replace(f'<b>{word}</b>:{word} ','') if translations.endswith(word): translations = translations.replace(f'{word}:{word}','') translations = translations.replace(f'<u>{word}</u>:{word}','') translations = translations.replace(f'<u><b>{word}</b></u>:{word}','') translations = translations.replace(f'<b>{word}</b>:{word}','') capitalWord = word.capitalize() translations = translations.replace(f'{capitalWord}:{capitalWord}.','') translations = translations.replace(f'<u>{capitalWord}</u>:{capitalWord}.','') translations = translations.replace(f'<u><b>{capitalWord}</b></u>:{capitalWord}.','') translations = translations.replace(f'<b>{capitalWord}</b>:{capitalWord}.','') translations = translations.replace(f'{capitalWord}:{capitalWord},','') translations = translations.replace(f'<u>{capitalWord}</u>:{capitalWord},','') translations = translations.replace(f'<u><b>{capitalWord}</b></u>:{capitalWord},','') translations = translations.replace(f'<b>{capitalWord}</b>:{capitalWord},','') translations = translations.replace(f'{capitalWord}:{capitalWord} ','') translations = translations.replace(f'<u>{capitalWord}</u>:{capitalWord} ','') translations = translations.replace(f'<u><b>{capitalWord}</b></u>:{capitalWord} ','') translations = translations.replace(f'<b>{capitalWord}</b>:{capitalWord} ','') if translations.endswith(capitalWord): translations = translations.replace(f'{capitalWord}:{capitalWord}','') translations = translations.replace(f'<u>{capitalWord}</u>:{capitalWord}','') translations = translations.replace(f'<u><b>{capitalWord}</b></u>:{capitalWord}','') translations = translations.replace(f'<b>{capitalWord}</b>:{capitalWord}','') lowerWord = word.lower() translations = translations.replace(f'{lowerWord}:{lowerWord}.','') translations = translations.replace(f'<u>{lowerWord}</u>:{lowerWord}.','') translations = translations.replace(f'<u><b>{lowerWord}</b></u>:{lowerWord}.','') translations = translations.replace(f'<b>{lowerWord}</b>:{lowerWord}.','') translations = translations.replace(f'{lowerWord}:{lowerWord},','') translations = translations.replace(f'<u>{lowerWord}</u>:{lowerWord},','') translations = translations.replace(f'<u><b>{lowerWord}</b></u>:{lowerWord},','') translations = translations.replace(f'<b>{lowerWord}</b>:{lowerWord},','') translations = translations.replace(f'{lowerWord}:{lowerWord} ','') translations = translations.replace(f'<u>{lowerWord}</u>:{lowerWord} ','') translations = translations.replace(f'<u><b>{lowerWord}</b></u>:{lowerWord} ','') translations = translations.replace(f'<b>{lowerWord}</b>:{lowerWord} ','') if translations.endswith(lowerWord): translations = translations.replace(f'{lowerWord}:{lowerWord}','') translations = translations.replace(f'<u>{lowerWord}</u>:{lowerWord}','') translations = translations.replace(f'<u><b>{lowerWord}</b></u>:{lowerWord}','') translations = translations.replace(f'<b>{lowerWord}</b>:{lowerWord}','') translations = translations.replace(',,','').replace('..','').replace(', ,','').replace('. .','') translations = translations.replace(':,','').replace(':.','').replace(': ,','').replace(': .','') translations = translations.replace(';,','').replace(';.','').replace('; ,','').replace('; .','') translations = translations.replace('.,','').replace('.,','').replace('. ,','').replace(', .','') translations = translations.replace(',:','').replace('.:','').replace(', :','').replace('. :','') translations = translations.replace(',;','').replace('.;','').replace(', ;','').replace('. ;','') if translations.startswith('. '): translations = translations[2:] translations = translations new_line = terms[0]+'\t'+translations+'\n' workfiles.write_tmpfile(cnt,new_line,'a') rd.close()