def prepare_and_process_translation(source_text): translation = translate_text(source_text, pair, directory=args.directory) if args.ignoreCase: translation = translation.lower() analyzed_translation = analyze_text(translation, pair, pair[::-1], directory=args.directory) analyzed_translation_units = list( parse(analyzed_translation, withText=True)) translation = re.sub("\s+", " ", translation) analyzed_tu_subsegments = [] for length in range(1, args.maxTranslationLength + 1): for start_ind in range(0, len(analyzed_translation_units) - length + 1): last_ind = start_ind + length - 1 analyzed_tu_subsegments.append( (analyzed_translation_units[start_ind:last_ind + 1], start_ind, last_ind)) return translation, analyzed_translation, analyzed_translation_units, analyzed_tu_subsegments
def function5(st_subsegment): tt_subsegment = translate_text(st_subsegment, pair, directory=args.directory) #t if args.ignoreCase: tt_subsegment = tt_subsegment.lower() analyzed_tt_subsegment = analyze_text(tt_subsegment, pair, pair[::-1], directory=args.directory) analyzed_tts_units = list(parse(analyzed_tt_subsegment, withText=True)) return tt_subsegment, analyzed_tt_subsegment, analyzed_tts_units
def process_source(source_text): analyzed_source = analyze_text(source_text, pair, pair, directory=args.directory) analyzed_source_units = list(parse(analyzed_source, withText=True)) correspondence = namedtuple('Correspondence', ['s', 't', 'i', 'j', 'k', 'l']) correspondences, analyzed_su_subsegments = [], [] for length in range(1, args.maxSourceLength + 1): for start_ind in range(0, len(analyzed_source_units) - length + 1): last_ind = start_ind + length - 1 analyzed_su_subsegments.append( (analyzed_source_units[start_ind:last_ind + 1], start_ind, last_ind)) return analyzed_source, analyzed_source_units, correspondence, correspondences, analyzed_su_subsegments
def process_subsegments(st_subsegment): #tt_subsegment = translate_text(st_subsegment, pair, directory=args.directory) #t #if args.ignoreCase: #tt_subsegment = tt_subsegment.lower() translate_file('st_info.txt', 'tt_info.txt') analyze_file('tt_info.txt', 'an_tt_info.txt') with open('tt_info.txt', 'r', encoding='utf-8') as file: tt_subsegments = file.read().strip('\n').split('\n') with open('an_tt_info.txt', 'r', encoding='utf-8') as file: analyzed_tt_subsegments = file.read().strip('\n').split('\n') #analyzed_tt_subsegment = analyze_text(tt_subsegment, pair, pair[::-1], directory=args.directory) analyzed_tts_units = [ list(parse(elem, withText=True)) for elem in analyzed_tt_subsegments ] return tt_subsegments, analyzed_tt_subsegments, analyzed_tts_units
def getCorrespondences(sourceLanguage,targetLanguage,ignoreCase,maxSourceLength,directory,maxTranslationLength, s): pair = (sourceLanguage, targetLanguage) sourceText = s.lower() if ignoreCase else s #S #this stuff analyzes source text analyzedSourceText = analyzeText(sourceText, pair, pair, directory=directory) analyzedSourceUnits = list(parse(analyzedSourceText, withText=True)) Correspondence = collections.namedtuple('Correspondence', ['s', 't', 'i', 'j', 'k', 'l']) correspondences = [] analyzedSourceUnitsSubsegments = [] for length in range(1, maxSourceLength + 1): for startIndex in range(0, len(analyzedSourceUnits) - length + 1): lastIndex = startIndex + length - 1 analyzedSourceUnitsSubsegments.append((analyzedSourceUnits[startIndex:lastIndex+1], startIndex, lastIndex)) #s, i, j (analyzed units forms of them) #this stuff translates source text translatedText = translateText(sourceText, pair, directory=directory) if ignoreCase: translatedText = translatedText.lower() #this stuff analyzes translated text analyzedTranslation = analyzeText(translatedText, pair, pair[::-1], directory=directory) analyzedTranslationUnits = list(parse(analyzedTranslation, withText=True)) analyzedTranslationUnitsSubsegments = [] for length in range(1, maxTranslationLength + 1): for startIndex in range(0, len(analyzedTranslationUnits) - length + 1): lastIndex = startIndex + length - 1 analyzedTranslationUnitsSubsegments.append((analyzedTranslationUnits[startIndex:lastIndex+1], startIndex, lastIndex)) #translatedTextSubsegements = [] startIndexes = [] lastIndexes = [] sourceTextSubsegments = [] for analyzedSourceUnitsSubsegment, startIndexInUnits, lastIndexInUnits in analyzedSourceUnitsSubsegments: sourceTextSubsegment = '' #s for i, (analyzedSourceUnitPreceedingText, analyzedSourceLexicalUnit) in enumerate(analyzedSourceUnitsSubsegment): sourceTextSubsegment += (analyzedSourceUnitPreceedingText if i != 0 else '') + analyzedSourceLexicalUnit.wordform startIndexInSourceText = sum(list(map(lambda x: len(x[0]) + len(x[1].wordform), analyzedSourceUnits[:startIndexInUnits]))) + len(analyzedSourceUnitsSubsegment[0][0]) #i lastIndexInSourceText = sum(list(map(lambda x: len(x[0]) + len(x[1].wordform), analyzedSourceUnits[:lastIndexInUnits+1]))) - 1 #j startIndexes.append(startIndexInSourceText) lastIndexes.append(lastIndexInSourceText) if ignoreCase: sourceTextSubsegment = sourceTextSubsegment.lower() sourceTextSubsegments.append(sourceTextSubsegment) with open('source_text_subsegments.txt', 'w', encoding='utf-8') as file: for s in sourceTextSubsegments: file.write(('(( %s ))\n\n') % (s)) translate('source_text_subsegments.txt', 'translated_text_subsegments.txt', directory, pair) analyze('translated_text_subsegments.txt', 'analyzed_translations.txt', directory, pair) """with open('source_text_subsegments.txt', 'r', encoding='utf-8') as file: sourceTextSubsegments = file.read().strip('\n').split('\n\n') """ with open('translated_text_subsegments.txt', 'r', encoding='utf-8') as file: translatedTextSubsegments = file.read().strip('\n').split('\n\n') """for i in range(len(sourceTextSubsegments)): sourceTextSubsegments[i] = sourceTextSubsegments[i].strip('(( ') sourceTextSubsegments[i] = sourceTextSubsegments[i].strip(' ))') if ignoreCase: sourceTextSubsegments[i] = sourceTextSubsegments[i].lower()""" for i in range(len(translatedTextSubsegments)): translatedTextSubsegments[i] = translatedTextSubsegments[i].strip('(( ') translatedTextSubsegments[i] = translatedTextSubsegments[i].strip(' ))') if ignoreCase: translatedTextSubsegments[i] = translatedTextSubsegments[i].lower() with open('analyzed_translations.txt', 'r', encoding='utf-8') as file: analyzedTranslatedTextSubsegments = file.read().strip('\n').split('\n\n') #analyzedTranslatedTextSubsegment = analyzeText(translatedTextSubsegment, pair, pair[::-1], directory=directory) #print(len(startIndexes)) #print(len(sourceTextSubsegments)) for i in range(len(analyzedTranslatedTextSubsegments)): analyzedTranslatedTextSubsegment = analyzedTranslatedTextSubsegments[i] translatedTextSubsegment = translatedTextSubsegments[i] sourceTextSubsegment = sourceTextSubsegments[i] startIndexInSourceText = startIndexes[i] lastIndexInSourceText = lastIndexes[i] analyzedTranslatedTextSubsegment = re.sub('\^\(\/\(\<lpar\>\$\^\(\/\(\<lpar\>\$ ', '', analyzedTranslatedTextSubsegment) analyzedTranslatedTextSubsegment = re.sub(' \^\)\/\)\<rpar\>\$\^\)\/\)\<rpar\>\$', '', analyzedTranslatedTextSubsegment) #print('meh', analyzedTranslatedTextSubsegment) analyzedTranslatedTextSubsegmentUnits = list(parse(analyzedTranslatedTextSubsegment, withText=True)) #pprint.pprint(analyzedTranslatedTextSubsegmentUnits) #print('meh', analyzedTranslatedTextSubsegmentUnits) #print('suka', analyzedTranslationUnitsSubsegments[0]) subsegmentMatches = list(filter(lambda x: list(map(lambda y: str(y[1]), x[0])) == list(map(lambda z: str(z[1]), analyzedTranslatedTextSubsegmentUnits)) , analyzedTranslationUnitsSubsegments)) if subsegmentMatches: startIndexInTranslatedText = sum(list(map(lambda x: len(x[0]) + len(x[1].wordform), analyzedTranslationUnits[:subsegmentMatches[0][1]]))) + len(subsegmentMatches[0][0][0][0]) #k lastIndexInTranslatedText = sum(list(map(lambda x: len(x[0]) + len(x[1].wordform), analyzedTranslationUnits[:subsegmentMatches[0][2]+1]))) - 1 #l correspondences.append(Correspondence( s=sourceTextSubsegment, t=translatedTextSubsegment, i=startIndexInSourceText, j=lastIndexInSourceText, k=startIndexInTranslatedText, l=lastIndexInTranslatedText )) #print(correspondences) return correspondences
def getCorrespondences(sourceLanguage, targetLanguage, ignoreCase, maxSourceLength, directory, maxTranslationLength, s): pair = (sourceLanguage, targetLanguage) sourceText = s.lower() if ignoreCase else s #S analyzedSourceText = analyzeText(sourceText, pair, pair, directory=directory) analyzedSourceUnits = list(parse(analyzedSourceText, withText=True)) Correspondence = collections.namedtuple('Correspondence', ['s', 't', 'i', 'j', 'k', 'l']) correspondences = [] analyzedSourceUnitsSubsegments = [] for length in range(1, maxSourceLength + 1): for startIndex in range(0, len(analyzedSourceUnits) - length + 1): lastIndex = startIndex + length - 1 analyzedSourceUnitsSubsegments.append( (analyzedSourceUnits[startIndex:lastIndex + 1], startIndex, lastIndex)) #s, i, j (analyzed units forms of them) translatedText = translateText(sourceText, pair, directory=directory) if ignoreCase: translatedText = translatedText.lower() analyzedTranslation = analyzeText(translatedText, pair, pair[::-1], directory=directory) analyzedTranslationUnits = list(parse(analyzedTranslation, withText=True)) analyzedTranslationUnitsSubsegments = [] for length in range(1, maxTranslationLength + 1): for startIndex in range(0, len(analyzedTranslationUnits) - length + 1): lastIndex = startIndex + length - 1 analyzedTranslationUnitsSubsegments.append( (analyzedTranslationUnits[startIndex:lastIndex + 1], startIndex, lastIndex)) translatedTextSubsegements = [] for analyzedSourceUnitsSubsegment, startIndexInUnits, lastIndexInUnits in analyzedSourceUnitsSubsegments: sourceTextSubsegment = '' #s for i, (analyzedSourceUnitPreceedingText, analyzedSourceLexicalUnit ) in enumerate(analyzedSourceUnitsSubsegment): sourceTextSubsegment += (analyzedSourceUnitPreceedingText if i != 0 else '') + analyzedSourceLexicalUnit.wordform startIndexInSourceText = sum( list( map(lambda x: len(x[0]) + len(x[1].wordform), analyzedSourceUnits[:startIndexInUnits]))) + len( analyzedSourceUnitsSubsegment[0][0]) #i lastIndexInSourceText = sum( list( map(lambda x: len(x[0]) + len(x[1].wordform), analyzedSourceUnits[:lastIndexInUnits + 1]))) - 1 #j if ignoreCase: sourceTextSubsegment = sourceTextSubsegment.lower() translatedTextSubsegment = translateText(sourceTextSubsegment, pair, directory=directory) #t if ignoreCase: translatedTextSubsegment = translatedTextSubsegment.lower() analyzedTranslatedTextSubsegment = analyzeText( translatedTextSubsegment, pair, pair[::-1], directory=directory) #print(analyzedTranslatedTextSubsegment) #print('meh', analyzedTranslatedTextSubsegment) analyzedTranslatedTextSubsegmentUnits = list( parse(analyzedTranslatedTextSubsegment, withText=True)) #pprint.pprint(analyzedTranslatedTextSubsegmentUnits) print('meh', analyzedTranslatedTextSubsegment) subsegmentMatches = list( filter( lambda x: list(map(lambda y: str(y[1]), x[0])) == list( map(lambda z: str(z[1]), analyzedTranslatedTextSubsegmentUnits)), analyzedTranslationUnitsSubsegments)) if subsegmentMatches: startIndexInTranslatedText = sum( list( map(lambda x: len(x[0]) + len(x[1].wordform), analyzedTranslationUnits[:subsegmentMatches[0][1]])) ) + len(subsegmentMatches[0][0][0][0]) #k lastIndexInTranslatedText = sum( list( map(lambda x: len(x[0]) + len(x[1].wordform), analyzedTranslationUnits[:subsegmentMatches[0][2] + 1]))) - 1 #l correspondences.append( Correspondence(s=sourceTextSubsegment, t=translatedTextSubsegment, i=startIndexInSourceText, j=lastIndexInSourceText, k=startIndexInTranslatedText, l=lastIndexInTranslatedText)) #print(correspondences) #print('Source text: %s' % repr(sourceText)) #print('Translated text: %s\n' % repr(translatedText)) # pprint.pprint(correspondences) return correspondences
# sourceText = args.text.lower() if args.ignoreCase else args.text #S if args.source_texts == None: f = sys.stdin else: f = open(args.source_texts, 'r') sen_num = 0 for sourceText in f: sourceText = sourceText.strip() sen_num = sen_num + 1 analyzedSourceText = analyzeText(sourceText, pair, pair, directory=args.directory) analyzedSourceUnits = list(parse(analyzedSourceText, withText=True)) Correspondence = collections.namedtuple('Correspondence', ['s', 't', 'i', 'j', 'k', 'l']) correspondences = [] analyzedSourceUnitsSubsegments = [] for length in range(1, args.maxSourceLength + 1): for startIndex in range(0, len(analyzedSourceUnits) - length + 1): lastIndex = startIndex + length - 1 analyzedSourceUnitsSubsegments.append( (analyzedSourceUnits[startIndex:lastIndex + 1], startIndex, lastIndex)) #s, i, j (analyzed units forms of them) translatedText = translateText(sourceText,
def getCorrespondences(sourceLanguage,targetLanguage,ignoreCase,maxSourceLength,directory,maxTranslationLength, s): pair = (sourceLanguage, targetLanguage) sourceText = s.lower() if ignoreCase else s #S cache_db = TinyDB('cache_db2.json') Data = Query() #this stuff analyzes source text analyzedSourceText = analyzeText(sourceText, pair, pair, directory=directory) analyzedSourceUnits = list(parse(analyzedSourceText, withText=True)) Correspondence = collections.namedtuple('Correspondence', ['s', 't', 'i', 'j', 'k', 'l']) correspondences = [] analyzedSourceUnitsSubsegments = [] for length in range(1, maxSourceLength + 1): for startIndex in range(0, len(analyzedSourceUnits) - length + 1): lastIndex = startIndex + length - 1 analyzedSourceUnitsSubsegments.append((analyzedSourceUnits[startIndex:lastIndex+1], startIndex, lastIndex)) #s, i, j (analyzed units forms of them) #this stuff translates source text translatedText = translateText(sourceText, pair, directory=directory) if ignoreCase: translatedText = translatedText.lower() #this stuff analyzes translated text analyzedTranslation = analyzeText(translatedText, pair, pair[::-1], directory=directory) analyzedTranslationUnits = list(parse(analyzedTranslation, withText=True)) analyzedTranslationUnitsSubsegments = [] for length in range(1, maxTranslationLength + 1): for startIndex in range(0, len(analyzedTranslationUnits) - length + 1): lastIndex = startIndex + length - 1 analyzedTranslationUnitsSubsegments.append((analyzedTranslationUnits[startIndex:lastIndex+1], startIndex, lastIndex)) translatedTextSubsegements = [] for analyzedSourceUnitsSubsegment, startIndexInUnits, lastIndexInUnits in analyzedSourceUnitsSubsegments: sourceTextSubsegment = '' #s for i, (analyzedSourceUnitPreceedingText, analyzedSourceLexicalUnit) in enumerate(analyzedSourceUnitsSubsegment): sourceTextSubsegment += (analyzedSourceUnitPreceedingText if i != 0 else '') + analyzedSourceLexicalUnit.wordform startIndexInSourceText = sum(list(map(lambda x: len(x[0]) + len(x[1].wordform), analyzedSourceUnits[:startIndexInUnits]))) + len(analyzedSourceUnitsSubsegment[0][0]) #i lastIndexInSourceText = sum(list(map(lambda x: len(x[0]) + len(x[1].wordform), analyzedSourceUnits[:lastIndexInUnits+1]))) - 1 #j if ignoreCase: sourceTextSubsegment = sourceTextSubsegment.lower() #this stuff translates source text subsegment if cache_db.search((Data.type == 'stsb_translation') & (Data.key == sourceTextSubsegment)): translatedTextSubsegment = cache_db.search((Data.type == 'stsb_translation') & (Data.key == sourceTextSubsegment))[0]['value'] #print('ага\n%s\n\n' % (translatedTextSubsegment)) else: translatedTextSubsegment = translateText(sourceTextSubsegment, pair, directory=directory) #t cache_db.insert({'type': 'stsb_translation', 'key': sourceTextSubsegment, 'value': translatedTextSubsegment}) if ignoreCase: translatedTextSubsegment = translatedTextSubsegment.lower() #this stuff analyzes translated text subsegment if cache_db.search((Data.type == 'trsb_analysis') & (Data.key == translatedTextSubsegment)): analyzedTranslatedTextSubsegment = cache_db.search((Data.type == 'trsb_analysis') & (Data.key == translatedTextSubsegment))[0]['value'] #print('угу\n%s\n\n' % (analyzedTranslatedTextSubsegment)) else: analyzedTranslatedTextSubsegment = analyzeText(translatedTextSubsegment, pair, pair[::-1], directory=directory) cache_db.insert({'type': 'trsb_analysis', 'key': translatedTextSubsegment, 'value': analyzedTranslatedTextSubsegment}) #analyzedTranslatedTextSubsegment = analyzeText(translatedTextSubsegment, pair, pair[::-1], directory=directory) analyzedTranslatedTextSubsegmentUnits = list(parse(analyzedTranslatedTextSubsegment, withText=True)) #pprint.pprint(analyzedTranslatedTextSubsegmentUnits) subsegmentMatches = list(filter(lambda x: list(map(lambda y: str(y[1]), x[0])) == list(map(lambda z: str(z[1]), analyzedTranslatedTextSubsegmentUnits)) , analyzedTranslationUnitsSubsegments)) if subsegmentMatches: startIndexInTranslatedText = sum(list(map(lambda x: len(x[0]) + len(x[1].wordform), analyzedTranslationUnits[:subsegmentMatches[0][1]]))) + len(subsegmentMatches[0][0][0][0]) #k lastIndexInTranslatedText = sum(list(map(lambda x: len(x[0]) + len(x[1].wordform), analyzedTranslationUnits[:subsegmentMatches[0][2]+1]))) - 1 #l correspondences.append(Correspondence( s=sourceTextSubsegment, t=translatedTextSubsegment, i=startIndexInSourceText, j=lastIndexInSourceText, k=startIndexInTranslatedText, l=lastIndexInTranslatedText )) #print('Source text: %s' % repr(sourceText)) #print('Translated text: %s\n' % repr(translatedText)) # pprint.pprint(correspondences) return correspondences