def phrase_ext(): phrase_list = [] for i in range(length_dataset): alignment_list = [] list_fr = sentences_fr[i].split() list_en = sentences_en[i].split() for j in range(len(list_en)): alignment_tuple = () word = final_dict[list_en[j]] for k in range(len(list_fr)): if list_fr[k] == word: alignment_tuple = (j, k) alignment_list.append(alignment_tuple) #print(alignment_list) phrases = phrase_extraction(sentences_en[i], sentences_fr[i], alignment_list) #print(phrases) phrases = list(sorted(phrases)) for l in range(len(phrases)): #print(phrases[l]) l_phrases = list(phrases[l]) phrase_list.append(l_phrases) return phrase_list
def phrase_bases_extraction(filename, foreign): corpus = load(filename) srctext = [corpus[i][foreign] for i in range(len(corpus))] trgtext = [corpus[i]['en'] for i in range(len(corpus))] with open(foreign + '.pickle', 'rb') as infile: aligned = pickle.load(infile) phrase_list = [] for i in range(len(srctext)): phrases = pb.phrase_extraction(srctext[i], trgtext[i], aligned[i]) phrase_list.append(phrases) ranks = {} for i in phrase_list: for _ in i: count_num = 0 count_den = 0 fr = _[2] eng = _[3] for pair in corpus: if (fr in pair[foreign]): count_den = count_den + 1 if (eng in pair['en']): count_num = count_num + 1 rank = count_num / count_den ranks[(fr, eng)] = rank sorted_x = sorted(ranks.items(), key=operator.itemgetter(1)) sorted_x.reverse() pprint(sorted_x)
def build_phrases(bitext): print("--- Building phrases") phrases = [] for b in bitext: bitext_words = ' '.join(word for word in b.words if word != '') bitext_mots = ' '.join(mot for mot in b.mots if mot != '') phrase = phrase_based.phrase_extraction(bitext_words, bitext_mots, b.alignment, 2) phrases.append(phrase) return phrases
def task_3(parallel_corpus, phrase_extraction_corpus_en, phrase_extraction_corpus_fr, alignments_pred): """ Task 3: Utility for calculating phrase based extraction scoring :param parallel_corpus: Processed Bitext :param phrase_extraction_corpus_en :param phrase_extraction_corpus_fr :param alignments_pred: Alignment list computed in task 1 :return: execution time """ start = time.process_time() alignments = [] print("Phrase Extraction") en_fr_phrases = [] fr_phrases = [] # print(phrase_extraction_corpus_en) for i in range(len(phrase_extraction_corpus_en)): # print(alignments_pred[phrase_extraction_corpus_en[i]]) # print(alignments[i]) # srctext = "michael assumes that he will stay in the house" # trgtext = "michael geht davon aus , dass er im haus bleibt" # alignment = [(0, 0), (1, 1), (1, 2), (1, 3), (2, 5), (3, 6), (4, 9), (5, 9), (6, 7), (7, 7), (8, 8)] # phrases = phrase_based.phrase_extraction(srctext, trgtext, alignment) # print(phrase_extraction_corpus_en[i]) phrases = phrase_based.phrase_extraction( phrase_extraction_corpus_en[i], phrase_extraction_corpus_fr[i], alignments_pred[phrase_extraction_corpus_en[i]]) # print("here") # for i in sorted(phrases): # print(i) for _, _, e_ph, f_ph in sorted(phrases): en_fr_phrases.append((e_ph, f_ph)) fr_phrases.append(f_ph) en_fr_phrases_count = Counter(en_fr_phrases) fr_phrases_count = Counter(fr_phrases) result = [] # print(en_fr_phrases_count) # print(fr_phrases_count) for e, f in en_fr_phrases: result.append( ((en_fr_phrases_count[(e, f)] / fr_phrases_count[f]), (e, f))) for i in reversed(sorted(set(result))): print(i) end = time.process_time() exec_time = str(end - start) return exec_time
def extract_phrase_table(self, bisent, alignment): PT = [] srctext = bisent[0].tokenized_sentence() trgtext = bisent[1].tokenized_sentence() # print "EPT:SRC:"+srctext.encode('utf-8') # print "EPT:TGT:"+trgtext.encode('utf-8') # print "EPT:INPUT_ALIGNMENT:"+str(alignment) phrase_pairs = phrase_extraction(srctext, trgtext, alignment) # if True: # print str(len(phrase_pairs))+" pairs to add for this sentence" #print "PHRASE PAIRS: "+str(phrase_pairs) for p in phrase_pairs: PT.append((self.apply_offset(p[0], bisent[0].offset), self.apply_offset(p[1], bisent[1].offset))) return PT
# examples=get_examples(settings) # ibm,corpus=use_IBM1(corpus,settings) # current_probs,corpus=drive(corpus) # foreign_eng contains the pairs of foreign and english words and their corresponding count # eng contains the english words and their corresponding count f = open(examples, 'r', encoding='utf-8') corpus = json.load(f) f.close() prob_table, aligned_obj = drive(corpus) foreign_eng = {} english = {} for i in range(len(aligned_obj)): print(corpus[i][native_lang]) print(corpus[i][foreign_lang]) phrases = phrase_based.phrase_extraction(corpus[i][native_lang], corpus[i][foreign_lang], aligned_obj[i][2]) for i in sorted(phrases): t = (i[2], i[3]) if t not in foreign_eng: # check for first occurence of phrase pair foreign_eng[t] = 1 else: #If phrase pair has already been encountered, increase count foreign_eng[t] += 1 if i[2] not in english: #Check for first occurence of foreign phrase english[i[2]] = 1 else: #If foreign phrase has already been encountered, increase count english[i[2]] += 1 # scores of the phrases is determined by the formula score(f,e) = count(f,e)/count(f) scores = {}
def main(): start = time.time() #parsing the json file with open(FILE, 'r') as f: corpus = json.load(f) Model1_table, aligned1 = IBM_Model_1(corpus) alignments_of_1 = [] words_of_1 = [] mots_of_1 = [] #storing the information got from the IBM Model 1 in lists for test in aligned1: alignments_of_1.append(test.alignment) words_of_1.append(test.words) mots_of_1.append(test.mots) print("") c = 0 #traversing the corpus for x in corpus: srctext = x[SOURCE_LANGUAGE] destext = x[DESTINATION_LANGUAGE] align = alignments_of_1[c] print("Source sentence:") print(words_of_1[c]) print("Destination sentence:") print(mots_of_1[c]) print("Alignment:") print(align) print("") c = c + 1 sorted_phrase_score = list() #calling the inbuilt function to extract phrases phrases = phrase_extraction(srctext, destext, align) for i in phrases: SOURCE_phrase = i[2] DESTINATION_phrase = i[3] count_numerator = 0.0 count_denominator = 0.0 for y in corpus: #checking if both the phrases are in the sentence if SOURCE_phrase in y[ SOURCE_LANGUAGE] and DESTINATION_phrase in y[ DESTINATION_LANGUAGE]: count_numerator = count_numerator + 1 #checking if the source phrase is in the source sentence if SOURCE_phrase in y[SOURCE_LANGUAGE]: count_denominator = count_denominator + 1 #calculating the phrase score phrase_score = count_numerator / count_denominator #adding the phrase score to a list sorted_phrase_score.append( (phrase_score, SOURCE_phrase, DESTINATION_phrase)) #printing the output in descending order of the phrase score for values in sorted(sorted_phrase_score, reverse=True): print("Source phrase:") print(values[1]) print("Destination phrase:") print(values[2]) print("Phrase Score:") print(values[0]) print("") #printing runtime print("") print("Time:") print(time.time() - start)
from nltk.translate import phrase_based from part2 import use_IBM1, get_examples, get_data import pprint # from part1 import drive if __name__ == '__main__': corpus, source_set, target_set, settings = get_data() examples = get_examples(settings) ibm, corpus = use_IBM1(corpus, settings) # foreign_eng contains the pairs of foreign and english words and their corresponding count # eng contains the english words and their corresponding count foreign_eng = {} english = {} for i in range(len(examples)): phrases = phrase_based.phrase_extraction( examples[i][settings['source']], examples[i][settings['target']], corpus[i].alignment) for i in sorted(phrases): t = (i[2], i[3] ) # tuple containing the source and the translation pair if t not in foreign_eng: # check for first occurence of phrase pair foreign_eng[t] = 1 else: #If phrase pair has already been encountered, increase count foreign_eng[t] += 1 if i[2] not in english: #Check for first occurence of source phrase english[i[2]] = 1 else: #If source phrase has already been encountered, increase count english[i[2]] += 1 # scores of the phrases is determined by the formula score(t,s) = count(t,s)/count(s) scores = {}
for item in tuple(bitext[i].alignment): # only keep word pairings where neither of the words is None if None not in item: newAlignment.append(item) bitext[i].alignment = Alignment(newAlignment) all_phrases = [] for pair in bitext: srctext = ' '.join(word for word in pair.words) trgtext = ' '.join(word for word in pair.mots) alignment = tuple(pair.alignment) phrases = phrase_extraction(srctext, trgtext, alignment) for phrase in phrases: all_phrases.append(phrase) # build dict matching english phrases to spanish phrases phrase_occ = {} for row in all_phrases: src = row[2] trg = row[3] if src not in phrase_occ: translations = defaultdict() translations[trg] = 1 phrase_occ[src] = translations elif trg not in phrase_occ[src]: phrase_occ[src][trg] = 1 else:
#Construct the source and target texts srctext = "" trgtext = "" for e_word in e: srctext += e_word srctext += ' ' for f_word in f: trgtext += f_word trgtext += ' ' srctext = srctext[:-1] trgtext = trgtext[:-1] #print(srctext) #print(trgtext) #Obtain phrase tuples from phrase_extraction module phrases = phrase_extraction(srctext, trgtext, align_ibm) for phrase in sorted(phrases): en_phrase = phrase[2] #English phrase fr_phrase = phrase[3] #French phrase #Increment count of the French phrase if (fr_phrase not in count_fr_phrase): count_fr_phrase[fr_phrase] = 1 else: count_fr_phrase[fr_phrase] += 1 #Increment count of the pair of English phrase, French phrase if ((en_phrase, fr_phrase) not in count_en_fr_phrase): count_en_fr_phrase[(en_phrase, fr_phrase)] = 1 else: count_en_fr_phrase[(en_phrase, fr_phrase)] += 1
A_text.append(AlignedSent(l_4, l_3)) ib1 = i1.IBMModel1(A_text, 5) l_align = [] for i in A_text: temp1 = i.alignment temp2 = [] for x in range(len(temp1)): temp2.append(temp1[x][0]) l_align.append(temp2) x = 0 laoded_dictionary_it = {} myDict = {} for i in range(len(english)): phrases = pb.phrase_extraction(english[i], German[i], l_align[i]) for j in sorted(phrases): k = (j[2], j[3], len(j[2].split())) l = j[3] if k in laoded_dictionary_it.keys(): laoded_dictionary_it[k] = laoded_dictionary_it[k] + 1 else: laoded_dictionary_it[k] = 1 x = x + 1 if l in myDict.keys(): myDict[l] = myDict[l] + 1 else: myDict[l] = 1 for i in laoded_dictionary_it.keys(): laoded_dictionary_it[i] /= myDict[i[1]]
def main(): ''' This is the core logic of our program. ''' test_corpus = True custom_corpus = False cwd = getcwd() # read data from the given dataset with open(cwd + '\\data1.json') as f: json_data1 = f.read() with open(cwd + '\\data2.json') as f: json_data2 = f.read() with open(cwd + '\\Alternative Corpus\\parallel.json') as f: json_data3 = f.read() # data is in JSON format and hence needs to be parsed data1 = json.loads(json_data1) data2 = json.loads(json_data2) data3 = json.loads(json_data3) # create an aligned corpus for phrase extraction bitext = [] for sentence in data1: bitext.append( AlignedSent(word_tokenize(sentence['fr'], language='french'), word_tokenize(sentence['en'], language='english'))) # run the model (model 1) for 10 iterations model1 = IBMModel1(bitext, 10) # print(bitext, '\n\n') # get the word translation table translate_table_1 = model1.translation_table # extract alignments and show them alignments_extracted_1 = [] for temp in bitext: alignments_extracted_1.append(temp.alignment) print(alignments_extracted_1) # Similarly run the model2 and print the results bitext = [] for sentence in data1: bitext.append( AlignedSent(word_tokenize(sentence['fr'], language='french'), word_tokenize(sentence['en'], language='english'))) model2 = IBMModel2(bitext, 10) translate_table_2 = model2.translation_table alignments_extracted_2 = [] for temp in bitext: alignments_extracted_2.append(temp.alignment) # print('\n\n',alignments_extracted_2,'\n\n') print('finished\n') # if this is true, run the phrase translation model for data2.json '''''' if test_corpus: bitext_test = [] # get the parallel sentences here for sentence in data2: bitext_test.append( AlignedSent(word_tokenize(sentence['fr'], language='french'), word_tokenize(sentence['en'], language='english'))) # test model for extracting phrases, MODEL 1, used to extract phrases: test_model = IBMModel1(bitext_test, 10) alignments_test = bitext_test[0].alignment # print(bitext_test[0], bitext_test[0].alignment) phrases = phrase_extraction(data2[0]['fr'], data2[0]['en'], alignments_test) for i in phrases: print(i, '\n\n') ''' CHANDRAHAS ADD DESCRIPTION HERE. OPTIONALLY, SAVE THE TRANSLATIONS ''' countef = defaultdict() countf = defaultdict() for sent in range(len(data3)): phrases = phrase_extraction(data2[sent]['fr'], data2[sent]['en'], alignments_test) for phrase in phrases: pair = (phrase[2], phrase[3]) print(pair) if pair not in countef: countef[pair] = 1 else: countef[pair] = countef[pair] + 1 for word in countef: fword = word[0] if fword not in countf: countf[fword] = countef[word] else: countf[fword] = countf[fword] + countef[word] print('ranks: \n\n') final = defaultdict(dict) for word in countef: val = countef[word] / countf[word[0]] # print(word," ", val) final[word[0]][word[1]] = val # print(final) for entity in final: # print(entity) current = final[entity] print(entity) # print(current) d_descending = sorted(current.items(), key=lambda kv: kv[1], reverse=True) for i in d_descending: print(i) print("\n") # print(final) # print("final") # for word in countef: # print(word," ",countef[word]) # if this is set as true, do the same for the dataset that we generated. if custom_corpus: bitext_test = [] for sentence in data3: bitext_test.append( AlignedSent(word_tokenize(sentence['gr'], language='german'), word_tokenize(sentence['en'], language='english'))) # test model for extracting phrases, MODEL 1: test_model = IBMModel1(bitext_test, 10) alignments_test = bitext_test[0].alignment ''' CHANDRAHAS ADD DESCRIPTION HERE ''' countef = defaultdict() countf = defaultdict() for sent in range(len(data3)): phrases = phrase_extraction(data3[sent]['gr'], data3[sent]['en'], alignments_test) for phrase in phrases: pair = (phrase[2], phrase[3]) # print(pair) if pair not in countef: countef[pair] = 1 else: countef[pair] = countef[pair] + 1 for word in countef: fword = word[0] if fword not in countf: countf[fword] = countef[word] else: countf[fword] = countf[fword] + countef[word] print('ranks: \n\n') final = defaultdict(dict) for word in countef: val = countef[word] / countf[word[0]] # print(word," ", val) final[word[0]][word[1]] = val # print(final) for entity in final: # print(entity) current = final[entity] print(entity) # print(current) d_descending = sorted(current.items(), key=lambda kv: kv[1], reverse=True) for i in d_descending: print(i) print("\n") # print(final) print("final") for word in countef: # print(word, " ", countef[word]) # for i in phrases: # for j in phrases: # for word in i: for sent in range(len(data3)): phrases = phrase_extraction(data3[sent]['gr'], data3[sent]['en'], alignments_test) for phrase in phrases: pair = (phrase[2], phrase[3]) # print(pair) if pair not in countef: countef[pair] = 1 else: countef[pair] = countef[pair] + 1 for word in countef: fword = word[0] if fword not in countf: countf[fword] = countef[word] else: countf[fword] = countf[fword] + countef[word] print('ranks: \n\n') final = defaultdict(dict) for word in countef: val = countef[word] / countf[word[0]] # print(word," ", val) final[word[0]][word[1]] = val print(final) for entity in final: # print(entity) current = final[entity] print(entity) # print(current) d_descending = sorted(current.items(), key=lambda kv: kv[1], reverse=True) for i in d_descending: print(i) print("\n")