def runCarmelAndGetFracCounts(run_true_training,run_noise_training,fractional_counts_channel,trained_cipher_file): total_true_corpus_probability = runCarmel(run_true_training) total_noise_corpus_probability = runCarmel(run_noise_training) print 'reading language fractional counts from',trained_cipher_file emMethods.readCarmelFractionalCounts(trained_cipher_file,fractional_counts_channel,'channel') print 'read the fst' return(total_true_corpus_probability,total_noise_corpus_probability)
def main() : # setting up options parser = OptionParser() parser.add_option("--num_iter", action="store", type="int", dest="num_iterations",default=100,help="number of iterations you would like to run em+smoothed l0. Default is 50") parser.add_option("--initial_alpha", action="store", type="float", dest="initial_alpha",default = 0.0,help="initial_alpha,the weight of the smoothed l0 penalty. Defaut is 0 ") parser.add_option("--final_alpha", action="store", type="float", dest="final_alpha",default = 0.0,help="final_alpha,the weight of the smoothed l0 penalty. Defaut is 0 ") parser.add_option("--beta", action="store", type="float", dest="beta",default = 0.5,help="beta, the smoothness of the l0 prior, smaller the sigma, closer the approximation to true L0. Default is 0.5") parser.add_option("--slack", action="store_true", dest="slack_option",default = False,help="if you want to project on the simplex with slack.") parser.add_option("--noslack", action="store_false", dest="slack_option",default = False, help="if you want to project on the simplex with no slack. This is the regular projection approach") parser.add_option("--num_pgd_iterations", action="store", type="int", dest="num_pgd_iterations",default = 10,help="Number of Projected Gradient Descent to run. Default is 100") parser.add_option("--eta", action="store", type="float", dest="eta",default = 0.1,help="Eta, the constant step size for PGD using armijo line search. Default is 0.1") parser.add_option("--armijo_beta", action="store", type="float", dest="armijo_beta",default = 0.5,help="Set value for Armijo beta, the beta used in in armijo line search. Default value is 0.2") parser.add_option("--armijo_sigma", action="store", type="float", dest="armijo_sigma",default = 0.5,help="Set value for Armijo sigma, the sigma used in in armijo line search. Lower bound is 0.0001") parser.add_option("--lower_bound", action="store", type="float", dest="lower_bound",default = 0.000001,help="Set value for the lower bound on the probability.Default is 10E-6") parser.add_option("--cipher_data_file", action="store", type="string", dest="cipher_data_file",default ='cipher.data',help="Cipher data file for training") parser.add_option("--cipher_noq_file", action="store", type="string", dest="cipher_noq_file",default ='cipher.noq',help="Cipher data without quotes") parser.add_option("--cipher_decode_file", action="store", type="string", dest="cipher_decode_file",default ='cipher.decode',help="Cipher file for decoding") parser.add_option("--cipher_gold_file", action="store", type="string", dest="cipher_gold_file",default ='cipher.gold',help="The correct decipherment") parser.add_option("--lm", action="store", type="string", dest="lm",default ='lm.carmel',help="The lm file") parser.add_option("--noe_lm", action="store", type="string", dest="noe_lm",default ='lm.carmel',help="The noe lm file") parser.add_option("--gaussian_params_file", action="store", type="string", dest="gaussian_params_file",default =None,help="The means and variances for each gaussian letter") parser.add_option("--unigram_probs_file", action="store", type="string", dest="unigram_probs_file",default =None,help="The means and variances for each gaussian letter") parser.add_option("--std_mult", action="store", type="float", dest="std_mult",default =1.,help="The multiplier for the std ") parser.add_option("--u", action="store_true", dest="uniform_init",default=False) parser.add_option("--g", action="store_true", dest="gaussian_init",default=False) parser.add_option("--i", action="store_true", dest="identity_init",default=False) parser.add_option("--hpc", action="store_true", dest="hpc",default=False) parser.add_option("--full_fst", action="store_true", dest="full_fst",default=False) parser.add_option("--posterior_decoding", action="store_true", dest="posterior_decoding",default=False) (options, args) = parser.parse_args() print options print args #getting the values from optparse num_iterations = options.num_iterations initial_alpha = options.initial_alpha final_alpha = options.final_alpha beta = options.beta slack_option = options.slack_option num_pgd_iterations = options.num_pgd_iterations eta = options.eta armijo_beta = options.armijo_beta armijo_sigma = options.armijo_sigma lower_bound = options.lower_bound cipher_data_file = options.cipher_data_file cipher_decode_file = options.cipher_decode_file cipher_noq_file = options.cipher_noq_file cipher_gold_file = options.cipher_gold_file lm = options.lm noe_lm = options.noe_lm gaussian_params_file = options.gaussian_params_file unigram_probs_file = options.unigram_probs_file uniform_init = options.uniform_init gaussian_init = options.gaussian_init identity_init = options.identity_init hpc = options.hpc std_mult = options.std_mult full_fst = options.full_fst posterior_decoding_flag = options.posterior_decoding #setting up paramters fractional_counts_language = {} fractional_counts_channel = {} probabilities_channel = {} probabilities_language = {} current_probabilities = {} current_fractional_counts = {} #constraint_parameters = [] initial_parameter_args = {} current_initial_parameter_args = {} parameter_to_index = {} parameter_counter = 0 num_constraints = 1 constraint_tags_dict = {} #this will hold the tag for the constraint. the key is the constraint id and the value is the tag corresponding to this constraint #beta = float(0) #alpha = float(0) global_num_parameters = 0 init_option = '' current_optimization_tag = '' #this will hold the of the constraint for which we are doing the optimization #adding parser options ''' print 'beta is ',beta print 'alpha is ',alpha print 'eta is ',eta raw_input() ''' gold_cipher = emMethods.readCipherFile(cipher_gold_file) ciphertext = emMethods.readCipherFile(cipher_noq_file) ciphertext_with_spaces = open(cipher_noq_file).readline().strip().split() print gold_cipher #dictionary = emMethods.createDictionary('complete.dict.new-formatted')#.small') #word_lines= emMethods.readWordLines('test.words.new-formatted') cipher_letter_dict = emMethods.getUniqCipherLetters(cipher_noq_file) cipher_probs = defaultdict(float) emMethods.getCipherLetterProbs(cipher_probs,cipher_noq_file) print 'cipher probs are ' #gaussians = defaultdict(list) #emMethods.readGaussians(gaussians,gaussian_params_file) plain_unigram_probs = dict((line.strip().split()[0],float(line.strip().split()[1])) for line in open(unigram_probs_file)) #get cipher letter probs del cipher_letter_dict['_'] #word_list_five = emMethods.readWordList('TEXT.3.linear') #plaintext = map(chr, range(97, 123)) plaintext_letters = [] ciphertext_letters = [] for k in range(65, 91): plaintext_letters.append(chr(k)) ciphertext_letters.append(chr(k).lower()) print plaintext_letters print 'the number of unique cipher letter is %d'%len(cipher_letter_dict.keys()) print cipher_letter_dict num_cipher_letters = len(cipher_letter_dict.keys()) num_plain_letters = 26 #gold_tag_sequence = emMethods.readWordList('test.tags.new-formatted.linear') free_parameters_channel = defaultdict(lambda:defaultdict(float)) free_parameters_language = defaultdict(lambda:defaultdict(float)) print 'starting to create parameters' total_language_parameters = 0 total_channel_parameters = 0 #for line in cipher_lines : #print 'created parameters for a line' #(language_parameters,channel_parameters) = emMethods.getFreeParametersBigram(line,dictionary,free_parameters_language,free_parameters_channel) if full_fst == True: emMethods.getFreeCipherParametersChannel(plaintext_letters,free_parameters_channel) num_cipher_letters = 26 else : emMethods.getFreeCipherParametersChannel(plaintext_letters,free_parameters_channel,cipher_letter_dict = cipher_letter_dict) temp = {'_':0.0} free_parameters_channel['_'] = temp #now, we will build all the lattices, and create a special start node and end node for every sentence start_node_end_node_list = [] fractional_counts_channel = copy.deepcopy(free_parameters_channel) probabilities_channel = copy.deepcopy(free_parameters_channel) #print 'gaussians' #print gaussians #createRandomPoint(probabilities_channel) if (uniform_init == True) : print 'uniform initialization' emMethods.initUniformProbs(probabilities_channel) # emMethods.initUniformProbs(probabilities_language,probabilities_channel) if (gaussian_init == True) : print 'gaussian initialization' #emMethods.initFromGaussians(probabilities_channel,gaussians,cipher_probs,std_mult) emMethods.initFromGaussiansSingleStd(probabilities_channel,plain_unigram_probs,cipher_probs,std_mult) if (identity_init == True) : emMethods.initIdentity(probabilities_channel) #print 'channel probabilities after weighting are ' #print probabilities_channel #raw_input() emMethods.writeFst('cipher.fst',probabilities_channel) #sys.exit() final_probabilities_channel = copy.deepcopy(free_parameters_channel) run_training = '' if hpc == True: run_training = "/home/nlg-05/vaswani/graehl/carmel/bin/linux64/carmel --train-cascade -u -M 0 -m -HJ %s %s cipher.fst"%(cipher_data_file,lm) run_posterior_decoding = "/home/nlg-05/vaswani/graehl/carmel/bin/linux64/carmel --train-cascade -u -M 0 -m -HJ %s %s posterior_decode.fst"%(cipher_data_file,lm) else : run_training = "/Users/avaswani/graehl/carmel/bin/macosx/carmel --train-cascade -u -M 0 -m -HJ %s %s cipher.fst"%(cipher_data_file,lm) run_posterior_decoding = "/Users/avaswani/graehl/carmel/bin/macosx/carmel --train-cascade -u -M 0 -m -HJ %s %s posterior_decode.fst"%(cipher_data_file,lm) #running the EM iterations #we are creating the indexes for algencan . Notice that here, the probabilities language is already uniform and therefore none of them will be zero index_to_parameter = {} createParametersForScaling(probabilities_channel,parameter_to_index,index_to_parameter) start_time = time.clock() print 'start time was ',start_time #fractional_counts_dump_file = open('fractional.params','w') #probabilities_dump_file = open('probs.params','w') #optimized_probabilities_dump_file = open('probs.optimized.params','w') alpha_delta = (final_alpha-initial_alpha)/(num_iterations-1) current_alpha = initial_alpha for i in range (0,num_iterations) : print 'the iteration number was ',i #this will create the parameters total_corpus_probability = 0.0 (status,output) = commands.getstatusoutput(run_training) print 'we just ran the training' print output print status prob_match = probability_re.search(output) print 'current alpha is ',current_alpha if prob_match == None : print'we should have found a probability' else : print 'the probability is %s'%prob_match.group(1) temp_corpus_probability = float(prob_match.group(1)[2:len(prob_match.group(1))]) total_corpus_probability = 0.693147181 * temp_corpus_probability print 'reading language fractional counts' emMethods.readCarmelFractionalCounts('cipher.fst.trained',fractional_counts_channel,'channel') print 'read the fst' print' the probability of the corpus was %f' %total_corpus_probability print 'we are now checking the accuracies' noe_command = 'cat tagging.fsa | sed \'s/*e*//g\' > tagging.fsa.noe' (status,output) = commands.getstatusoutput(noe_command) print 'we wrote the noe fsa' if hpc == True: viterbi_command = "cat %s | /home/nlg-05/vaswani/graehl/carmel/bin/linux64/carmel -u -srbk 1 -QEWI %s cipher.fst > decipherment_output"%(cipher_decode_file,noe_lm) else : viterbi_command = "cat %s | /Users/avaswani/graehl/carmel/bin/macosx/carmel -u -srbk 1 -QEWI %s cipher.fst > decipherment_output"%(cipher_decode_file,noe_lm) (status,output) = commands.getstatusoutput(viterbi_command) print 'status',status print 'output',output #tagged_sequence = emMethods.readTaggingOutput('tagging_output') deciphered_sequence = emMethods.readCipherFile('decipherment_output') print 'length of deciphered sequence was ',len(deciphered_sequence) accuracy = emMethods.calcAccuracy(gold_cipher,deciphered_sequence) print 'The accuracy was %s and the objective function value was %s'%(str(accuracy),str(evaluateObjectiveFuncValue(total_corpus_probability,probabilities_language,probabilities_channel,current_alpha,beta))) expected_counts = zeros(shape=(num_plain_letters,num_plain_letters)) #current_fractional_counts = fractional_counts_channel #first populating the expected counts and probabilities matrix #for i,plain_letter in enumerate(probabilities_channel.keys()) : emMethods.dictionaryToArray(expected_counts,fractional_counts_channel,parameter_to_index) ''' print 'Doing posterior decoding' # decode m = Munkres() #J is not a matrix. its pairs of coordinates J = m.compute(-expected_counts) key = dict((index_to_parameter[plaintext_letters[n]][J[n][1]],plaintext_letters[J[n][0]]) for n in range(len(ciphertext_letters))) print 'the key was ',key posterior_decoded_string = [] for letter in ciphertext: if letter == '_': posterior_decoded_string.append('_') else : posterior_decoded_string.append(key[letter]) posterior_decoded_accuracy = emMethods.calcAccuracy(gold_cipher,posterior_decoded_string) print 'The posterior decoded accuracy was',posterior_decoded_accuracy ''' pdgRowAndColumnConstraints(probabilities_channel,parameter_to_index,num_pgd_iterations,current_alpha,beta,eta,lower_bound,armijo_beta,armijo_sigma,num_plain_letters,num_cipher_letters,expected_counts=expected_counts) #now writing the fsa back again emMethods.writeFst('cipher.fst',probabilities_channel) #if the user has asked for posterior decoding, then we must do it if posterior_decoding_flag == True: print 'running command',run_posterior_decoding emMethods.writePosteriorDecodingFST(ciphertext_with_spaces,probabilities_channel,'posterior_decode.fst') (status,output) = commands.getstatusoutput(run_posterior_decoding) print 'we just ran posterior decoding' print output print status prob_match = probability_re.search(output) if prob_match == None : print'we should have found a probability in posterior decoding' else : print 'the posterior decoding probability is %s'%prob_match.group(1) temp_corpus_probability = float(prob_match.group(1)[2:len(prob_match.group(1))]) total_corpus_probability = 0.693147181 * temp_corpus_probability print 'the posterior decoding probability in base e is',total_corpus_probability print 'Getting the posterior decode' posterior_decode = emMethods.getPosteriorDecode('posterior_decode.fst.trained') print 'the posterior decode was ',posterior_decode posterior_decode_accuracy = emMethods.calcAccuracy(gold_cipher,posterior_decode) print 'The posterior decoded accuracy was ',posterior_decode_accuracy #print 'checking the initial zeros in channel model' #checkZeros(probabilities_channel) fractional_counts_channel = copy.deepcopy(free_parameters_channel) final_probabilities_channel = copy.deepcopy(probabilities_channel) print 'at the end of the iteration' current_alpha += alpha_delta elapsed_time = time.clock() - start_time print 'the elapsed time was ',elapsed_time
def main() : global fractional_counts_language ,fractional_counts_channel,probabilities_channel,probabilities_language,sigma,alpha,current_fractional_counts,current_optimization_params,init_option,initial_parameter_args,constraint_tags_dict,parameter_counter,current_optimization_tag num_iterations = int(sys.argv[1]) alpha = float(sys.argv[2]) sigma = float(sys.argv[3]) dictionary = emMethods.createDictionary('complete.dict.new-formatted')#.small') word_lines= emMethods.readWordLines('test.words.new-formatted') #word_list_five = emMethods.readWordList('TEXT.3.linear') gold_tag_sequence = emMethods.readWordList('test.tags.new-formatted.linear') free_parameters_channel = {} free_parameters_language = {} print 'starting to create parameters' total_language_parameters = 0 total_channel_parameters = 0 for line in word_lines : #print 'created parameters for a line' (language_parameters,channel_parameters) = emMethods.getFreeParametersBigram(line,dictionary,free_parameters_language,free_parameters_channel) #print language_parameters #print channel_parameters total_language_parameters += language_parameters total_channel_parameters += channel_parameters print 'total language parameters is %d' %(total_language_parameters) print 'total channel parameters is %d' %(total_channel_parameters) #now, we will build all the lattices, and create a special start node and end node for every sentence start_node_end_node_list = [] print 'constraint_lengths are being printed' for tag in free_parameters_language.keys() : print len(free_parameters_language[tag].keys()) #raw_input() sys.exit() # print len(word_list) # num_taggings = emMethods.getNumTaggings(word_list,dictionary) # print 'num_taggings ' # print type(num_taggings) #print num_taggings fractional_counts_language = copy.deepcopy(free_parameters_language) fractional_counts_channel = copy.deepcopy(free_parameters_channel) probabilities_channel = copy.deepcopy(free_parameters_channel) probabilities_language = copy.deepcopy(free_parameters_language) emMethods.initUniformProbs(probabilities_channel,probabilities_language) # emMethods.initUniformProbs(probabilities_language,probabilities_channel) emMethods.writeFsa('tagging.fsa',probabilities_language) emMethods.writeFst('tagging.fst',probabilities_channel) run_training = r'./carmel.static --train-cascade -M 0 -m -HJ test.words.new-formatted.training tagging.fsa tagging.fst' # skel_size += len(col) #running the EM iterations #we are creating the indexes for algencan . Notice that here, the probabilities language is already uniform and therefore none of them will be zero createParametersForScaling(probabilities_language) for i in range (0,num_iterations) : ''' print 'checking the initial zeros inlanguage' checkZeros(probabilities_language) print 'checking the initial zeros in channel' checkZeros(probabilities_channel) ''' #best_tag_sequence = emMethods.viterbiSearch(start,end,probabilities_channel,probabilities_language,lattice_skeleton) #emMethods.calcAccuracy(gold_tag_sequence,best_tag_sequence) #raw_input() #this will create the parameters total_corpus_probability = 0.0 (status,output) = commands.getstatusoutput(run_training) print 'we just ran the training' prob_match = probability_re.search(output) if prob_match == None : print'we should have found a probability' else : print 'the probability is %s'%prob_match.group(1) total_corpus_probability = float(prob_match.group(1)[2:len(prob_match.group(1))]) print 'reading language fractional counts' emMethods.readCarmelFractionalCounts('tagging.fsa.trained',fractional_counts_language,'bigram') print 'read the fsa' print 'reading channel fractional counts' emMethods.readCarmelFractionalCounts('tagging.fst.trained',fractional_counts_channel,'channel') print 'read the fst' print' the probability of the corpus was %f' %total_corpus_probability print 'we are now checking the accuracies' noe_command = 'cat tagging.fsa | sed \'s/*e*//g\' > tagging.fsa.noe' (status,output) = commands.getstatusoutput(noe_command) print 'we wrote the noe fsa' viterbi_command = r'cat test.words.new-formatted.quotes | ./carmel.static -srbk -QEWI 1 tagging.fsa.noe tagging.fst > tagging_output' (status,output) = commands.getstatusoutput(viterbi_command) tagged_sequence = emMethods.readTaggingOutput('tagging_output') accuracy = emMethods.calcAccuracy(gold_tag_sequence,tagged_sequence) print 'The accuracy was %s and the objective function value was %s'%(str(accuracy),str(evaluateObjectiveFuncValue(total_corpus_probability,probabilities_language,probabilities_channel,alpha,sigma))) #emMethods.reEstimateProbabilities(probabilities_channel,probabilities_language,fractional_counts_channel,fractional_counts_language) #fractional_counts_language = copy.deepcopy(free_parameters_language) #fractional_counts_channel = copy.deepcopy(free_parameters_channel) #first optimizing the tag bigrams and doing it per parameter for tag in initial_parameter_args.keys() : if len(initial_parameter_args[tag].keys()) == 1 : continue #current_initial_parameter_args = initial_parameter_args[tag] current_optimization_tag = tag parameter_counter = len(initial_parameter_args[tag].keys()) current_fractional_counts = fractional_counts_language constraint_tags_dict[1] = tag temp_language_probs = dict(probabilities_language) #optimizing per constraint init_option = 'current_prob' current_optimization_params = 'tag_bigrams' algencan.solvers(evalf,evalg,evalh,evalc,evaljac,evalhc,evalfc,evalgjac,evalhl,evalhlp,inip,endp) #I should check if the objective function is increasing language_probs_after_init_current_prob = copy.deepcopy(probabilities_language) #obj_val1 = evaluateObjectiveFuncValue(total_corpus_probability,probabilities_language,probabilities_channel,alpha,sigma) total_obj_val1 = evaluateObjectiveFuncValue(total_corpus_probability,probabilities_language,probabilities_channel,alpha,sigma) obj_val1 = evaluateOptimizationFunction(initial_parameter_args,probabilities_language,fractional_counts_language,alpha,sigma) print 'the function value was obj 1 %f'%obj_val1 #emMethods.clearAlphaBeta(lattice_skeleton) init_option = 'zeros' current_optimization_params = 'tag_bigrams' algencan.solvers(evalf,evalg,evalh,evalc,evaljac,evalhc,evalfc,evalgjac,evalhl,evalhlp,inip,endp) language_probs_after_init_zeros = copy.deepcopy(probabilities_language) #obj_val2 = evaluateObjectiveFuncValue(total_corpus_probability,probabilities_language,probabilities_channel,alpha,sigma)a total_obj_val2 = evaluateObjectiveFuncValue(total_corpus_probability,probabilities_language,probabilities_channel,alpha,sigma) obj_val2 = evaluateOptimizationFunction(initial_parameter_args,probabilities_language,fractional_counts_language,alpha,sigma) print 'the function value was obj 2 %f'%obj_val2 #emMethods.clearAlphaBeta(lattice_skeleton) if (total_obj_val1 >= total_obj_val2) : #init_option = 'current_prob' #current_optimization_params = 'tag_bigrams' if (obj_val1 < obj_val2) : print 'the final objective function value was opposite' probabilities_language = copy.deepcopy(language_probs_after_init_current_prob) #algencan.solvers(evalf,evalg,evalh,evalc,evaljac,evalhc,evalfc,evalgjac,evalhl,evalhlp,inip,endp) print 'the final objective function value was obj 1 %f'%total_obj_val1 else : #init_option = 'zeros' #current_optimization_params = 'tag_bigrams' probabilities_language = copy.deepcopy(language_probs_after_init_zeros) #algencan.solvers(evalf,evalg,evalh,evalc,evaljac,evalhc,evalfc,evalgjac,evalhl,evalhlp,inip,endp) if (obj_val2 < obj_val1) : print 'the final objective function value was opposite' print 'the final objective function value was obj 2 %f'%total_obj_val2 #raw_input() # emMethods.reEstimateProbabilities(probabilities_channel,probabilities_language,fractional_counts_channel,fractional_counts_language) emMethods.reEstimateProbabilities(probabilities_channel,probabilities_language,fractional_counts_channel,fractional_counts_language) print 'writing the fsa' #now writing the fsa back again emMethods.writeFsa('tagging.fsa',probabilities_language) emMethods.writeFst('tagging.fst',probabilities_channel) ''' noe_command = 'cat tagging.fsa | sed \'s/*e*//g\' > tagging.fsa.noe' (status,output) = commands.getstatusoutput(noe_command) print 'we wrote the noe fsa' viterbi_command = r'cat test.words.new-formatted.quotes | ./carmel.static -srbk -QEWI 1 tagging.fsa.noe tagging.fst > tagging_output' (status,output) = commands.getstatusoutput(viterbi_command) tagged_sequence = emMethods.readTaggingOutput('tagging_output') emMethods.calcAccuracy(gold_tag_sequence,tagged_sequence) ''' print 'checking the zeros in tag bigram model' checkZeros(probabilities_language) print 'checking the initial zeros in channel model' checkZeros(probabilities_channel) #fractional_counts_language.clear() #fractional_counts_channel.clear() fractional_counts_language = copy.deepcopy(free_parameters_language) fractional_counts_channel = copy.deepcopy(free_parameters_channel) #probabilities_language = copy.deepcopy(free_parameters_language) probabilities_channel = copy.deepcopy(free_parameters_channel)
def main() : global fractional_counts_language ,fractional_counts_channel,probabilities_channel,sigma,alpha,current_fractional_counts,current_optimization_params,init_option num_iterations = int(sys.argv[1]) alpha = float(sys.argv[2]) sigma = float(sys.argv[3]) # dictionary = emMethods.createDictionary('DICT2')#.small') # word_list = emMethods.readWordList('TEXT.linear') #word_list_five = emMethods.readWordList('TEXT.5.linear') # gold_tag_sequence = emMethods.readWordList('GOLD.linear') gold_cipher = emMethods.readCipherFile('cipher.gold.noq') print gold_cipher #dictionary = emMethods.createDictionary('complete.dict.new-formatted')#.small') #word_lines= emMethods.readWordLines('test.words.new-formatted') cipher_letter_dict = emMethods.getUniqCipherLetters('cipher.data.noq') #word_list_five = emMethods.readWordList('TEXT.3.linear') #plaintext = map(chr, range(97, 123)) plaintext = [] for k in range(65, 91): plaintext.append(chr(k)) print plaintext print 'the number of unique cipher letter is %d'%len(cipher_letter_dict.keys()) print cipher_letter_dict #gold_tag_sequence = emMethods.readWordList('test.tags.new-formatted.linear') free_parameters_channel = {} free_parameters_language = {} print 'starting to create parameters' total_language_parameters = 0 total_channel_parameters = 0 #for line in cipher_lines : #print 'created parameters for a line' #(language_parameters,channel_parameters) = emMethods.getFreeParametersBigram(line,dictionary,free_parameters_language,free_parameters_channel) emMethods.getFreeCipherParametersChannel(cipher_letter_dict,plaintext,free_parameters_channel) temp = {'_':0.0} free_parameters_channel['_'] = temp #print free_parameters_channel #sys.exit() #print language_parameters #print channel_parameters #total_language_parameters += language_parameters #total_channel_parameters += channel_parameters #print 'total language parameters is %d' %(total_language_parameters) #print 'total channel parameters is %d' %(total_channel_parameters) #now, we will build all the lattices, and create a special start node and end node for every sentence start_node_end_node_list = [] # print len(word_list) # num_taggings = emMethods.getNumTaggings(word_list,dictionary) # print 'num_taggings ' # print type(num_taggings) #print num_taggings fractional_counts_channel = copy.deepcopy(free_parameters_channel) probabilities_channel = copy.deepcopy(free_parameters_channel) emMethods.initUniformProbs(probabilities_channel) # emMethods.initUniformProbs(probabilities_language,probabilities_channel) emMethods.writeFst('cipher.fst',probabilities_channel) run_training = r'./carmel.static --train-cascade -M 0 -m -HJ cipher.data cipher.wfsa cipher.fst' # skel_size += len(col) #running the EM iterations for i in range (0,num_iterations) : ''' print 'checking the initial zeros inlanguage' checkZeros(probabilities_language) print 'checking the initial zeros in channel' checkZeros(probabilities_channel) ''' #best_tag_sequence = emMethods.viterbiSearch(start,end,probabilities_channel,probabilities_language,lattice_skeleton) #emMethods.calcAccuracy(gold_tag_sequence,best_tag_sequence) #raw_input() #this will create the parameters total_corpus_probability = 0.0 (status,output) = commands.getstatusoutput(run_training) print 'we just ran the training' prob_match = probability_re.search(output) if prob_match == None : print'we should have found a probability' else : print 'the probability is %s'%prob_match.group(1) total_corpus_probability = float(prob_match.group(1)[2:len(prob_match.group(1))]) print 'reading channel fractional counts' emMethods.readCarmelFractionalCounts('cipher.fst.trained',fractional_counts_channel,'channel') print 'read the fst' print' the probability of the corpus was %f' %total_corpus_probability print 'we are now checking the accuracies' noe_command = 'cat tagging.fsa | sed \'s/*e*//g\' > tagging.fsa.noe' (status,output) = commands.getstatusoutput(noe_command) print 'we wrote the noe fsa' viterbi_command = r'cat cipher.data | ./carmel.static -srbk -QEWI 1 cipher.wfsa.noe cipher.fst > decipherment_output' (status,output) = commands.getstatusoutput(viterbi_command) #tagged_sequence = emMethods.readTaggingOutput('tagging_output') deciphered_sequence = emMethods.readCipherFile('decipherment_output') accuracy = emMethods.calcAccuracy(gold_cipher,deciphered_sequence) print 'The accuracy was %s and the objective function value was %s'%(str(accuracy),str(evaluateObjectiveFuncValue(total_corpus_probability,probabilities_channel,alpha,sigma))) #first optimizing the channel current_fractional_counts = fractional_counts_channel createParameters(probabilities_channel,current_fractional_counts,free_parameters_channel,alpha,sigma) temp_channel_probs = dict(probabilities_channel) init_option = 'current_prob' current_optimization_params = 'channel' algencan.solvers(evalf,evalg,evalh,evalc,evaljac,evalhc,evalfc,evalgjac,evalhl,evalhlp,inip,endp) #I should check if the objective function is increasing channel_probs_after_init_current_prob = copy.deepcopy(probabilities_channel) #obj_val1 = evaluateObjectiveFuncValue(total_corpus_probability,probabilities_language,probabilities_channel,alpha,sigma) total_obj_val1 = evaluateObjectiveFuncValue(total_corpus_probability,probabilities_channel,alpha,sigma) obj_val1 = evaluateOptimizationFunction(initial_parameter_args,probabilities_channel,fractional_counts_channel,alpha,sigma) print 'the function value was obj 1 %f'%obj_val1 #emMethods.clearAlphaBeta(lattice_skeleton) init_option = 'zeros' current_optimization_params = 'channel' algencan.solvers(evalf,evalg,evalh,evalc,evaljac,evalhc,evalfc,evalgjac,evalhl,evalhlp,inip,endp) channel_probs_after_init_zeros = copy.deepcopy(probabilities_channel) #obj_val2 = evaluateObjectiveFuncValue(total_corpus_probability,probabilities_language,probabilities_channel,alpha,sigma)a total_obj_val2 = evaluateObjectiveFuncValue(total_corpus_probability,probabilities_channel,alpha,sigma) obj_val2 = evaluateOptimizationFunction(initial_parameter_args,probabilities_channel,fractional_counts_channel,alpha,sigma) print 'the function value was obj 2 %f'%obj_val2 #emMethods.clearAlphaBeta(lattice_skeleton) if (total_obj_val1 >= total_obj_val2) : #init_option = 'current_prob' #current_optimization_params = 'tag_bigrams' if (obj_val1 < obj_val2) : print 'the final objective function value was opposite' probabilities_channel = copy.deepcopy(channel_probs_after_init_current_prob) #algencan.solvers(evalf,evalg,evalh,evalc,evaljac,evalhc,evalfc,evalgjac,evalhl,evalhlp,inip,endp) print 'the final objective function value was obj 1 %f'%total_obj_val1 else : #init_option = 'zeros' #current_optimization_params = 'tag_bigrams' probabilities_channel = copy.deepcopy(channel_probs_after_init_zeros) if (obj_val2 < obj_val1) : print 'the final objective function value was opposite' print 'the final objective function value was obj 2 %f'%total_obj_val2 #raw_input() # emMethods.reEstimateProbabilities(probabilities_channel,probabilities_language,fractional_counts_channel,fractional_counts_language) #emMethods.reEstimateProbabilities(probabilities_channel,probabilities_language,fractional_counts_channel,fractional_counts_language) #print 'writing the fsa' #now writing the fsa back again #emMethods.writeFsa('tagging.fsa',probabilities_language) emMethods.writeFst('cipher.fst',probabilities_channel) #print 'checking the zeros in tag bigram model' #checkZeros(probabilities_language) print 'checking the zeros in channel model' checkZeros(probabilities_channel) #fractional_counts_language.clear() #fractional_counts_channel.clear() #fractional_counts_language = copy.deepcopy(free_parameters_language) fractional_counts_channel = copy.deepcopy(free_parameters_channel)
def main() : global fractional_counts_language ,fractional_counts_channel,probabilities_channel,probabilities_language,beta,alpha,current_fractional_counts,current_optimization_params,init_option,initial_parameter_args,constraint_tags_dict,parameter_counter,current_optimization_tag,slack_option #adding parser options parser = OptionParser() parser.add_option("--num_iter", action="store", type="int", dest="num_iterations",default=100,help="number of iterations you would like to run em+smoothed l0. Default is 50") parser.add_option("--alpha", action="store", type="float", dest="alpha",default = 0.0,help="alpha,the weight of the smoothed l0 penalty. Defaut is 0 ") parser.add_option("--beta", action="store", type="float", dest="beta",default = 0.5,help="beta, the smoothness of the l0 prior, smaller the sigma, closer the approximation to true L0. Default is 0.5") parser.add_option("--slack", action="store_true", dest="slack_option",default = False,help="if you want to project on the simplex with slack.") parser.add_option("--noslack", action="store_false", dest="slack_option",default = False, help="if you want to project on the simplex with no slack. This is the regular projection approach") parser.add_option("--num_pgd_iterations", action="store", type="int", dest="num_pgd_iterations",default = 50,help="Number of Projected Gradient Descent to run. Default is 100") parser.add_option("--eta", action="store", type="float", dest="eta",default = 0.5,help="Eta, the constant step size for PGD using armijo line search. Default is 0.1") parser.add_option("--armijo_beta", action="store", type="float", dest="armijo_beta",default = 0.5,help="Set value for Armijo beta, the beta used in in armijo line search. Default value is 0.2") parser.add_option("--armijo_sigma", action="store", type="float", dest="armijo_sigma",default = 0.5,help="Set value for Armijo sigma, the sigma used in in armijo line search. Lower bound is 0.0001") parser.add_option("--lower_bound", action="store", type="float", dest="lower_bound",default = 0.000001,help="Set value for the lower bound on the probability.Default is 10E-6") (options, args) = parser.parse_args() #print options #print args #getting the values from optparse num_iterations = options.num_iterations alpha = options.alpha beta = options.beta slack_option = options.slack_option num_pgd_iterations = options.num_pgd_iterations eta = options.eta armijo_beta = options.armijo_beta armijo_sigma = options.armijo_sigma lower_bound = options.lower_bound gold_cipher = emMethods.readCipherFile('cipher.gold.noq') print gold_cipher #dictionary = emMethods.createDictionary('complete.dict.new-formatted')#.small') #word_lines= emMethods.readWordLines('test.words.new-formatted') cipher_letter_dict = emMethods.getUniqCipherLetters('cipher.data.noq') #word_list_five = emMethods.readWordList('TEXT.3.linear') #plaintext = map(chr, range(97, 123)) plaintext = [] for k in range(65, 91): plaintext.append(chr(k)) print plaintext print 'the number of unique cipher letter is %d'%len(cipher_letter_dict.keys()) print cipher_letter_dict #gold_tag_sequence = emMethods.readWordList('test.tags.new-formatted.linear') free_parameters_channel = {} free_parameters_language = {} print 'starting to create parameters' total_language_parameters = 0 total_channel_parameters = 0 #for line in cipher_lines : #print 'created parameters for a line' #(language_parameters,channel_parameters) = emMethods.getFreeParametersBigram(line,dictionary,free_parameters_language,free_parameters_channel) emMethods.getFreeCipherParametersChannel(cipher_letter_dict,plaintext,free_parameters_channel) temp = {'_':0.0} free_parameters_channel['_'] = temp #print free_parameters_channel #sys.exit() #print language_parameters #print channel_parameters #total_language_parameters += language_parameters #total_channel_parameters += channel_parameters #print 'total language parameters is %d' %(total_language_parameters) #print 'total channel parameters is %d' %(total_channel_parameters) #now, we will build all the lattices, and create a special start node and end node for every sentence start_node_end_node_list = [] # print len(word_list) # num_taggings = emMethods.getNumTaggings(word_list,dictionary) # print 'num_taggings ' # print type(num_taggings) #print num_taggings fractional_counts_channel = copy.deepcopy(free_parameters_channel) probabilities_channel = copy.deepcopy(free_parameters_channel) createRandomPoint(probabilities_channel) #emMethods.initUniformProbs(probabilities_channel) # emMethods.initUniformProbs(probabilities_language,probabilities_channel) emMethods.writeFst('cipher.fst',probabilities_channel) final_probabilities_channel = copy.deepcopy(free_parameters_channel) run_training = r'./carmel --train-cascade -M 0 -m -HJ cipher.data cipher.wfsa cipher.fst' #running the EM iterations #we are creating the indexes for algencan . Notice that here, the probabilities language is already uniform and therefore none of them will be zero createParametersForScaling(probabilities_channel) start_time = time.clock() print 'start time was ',start_time #fractional_counts_dump_file = open('fractional.params','w') #probabilities_dump_file = open('probs.params','w') #optimized_probabilities_dump_file = open('probs.optimized.params','w') for i in range (0,num_iterations) : print 'the iteration number was ',i ''' print 'checking the initial zeros inlanguage' checkZeros(probabilities_language) print 'checking the initial zeros in channel' checkZeros(probabilities_channel) ''' #best_tag_sequence = emMethods.viterbiSearch(start,end,probabilities_channel,probabilities_language,lattice_skeleton) #emMethods.calcAccuracy(gold_tag_sequence,best_tag_sequence) #raw_input() #this will create the parameters total_corpus_probability = 0.0 (status,output) = commands.getstatusoutput(run_training) print 'we just ran the training' print output prob_match = probability_re.search(output) if prob_match == None : print'we should have found a probability' else : print 'the probability is %s'%prob_match.group(1) temp_corpus_probability = float(prob_match.group(1)[2:len(prob_match.group(1))]) total_corpus_probability = 0.693147181 * temp_corpus_probability print 'reading language fractional counts' emMethods.readCarmelFractionalCounts('cipher.fst.trained',fractional_counts_channel,'channel') print 'read the fst' print' the probability of the corpus was %f' %total_corpus_probability print 'we are now checking the accuracies' noe_command = 'cat tagging.fsa | sed \'s/*e*//g\' > tagging.fsa.noe' (status,output) = commands.getstatusoutput(noe_command) print 'we wrote the noe fsa' viterbi_command = r'cat cipher.data.quotes | ./carmel -srbk 1 -QEWI cipher.wfsa.noe cipher.fst > decipherment_output' (status,output) = commands.getstatusoutput(viterbi_command) #tagged_sequence = emMethods.readTaggingOutput('tagging_output') deciphered_sequence = emMethods.readCipherFile('decipherment_output') accuracy = emMethods.calcAccuracy(gold_cipher,deciphered_sequence) print 'The accuracy was %s and the objective function value was %s'%(str(accuracy),str(evaluateObjectiveFuncValue(total_corpus_probability,probabilities_channel,probabilities_language,alpha,beta))) #emMethods.reEstimateProbabilities(probabilities_channel,probabilities_language,fractional_counts_channel,fractional_counts_language) #fractional_counts_language = copy.deepcopy(free_parameters_language) #fractional_counts_channel = copy.deepcopy(free_parameters_channel) #first optimizing the tag bigrams and doing it per parameter for tag in initial_parameter_args.keys() : print 'we are starting a new tag' if len(initial_parameter_args[tag].keys()) == 1 : continue print 'we are currently optimizing for tag', tag #current_initial_parameter_args = initial_parameter_args[tag] current_optimization_tag = tag parameter_counter = len(initial_parameter_args[tag].keys()) current_fractional_counts = fractional_counts_channel constraint_tags_dict[1] = tag temp_language_probs = dict(probabilities_channel) #optimizing per constraint init_option = 'current_prob' current_optimization_params = 'channel' #creating the initial vector to pass to the projected gradient descent algorithm x=zeros(parameter_counter) expected_counts=zeros(parameter_counter) expected_counts_sum = 0. for next_tag in initial_parameter_args[current_optimization_tag].keys() : if initial_parameter_args[current_optimization_tag][next_tag] == 'ONE' : continue else : parameter_number = initial_parameter_args[current_optimization_tag][next_tag] x[parameter_number] = probabilities_channel[current_optimization_tag][next_tag] expected_counts[parameter_number] = current_fractional_counts[current_optimization_tag][next_tag] expected_counts_sum += current_fractional_counts[current_optimization_tag][next_tag] print 'expected counts sum was ',expected_counts_sum if expected_counts_sum <= 0.5 : #set all the probabilities to 0 #for next_tag in initial_parameter_args[current_optimization_tag] : # new_probabilities = zeros(len(initial_parameter_args[current_optimization_tag].keys())) # assignProbs(new_probabilities,probabilities_channel) continue #raw_input() #the optimization steps have to come here #print 'we are optimizing the tag bigrams now ' #current_eta = eta_0 #/sqrt(num_iterations+1) print 'Doing projected gradient descent' new_probabilities = projectedGradientDescentWithArmijoRule(x = x,expected_counts = expected_counts,num_pgd_iterations = num_pgd_iterations,eta = eta,lower_bound = lower_bound,armijo_beta = armijo_beta,armijo_sigma = armijo_sigma) print 'finished projected gradient descent' #new_probabilities = newtonProjectedGradientDescentWithArmijoRule(x,num_pgd_iterations,eta,lower_bound,armijo_beta,armijo_sigma) #print new_probabilities #raw_input() if current_optimization_params == 'tag_bigrams' : #print 'we are replacing bigram probs' assignProbs(new_probabilities,probabilities_language) # checkZeros(probabilities_language) else : print 'we are replacing channel probs' assignProbs(new_probabilities,probabilities_channel) #now writing the fsa back again # emMethods.writeFsa('tagging.fsa',probabilities_language) # emMethods.writeFst('tagging.fst',probabilities_channel) emMethods.writeFst('cipher.fst',probabilities_channel) #emMethods.writeFstLnFormat('tagging.fst',probabilities_channel) print 'checking the initial zeros in channel model' checkZeros(probabilities_channel) #fractional_counts_language.clear() #fractional_counts_channel.clear() fractional_counts_channel = copy.deepcopy(free_parameters_channel) final_probabilities_channel = copy.deepcopy(probabilities_channel) print 'at the end of the iteration' #raw_input() elapsed_time = time.clock() - start_time print 'the elapsed time was ',elapsed_time
def main() : # setting up options parser = OptionParser() parser.add_option("--num_iter", action="store", type="int", dest="num_iterations",default=100,help="number of iterations you would like to run em+smoothed l0. Default is 50") parser.add_option("--num_noise_samples", action="store", type="int", dest="num_noise_samples",default=10,help="number of noise samples. Default is 10") parser.add_option("--initial_alpha", action="store", type="float", dest="initial_alpha",default = 0.0,help="initial_alpha,the weight of the smoothed l0 penalty. Defaut is 0 ") parser.add_option("--final_alpha", action="store", type="float", dest="final_alpha",default = 0.0,help="final_alpha,the weight of the smoothed l0 penalty. Defaut is 0 ") parser.add_option("--beta", action="store", type="float", dest="beta",default = 0.5,help="beta, the smoothness of the l0 prior, smaller the sigma, closer the approximation to true L0. Default is 0.5") parser.add_option("--slack", action="store_true", dest="slack_option",default = False,help="if you want to project on the simplex with slack.") parser.add_option("--noslack", action="store_false", dest="slack_option",default = False, help="if you want to project on the simplex with no slack. This is the regular projection approach") parser.add_option("--num_pgd_iterations", action="store", type="int", dest="num_pgd_iterations",default = 10,help="Number of Projected Gradient Descent to run. Default is 100") parser.add_option("--eta", action="store", type="float", dest="eta",default = 0.1,help="Eta, the constant step size for PGD using armijo line search. Default is 0.1") parser.add_option("--armijo_beta", action="store", type="float", dest="armijo_beta",default = 0.5,help="Set value for Armijo beta, the beta used in in armijo line search. Default value is 0.2") parser.add_option("--armijo_sigma", action="store", type="float", dest="armijo_sigma",default = 0.5,help="Set value for Armijo sigma, the sigma used in in armijo line search. Lower bound is 0.0001") parser.add_option("--lower_bound", action="store", type="float", dest="lower_bound",default = 0.000001,help="Set value for the lower bound on the probability.Default is 10E-6") parser.add_option("--cipher_data_file", action="store", type="string", dest="cipher_data_file",default ='cipher.data',help="Cipher data file for training") parser.add_option("--cipher_noq_file", action="store", type="string", dest="cipher_noq_file",default ='cipher.noq',help="Cipher data without quotes") parser.add_option("--cipher_decode_file", action="store", type="string", dest="cipher_decode_file",default ='cipher.decode',help="Cipher file for decoding") parser.add_option("--cipher_gold_file", action="store", type="string", dest="cipher_gold_file",default ='cipher.gold',help="The correct decipherment") parser.add_option("--lm", action="store", type="string", dest="lm",default ='lm.carmel',help="The lm file") parser.add_option("--noe_lm", action="store", type="string", dest="noe_lm",default ='lm.carmel',help="The noe lm file") parser.add_option("--gaussian_params_file", action="store", type="string", dest="gaussian_params_file",default =None,help="The means and variances for each gaussian letter") parser.add_option("--unigram_probs_file", action="store", type="string", dest="unigram_probs_file",default =None,help="The means and variances for each gaussian letter") parser.add_option("--std_mult", action="store", type="float", dest="std_mult",default =1.,help="The multiplier for the std ") parser.add_option("--u", action="store_true", dest="uniform_init",default=False) parser.add_option("--g", action="store_true", dest="gaussian_init",default=False) parser.add_option("--i", action="store_true", dest="identity_init",default=False) parser.add_option("--hpc", action="store_true", dest="hpc",default=False) parser.add_option("--full_fst", action="store_true", dest="full_fst",default=False) parser.add_option("--fst_init", action="store_true", dest="fst_init",default=False) parser.add_option("--noise_lm", action="store", type="string", dest="noise_lm",default ='noise.lm.carmel',help="The noise lm file") parser.add_option("--noise_channel_fst", action="store", type="string", dest="noise_channel_fst",default ='noise.fst.carmel',help="The channel fst") parser.add_option("--noise_probs_file", action="store", type="string", dest="noise_probs_file",default ='noise.probs',help="The noise probs file") parser.add_option("--noise_samples_file", action="store", type="string", dest="noise_samples_file",default ='noise.samples',help="The noise samples file") (options, args) = parser.parse_args() print options print args #getting the values from optparse num_iterations = options.num_iterations initial_alpha = options.initial_alpha final_alpha = options.final_alpha beta = options.beta slack_option = options.slack_option num_pgd_iterations = options.num_pgd_iterations eta = options.eta armijo_beta = options.armijo_beta armijo_sigma = options.armijo_sigma lower_bound = options.lower_bound cipher_data_file = options.cipher_data_file cipher_decode_file = options.cipher_decode_file cipher_noq_file = options.cipher_noq_file cipher_gold_file = options.cipher_gold_file lm = options.lm noe_lm = options.noe_lm gaussian_params_file = options.gaussian_params_file unigram_probs_file = options.unigram_probs_file uniform_init = options.uniform_init gaussian_init = options.gaussian_init identity_init = options.identity_init hpc = options.hpc std_mult = options.std_mult full_fst = options.full_fst num_noise_samples = options.num_noise_samples noise_lm = options.noise_lm noise_channel_fst = options.noise_channel_fst fst_init = options.fst_init noise_probs_file = options.noise_probs_file noise_samples_file = options.noise_samples_file noise_probs_file = options.noise_probs_file #setting up paramters fractional_counts_language = {} fractional_counts_channel = {} probabilities_channel = {} probabilities_language = {} current_probabilities = {} current_fractional_counts = {} #constraint_parameters = [] initial_parameter_args = {} current_initial_parameter_args = {} parameter_to_index = {} parameter_counter = 0 num_constraints = 1 constraint_tags_dict = {} #this will hold the tag for the constraint. the key is the constraint id and the value is the tag corresponding to this constraint #beta = float(0) #alpha = float(0) global_num_parameters = 0 init_option = '' current_optimization_tag = '' #this will hold the of the constraint for which we are doing the optimization #adding parser options ''' print 'beta is ',beta print 'alpha is ',alpha print 'eta is ',eta raw_input() ''' gold_cipher = emMethods.readCipherFile(cipher_gold_file) print gold_cipher #dictionary = emMethods.createDictionary('complete.dict.new-formatted')#.small') #word_lines= emMethods.readWordLines('test.words.new-formatted') cipher_letter_dict = emMethods.getUniqCipherLetters(cipher_noq_file) cipher_letters = open(cipher_noq_file).readline().split() #sys.exit() cipher_probs = defaultdict(float) emMethods.getCipherLetterProbs(cipher_probs,cipher_noq_file) print 'cipher probs are ' #gaussians = defaultdict(list) #emMethods.readGaussians(gaussians,gaussian_params_file) plain_unigram_probs = dict((line.strip().split()[0],float(line.strip().split()[1])) for line in open(unigram_probs_file)) #get cipher letter probs del cipher_letter_dict['_'] #word_list_five = emMethods.readWordList('TEXT.3.linear') #plaintext = map(chr, range(97, 123)) plaintext = [] for k in range(65, 91): plaintext.append(chr(k)) print plaintext print 'the number of unique cipher letter is %d'%len(cipher_letter_dict.keys()) print cipher_letter_dict num_cipher_letters = len(cipher_letter_dict.keys()) num_plain_letters = 26 #gold_tag_sequence = emMethods.readWordList('test.tags.new-formatted.linear') free_parameters_channel = defaultdict(lambda:defaultdict(float)) free_parameters_language = defaultdict(lambda:defaultdict(float)) print 'starting to create parameters' total_language_parameters = 0 total_channel_parameters = 0 #for line in cipher_lines : #print 'created parameters for a line' #(language_parameters,channel_parameters) = emMethods.getFreeParametersBigram(line,dictionary,free_parameters_language,free_parameters_channel) if full_fst == True: emMethods.getFreeCipherParametersChannel(plaintext,free_parameters_channel) num_cipher_letters = 26 else : emMethods.getFreeCipherParametersChannel(plaintext,free_parameters_channel,cipher_letter_dict = cipher_letter_dict) temp = {'_':0.0} free_parameters_channel['_'] = temp #now, we will build all the lattices, and create a special start node and end node for every sentence start_node_end_node_list = [] fractional_counts_channel = copy.deepcopy(free_parameters_channel) probabilities_channel = copy.deepcopy(free_parameters_channel) #print 'gaussians' #print gaussians #createRandomPoint(probabilities_channel) if (uniform_init == True) : print 'uniform initialization' emMethods.initUniformProbs(probabilities_channel) # emMethods.initUniformProbs(probabilities_language,probabilities_channel) if (gaussian_init == True) : print 'gaussian initialization' #emMethods.initFromGaussians(probabilities_channel,gaussians,cipher_probs,std_mult) emMethods.initFromGaussiansSingleStd(probabilities_channel,plain_unigram_probs,cipher_probs,std_mult) if (identity_init == True) : emMethods.initIdentity(probabilities_channel) if fst_init == True: emMethods.initFromWfst('init.fst',probabilities_channel,'channel') #print 'channel probabilities after weighting are ' #print probabilities_channel #raw_input() emMethods.writeFst('cipher.fst',probabilities_channel) #sys.exit() final_probabilities_channel = copy.deepcopy(free_parameters_channel) #running the EM iterations #we are creating the indexes for algencan . Notice that here, the probabilities language is already uniform and therefore none of them will be zero createParametersForScaling(probabilities_channel,parameter_to_index) start_time = time.clock() print 'start time was ',start_time #fractional_counts_dump_file = open('fractional.params','w') #probabilities_dump_file = open('probs.params','w') #optimized_probabilities_dump_file = open('probs.optimized.params','w') alpha_delta = (final_alpha-initial_alpha)/(num_iterations-1) current_alpha = initial_alpha ###########GENERATING THE NOISE FILE LIST########################################## noise_file_list = [] for k in range(num_noise_samples): noise_file_list.append("%s.noise%d"%(cipher_noq_file,k)) #reading the noise probsa noise_probs = [] for line in open(noise_probs_file): if line.strip() == '': continue else: noise_probs.append(float(line.strip())) print 'number of noise probs is ',len(noise_probs) #this will be needed for computations log_num_noise_samples = math.log(num_noise_samples) #reading noise samples total_noise_samples = 0 noise_samples = [] for line in open(noise_samples_file): if line.strip() == '': continue else: noise_samples.append(line.strip()) total_noise_samples += 1 print 'total number of noise samples is',total_noise_samples ###############GENERATE CARMEL TRAINING AND DECODING COMMANDS############################## print 'Generating the training and decoding commands...' run_true_training = '' run_noise_training = '' if hpc == True: run_true_training = "/home/nlg-05/vaswani/graehl/carmel/bin/linux64/carmel --train-cascade -u -M 0 -m -HJ %s %s cipher.fst"%(cipher_data_file,lm) run_noise_training = "/home/nlg-05/vaswani/graehl/carmel/bin/linux64/carmel --train-cascade -u -M 0 -m -HJ %s %s %s"%(cipher_data_file,noise_lm,noise_channel_fst) else : run_true_training = "/Users/avaswani/graehl/carmel/bin/macosx/carmel --train-cascade -u -M 0 -m -HJ %s %s cipher.fst"%(cipher_data_file,lm) run_noise_training = "/Users/avaswani/graehl/carmel/bin/macosx/carmel --train-cascade -u -M 0 -m -HJ %s %s %s"%(cipher_data_file,noise_lm,noise_channel_fst) noise_sample_training_commands_for_model = [] #noise_sample_training_commands_for_noise = [] for k in range(num_noise_samples): run_noise_sample_true_training = '' #run_noise_sample_noise_training = '' if hpc == True: run_noise_sample_true_training = "/home/nlg-05/vaswani/graehl/carmel/bin/linux64/carmel --train-cascade -u -M 0 -m -HJ %s %s cipher.fst"%(noise_file_list[k],lm) #run_noise_sample_noise_training = "/home/nlg-05/vaswani/graehl/carmel/bin/linux64/carmel --train-cascade -u -M 0 -m -HJ %s %s %s"%(noise_file_list[k],noise_lm,noise_channel_fst) else : run_noise_sample_true_training = "/Users/avaswani/graehl/carmel/bin/macosx/carmel --train-cascade -u -M 0 -m -HJ %s %s cipher.fst"%(noise_file_list[k],lm) #run_noise_sample_noise_training = "/Users/avaswani/graehl/carmel/bin/macosx/carmel --train-cascade -u -M 0 -m -HJ %s %s %s"%(noise_file_list[k],noise_lm,noise_channel_fst) noise_sample_training_commands_for_model.append(run_noise_sample_true_training) #noise_sample_training_commands_for_noise.append(run_noise_sample_noise_training) viterbi_command = '' if hpc == True: viterbi_command = "cat %s | /home/nlg-05/vaswani/graehl/carmel/bin/linux64/carmel -u -srbk 1 -QEWI %s cipher.fst > decipherment_output"%(cipher_decode_file,noe_lm) else : viterbi_command = "cat %s | /Users/avaswani/graehl/carmel/bin/macosx/carmel -u -srbk 1 -QEWI %s cipher.fst > decipherment_output"%(cipher_decode_file,noe_lm) print 'Running Training ...' print 'command for running noise training is ',run_noise_training log_prob_under_noise = runCarmel(run_noise_training) ''' log_noise_sample_probs_under_noise = [] for k in range(num_noise_samples): #log_noise_sample_prob_under_noise = runCarmel(noise_sample_training_commands_for_noise[k]) log_noise_sample_probs_under_noise.append(log_noise_sample_prob_under_noise) print 'the log noise sample probs under noise are ',log_noise_sample_probs_under_noise ''' ##########################RUN NCE TRAINING############################### for i in range (0,num_iterations) : ###############GENERATING K NOISE SAMPLES######################### print 'Generating noise samples...' noise_ciphertext = list(cipher_letters) for k in range(num_noise_samples): noise_file = open(noise_file_list[k],'w') #emMethods.generateNoiseCipher(noise_ciphertext) noise_ciphertext = noise_samples[i*num_noise_samples+k] #print 'noise ciphertext is ',' '.join(noise_ciphertext) print 'noise ciphertext is ',noise_ciphertext #noise_file.write("\n%s\n"%' '.join(["\"%s\""%item for item in noise_ciphertext])) noise_file.write("\n%s"%noise_ciphertext) noise_file.close() print 'Getting Accuracy' ################RUNNING VITERBI AND GETTING ACCURCY################### runViterbiAndGetAccuracy(gold_cipher,viterbi_command) #dampening the learning rate eta = eta/(1+i) current_function_value = 0. print 'the iteration number was ',i print 'current alpha is ',current_alpha print 'Computing expected counts...' ###############COMPUTING EXPECTED COUNTS ############################### log_prob_under_model = runCarmel(run_true_training) print 'reading language fractional counts from cipher.fst.trained' emMethods.readCarmelFractionalCounts('cipher.fst.trained',fractional_counts_channel,'channel') print 'Running the noise model' log_prob_under_noise = runCarmel(run_noise_training) positive_weight = computePositiveWeight(log_prob_under_model,log_prob_under_noise,num_noise_samples) #compute the p_d_equals_one = log_prob_under_model - (logaddexp(log_num_noise_samples+log_prob_under_noise,log_prob_under_model)) current_function_value += p_d_equals_one print 'p of d equals 1 at the current point is ', p_d_equals_one print 'The positive weight was',positive_weight if positive_weight <= 10E-10: positive_weight = 10E-10 expected_counts = zeros(shape=(num_plain_letters,num_plain_letters)) emMethods.dictionaryToArray(expected_counts,fractional_counts_channel,parameter_to_index) expected_counts *= positive_weight total_noise_expected_counts = zeros(shape=(num_plain_letters,num_plain_letters)) log_noise_sample_probs_under_noise = [] #log_noise_sample_probs_under_noise = [] for k in range(num_noise_samples): temp_noise_fractional_counts = copy.deepcopy(free_parameters_channel) log_noise_sample_prob_under_model = runCarmel(noise_sample_training_commands_for_model[k]) print 'reading noise language fractional counts from cipher.fst.trained' emMethods.readCarmelFractionalCounts('cipher.fst.trained',temp_noise_fractional_counts,'channel') #log_noise_sample_prob_under_noise = runCarmel(noise_sample_training_commands_for_noise[k]) log_noise_sample_prob_under_noise = noise_probs[i*num_noise_samples+k] log_noise_sample_probs_under_noise.append(log_noise_sample_prob_under_noise) negative_weight = computeNegativeWeight(log_noise_sample_prob_under_model,log_noise_sample_probs_under_noise[k],num_noise_samples) if negative_weight <= 10E-10 : negative_weight = 10E-10 print 'The negative weight was',negative_weight p_d_equals_zero = log_noise_sample_probs_under_noise[k]+ log_num_noise_samples -\ (logaddexp(log_num_noise_samples+log_noise_sample_probs_under_noise[k],log_noise_sample_prob_under_model)) current_function_value += p_d_equals_zero print 'p of d equals 0 at the current point',k,' is ', p_d_equals_zero temp_noise_expected_counts = zeros(shape=(num_plain_letters,num_plain_letters)) emMethods.dictionaryToArray(temp_noise_expected_counts,temp_noise_fractional_counts,parameter_to_index) #print 'temp noise expected counts are ',temp_noise_expected_counts temp_noise_expected_counts *= negative_weight #print 'temp noise exptected counts are ',temp_noise_expected_counts total_noise_expected_counts += temp_noise_expected_counts print 'the log noise sample probs under noise are ',log_noise_sample_probs_under_noise #print 'total noise expected counts are ',total_noise_expected_counts #getting the final exptected counts expected_counts -= total_noise_expected_counts print 'Current function value is ',current_function_value ###############PROJECTING THE POINT ONTO THE DOUBLY STOCHASTIC MATRIX########################### new_feasible_point,current_point,grad = nceUpdate(eta,expected_counts,probabilities_channel,parameter_to_index,num_plain_letters) print 'Running line search....' #############LINE SEARCH######################### armijo_bound = 0.0 #print 'new feasible point is ',new_feasible_point #raw_input() #print 'gradient is ',grad #raw_input() armijo_bound = -armijo_sigma * armijo_beta * (grad*(new_feasible_point-current_point)).sum() print 'Armijo bound is ',armijo_bound terminate_line_srch = False num_steps = 1 #current_beta = 1.0 #armijo_beta current_beta = armijo_beta final_beta = 0 current_armijo_bound = armijo_bound no_update = True best_func_value = current_function_value while(terminate_line_srch != True) : #print 'num steps is ',num_steps temp_point = zeros(shape=current_point.shape) temp_point = current_point * (1.0 - current_beta) + current_beta * new_feasible_point #print 'the temp point is ' #print temp_point #evaluating the function at the current point #first writing the fst #assigning the probabilities and writing the fsa temp_probabilities_channel = copy.deepcopy(free_parameters_channel) assignProbsMatrix(temp_point,temp_probabilities_channel,parameter_to_index) temp_probabilities_channel['_']['_'] = 1.0 emMethods.writeFst('cipher.fst',temp_probabilities_channel) #print 'temp point is ',temp_point #sys.exit() func_value_at_temp_point = 0. temp_log_prob_under_model = runCarmel(run_true_training) temp_p_d_equals_one = temp_log_prob_under_model - (logaddexp(log_num_noise_samples+log_prob_under_noise,temp_log_prob_under_model)) print 'p of d equals 1 at the temp point is ', temp_p_d_equals_one func_value_at_temp_point += temp_p_d_equals_one #then go over the noise samples for k in range(num_noise_samples): temp_log_noise_sample_prob_under_model = runCarmel(noise_sample_training_commands_for_model[k]) temp_p_d_equals_zero = log_noise_sample_probs_under_noise[k] + log_num_noise_samples -\ (logaddexp(log_num_noise_samples+log_noise_sample_probs_under_noise[k],temp_log_noise_sample_prob_under_model)) func_value_at_temp_point += temp_p_d_equals_zero print 'p of d equals 0 at the temp point',k,' is ', temp_p_d_equals_zero print 'the function value at the temp point is %.16f'%func_value_at_temp_point #raw_input() if func_value_at_temp_point > best_func_value : best_func_value = func_value_at_temp_point final_beta = current_beta no_update = False #print 'we arrived at a better function value' #raw_input() # print 'we just updated thef final beta to ',final_beta #if (func_value_at_temp_point - current_function_value >= current_armijo_bound) : # terminate_line_srch = True #elif current_function_value - func_value_at_temp_point < 0: # terminate_line_srch = True if num_steps >= 5 : terminate_line_srch = True current_beta = armijo_beta * current_beta current_armijo_bound = armijo_bound * current_beta num_steps += 1 if no_update == False : current_point = (1.0-final_beta)*current_point + final_beta*new_feasible_point if no_update == True :#x.all() == current_point.all() : print 'not update was true' break; #print 'the current point is ',current_point #raw_input() #assigning the probabilities and writing the fsa assignProbsMatrix(current_point,probabilities_channel,parameter_to_index) emMethods.writeFst('cipher.fst',probabilities_channel) #print 'checking the initial zeros in channel model' #checkZeros(probabilities_channel) fractional_counts_channel = copy.deepcopy(free_parameters_channel) final_probabilities_channel = copy.deepcopy(probabilities_channel) print 'at the end of the iteration' current_alpha += alpha_delta elapsed_time = time.clock() - start_time print 'the elapsed time was ',elapsed_time