def read_output_thanh(): fname = './train_debugs/debug_full_05.txt' en_url = '' can_url = '' found_can = False cal_pairs = {} old_domain = '' count = 0 with open(fname, 'rt') as f: for line in f: line = line.strip() ms = line.split() if len(ms)!=2: continue key, val = ms if key.startswith('----------'): en_url = line.split(' ')[1] found_can = False domain = get_domain(en_url) if old_domain is not None and domain != old_domain: print old_domain , '\t', count old_domain = domain count = 0 elif found_can==False and key.find(':')>=0: found_can = True can_url = val #print en_url + '\t' + can_url cal_pairs[en_url] = can_url elif key.startswith('---gold'): rank = int(key.replace('=', ',').split(',')[1]) if rank==-1: count +=1 return cal_pairs
def read_output_thanh(): fname = './train_debugs/debug_full_05.txt' en_url = '' can_url = '' found_can = False cal_pairs = {} old_domain = '' count = 0 with open(fname, 'rt') as f: for line in f: line = line.strip() ms = line.split() if len(ms) != 2: continue key, val = ms if key.startswith('----------'): en_url = line.split(' ')[1] found_can = False domain = get_domain(en_url) if old_domain is not None and domain != old_domain: print old_domain, '\t', count old_domain = domain count = 0 elif found_can == False and key.find(':') >= 0: found_can = True can_url = val #print en_url + '\t' + can_url cal_pairs[en_url] = can_url elif key.startswith('---gold'): rank = int(key.replace('=', ',').split(',')[1]) if rank == -1: count += 1 return cal_pairs
def run1(): #make clues for train set #debug_domains = ['www.dakar.com', 'www.luontoportti.com', 'www.nauticnews.com', 'www.the-great-adventure.fr'] debug_domains = ['bugadacargnel.com', 'www.ec.gc.ca'] with open(train_pairs, 'rt') as f: for line in f: en_url, gold = line.strip().split('\t') en_url, gold = en_url.strip(), gold.strip() if debug and get_domain(en_url) not in debug_domains: continue get_candidates(en_url)
def run1():#make clues for train set #debug_domains = ['www.dakar.com', 'www.luontoportti.com', 'www.nauticnews.com', 'www.the-great-adventure.fr'] debug_domains = ['bugadacargnel.com', 'www.ec.gc.ca'] with open(train_pairs, 'rt') as f: for line in f: en_url, gold = line.strip().split('\t') en_url, gold = en_url.strip(), gold.strip() if debug and get_domain(en_url) not in debug_domains: continue get_candidates(en_url)
def get_candidates(en_url): '''Get all candidates for the given source English URL''' domain = get_domain( en_url ) #TODO get these coment back for run train data?? doo all en_url of a domain, so load load_domain_corpus(domain) en_page = en_corpus[en_url] unique_en_tokens = list(set(en_page.tokens)) score = get_chance_score(en_page.tokens, unique_en_tokens) print('%f\t%d\t%d\t%d\t%d\t%s' % (score, len(en_page.tokens), len(unique_en_tokens), col_size, col_vocab_size, en_url) ) #min_socre, #en_len, #en_vocab_size, col_size, #col_vocab_size return doc_col_scores = col_model_for_a_doc(en_page.tokens, unique_en_tokens) cans, scores = [], [] #cans, scores = [''], [0] #max_score = float('-inf') #pq_result = [] for fr_url in fr_corpus: #lrate = en_page.length/float(fr_corpus[fr_url].length) #if use_filter and (lrate < LENGTH_LOWER_BOUND or lrate > LENGTH_UPPER_BOUND):#filter length #continue #score = score_original(fr_url, en_page.tokens) #print('1:' + str(score)) #score = score_original_optimal(fr_url, unique_en_tokens, doc_col_scores, max_score) score = score_original_optimal(fr_url, unique_en_tokens, doc_col_scores) #print('2:' + str(score)) if score is not None: cans.append(fr_url) scores.append(score) #if score>max_score: #max_score = score #cans[0] = fr_url #scores[0] = score #if len(pq_result)<output_top: #heapq.heappush(pq_result, (score, fr_url)) #else: #heapq.heappushpop(pq_result, (score, fr_url)) cans, scores = sort_candidates(cans, scores) #cans, scores = np.array(cans), np.array(scores) return cans, scores
def load_translation(domain): '''Load transation for the given domain, ignore all line of others domain.''' global tran_corpus tran_corpus = defaultdict(list) domain_found = False with open(tran_en, 'rt') as f: for line in f: url, line = line.strip().split('\t') url, line = url.strip(), line.strip().lower() if url == 'unknown_url': continue line_domain = get_domain(url) if line_domain == domain: #a domain have transaltion in multiple line tran_corpus[url].append(line) domain_found = True elif domain_found: break
def run1(): #debug_domains = ['www.dakar.com', 'www.luontoportti.com', 'www.nauticnews.com', 'www.the-great-adventure.fr'] debug_domains = ['eu.blizzard.com'] with open(train_pairs, 'rt') as f: for line in f: en_url, gold = line.strip().split('\t') en_url, gold = en_url.strip(), gold.strip() if debug and get_domain(en_url) not in debug_domains: continue cans, scores = get_candidates(en_url) count_evaluate(en_url, cans, scores, gold) print_domain_summary(current_domain) print_summary()
def load_translation(domain): '''Load transation for the given domain, ignore all line of others domain.''' global tran_corpus tran_corpus = defaultdict(list) domain_found = False with open(tran_en, 'rt') as f: for line in f: url, line = line.strip().split('\t') url, line = url.strip(), line.strip().lower() if url == 'unknown_url': continue line_domain = get_domain(url) if line_domain == domain:#a domain have transaltion in multiple line tran_corpus[url].append(line) domain_found = True elif domain_found: break
def get_candidates(en_url): '''Get all candidates for the given source English URL''' domain = get_domain(en_url)#TODO get these coment back for run train data?? doo all en_url of a domain, so load load_domain_corpus(domain) en_page = en_corpus[en_url] unique_en_tokens = list(set(en_page.tokens)) score = get_chance_score(en_page.tokens, unique_en_tokens) print('%f\t%d\t%d\t%d\t%d\t%s'%(score, len(en_page.tokens), len(unique_en_tokens), col_size, col_vocab_size, en_url))#min_socre, #en_len, #en_vocab_size, col_size, #col_vocab_size return doc_col_scores = col_model_for_a_doc(en_page.tokens, unique_en_tokens) cans, scores = [], [] #cans, scores = [''], [0] #max_score = float('-inf') #pq_result = [] for fr_url in fr_corpus: #lrate = en_page.length/float(fr_corpus[fr_url].length) #if use_filter and (lrate < LENGTH_LOWER_BOUND or lrate > LENGTH_UPPER_BOUND):#filter length #continue #score = score_original(fr_url, en_page.tokens) #print('1:' + str(score)) #score = score_original_optimal(fr_url, unique_en_tokens, doc_col_scores, max_score) score = score_original_optimal(fr_url, unique_en_tokens, doc_col_scores) #print('2:' + str(score)) if score is not None: cans.append(fr_url) scores.append(score) #if score>max_score: #max_score = score #cans[0] = fr_url #scores[0] = score #if len(pq_result)<output_top: #heapq.heappush(pq_result, (score, fr_url)) #else: #heapq.heappushpop(pq_result, (score, fr_url)) cans, scores = sort_candidates(cans, scores) #cans, scores = np.array(cans), np.array(scores) return cans, scores
def get_candidates(en_url): '''Get all candidates for the given source English URL''' domain = get_domain(en_url)#TODO get these coment back for run train data?? doo all en_url of a domain, so load load_domain_corpus(domain) en_page = en_corpus[en_url] en_text = ' '.join(en_page.tokens) if en_url not in cal_pairs: return fr_url = cal_pairs[en_url] fr_text = ' '.join(tran_corpus[fr_url]) if len(fr_text)==0 or len(en_text)==0: print('%s\t%s\t%f\t%f\t%f'%(en_url, fr_url, 20, 20, 20)) return #print('debug len:', len(fr_text), 'en_len', len(en_text), sep='\t') en_model, en_len = get_pro_model(' '.join(en_page.tokens)) fr_model, fr_len = get_pro_model(' '.join(tran_corpus[cal_pairs[en_url]])) kl = get_kl_divergence(en_model, en_len, fr_model, fr_len) cr = get_cross_entropy(en_model, en_len, fr_model, fr_len) cosine = get_cosine(en_model, en_len, fr_model, fr_len) print('%s\t%s\t%f\t%f\t%f'%(en_url, fr_url, kl, cr, cosine)) return import pdb pdb.set_trace() #clues 1 en_page = en_corpus[en_url] unique_en_tokens = list(set(en_page.tokens)) score = get_chance_score(en_page.tokens, unique_en_tokens) print('%f\t%d\t%d\t%d\t%d\t%s'%(score, len(en_page.tokens), len(unique_en_tokens), col_size, col_vocab_size, en_url))#min_socre, #en_len, #en_vocab_size, col_size, #col_vocab_size return doc_col_scores = col_model_for_a_doc(en_page.tokens, unique_en_tokens) cans, scores = [], [] #cans, scores = [''], [0] #max_score = float('-inf') #pq_result = [] for fr_url in fr_corpus: #lrate = en_page.length/float(fr_corpus[fr_url].length) #if use_filter and (lrate < LENGTH_LOWER_BOUND or lrate > LENGTH_UPPER_BOUND):#filter length #continue #score = score_original(fr_url, en_page.tokens) #print('1:' + str(score)) #score = score_original_optimal(fr_url, unique_en_tokens, doc_col_scores, max_score) score = score_original_optimal(fr_url, unique_en_tokens, doc_col_scores) #print('2:' + str(score)) if score is not None: cans.append(fr_url) scores.append(score) #if score>max_score: #max_score = score #cans[0] = fr_url #scores[0] = score #if len(pq_result)<output_top: #heapq.heappush(pq_result, (score, fr_url)) #else: #heapq.heappushpop(pq_result, (score, fr_url)) cans, scores = sort_candidates(cans, scores) #cans, scores = np.array(cans), np.array(scores) return cans, scores