def stats(data="", path=""): vn_vocab,vn_bare_vocab,vn_long_vocab,vn_long_bare_vocab \ =load_vn_vocab() fixing_map = load_hard_fixing() wrong_words_counters = dict() bigram_counters = dict() if data != "": questions = data elif path != "": from load_products import load_question_from_file load_question_from_file(path) else: from load_products import load_questions questions = load_questions() cdir = os.path.abspath(os.path.dirname(__file__)) for qs in questions: #print qs qs = unicode(qs) qs = qs.lower() qs = norm_fix_common(qs, fixing_map) _tokens = split_sentece(qs) tokens = [] for token in _tokens: token = utils.accent2bare(token) tokens.append(token) for i in xrange(len(tokens)): if not tokens[i] in vn_bare_vocab: utils.add_dict_counter(wrong_words_counters, tokens[i]) if i < len(tokens) - 1: utils.add_dict_counter(bigram_counters, u"%s %s" % (tokens[i], tokens[i + 1])) sorted_wrong_tokens = utils.sort_dict(wrong_words_counters) sorted_bigram_counter = utils.sort_dict(bigram_counters) f_wrong = open("%s/models/data/out/wrong_tokens.dat" % cdir, "w", encoding="utf-8") f_bigram_stats = open("%s/models/data/out/bigram_tokens.dat" % cdir, "w", encoding="utf-8") for kv in sorted_wrong_tokens: ss = DIGIT.search(kv[0]) if ss != None: continue f_wrong.write(u"%s : %s\n" % (kv[0], kv[1])) f_wrong.close() for kv in sorted_bigram_counter: f_bigram_stats.write(u"%s : %s\n" % (kv[0], kv[1])) f_bigram_stats.close()
def get_voca_from_train_data(self, train_data): """ :param train_data: [ seq_sample, ... ] :return: """ cuid_count = dict() city_count = dict() query_count = dict() for seq_sample in train_data: cuid = seq_sample['cuid'] cuid_count[cuid] = cuid_count.get(cuid, 0) + 1 session_queries = seq_sample['session_queries'] for se_queries in session_queries: for query in se_queries: loc_city = query['loc_city'] city_count[loc_city] = city_count.get(loc_city, 0) + 1 query_key = query['query_key'] query_count[query_key] = query_count.get(query_key, 0) + 1 cuid_count = sort_dict(cuid_count) city_count = sort_dict(city_count) query_count = sort_dict(query_count) tmp_voca = get_voca_from_count(cuid_count) cuid_voca = { 'cuid_to_ix': tmp_voca['w_to_ix'], 'ix_to_cuid': tmp_voca['ix_to_w'], 'cuid_freq': tmp_voca['w_count'] } tmp_voca = get_voca_from_count(city_count) city_voca = { 'city_to_ix': tmp_voca['w_to_ix'], 'ix_to_city': tmp_voca['ix_to_w'], 'city_freq': tmp_voca['w_count'] } tmp_voca = get_voca_from_count(query_count, ['<START>']) query_voca = { 'query_to_ix': tmp_voca['w_to_ix'], 'ix_to_query': tmp_voca['ix_to_w'], 'query_freq': tmp_voca['w_count'] } return { 'cuid_voca': cuid_voca, 'city_voca': city_voca, 'query_voca': query_voca }
def clean_torrent(self): cd = self.cleaned_data tdata = cd['torrent'].read() try: tdict = bdecode(tdata) except: raise forms.ValidationError('.torrent file is not valid') tdict['announce'] = 'R' #need to replace later in process cd['torrent'] = bencode(sort_dict(tdict)) return cd
def check_dict(AllFiles): """ Find uncorrect data in AllFiles and remove it """ import collections for exp_name in list(AllFiles.keys()): for exp_num in list(AllFiles[exp_name].keys()): if not AllFiles[exp_name][exp_num]: utils.logger.warning( "There is no '2dseq' data in {0}\{1}".format( exp_name, exp_num)) del AllFiles[exp_name][exp_num] AllFiles[exp_name] = utils.sort_dict(AllFiles[exp_name]) if not AllFiles[exp_name]: utils.logger.warning( "There is no any experiments with '2dseq' data in {}".format( exp_name)) del AllFiles[exp_name] return AllFiles
def stats_bigram(): vn_vocab, vn_bare_vocab, vn_long_vocab, vn_long_bare_vocab \ = load_vn_vocab() vn_special_words = load_special_words() f = open("%s/models/data/out/bigram_tokens.dat" % cdir) bigram_counters = {} while True: line = f.readline() if line == "": break line = line.strip() parts = line.split(" : ") if len(parts) != 2: continue bigram = parts[0] if DIGIT.search(bigram) != None: continue cc = int(parts[1]) bigram_counters[bigram] = cc f.close() wrong_bigram_candidates = {} true_bigram_candidates = {} for bigram, counter in bigram_counters.iteritems(): if is_wrong_bare_bigram_candidates(bigram, vn_bare_vocab, vn_special_words): if counter > 2: wrong_bigram_candidates[bigram] = counter else: if counter > 8: true_bigram_candidates[bigram] = counter #Searching for candidates f_out = open("%s/models/data/out/bigram_candidates.dat" % cdir, "w", encoding="utf-8") f_rules_fix = open("%s/models/data/out/rule_one_fix.dat" % cdir, "w", encoding="utf-8") TOP = 10 #print len(true_bigram_candidates) #exit(-1) hierachical_true_dict = generate_hierachical_alphabet_dict( true_bigram_candidates) hierachical_true_first_ab_dict = generate_hierachical_first_alphabet_dict( true_bigram_candidates) utils.pickle_save(hierachical_true_dict, "%s/models/data/out/hierachical_true_dict.pkl" % cdir) utils.pickle_save( hierachical_true_first_ab_dict, "%s/models/data/out/hierachical_true_first_ab_dict.pkl" % cdir) utils.pickle_save(wrong_bigram_candidates, "%s/models/data/out/wrong_bigrams.pkl" % cdir) print "Searching for candidates..." cc = 0 for wrong_bigram, counter in wrong_bigram_candidates.iteritems(): cc += 1 print "\r%s" % cc, d_candidates = {} for c in set(wrong_bigram): if c == " ": continue try: sub_dict = hierachical_true_dict[c] except: continue for candidate, counter in sub_dict.iteritems(): try: d_candidates[candidate] except: d_candidates[candidate] = cal_sim_score( wrong_bigram, candidate, counter) sorted_score = utils.sort_dict(d_candidates) f_out.write(u"%s:\n" % wrong_bigram) ss = [] for i in xrange(TOP): ss.append("%s:%s " % (sorted_score[i][0], sorted_score[i][1])) ss = " ".join(ss) if sorted_score[0][1] > 1 and sorted_score[1][1] < 1: #print "A Rule" f_rules_fix.write(u"%s : %s\n" % (wrong_bigram, sorted_score[0][0])) f_rules_fix.flush() f_out.write(u"\t%s\n" % ss) f_out.close() f_rules_fix.close() print "\n\n\tDone"
session.log.folder).expanduser().resolve().as_posix() session.checkpoint.folder = session.checkpoint.folder \ .replace('{name}', experiment.name) \ .replace('{tags}', '_'.join(experiment.tags)) \ .replace('{rand}', rand_id) session.checkpoint.folder = Path( session.checkpoint.folder).expanduser().resolve().as_posix() if 'state_dict' in experiment.model: experiment.model.state_dict = Path( experiment.model.state_dict).expanduser().resolve().as_posix() if 'state_dict' in experiment.optimizer: experiment.optimizer.state_dict = Path( experiment.optimizer.state_dict).expanduser().resolve().as_posix() sort_dict( experiment, ['name', 'tags', 'epoch', 'samples', 'model', 'optimizer', 'sessions']) sort_dict(session, [ 'epochs', 'batch_size', 'losses', 'seed', 'cpus', 'device', 'samples', 'status', 'datetime_started', 'datetime_completed', 'data', 'log', 'checkpoint', 'git', 'gpus' ]) experiment.sessions.append(session) pyaml.pprint(experiment, sort_dicts=False, width=200) del session # endregion # region Building phase # Seeds (set them after the random run id is generated) set_seeds(experiment.session.seed)
def fix_wrong_words_heuristic(data="",path=""): vn_vocab = load_vn_vocab() fixing_map = load_hard_fixing() hard_regex = HardRegex() hard_regex.load_from_file() cc = [] for c in cc: if not c in vn_vocab: print "Wrong",c #exit(-1) wrong_words_counters = dict() if data != "": questions = data elif path != "": from load_products import load_question_from_file questions = load_question_from_file(path) else: from load_products import load_questions questions = load_questions() f_fix = open("%s/models/data/out/fixing"%cdir,"w",encoding="utf-8") bi_forward = dict() bi_backward = dict() question_norm1 = [] for qs in questions: qs = unicode(qs) qs = qs.lower() qs = hard_regex.replace(qs) tokens = split_sentece(qs) qq = [] ii = -1 for token in tokens: ii += 1 token = norm_token(token) try: token = fixing_map[token] qq.append(token) continue except: pass if is_skip_token(token): continue else: if not token in vn_vocab: #if token == u"luc": # print "LUC here ",qs try: if ii > 0: #if tokens[ii-1] == u"cường": # print "\t",token try: bi_backward[token][tokens[ii-1]] += 1 except: try: mm = bi_backward[token] except: mm = dict() bi_backward[token] = mm try: mm[tokens[ii-1]] += 1 except: mm[tokens[ii-1]] = 1 if ii < len(tokens) - 1: try: mm = bi_forward[token] except: mm = dict() bi_forward[token] = mm try: mm[tokens[ii + 1]] += 1 except: mm[tokens[ii + 1]] = 1 wrong_words_counters[token] += 1 except: wrong_words_counters[token] = 1 qq.append(token) ss = " ".join(qq) question_norm1.append(qq) f_fix.write(u"%s\n"%ss) f_fix.close() kvs = [] for key, value in sorted(wrong_words_counters.iteritems(), key=lambda (k, v): (v, k)): kvs.append([key, value]) TOP = 400 f = open("%s/models/data/out/popular_wrong_words.dat"%cdir,"w",encoding="utf-8") for i in xrange(1,TOP): f.write(u"%s\n"%kvs[-i][0]) #print kvs[-ie WMT’14 English to French][0],kvs[-i][1] f.close() #TOP = 300 candidates_f = dict() candidates_b = dict() revert_f = dict() revert_b = dict() T_TOP = 2 T_MIN = 8 f_forward_exist = dict() f_backward_exist = dict() for i in xrange(1,TOP): k = kvs[-i][0] #print kvs[-i][0],kvs[-i][1] forward_exist = True backward_exist = True try: f_forward = utils.sort_dict(bi_forward[k]) except: forward_exist = False try: f_backward = utils.sort_dict(bi_backward[k]) except: backward_exist = False f_forward_exist[k] = forward_exist f_backward_exist[k] = backward_exist if forward_exist: sz = min(T_TOP,len(f_forward)) for i in xrange(sz): if f_forward[i][1] > T_MIN: try: #print f_forward[i][0] revert_f[f_forward[i][0]].add(k) except: revert_f[f_forward[i][0]] = set() revert_f[f_forward[i][0]].add(k) if backward_exist: sz = min(T_TOP,len(f_backward)) for i in xrange(sz): if f_backward[i][1] > T_MIN: try: revert_b[f_backward[i][0]].add(k) except: revert_b[f_backward[i][0]] = set() revert_b[f_backward[i][0]].add(k) #print revert_b #print revert_f b_stores = dict() f_stores = dict() for q in question_norm1: i = -1 for token in q: i += 1 if i < len(q) - 1: w_next = q[i+1] if w_next in vn_vocab: try: b_own = revert_b[token] #Saving backward word context try: bb = b_stores[w_next] except: bb = dict() b_stores[w_next] = bb try: bb[token] += 1 except: bb[token] = 1 #Adding to the bw candidates for w in b_own: try: d_cand = candidates_b[w] except: d_cand = dict() candidates_b[w] = d_cand try: d_cand[w_next] += 1 except: d_cand[w_next] = 1 except: pass if i > 0: w_before = q[i - 1] if w_before in vn_vocab: try: b_own = revert_f[token] try: ff = f_stores[w_before] except: ff = dict() f_stores[w_before] = ff try: ff[token] += 1 except: ff[token] = 1 for w in b_own: try: d_cand = candidates_f[w] except: d_cand = dict() candidates_f[w] = d_cand try: d_cand[w_before] += 1 except: d_cand[w_before] = 1 except: pass f = open("%s/models/data/out/fix_candidates"%cdir,"w",encoding="utf-8") one_fix = dict() f_one_fix = open("%s/models/data/out/one_fix.dat"%cdir,"w",encoding="utf-8") f_multi_fix = open("%s/models/data/out/multi_fix.dat"%cdir,"w",encoding="utf-8") N_MULTI = 2 N_CONTEXT = 3 THRES_2 = 0.7 for k,v in b_stores.iteritems(): v = utils.sort_dict(v) b_stores[k] = v for k,v in f_stores.iteritems(): v = utils.sort_dict(v) f_stores[k] = v for k,v in candidates_b.iteritems(): if f_backward_exist[k]: #print "Error_b: ",k ss = utils.sort_dict(v) #print "\t",ss ll = [] l_candidates = [] l_ref_scores = [] for s in ss: ll.append(u"%s:%s " % (s[0], s[1])) l_candidates.append(s[0]) l_ref_scores.append(s[1]) ll = " ".join(ll) f.write(u"%s:\n" % k) f.write(u"\t%s\n" % ll) true_candidates,sorted_socre = get_candidate(k,l_candidates,l_ref_scores) ll2 = [] for i in xrange(len(true_candidates)): ll2.append(u"%s:%s "%(true_candidates[i],sorted_socre[i])) f.write(u"\t%s\n"%" ".join(ll2)) #Write one fix: if sorted_socre[1] < 1 and sorted_socre[0] > 1: one_fix[k] = true_candidates[0] elif sorted_socre[1] > THRES_2: for i in reversed(xrange(2)): fix = true_candidates[i] try: ll_context = [] back_context = b_stores[fix] for i in xrange(N_CONTEXT): ll_context.append(back_context[i][0]) f_multi_fix.write("B\t%s\t%s\t%s\n"%(k,fix," ".join(ll_context))) except: pass f.write(u"\n\n\n") for k,v in candidates_f.iteritems(): if f_forward_exist[k]: #print "Error_f: ",k ss = utils.sort_dict(v) #print "\t",ss ll = [] l_candidates = [] l_ref_scores = [] for s in ss: ll.append(u"%s:%s " % (s[0], s[1])) l_candidates.append(s[0]) l_ref_scores.append(s[1]) ll = " ".join(ll) f.write(u"%s:\n" % k) f.write(u"\t%s\n" % ll) true_candidates,sorted_socre = get_candidate(k,l_candidates,l_ref_scores) ll2 = [] for i in xrange(len(true_candidates)): ll2.append(u"%s:%s "%(true_candidates[i],sorted_socre[i])) f.write(u"\t%s\n"%" ".join(ll2)) #one fix: if sorted_socre[1] < 1 and sorted_socre[0] > 1: one_fix[k] = true_candidates[0] elif sorted_socre[1] > THRES_2: for i in reversed(xrange(2)): fix = true_candidates[i] try: ll_context = [] forward_context = f_stores[fix] for i in xrange(N_CONTEXT): ll_context.append(forward_context[i][0]) f_multi_fix.write("F\t%s\t%s\t%s\n" % (k, fix, " ".join(ll_context))) except: pass f.close() for k,v in one_fix.iteritems(): f_one_fix.write("%s\t%s\n"%(k,v)) f_one_fix.close() f_multi_fix.close()
def fix(self,sen2): qs = sen2 if qs == None: return try: qs = unicode(qs,encoding="utf-8") except: pass qs = qs.lower() qs = self.unigram_fixing.fix(qs) qs = vnbarenorm.norm_fix_common(qs,self.one_fix_map) _tokens = vnbarenorm.split_sentece(qs) back_ref = " ".join(_tokens) tokens = [] for token in _tokens: token = utils.accent2bare(token) tokens.append(token) bare_raw_sen = " ".join(tokens) for reg,repl in self.one_fix_map.iteritems(): bare_raw_sen = reg.sub(repl,bare_raw_sen) for i in xrange(len(tokens)-1): bigram = u"%s %s"%(tokens[i],tokens[i+1]) #print "\t%s"%bigram if vnbarenorm.is_wrong_bare_bigram_candidates(bigram,self.vn_bare_vocab,self.vn_special_words): #print bigram d_candidates = {} c = bigram[0] #self.vn_true_bare_bigram_f_ddict #for c in set(bigram): if c == " ": continue try: sub_dict = self.vn_true_bare_bigram_hie_ddict[c] except: continue for candidate, counter in sub_dict.iteritems(): try: d_candidates[candidate] except: sim_score = vnbarenorm.cal_sim_score(bigram, candidate, counter) if sim_score < 0.7: continue d_candidates[candidate] = vnbarenorm.cal_sim_score(bigram, candidate, counter) if len(d_candidates) == 0: continue sorted_score = utils.sort_dict(d_candidates) #print sorted_score if sorted_score[0][1] > 1:# and sorted_score[1][1] < 1: repl = sorted_score[0][0] reg = re.compile(r"\b%s\b"%bigram,re.UNICODE) bare_raw_sen = reg.sub(repl,bare_raw_sen) return bare_raw_sen,back_ref
def filter_combo_se(fname=config.POLY_ADR_PATH): combo2stitch = {} combo2se = defaultdict(set) se2name = {} seCounter = dict() drugs = set() fin = open(fname) print 'Reading: %s' % fname fin.readline() for line in fin: stitch_id1, stitch_id2, se, se_name = line.strip().split(',') drugs.add(stitch_id1) drugs.add(stitch_id2) combo = stitch_id1 + '_' + stitch_id2 combo2stitch[combo] = [stitch_id1, stitch_id2] combo2se[combo].add(se) utils.add_dict_counter(seCounter, se) se2name[se] = se_name fin.close() n_interactions = sum([len(v) for v in combo2se.values()]) print 'Before' print 'Drug combinations: %d Side effects: %d' % (len(combo2stitch), len(se2name)) print 'Drug-drug interactions: %d' % (n_interactions) print 'Num drug: %d' % (len(drugs)) seCounterSorted = utils.sort_dict(seCounter) validSe = set() se2Id = dict() for i in range(config.NUM_SE): se = seCounterSorted[i][0] validSe.add(seCounterSorted[i][0]) utils.get_update_dict_index(se2Id, se) print validSe drug2Id = dict() se2Combos = dict() nInteractions = 0 combosCounter = dict() for combo, ses in combo2se.iteritems(): t1, t2 = combo2stitch[combo] id1 = utils.get_update_dict_index(drug2Id, t1) id2 = utils.get_update_dict_index(drug2Id, t2) for se in ses: seId = utils.get_dict(se2Id, se, -1) if seId != -1: combos = utils.get_insert_key_dict(se2Combos, seId, []) combos.append([id1, id2]) nInteractions += 1 utils.add_dict_counter(combosCounter, "%s_%s" % (id1, id2)) nDrug = len(drug2Id) print 'After' print 'Drug combinations: %d Side effects: %d' % (len(combosCounter), config.NUM_SE) print 'Drug-drug interactions: %d' % nInteractions print 'Num drug: %d' % (nDrug) print 'Save to file...' drug_drug_adj_list = [] for se, combos in se2Combos.iteritems(): drugdrugMatrix = np.zeros((nDrug, nDrug)) for d1, d2 in combos: drugdrugMatrix[d1, d2] = drugdrugMatrix[d2, d1] = 1 drug_drug_adj_list.append(sp.csr_matrix(drugdrugMatrix)) utils.save_obj(drug_drug_adj_list, config.PROCESSED_COMBO_ADR) utils.save_obj(drug2Id, config.DRUG_MAP)