Example #1
0
def stats(data="", path=""):
    vn_vocab,vn_bare_vocab,vn_long_vocab,vn_long_bare_vocab \
        =load_vn_vocab()
    fixing_map = load_hard_fixing()
    wrong_words_counters = dict()
    bigram_counters = dict()
    if data != "":
        questions = data
    elif path != "":
        from load_products import load_question_from_file
        load_question_from_file(path)
    else:
        from load_products import load_questions
        questions = load_questions()
    cdir = os.path.abspath(os.path.dirname(__file__))

    for qs in questions:
        #print qs
        qs = unicode(qs)
        qs = qs.lower()
        qs = norm_fix_common(qs, fixing_map)
        _tokens = split_sentece(qs)

        tokens = []
        for token in _tokens:
            token = utils.accent2bare(token)
            tokens.append(token)

        for i in xrange(len(tokens)):
            if not tokens[i] in vn_bare_vocab:
                utils.add_dict_counter(wrong_words_counters, tokens[i])
            if i < len(tokens) - 1:
                utils.add_dict_counter(bigram_counters,
                                       u"%s %s" % (tokens[i], tokens[i + 1]))

    sorted_wrong_tokens = utils.sort_dict(wrong_words_counters)
    sorted_bigram_counter = utils.sort_dict(bigram_counters)
    f_wrong = open("%s/models/data/out/wrong_tokens.dat" % cdir,
                   "w",
                   encoding="utf-8")
    f_bigram_stats = open("%s/models/data/out/bigram_tokens.dat" % cdir,
                          "w",
                          encoding="utf-8")

    for kv in sorted_wrong_tokens:
        ss = DIGIT.search(kv[0])
        if ss != None:
            continue
        f_wrong.write(u"%s : %s\n" % (kv[0], kv[1]))
    f_wrong.close()
    for kv in sorted_bigram_counter:
        f_bigram_stats.write(u"%s : %s\n" % (kv[0], kv[1]))
    f_bigram_stats.close()
    def get_voca_from_train_data(self, train_data):
        """
        :param train_data: [ seq_sample, ... ]
        :return:
        """
        cuid_count = dict()
        city_count = dict()
        query_count = dict()
        for seq_sample in train_data:
            cuid = seq_sample['cuid']
            cuid_count[cuid] = cuid_count.get(cuid, 0) + 1
            session_queries = seq_sample['session_queries']
            for se_queries in session_queries:
                for query in se_queries:
                    loc_city = query['loc_city']
                    city_count[loc_city] = city_count.get(loc_city, 0) + 1
                    query_key = query['query_key']
                    query_count[query_key] = query_count.get(query_key, 0) + 1
        cuid_count = sort_dict(cuid_count)
        city_count = sort_dict(city_count)
        query_count = sort_dict(query_count)

        tmp_voca = get_voca_from_count(cuid_count)
        cuid_voca = {
            'cuid_to_ix': tmp_voca['w_to_ix'],
            'ix_to_cuid': tmp_voca['ix_to_w'],
            'cuid_freq': tmp_voca['w_count']
        }
        tmp_voca = get_voca_from_count(city_count)
        city_voca = {
            'city_to_ix': tmp_voca['w_to_ix'],
            'ix_to_city': tmp_voca['ix_to_w'],
            'city_freq': tmp_voca['w_count']
        }
        tmp_voca = get_voca_from_count(query_count, ['<START>'])
        query_voca = {
            'query_to_ix': tmp_voca['w_to_ix'],
            'ix_to_query': tmp_voca['ix_to_w'],
            'query_freq': tmp_voca['w_count']
        }
        return {
            'cuid_voca': cuid_voca,
            'city_voca': city_voca,
            'query_voca': query_voca
        }
Example #3
0
	def clean_torrent(self):
		cd = self.cleaned_data
		tdata = cd['torrent'].read()
		try:
			tdict = bdecode(tdata)
		except:
			raise forms.ValidationError('.torrent file is not valid')
		tdict['announce'] = 'R' #need to replace later in process
		cd['torrent'] = bencode(sort_dict(tdict))
		return cd
Example #4
0
def check_dict(AllFiles):
    """
Find uncorrect data in AllFiles and remove it
    """
    import collections

    for exp_name in list(AllFiles.keys()):

        for exp_num in list(AllFiles[exp_name].keys()):
            if not AllFiles[exp_name][exp_num]:
                utils.logger.warning(
                    "There is no '2dseq' data in {0}\{1}".format(
                        exp_name, exp_num))
                del AllFiles[exp_name][exp_num]

        AllFiles[exp_name] = utils.sort_dict(AllFiles[exp_name])

        if not AllFiles[exp_name]:
            utils.logger.warning(
                "There is no any experiments with '2dseq' data in {}".format(
                    exp_name))
            del AllFiles[exp_name]

    return AllFiles
Example #5
0
def stats_bigram():
    vn_vocab, vn_bare_vocab, vn_long_vocab, vn_long_bare_vocab \
        = load_vn_vocab()
    vn_special_words = load_special_words()
    f = open("%s/models/data/out/bigram_tokens.dat" % cdir)
    bigram_counters = {}
    while True:
        line = f.readline()
        if line == "":
            break
        line = line.strip()

        parts = line.split(" : ")

        if len(parts) != 2:
            continue
        bigram = parts[0]
        if DIGIT.search(bigram) != None:
            continue
        cc = int(parts[1])
        bigram_counters[bigram] = cc
    f.close()

    wrong_bigram_candidates = {}
    true_bigram_candidates = {}
    for bigram, counter in bigram_counters.iteritems():
        if is_wrong_bare_bigram_candidates(bigram, vn_bare_vocab,
                                           vn_special_words):
            if counter > 2:
                wrong_bigram_candidates[bigram] = counter
        else:
            if counter > 8:
                true_bigram_candidates[bigram] = counter

    #Searching for candidates

    f_out = open("%s/models/data/out/bigram_candidates.dat" % cdir,
                 "w",
                 encoding="utf-8")
    f_rules_fix = open("%s/models/data/out/rule_one_fix.dat" % cdir,
                       "w",
                       encoding="utf-8")
    TOP = 10
    #print len(true_bigram_candidates)
    #exit(-1)
    hierachical_true_dict = generate_hierachical_alphabet_dict(
        true_bigram_candidates)
    hierachical_true_first_ab_dict = generate_hierachical_first_alphabet_dict(
        true_bigram_candidates)
    utils.pickle_save(hierachical_true_dict,
                      "%s/models/data/out/hierachical_true_dict.pkl" % cdir)
    utils.pickle_save(
        hierachical_true_first_ab_dict,
        "%s/models/data/out/hierachical_true_first_ab_dict.pkl" % cdir)
    utils.pickle_save(wrong_bigram_candidates,
                      "%s/models/data/out/wrong_bigrams.pkl" % cdir)
    print "Searching for candidates..."

    cc = 0

    for wrong_bigram, counter in wrong_bigram_candidates.iteritems():
        cc += 1
        print "\r%s" % cc,

        d_candidates = {}

        for c in set(wrong_bigram):
            if c == " ":
                continue
            try:
                sub_dict = hierachical_true_dict[c]
            except:
                continue

            for candidate, counter in sub_dict.iteritems():
                try:
                    d_candidates[candidate]
                except:
                    d_candidates[candidate] = cal_sim_score(
                        wrong_bigram, candidate, counter)

        sorted_score = utils.sort_dict(d_candidates)

        f_out.write(u"%s:\n" % wrong_bigram)
        ss = []
        for i in xrange(TOP):
            ss.append("%s:%s " % (sorted_score[i][0], sorted_score[i][1]))
        ss = " ".join(ss)
        if sorted_score[0][1] > 1 and sorted_score[1][1] < 1:
            #print "A Rule"
            f_rules_fix.write(u"%s : %s\n" %
                              (wrong_bigram, sorted_score[0][0]))
            f_rules_fix.flush()
        f_out.write(u"\t%s\n" % ss)
    f_out.close()
    f_rules_fix.close()
    print "\n\n\tDone"
            session.log.folder).expanduser().resolve().as_posix()
    session.checkpoint.folder = session.checkpoint.folder \
        .replace('{name}', experiment.name) \
        .replace('{tags}', '_'.join(experiment.tags)) \
        .replace('{rand}', rand_id)
    session.checkpoint.folder = Path(
        session.checkpoint.folder).expanduser().resolve().as_posix()
if 'state_dict' in experiment.model:
    experiment.model.state_dict = Path(
        experiment.model.state_dict).expanduser().resolve().as_posix()
if 'state_dict' in experiment.optimizer:
    experiment.optimizer.state_dict = Path(
        experiment.optimizer.state_dict).expanduser().resolve().as_posix()

sort_dict(
    experiment,
    ['name', 'tags', 'epoch', 'samples', 'model', 'optimizer', 'sessions'])
sort_dict(session, [
    'epochs', 'batch_size', 'losses', 'seed', 'cpus', 'device', 'samples',
    'status', 'datetime_started', 'datetime_completed', 'data', 'log',
    'checkpoint', 'git', 'gpus'
])
experiment.sessions.append(session)
pyaml.pprint(experiment, sort_dicts=False, width=200)
del session
# endregion

# region Building phase
# Seeds (set them after the random run id is generated)
set_seeds(experiment.session.seed)
Example #7
0
def fix_wrong_words_heuristic(data="",path=""):
    vn_vocab = load_vn_vocab()
    fixing_map = load_hard_fixing()
    hard_regex = HardRegex()
    hard_regex.load_from_file()
    cc = []
    for c in cc:
        if not c in vn_vocab:
            print "Wrong",c
        #exit(-1)
    wrong_words_counters = dict()
    if data != "":
        questions = data
    elif path != "":
        from load_products import load_question_from_file
        questions = load_question_from_file(path)
    else:
        from load_products import load_questions
        questions = load_questions()
    f_fix = open("%s/models/data/out/fixing"%cdir,"w",encoding="utf-8")
    bi_forward = dict()
    bi_backward = dict()
    question_norm1 = []
    for qs in  questions:
        qs = unicode(qs)
        qs = qs.lower()
        qs = hard_regex.replace(qs)
        tokens = split_sentece(qs)
        qq = []
        ii = -1
        for token in tokens:
            ii += 1
            token = norm_token(token)
            try:
                token = fixing_map[token]
                qq.append(token)
                continue
            except:
                pass
            if is_skip_token(token):
                continue


            else:
                if not token in vn_vocab:
                    #if token == u"luc":
                    #    print "LUC here ",qs
                    try:
                        if ii > 0:
                            #if tokens[ii-1] == u"cường":
                            #    print "\t",token
                            try:
                                bi_backward[token][tokens[ii-1]] += 1
                            except:
                                try:
                                    mm = bi_backward[token]
                                except:
                                    mm = dict()
                                    bi_backward[token] = mm

                                try:
                                    mm[tokens[ii-1]] += 1
                                except:
                                    mm[tokens[ii-1]] = 1
                        if ii < len(tokens) - 1:
                            try:
                                mm = bi_forward[token]
                            except:
                                mm = dict()
                                bi_forward[token] = mm

                            try:
                                mm[tokens[ii + 1]] += 1
                            except:
                                mm[tokens[ii + 1]] = 1

                        wrong_words_counters[token] += 1
                    except:
                        wrong_words_counters[token] = 1
            qq.append(token)


        ss = " ".join(qq)

        question_norm1.append(qq)
        f_fix.write(u"%s\n"%ss)

    f_fix.close()
    kvs = []

    for key, value in sorted(wrong_words_counters.iteritems(), key=lambda (k, v): (v, k)):
        kvs.append([key, value])

    TOP = 400
    f = open("%s/models/data/out/popular_wrong_words.dat"%cdir,"w",encoding="utf-8")
    for i in xrange(1,TOP):
        f.write(u"%s\n"%kvs[-i][0])
        #print kvs[-ie WMT’14 English to French][0],kvs[-i][1]
    f.close()
    #TOP = 300
    candidates_f = dict()
    candidates_b = dict()


    revert_f = dict()
    revert_b = dict()
    T_TOP = 2
    T_MIN = 8

    f_forward_exist = dict()
    f_backward_exist = dict()
    for i in xrange(1,TOP):
        k = kvs[-i][0]
        #print kvs[-i][0],kvs[-i][1]

        forward_exist = True
        backward_exist = True
        try:

            f_forward = utils.sort_dict(bi_forward[k])
        except:
            forward_exist = False

        try:
            f_backward = utils.sort_dict(bi_backward[k])
        except:
            backward_exist = False

        f_forward_exist[k] = forward_exist
        f_backward_exist[k] = backward_exist

        if forward_exist:
            sz = min(T_TOP,len(f_forward))
            for i in xrange(sz):
                if f_forward[i][1] > T_MIN:
                    try:
                        #print f_forward[i][0]
                        revert_f[f_forward[i][0]].add(k)
                    except:
                        revert_f[f_forward[i][0]] = set()
                        revert_f[f_forward[i][0]].add(k)
        if backward_exist:
            sz = min(T_TOP,len(f_backward))
            for i in xrange(sz):
                if f_backward[i][1] > T_MIN:
                    try:
                        revert_b[f_backward[i][0]].add(k)
                    except:
                        revert_b[f_backward[i][0]] = set()
                        revert_b[f_backward[i][0]].add(k)

    #print revert_b
    #print revert_f

    b_stores = dict()
    f_stores = dict()


    for q in question_norm1:
        i = -1
        for token in q:
            i += 1
            if i < len(q) - 1:
                w_next = q[i+1]
                if w_next in vn_vocab:
                    try:
                        b_own = revert_b[token]
                        #Saving backward word context
                        try:
                            bb = b_stores[w_next]
                        except:
                            bb = dict()
                            b_stores[w_next] = bb
                        try:
                            bb[token] += 1
                        except:
                            bb[token] = 1

                        #Adding to the bw candidates

                        for w in b_own:
                            try:
                                d_cand = candidates_b[w]
                            except:
                                d_cand = dict()
                                candidates_b[w] = d_cand
                            try:
                                d_cand[w_next] += 1
                            except:
                                d_cand[w_next] = 1


                    except:
                        pass
            if i > 0:

                w_before = q[i - 1]

                if w_before in vn_vocab:
                    try:
                        b_own = revert_f[token]
                        try:
                            ff = f_stores[w_before]
                        except:
                            ff = dict()
                            f_stores[w_before] = ff

                        try:
                            ff[token] += 1
                        except:
                            ff[token] = 1

                        for w in b_own:
                            try:
                                d_cand = candidates_f[w]
                            except:
                                d_cand = dict()
                                candidates_f[w] = d_cand
                            try:
                                d_cand[w_before] += 1
                            except:
                                d_cand[w_before] = 1


                    except:
                        pass

    f = open("%s/models/data/out/fix_candidates"%cdir,"w",encoding="utf-8")
    one_fix = dict()
    f_one_fix = open("%s/models/data/out/one_fix.dat"%cdir,"w",encoding="utf-8")
    f_multi_fix = open("%s/models/data/out/multi_fix.dat"%cdir,"w",encoding="utf-8")
    N_MULTI = 2
    N_CONTEXT = 3
    THRES_2  = 0.7

    for k,v in b_stores.iteritems():
        v = utils.sort_dict(v)
        b_stores[k] = v
    for k,v in f_stores.iteritems():
        v = utils.sort_dict(v)
        f_stores[k] = v

    for k,v in candidates_b.iteritems():
        if f_backward_exist[k]:
            #print "Error_b: ",k

            ss = utils.sort_dict(v)
            #print "\t",ss

            ll = []
            l_candidates = []
            l_ref_scores = []
            for s in ss:
                ll.append(u"%s:%s " % (s[0], s[1]))
                l_candidates.append(s[0])
                l_ref_scores.append(s[1])

            ll = " ".join(ll)
            f.write(u"%s:\n" % k)
            f.write(u"\t%s\n" % ll)

            true_candidates,sorted_socre = get_candidate(k,l_candidates,l_ref_scores)

            ll2 = []
            for i in xrange(len(true_candidates)):
                ll2.append(u"%s:%s "%(true_candidates[i],sorted_socre[i]))
            f.write(u"\t%s\n"%" ".join(ll2))

            #Write one fix:
            if sorted_socre[1] < 1 and sorted_socre[0] > 1:
                one_fix[k] = true_candidates[0]
            elif sorted_socre[1] > THRES_2:
                for i in reversed(xrange(2)):
                    fix = true_candidates[i]

                    try:
                        ll_context = []
                        back_context = b_stores[fix]
                        for i in xrange(N_CONTEXT):
                            ll_context.append(back_context[i][0])
                        f_multi_fix.write("B\t%s\t%s\t%s\n"%(k,fix," ".join(ll_context)))
                    except:
                        pass

    f.write(u"\n\n\n")
    for k,v in candidates_f.iteritems():
        if f_forward_exist[k]:
            #print "Error_f: ",k
            ss = utils.sort_dict(v)
            #print "\t",ss

            ll = []
            l_candidates = []
            l_ref_scores = []
            for s in ss:
                ll.append(u"%s:%s " % (s[0], s[1]))
                l_candidates.append(s[0])
                l_ref_scores.append(s[1])
            ll = " ".join(ll)
            f.write(u"%s:\n" % k)
            f.write(u"\t%s\n" % ll)

            true_candidates,sorted_socre = get_candidate(k,l_candidates,l_ref_scores)
            ll2 = []
            for i in xrange(len(true_candidates)):
                ll2.append(u"%s:%s "%(true_candidates[i],sorted_socre[i]))
            f.write(u"\t%s\n"%" ".join(ll2))
            #one fix:
            if sorted_socre[1] < 1 and sorted_socre[0] > 1:
                one_fix[k] = true_candidates[0]
            elif sorted_socre[1] > THRES_2:
                for i in reversed(xrange(2)):
                    fix = true_candidates[i]

                    try:
                        ll_context = []
                        forward_context = f_stores[fix]
                        for i in xrange(N_CONTEXT):
                            ll_context.append(forward_context[i][0])
                        f_multi_fix.write("F\t%s\t%s\t%s\n" % (k, fix, " ".join(ll_context)))
                    except:
                        pass

    f.close()
    for k,v in one_fix.iteritems():
        f_one_fix.write("%s\t%s\n"%(k,v))
    f_one_fix.close()
    f_multi_fix.close()
    def fix(self,sen2):
        qs = sen2
        if qs == None:
            return
        try:
            qs = unicode(qs,encoding="utf-8")
        except:
            pass
        qs = qs.lower()
        qs = self.unigram_fixing.fix(qs)

        qs = vnbarenorm.norm_fix_common(qs,self.one_fix_map)
        _tokens = vnbarenorm.split_sentece(qs)



        back_ref = " ".join(_tokens)

        tokens = []
        for token in _tokens:
            token = utils.accent2bare(token)
            tokens.append(token)

        bare_raw_sen = " ".join(tokens)
        for reg,repl in self.one_fix_map.iteritems():
            bare_raw_sen = reg.sub(repl,bare_raw_sen)

        for i in xrange(len(tokens)-1):
            bigram = u"%s %s"%(tokens[i],tokens[i+1])
            #print "\t%s"%bigram
            if vnbarenorm.is_wrong_bare_bigram_candidates(bigram,self.vn_bare_vocab,self.vn_special_words):

                #print bigram
                d_candidates = {}

                c = bigram[0]
                #self.vn_true_bare_bigram_f_ddict

                #for c in set(bigram):
                if c == " ":
                    continue
                try:
                    sub_dict = self.vn_true_bare_bigram_hie_ddict[c]
                except:
                    continue

                for candidate, counter in sub_dict.iteritems():
                    try:
                        d_candidates[candidate]
                    except:
                        sim_score = vnbarenorm.cal_sim_score(bigram, candidate, counter)
                        if sim_score < 0.7:
                            continue
                        d_candidates[candidate] = vnbarenorm.cal_sim_score(bigram, candidate, counter)

                if len(d_candidates) == 0:
                    continue
                sorted_score = utils.sort_dict(d_candidates)

                #print sorted_score

                if sorted_score[0][1] > 1:# and sorted_score[1][1] < 1:
                    repl = sorted_score[0][0]
                    reg = re.compile(r"\b%s\b"%bigram,re.UNICODE)
                    bare_raw_sen = reg.sub(repl,bare_raw_sen)
        return bare_raw_sen,back_ref
Example #9
0
def filter_combo_se(fname=config.POLY_ADR_PATH):
    combo2stitch = {}
    combo2se = defaultdict(set)
    se2name = {}
    seCounter = dict()
    drugs = set()
    fin = open(fname)
    print 'Reading: %s' % fname
    fin.readline()
    for line in fin:
        stitch_id1, stitch_id2, se, se_name = line.strip().split(',')
        drugs.add(stitch_id1)
        drugs.add(stitch_id2)
        combo = stitch_id1 + '_' + stitch_id2
        combo2stitch[combo] = [stitch_id1, stitch_id2]
        combo2se[combo].add(se)
        utils.add_dict_counter(seCounter, se)
        se2name[se] = se_name
    fin.close()
    n_interactions = sum([len(v) for v in combo2se.values()])

    print 'Before'
    print 'Drug combinations: %d Side effects: %d' % (len(combo2stitch),
                                                      len(se2name))
    print 'Drug-drug interactions: %d' % (n_interactions)
    print 'Num drug: %d' % (len(drugs))

    seCounterSorted = utils.sort_dict(seCounter)
    validSe = set()
    se2Id = dict()
    for i in range(config.NUM_SE):
        se = seCounterSorted[i][0]
        validSe.add(seCounterSorted[i][0])
        utils.get_update_dict_index(se2Id, se)
    print validSe
    drug2Id = dict()
    se2Combos = dict()
    nInteractions = 0
    combosCounter = dict()
    for combo, ses in combo2se.iteritems():
        t1, t2 = combo2stitch[combo]
        id1 = utils.get_update_dict_index(drug2Id, t1)
        id2 = utils.get_update_dict_index(drug2Id, t2)
        for se in ses:
            seId = utils.get_dict(se2Id, se, -1)
            if seId != -1:
                combos = utils.get_insert_key_dict(se2Combos, seId, [])
                combos.append([id1, id2])
                nInteractions += 1
                utils.add_dict_counter(combosCounter, "%s_%s" % (id1, id2))

    nDrug = len(drug2Id)

    print 'After'
    print 'Drug combinations: %d Side effects: %d' % (len(combosCounter),
                                                      config.NUM_SE)
    print 'Drug-drug interactions: %d' % nInteractions
    print 'Num drug: %d' % (nDrug)

    print 'Save to file...'

    drug_drug_adj_list = []
    for se, combos in se2Combos.iteritems():
        drugdrugMatrix = np.zeros((nDrug, nDrug))
        for d1, d2 in combos:
            drugdrugMatrix[d1, d2] = drugdrugMatrix[d2, d1] = 1
        drug_drug_adj_list.append(sp.csr_matrix(drugdrugMatrix))

    utils.save_obj(drug_drug_adj_list, config.PROCESSED_COMBO_ADR)
    utils.save_obj(drug2Id, config.DRUG_MAP)