def prepare_PMI(self, texts, tokens_valid, thr): n_tokens = len(tokens_valid) n_samples = len(texts) p_margin = np.zeros(n_tokens) p = np.zeros((n_tokens, n_tokens)) pbar = progbar.start(n_samples) l = 0 for tokens in texts: tids = [tokens_valid[token] for token in set(tokens) if tokens_valid.has_key(token)] for tid in tids: p_margin[tid] += 1 for i in range(len(tids) - 1): t1 = tids[i] for j in range(i + 1, len(tids)): t2 = tids[j] if t1 < t2: p[t1][t2] += 1 else: p[t2][t1] += 1 l += 1 pbar.update(l) pbar.finish() pmi_list = [] #values = [] n = (n_tokens - 1) * n_tokens / 2 pbar = progbar.start(n) l = 0 vs = [] for i in range(n_tokens - 1): if p_margin[i] == 0.: print i for j in range(i + 1, n_tokens): v = p[i][j] / (p_margin[i] * p_margin[j]) vs.append(v) if v > thr: pmi_list.append(((i, j), v)) #values.append(v) l += 1 pbar.update(l) pbar.finish() print 'sim_value_range: [%f, %f]'%(np.min(vs), np.max(vs)) #cPickle.dump(values, open('output/pmi_values.pkl', 'w')) return pmi_list
def export_vote(thr_rate, ofname): n_batch = 90 yhists = [] pbar = progbar.start(n_batch) for batch_id in range(n_batch): fname = 'data/simrecord_90_%d.pkl'%(batch_id) records = cPickle.load(open(fname, 'r')) for y, x_len, record in records: thr = x_len * thr_rate ys = [yi for yi, d in record if d <= thr] yhist = {} for yi in ys: if yhist.has_key(yi): yhist[yi] += 1 else: yhist[yi] = 1. yhist = sorted(yhist.items(), key = lambda k: -k[1]) yhists.append(yhist) pbar.update(batch_id + 1) pbar.finish() cPickle.dump(yhists, open(ofname, 'w'))
def main(): con = db.connect() cur = con.cursor() emo_mids = {} limit = 25000000 pbar = progbar.start(limit) i = 0 cur.execute("SELECT mid, text FROM microblogs WHERE comments_count > 0 AND comments_count < 100 LIMIT %d"%(limit)) for mid, text in cur: text, emos = blogger.extract(text) if len(emos) > 0 and len(emos) < 6: samples = blogger.prepare_sample(text, emos) for e, t in samples: if emo_mids.has_key(e): emo_mids[e].append(mid) else: emo_mids[e] = [mid, ] i += 1 pbar.update(i) pbar.finish() cPickle.dump(emo_mids, open('../output/emo_mids.pkl', 'w'))
def sample(): blogs = commdatica.load('output/umtc.txt') has_emo = [] no_emo = [] target = 1000 i = 0 pbar = progbar.start(target) for blog in blogs: if blogger.is_valid(blog.text): if not len(has_emo) >= 500: has_emo.append(blog) i += 1 elif blogger.is_valid(blog.text, check_emo = False): if not len(no_emo) >= 500: no_emo.append(blog) i += 1 pbar.update(i) pbar.finish() print 'writing to umtc_yes_emo.txt ....', open('output/umtc_yes_emo.txt', 'w').write('\n'.join([repr(blog) for blog in has_emo])) print 'OK' print 'writing to umtc_no_emo.txt ....', open('output/umtc_no_emo.txt', 'w').write('\n'.join([repr(blog) for blog in no_emo])) print 'OK' bs = commdatica.load('output/umtc_yes_emo.txt') print len(bs)
def sample(): blogs = commdatica.load("output/umtc.txt") has_emo = [] no_emo = [] target = 1000 i = 0 pbar = progbar.start(target) for blog in blogs: if blogger.is_valid(blog.text): if not len(has_emo) >= 500: has_emo.append(blog) i += 1 elif blogger.is_valid(blog.text, check_emo=False): if not len(no_emo) >= 500: no_emo.append(blog) i += 1 pbar.update(i) pbar.finish() print "writing to umtc_yes_emo.txt ....", open("output/umtc_yes_emo.txt", "w").write("\n".join([repr(blog) for blog in has_emo])) print "OK" print "writing to umtc_no_emo.txt ....", open("output/umtc_no_emo.txt", "w").write("\n".join([repr(blog) for blog in no_emo])) print "OK" bs = commdatica.load("output/umtc_yes_emo.txt") print len(bs)
def prepare(): ''' prepare data for LSTM data are structured as [[codes, codes, ...], ...], which are ordered by eid ''' import blogger from utils import progbar coder = cPickle.load(open(PKL_TFCODER, 'r')) datalist = [] pbar = progbar.start(N_EMO) for eid in range(N_EMO): lines = open(DIR_TEXT + '%d.txt'%(eid), 'r').read().split('\n') data = [] for line in lines: text, emo = blogger.extract(line) codes = coder.code(text) data.append(codes) datalist.append(data) pbar.update(eid + 1) pbar.finish() cPickle.dump(datalist, open(PKL_TFDATA, 'w'))
def main(): con = db.connect() cur = con.cursor() emos = open('../data/emo_top100.txt', 'r').read().decode('utf8').split('\n') limit = int(sys.argv[1]) pbar = progbar.start(limit) i = 0 cur.execute("SELECT text FROM microblogs WHERE comments_count > 0 AND comments_count < 100 LIMIT %d"%(limit)) fobjs = dict([(emo, open('../corpus_raw/%s.txt'%(emo), 'w')) for emo in emos]) for res in cur: blog = res[0] text, emos = blogger.extract(blog) n_emo = len(emos) if n_emo > 0 and n_emo < 6: samples = blogger.prepare_sample(text, emos) for e, t in samples: if not e in fobjs: continue fobjs[e].write(t + '\n') i += 1 pbar.update(i) pbar.finish() for fobj in fobjs.values(): fobj.close()
def main(): optparser = OptionParser() optparser.add_option('-x', '--dname_x', action='store', type = 'str', dest='dname_x') optparser.add_option('-s', '--dname_xsup', action='store', type = 'str', dest='dname_xsup') optparser.add_option('-k', '--value_k', dest='value_k', type='float', action = 'store', default = 1.) optparser.add_option('-y', '--ydim', action='store', type='int', dest='ydim', default = N_EMO) opts, args = optparser.parse_args() print >> sys.stderr, 'nbdatica: [info] loading data for training NaiveBayes ... ', train, valid, test = datica.load_data(opts.dname_x, opts.ydim, valid_rate = 0.) print >> sys.stderr, 'OK' print >> sys.stderr, 'nbdatica: [info] training NaiveBayes ... ', classifier = NaiveBayesClassifier() classifier.train(train[0], train[1], opts.value_k) print >> sys.stderr, 'OK' if not os.path.exists(opts.dname_xsup): os.mkdir(opts.dname_xsup) pbar = progbar.start(opts.ydim) for eid in range(opts.ydim): ifname = opts.dname_x + '%d.pkl'%(eid) seqs = cPickle.load(open(ifname, 'r')) ofname = opts.dname_xsup + '%d.pkl'%(eid) proba = [classifier.classify(seq) for seq in seqs] cPickle.dump(proba, open(ofname, 'w')) pbar.update(eid + 1) pbar.finish()
def main(): dir_root = '../' dir_corpus = dir_root + 'corpus/' dir_token = dir_root + 'token/' fname_emo = dir_corpus + 'emo.txt' emos = open(fname_emo, 'r').read().decode('utf8').split('\n')[:-1] if not os.path.isdir(dir_token): os.mkdir(dir_token) for emo in emos: fname_output = dir_token + '%s.txt'%(emo) fname_input = dir_corpus + '%s.txt'%(emo) fobj_out = open(fname_output, 'w') with open(fname_input, 'r') as fobj_in: print >> sys.stderr, 'main: [info] processing %s'%(fname_input) pbar = progbar.start(wc_l(fname_input)) i = 0 for line in fobj_in: line = line[:-1].decode('utf8') toks = zhtokenizer.tokenize(line) fobj_out.write(' '.join(toks) + '\n') i += 1 pbar.update(i) pbar.finish() fobj_out.close()
def main(): key_wemb = sys.argv[1] ydim = int(sys.argv[2]) size = int(sys.argv[3]) key_dataset = key_wemb + '%dy%d'%(ydim, size) dir_root = '../' dir_tokid = dir_root + 'tokid/%s/'%(key_wemb) dir_dataset = dir_root + 'dataset/' fname_dataset = dir_dataset + '%s.pkl'%(key_dataset) fname_emo = dir_tokid + 'emo.txt' emos = open(fname_emo, 'r').read().decode('utf8').split('\n')[:-1] dataset = [] pbar = progbar.start(ydim) i = 0 for emo in emos[:ydim]: fname_input = dir_tokid + '%s.txt'%(emo) ids = range(wc_l(fname_input)) np.random.shuffle(ids) dataset.append((emo, ids[:size])) i += 1 pbar.update(i) pbar.finish() cPickle.dump(dataset, open(fname_dataset, 'w'))
def classify(train, test, ubm, gamma = 1., r = 16., w = 1., m = 1., v = 1., n_components = 8): x, y = train xdim = x.shape[1] ydim = np.unique(y).shape[0] M = n_components gs = [] print >> sys.stderr, 'classify: [info] building gmm for each label ...' pbar = progbar.start(ydim) for i in range(ydim): xi = x[y == i] # matrix[T x xdim] T = xi.shape[0] weights = ubm.weights_ # matrix[M, ] probs = ubm.predict_proba(xi) # matrix[T x M] Pr_t_i = probs * weights Pr_t_i = Pr_t_i / np.asmatrix(Pr_t_i.sum(axis = 1)).T # matrix[T x M] n_i = np.asarray(Pr_t_i.sum(axis = 0)).flatten() # matrix[M, ] Ex_i = np.asarray([(np.asarray(Pr_t_i[:, i]) * xi).sum(axis = 0) / n_i[i] if not n_i[i] == 0. else np.zeros(xdim) for i in range(M)]) # matrix[M x xdim] Ex2_i = np.asarray([(np.asarray(Pr_t_i[:, i]) * (xi ** 2)).sum(axis = 0) / n_i[i] if not n_i[i] == 0. else np.zeros(xdim) for i in range(M)]) # matrix[M x xdim] alpha = lambda p: n_i / (n_i + r ** p) alpha_w = alpha(w) alpha_m = alpha(m) alpha_v = alpha(v) # weights: matrix[M, ] new_weights = (alpha_w * n_i / T + (1. - alpha_w) * ubm.weights_) * gamma # means: matrix[M, xdim] alpha_m = np.asarray(np.asmatrix(alpha_m).T) new_means = alpha_m * Ex_i + (1. - alpha_m) * ubm.means_ # covar: matrix[M, xdim] alpha_v = np.asarray(np.asmatrix(alpha_v).T) new_covars = alpha_v * Ex2_i + (1. - alpha_v) * (ubm.covars_ + ubm.means_ **2) - new_means ** 2 g = GMM(n_components = M) g.means_ = new_means g.weights_ = new_weights g.covars_ = new_covars gs.append(g) pbar.update(i + 1) pbar.finish() x, y = test scores = [g.score(x) for g in gs] proba = np.column_stack(scores) # not probability really return proba
def __iter__(self): pbar = progbar.start(self.n_emo) for i in range(self.n_emo): seqs = cPickle.load(open('data/dataset/unigram/%d.pkl'%(i), 'r')) for seq in seqs: yield seq pbar.update(i + 1) pbar.finish()
def extract(ifname, ofname): n_lines = int(commands.getoutput('grep -cF "" %s'%(ifname))) if n_lines == 0: return pbar = progbar.start(n_lines) l = 0 datalist = [] ofobj = open(ofname, 'w') with open(ifname, 'r') as ifobj: for line in ifobj: blog = json.loads(line) n_text = len(blog['text']) for i in range(1, n_text): res = blogger.extract(blog['text'][i]) if res is not None: datum = {} datum['text'] = res[0] datum['emo'] = res[1] above = [] re_id = i while blog['re'][re_id] is not None: re_id = blog['re'][re_id] above.append(blog['text'][re_id]) datum['above'] = above follow = [] last_id = i for j in range(i + 1, n_text): if blog['re'][j] == last_id: follow.append(blog['text'][j]) last_id = j datum['follow'] = follow ofobj.write(json.dumps(datum) + '\n') #for k in range(n_text): # print '%d. (-> %s) %s'%(k, blog['re'][k], blog['text'][k]) # #print 'above:' #print '\n'.join(above) #print 'follow:' #print '\n'.join(follow) #print #if i > 100: # return l += 1 pbar.update(l) pbar.finish() ofobj.close()
def main(): import db import blogger from utils import progbar n_label = len(emos_list) emo_map = {} for label, emos in enumerate(emos_list): for emo in emos: emo_map[emo] = label odname = 'data/dataset_emo/' if not os.path.isdir(odname): os.mkdir(odname) odname += 'raw/' if not os.path.isdir(odname): os.mkdir(odname) fobjs = [open(odname + '%d.txt'%(i), 'w') for i in range(n_label)] counts = [0 for i in range(n_label)] N = 70000 all_N = N * n_label con = db.connect() cur = con.cursor() pbar = progbar.start(all_N) l = 0 cur.execute('SELECT text FROM microblogs') for t in cur: res = blogger.extract(t[0]) if res is None or not emo_map.has_key(res[1]): continue label = emo_map[res[1]] if counts[label] < N: counts[label] += 1 fobjs[label].write(res[0] + '\n') l += 1 pbar.update(l) if counts[label] == N and sum(counts) == all_N: break pbar.finish() cur.close() con.close() for fobj in fobjs: fobj.close()
def extract(dname_dataset, idx): idname = 'data/blogs/mtr/' dir_dataset = 'data/blogs/%s/'%(dname_dataset) odname = dir_dataset + 'tea/' init_folders([dir_dataset, odname]) ifname = idname + '%d.txt'%(idx) ofname = odname + '%d.txt'%(idx) n_lines = int(commands.getoutput('grep -cF "" %s'%(ifname))) if n_lines == 0: print >> sys.stderr, '%s is empty'%(ifname) return pbar = progbar.start(n_lines) l = 0 datalist = [] ofobj = open(ofname, 'w') with open(ifname, 'r') as ifobj: for line in ifobj: blog = json.loads(line) n_text = len(blog['text']) for i in range(1, n_text): res = blogger.extract(blog['text'][i]) if res is not None: datum = {} datum['text'] = res[0] datum['emo'] = res[1] above_s = [] re_id = i while blog['re'][re_id] is not None: re_id = blog['re'][re_id] above_s.append(blog['text'][re_id]) datum['above_s'] = above_s above_t = [] re_id = i - 1 while re_id >= 0: above_t.append(blog['text'][re_id]) re_id -= 1 datum['above_t'] = above_t ofobj.write(json.dumps(datum) + '\n') l += 1 pbar.update(l) pbar.finish() ofobj.close()
def embed(xy): seqs, y = xy x_vec = [] pbar = progbar.start(len(seqs)) for i, seq in enumerate(seqs): x_vec.append(np.mean(embedder.embed(seq), axis = 0)) pbar.update(i + 1) pbar.finish() return (x_vec, y)
def build(train_x_y, ydim): probwe = {} x, y = train_x_y n_y = np.zeros(ydim) n_samples = len(x) print >> sys.stderr, 'scaning train dataset' pbar = progbar.start(n_samples) loop = 0 for seq, yi in zip(x, y): n_y[yi] += 1 for token in set(seq): if not probwe.has_key(token): probwe[token] = np.zeros(ydim) probwe[token][yi] += 1 loop += 1 pbar.update(loop) pbar.finish() print >> sys.stderr, 'normalization' pbar = progbar.start(len(probwe)) loop = 0 for k in probwe.keys(): probwe[k] /= n_y loop += 1 pbar.update(loop) pbar.finish() return probwe
def main(): blogs = commdatica.load('output/umtc.txt') print '%d in total'%(len(blogs)) pbar = progbar.start(len(blogs)) c = 0 for i, blog in enumerate(blogs): if blogger.is_valid(blog.text, check_emo = False): c += 1 pbar.update(i + 1) pbar.finish() print '%.2f%%'%(100. * c / len(blogs))
def prepare(eids = range(N_EMO)): if not os.path.exists(DIR_TOKEN): os.mkdir(DIR_TOKEN) pbar = progbar.start(len(eids) * 4000) c = 0 for eid in eids: seqs = [] lines = open(DIR_TEXT + '%d.txt'%(eid), 'r').read().split('\n') for line in lines: seqs.append(zhpr.tokenize(line)) c += 1 pbar.update(c) cPickle.dump(seqs, open(DIR_TOKEN + '%d.pkl'%(eid), 'w')) pbar.finish()
def reform(idir, odir): if not os.path.isdir(odir): os.mkdir(odir) n_emo = 90 pbar = progbar.start(n_emo) for i in range(n_emo): seqs = cPickle.load(open(idir + '%d.pkl'%(i), 'r')) fobj = open(odir + '%d.txt'%(i), 'w') content = u'' content = u'\n'.join([u' '.join(seq) for seq in seqs]) fobj.write(content) fobj.close() pbar.update(i + 1) pbar.finish()
def test(): import cPickle import zhprocessor from utils import logger, progbar FNAME_TEXT = 'output/0.txt' PKL_TOKENS = 'data/tokens.pkl' if not os.path.exists(PKL_TOKENS): logger.debug('loading lines...') lines = open(FNAME_TEXT, 'r').read().split('\n') logger.debug('preparing toknes..') tokens = [] pbar = progbar.start(len(lines)) for i, line in enumerate(lines): res = blogger.extract(line) if res == None: continue text, emo = res tokens.extend(zhprocessor.tokenize(text)) pbar.update(i + 1) pbar.finish() cPickle.dump(tokens, open(PKL_TOKENS, 'w')) else: tokens = cPickle.load(open('data/tokens.pkl', 'r')) cclassifier = ConceptClassifier() logger.debug('analysing tokens..') d = 10 vecs = cclassifier.analyse(tokens, d) print len(vecs) vecs = vecs.items() repr_vecs = [vecs[0] for i in range(d)] for vec in vecs[1:]: for i in range(d): if abs(repr_vecs[i][1][i]) < abs(vec[1][i]): repr_vecs[i] = vec for k, v in repr_vecs: print k, v
def load(fname_dataset, dir_tokid, valid = 0.2, test = 0.1): emo_ids = cPickle.load(open(fname_dataset, 'r')) dataset = [[[], []] for i in range(3)] def add_samples(idxs, lines, y, split_id): for idx in idxs: tids = map(int, lines[idx][:-1].split(' ')) dataset[split_id][0].append(tids) dataset[split_id][1].append(y) print >> sys.stderr, 'load: [info] loading data...' pbar = progbar.start(len(emo_ids)) for i, item in enumerate(emo_ids): emo, ids = item n_total = len(ids) n_valid = int(valid * n_total) n_test = int(test * n_total) n_train = n_total - n_valid - n_test fname_tokid = dir_tokid + '%s.txt'%(emo) # tids = load_tokid(fname_tokid) lines = open(fname_tokid, 'r').readlines() add_samples(ids[:n_train], lines, i, 0) add_samples(ids[n_train:(-n_test)], lines, i, 1) add_samples(ids[-n_test:], lines, i, 2) pbar.update(i + 1) pbar.finish() def shuffle(subset): x, y = subset ids = range(len(x)) np.random.shuffle(ids) x = [x[i] for i in ids] y = [y[i] for i in ids] return (x, y) dataset = [shuffle(tuple(subset)) for subset in dataset] dataset = tuple(dataset) return dataset
def load_blogs(): def load_blog_lines(): lines = [] for i in range(3): fname = 'data/blogs/blogs_subset_%d.txt'%(i) lines.extend(open(fname, 'r').readlines()) return lines lines = load_blog_lines() blogs = [] pbar = progbar.start(len(lines)) for i, line in enumerate(lines): blogs.append(json.loads(line)) pbar.update(i + 1) pbar.finish() return blogs
def load(fname_blogs = FNAME_BLOGS_FILTERED): ''' load data/blogs_subset.txt as list of BlogInfo ''' lines = open(fname_blogs, 'r').readlines() blogs = [] pbar = progbar.start(len(lines)) for i, l in enumerate(lines): blogs.append(BlogInfo.loads(l)) pbar.update(i + 1) pbar.finish() return blogs
def main(): import db import datica import blogger from utils import progbar con = db.connect() cur = con.cursor() maxN = 70000 odname = 'data/dataset_emo/' if not os.path.isdir(odname): os.mkdir(odname) config = datica.load_config('data/config4.txt') for label, eids in enumerate(config): for eid in eids: print >> sys.stderr, 'loading LABEL %d - EID %d'%(label, eid) ifname = 'data/eid_mids/%d.txt'%(eid) ofname = odname + '%d.txt'%(eid) ofobj = open(ofname, 'w') mids = open(ifname, 'r').read().split('\n') if len(mids) > maxN: mids = mids[:maxN] pbar = progbar.start(len(mids)) l = 0 for mid in mids: t = load_by_mid(cur, mid) res = blogger.extract(t) if res is not None: text, emo = res ofobj.write(text + '\n') l += 1 pbar.update(l) pbar.finish() ofobj.close() cur.close() con.close()
def calculate(self, x, y, deduplicate = True): vecs = {} ydim = np.max(y) + 1 y_count = np.zeros(ydim) n_samples = len(x) for xi, yi in zip(x, y): y_count[yi] += 1 if deduplicate: xi = set(xi) for token in xi: if not vecs.has_key(token): vecs[token] = np.zeros(ydim) vecs[token][yi] += 1 chis = {} pbar = progbar.start(len(vecs)) l = 0 for token, vec in vecs.items(): chi_values = [] vec_sum = np.sum(vec) for i in range(ydim): a = vec[i] b = vec_sum - a c = y_count[i] - a d = (n_samples - y_count[i]) - b chi_values.append((a * d - b * c) ** 2 / ((a + b) * (c + d))) chis[token] = chi_values l += 1 pbar.update(l) pbar.finish() return chis
def prepare(eids = range(N_EMO)): ''' tokenize and unigramize the text under data/dataset/text ''' import blogger import zhtokenizer from utils import progbar, zhprocessor if not os.path.isdir(DIR_UNIGRAM): os.mkdir(DIR_UNIGRAM) if not os.path.isdir(DIR_TOKEN): os.mkdir(DIR_TOKEN) unigram_list = [] token_list = [] for eid in eids: lines = open(DIR_TEXT + '%d.txt'%(eid), 'r').read().split('\n') unigram_list = [] token_list = [] print 'preparing data for EID-%d'%(eid) pbar = progbar.start(len(lines)) for i, line in enumerate(lines): text, emo = blogger.extract(line) text = zhprocessor.simplify(text) unigrams = zhtokenizer.unigramize(text) tokens = zhtokenizer.tokenize(text) unigram_list.append(unigrams) token_list.append(tokens) pbar.update(i + 1) pbar.finish() cPickle.dump(unigram_list, open(DIR_UNIGRAM + '%d.pkl'%(eid), 'w')) cPickle.dump(token_list, open(DIR_TOKEN + '%d.pkl'%(eid), 'w'))
def build(seqs, N): tf = {} df = {} pbar = progbar.start(len(seqs)) l = 0 for seq in seqs: for t in seq: if tf.has_key(t): tf[t] += 1 else: tf[t] = 1. for t in set(seq): if df.has_key(t): df[t] += 1 else: df[t] = 1 l += 1 pbar.update(l) pbar.finish() tf = sorted(tf.items(), key = lambda k: -k[1]) n_seq = len(seqs) nums = set('0123456789') Widx = {} Widf = {} idx = 0 for t, f in tf: if not t in nums: Widf[t] = np.log((1. + n_seq) / df[t]) Widx[t] = idx idx += 1 if idx == N: break return Widx, Widf
def prepare_tokenize(ifname, ofname): import zhtokenizer from utils import zhprocessor, progbar lines = open(ifname, 'r').readlines() pbar = progbar.start(len(lines)) l = 0 seqs = [] for line in lines: line = line.decode('utf8') line = zhprocessor.simplify(line) tokens = zhtokenizer.tokenize(line) seqs.append(tokens) l += 1 pbar.update(l) pbar.finish() cPickle.dump(seqs, open(ofname, 'w'))
def main(): dlist = [] x_set = {} m_set = {} for i in range(90): fname = 'data/dataset/text/%d.txt'%(i) lines = open(fname, 'r').read().split('\n') dlist.append(lines) pbar = progbar.start(len(lines)) for j, line in enumerate(lines): l = zhp.simplify(line.decode('utf8')) res = zht.segment(l, True) for w in res: if w.flag == 'x': if x_set.has_key(w.word): x_set[w.word] += 1 else: x_set[w.word] = 1 elif w.flag == 'm': if m_set.has_key(w.word): m_set[w.word] += 1 else: m_set[w.word] = 1 pbar.update(j + 1) pbar.finish() fobj = open('output/set_x.txt', 'w') x_set = sorted(x_set.items(), key = lambda k: -k[1]) for k, v in x_set: fobj.write('%s (%d)\n'%(k, v)) fobj.close() fobj = open('output/set_m.txt', 'w') m_set = sorted(m_set.items(), key = lambda k: -k[1]) for k, v in m_set: fobj.write('%s (%d)\n'%(k, v)) fobj.close()