def __init__(self, cluid, twarr, prob, hot, level, geo_list, time_list, keywords): self.cluid = cluid self.twarr = twarr self.prob = prob self.hot = hot self.level = level self.geo_list = geo_list self.time_list = time_list self.keywords = keywords self.od = Od() summary = Od( zip( ['cluster_id', 'prob', 'level', 'hot', 'keywords'], [cluid, prob, level, hot, keywords], )) geo_table = [ Od(zip(['quality', 'address', 'country', 'bbox'], geo)) for geo in geo_list ] time_table = Od( zip(['most_possible_time', 'earliest_time', 'latest_time'], time_list)) self.od.update( zip( ['summary', 'geo_infer', 'time_infer', 'tweet_list'], [summary, geo_table, time_table, twarr], ))
def chain_to_info(**blockchain): """ TODO :param blockchain: :return: """ temp_chain = cp.deepcopy( blockchain['blocks']) if 'blocks' in blockchain else [] temp_txs = cp.deepcopy( blockchain['txs']) if 'txs' in blockchain else [] # Converting the blocks in the chain to ordered dictionaries blocks_info = [ Od([('previous_hash', block.previous_hash), ('index', block.index), ('proof', block.proof), ('transactions', [ Od([('sender', tx.sender), ('recipient', tx.recipient), ('amount', tx.amount), ('timestamp', tx.timestamp), ('signature', tx.signature)]) for tx in block.transactions ]), ('timestamp', block.timestamp)]) for block in temp_chain ] # Converting the transactions in the chain to ordered dictionaries txs_info = [ Od([('sender', tx.sender), ('recipient', tx.recipient), ('amount', tx.amount), ('timestamp', tx.timestamp), ('signature', tx.signature)]) for tx in temp_txs ] return blocks_info, txs_info
def main(): # print出来的结果最好也输出到文件里 base = '/home/cdong/works/research/clu/data/' files = [ '20ng_tf.pkl', '20ng_tfidf.pkl', 'Event_tf.pkl', 'Event_tfidf.pkl', 'Google_tf.pkl', 'Google_tfidf.pkl', 'Reuters_tf.pkl', 'Reuters_tfidf.pkl', 'TREC_tf.pkl', 'TREC_tfidf.pkl', ] for file in files: file = base + file print(file) scores = list() for i in range(3): features, y_true = pickle.load(open(file, 'rb')) y_pred = GANMM(features) score = Od((s, get_score(y_true, y_pred, s)) for s in ('acc', 'ari', 'nmi')) scores.append(score) print(score) name2scores = Od() for score in scores: for score_name, score_value in score.items(): name2scores.setdefault(score_name, list()).append(score_value) for score_name, score_values in name2scores.items(): print('mean {}: {}\n'.format(score_name, np.mean(score_values)))
def vec2dict(idx_vec): od = Od(zip(names, [None] * len(names))) for n_idx, v_idx in enumerate(idx_vec): n, v = names[n_idx], values_list[n_idx][v_idx] if not callable(v): od[n] = v return od
def group_iters(): groups_ = Od() for p_, o_ in param2iter.items(): k_, v_ = (p_[0], p_[1]), (p_[2], o_) groups_.setdefault(k_, list()) groups_[k_].__add__(v_) return groups_
def construct_od(self): """ 从当前聚类信息构造一个字段按顺序保存的字典,该字典维护一个用于描述聚类信息的树形结构, 各字段含义与数值类型见本文件 line 182-209 的说明 :return: OrderedDict,保存了聚类的结构化信息 """ summary = Od( zip( ['cluster_id', 'prob', 'level', 'hot', 'keywords'], [self.cluid, self.prob, self.level, self.hot, self.keywords], )) od = Od( zip( ['summary', 'geo_infer', 'time_infer', 'tweet_list'], [summary, self.geo_table, self.time_table, self.twarr], )) return od
def _init_months(): temp = [ 'Jan', 'Feb', 'Mar', 'Apr', 'May', 'Jun', 'Jul', 'Aug', 'Sep', 'Oct', 'Nov', 'Dec' ] months = Od() for i in range(1, 13): months[i] = temp[i - 1] return months
def get_scores(pred_target, add_on): preds, trues = list(), list() for batch in batches: c_probs = self.sess.run(pred_target, feed_dict=self.get_fd_by_batch(batch)) preds.extend(np.argmax(c_probs, axis=1).reshape(-1)) trues.extend(d.topic for d in batch) od = au.scores(trues, preds, au.eval_scores) return Od((k + add_on, v) for k, v in od.items())
def group_data_frame_columns(data_frame, columns): groups = [(Od(), data_frame)] for col in columns: for _ in range(len(groups)): p_od, p_df = groups.pop(0) for n_od, n_df in group_data_frame(p_df, col): p_od = p_od.copy() p_od.update(n_od) groups.append((p_od, n_df)) return groups
def platformPulizie_old(self, db=None): if db is None: db = self.DataBase # todo evitare di caricare le date passate? # print(os.getcwdb()) self.datePrenotazioni.clear() # a, m, g = self.amg(self._dataIn) for anno in db.keys(): for mese in db[anno].keys(): for giorno in db[anno][mese].keys(): data = QtCore.QDate(anno, mese, giorno) plat = db[anno][mese][giorno]["checkIn"]["platform"] pulizie = db[anno][mese][giorno]["checkOut"][ 'data partenza'] if 'platforms' not in self.datePrenotazioni: self.datePrenotazioni['platforms'] = Od() if plat != '': if plat not in self.datePrenotazioni['platforms']: dat = {'date': [data]} self.datePrenotazioni['platforms'][plat] = Od(dat) # self.datePrenotazioni['platforms'][plat]['colore'] = QtGui.QColor else: print("PLAT: ", plat, "##") self.datePrenotazioni['platforms'][plat][ 'date'].append(data) # if plat == "Booking": # if data not in self.dateBooking: # self.dateBooking.append(data) # elif plat == "AirB&B": # if data not in self.dateAirbb: # self.dateAirbb.append(data) # elif plat == 'Privati': # if data not in self.datePrivati: # self.datePrivati.append(data) if pulizie != '': if pulizie not in self.datePulizie: self.datePulizie.append(pulizie) return self.datePrenotazioni, self.datePulizie
def print_groups(use_max, sort_group, files): def group_score(frame, col): group_top = 5 mean = np.mean(frame[col].values[1:group_top]) return mean board = (read_board_multi if len(files) >= 60 else read_board_files)( use_max, files) board.fillna('_', inplace=True) for s in set(board.columns).intersection(set(group_by)): print(s, sorted(Counter(board[s]))) sort_by = 'acc' board.sort_values(by=sort_by, ascending=False, inplace=True) ret_dfs = list() for dn_bv, dn_df in au.group_data_frame_columns(board, columns=[dn_]): data_name = dn_bv[dn_] print(data_name) bv_df_list = au.group_data_frame_columns(dn_df, columns=group_by) bv_df_sc_list = list() for bv, df in bv_df_list: evals = ['nmi', 'ari', 'acc'] # evals += [s + '_nis' for s in evals] sc = Od((s, group_score(df, s)) for s in evals) bv_df_sc_list.append((bv, df, sc)) if sort_group: bv_df_sc_list = sorted( bv_df_sc_list, key=lambda item: (item[-1]['acc'], item[-1]['ari'], item[-1]['nmi'])) ret_df = pd.DataFrame() for bv, df, sc in bv_df_sc_list: # if not (bv[l1_] == '1.0' and bv[l2_] == '1.0' # and bv[l3_] == '0.0'): # continue print('<{}>'.format(len(df)), end=' ') print(' '.join(['{}={:7}'.format(*x) for x in bv.items()]), end='| ') print(' '.join( ['{} {:.4f}'.format(s, g) for s, g in sc.items()])) print(df.iloc[:4, :], '\n') # print(','.join(df[gid_].tolist())) i = len(ret_df) for b, v in bv.items(): ret_df.loc[i, b] = v for s, c in sc.items(): ret_df.loc[i, s] = round(float(c), 4) ret_df.loc[i, 'LEN'] = len(df) ret_df.loc[i, dn_] = data_name print(data_name, '\n' * 3) # if comment is not None: # ret_df.to_csv('{}_{}.csv'.format(comment, data_name)) ret_dfs.append(ret_df) pd.concat(ret_dfs).to_csv('{}.csv'.format(comment))
def ddd(thing): stack = traceback.extract_stack() for s in stack: filename, lineno, function_name, code = s if filename == '/var/src/conkyconfpy/conkyconfpy.py': d = Od({ 'lineno': lineno, 'function_name': function_name, 'code': code, }) pprint(d)
def platformPulizie(self, db=None): if db is None: db = self.DataBase # todo evitare di caricare le date passate? # print(os.getcwdb()) self.datePrenotazioni.clear() # a, m, g = self.amg(self._dataIn) for anno in db.keys(): for mese in db[anno].keys(): for giorno in db[anno][mese].keys(): data = QtCore.QDate(anno, mese, giorno) checkIn = db[anno][mese][giorno]["checkIn"] plat = checkIn["platform"] pulizie = db[anno][mese][giorno]["checkOut"][ 'data partenza'] note = checkIn['note'] if 'platforms' not in self.datePrenotazioni: self.datePrenotazioni['platforms'] = Od() if plat != '': if plat not in self.datePrenotazioni['platforms']: dat = {'date': [data]} self.datePrenotazioni['platforms'][plat] = Od(dat) # self.datePrenotazioni['platforms'][plat]['colore'] = QtGui.QColor else: # print("not in self.datePrenotazioni (managerprenotazioni) PLAT: ", plat, "##") self.datePrenotazioni['platforms'][plat][ 'date'].append(data) if pulizie != '': if pulizie not in self.datePulizie: print('pulizie', pulizie) self.datePulizie.append(pulizie) # if spese > 0: # print('spese append: ', spese) # if data not in self.dateSpese: # self.dateSpese.append(data) if len(note) > 0: if data not in self.dateNote: self.dateNote.append(data) return self.datePrenotazioni, self.datePulizie, self.dateNote
def print_coherent_topics(): from collections import Counter from data.make_embeddings import load_pretrain_google_news word2vec = load_pretrain_google_news() print('word2vec load over') for file in res_files: topic_list, c_alpha_list, w_alpha_list, doc_embed, word_embed, clu_embed, d_name = np.load( file) print('c_alpha_list.shape', np.array(c_alpha_list).shape) cluid_list = np.argmax(c_alpha_list, axis=1) cluid2counter = Od( (cluid, Counter()) for cluid in range(len(clu_embed))) for topic, cluid in zip(topic_list, cluid_list): cluid2counter[cluid][topic] += 1 print(d_name) ifd, docarr = name2object[d_name].load_ifd_and_docarr() w_num = 31 c_num = 100 cw_sim = au.cosine_similarity(clu_embed, word_embed) cw_sim_sort = np.sort(cw_sim, axis=1)[:, :-w_num:-1] top_sim_clu = np.argsort(np.mean(cw_sim_sort, axis=1).reshape(-1))[::-1][:c_num] for cluid in sorted(list(top_sim_clu)): print(cluid) topic_distrib = cluid2counter[cluid].most_common() if len(topic_distrib) == 0: continue topic = topic_distrib[0][0] print('cluid: {}, guess topic: {}, distribution: {}'.format( cluid, topic, topic_distrib[:5])) cw_sim_top = cw_sim[cluid] top_word_id = np.argsort(cw_sim_top)[:-w_num:-1] valid_words = [ ifd.id2word(wid) for wid in top_word_id if ifd.id2word(wid) in word2vec ] print(' '.join(valid_words)) # cw_sim_top = cw_sim[top_sim_clu] # top_word_id = np.argsort(cw_sim_top, axis=1)[:, :-w_num:-1] # print(np.sort(np.mean(cw_sim_sort, axis=1).reshape(-1))[:-c_num:-1]) # print(np.array([cw_sim[wid] for wid, cw_sim in zip(top_word_id, cw_sim_top)])) # # ifd = name2class[d_name].load_ifd() # for idx, wid_list in enumerate(top_word_id): # valid_words = [ifd.id2word(wid) for wid in wid_list if ifd.id2word(wid) in word2vec] # print('{}: '.format(idx) + ('{} ' * len(valid_words)).format(*valid_words)) print('\n----\n')
def __init__(self, cluid, twarr, prob, hot, level, geo_list, time_list, keywords): self.cluid = cluid self.twarr = twarr self.prob = prob self.hot = hot self.level = level self.geo_list = geo_list self.time_list = time_list self.keywords = keywords self.geo_table = [ Od(zip(['quality', 'address', 'country', 'bbox', 'freq'], g)) for g in self.geo_list ] self.s_geo_table = [ row for row in self.geo_table if row['quality'] not in self.large_geo ] self.time_table = Od(zip(['most_possible_time', 'earliest_time', 'latest_time'], self.time_list)) \ if self.time_list is not None else None
def analyze_refine_mean_and_stderr(result_file, mean_std_file): using_scores = ['nmi', 'h**o', 'cmplt', 'ari'] arg_tpc_clu_list = iu.load_array(result_file) rows = list() for kwargs, topics, clusters in arg_tpc_clu_list: scores = [au.score(topics, clusters, s) for s in using_scores] res_dict = Od(zip(using_scores, scores)) row = Od(list(kwargs.items()) + list(res_dict.items())) rows.append(row) rows = sorted(rows, key=lambda item: item['nmi'], reverse=True) df = pd.DataFrame(data=rows) print(df) score_array = df[using_scores].values mean = np.mean(score_array, axis=0) std = np.std(score_array, axis=0, ddof=1) table = list(zip(*[using_scores, mean, std])) lines = [ '{}: {} ± {}'.format(name, round(mean, 4), round(std, 4)) for name, mean, std in table ] iu.write_lines(mean_std_file, lines)
def print_top_words(self, ifd): from collections import OrderedDict as Od cluid2counter = Od( (twh.cluster.cluid, Counter()) for twh in self.twharr) for twh in self.twharr: cluid2counter[twh.cluster.cluid][twh.topic] += 1 print('total cluster number: {}'.format(len(self.cludict))) clu2distrib = Od() for cluid in sorted(cluid2counter.keys()): topic_distrib = cluid2counter[cluid].most_common() topic = topic_distrib[0][0] print('cluid: {}, topic: {}, t distrib: {}'.format( cluid, topic, topic_distrib[:10])) word_distrib = self.cludict[cluid].tokens.most_common()[:60] valid_words = [ifd.id2word(wid) for wid, cnt in word_distrib] print(' '.join(valid_words)) clu2distrib[cluid] = { 'topic_distrib': topic_distrib, 'word_distrib': valid_words } return clu2distrib
def read_excel_onesheet(fd, sheet_name, same_line_debug=False): """: 第一列必须唯一,且为英语,作为第一层键; 第一行必须唯一,且为英语,作为二层键 返回dict """ if os.path.exists(fd): pass else: print '@Error:We can not find file :%s' % fd excel_hash = Od() workbook = xlrd.open_workbook(fd) worksheets = workbook.sheet_names() worksheet = workbook.sheet_by_name(sheet_name) num_rows = worksheet.nrows num_cols = worksheet.ncols for rown in range(1, num_rows): row_name = worksheet.cell_value(rown, 0) if same_line_debug: if row_name not in excel_hash.keys(): excel_hash[row_name] = Od() key1 = row_name else: print 'Same line %s' % row_name exit() else: if rown not in excel_hash.keys(): excel_hash[rown] = Od() key1 = rown for coln in range(0, num_cols): col_name = str(worksheet.cell_value(0, coln)) if col_name not in excel_hash[key1].keys(): pass else: print 'The same clown %s' % col_name excel_hash[key1][col_name] = worksheet.cell_value(rown, coln) return excel_hash
def __init__(self, info=None, shortcut=0): # self._dataIn = dataIn # self._domani = dataIn.addDays(1) # self._nome = nome # self._cognome = cognome # opzione per correggere il percorso durante i tests self.shortcut = shortcut self._info = info # if self._info is not None: manageErr = False try: self.info = self.getInfo self._dataIn = self.info['data arrivo'] self._dataOut = self.info['data partenza'] self._domani = self._dataIn.addDays(1) self._nome = self.info['nome'] self._cognome = self.info['cognome'] except AttributeError: manageErr = True except TypeError: manageErr = True if manageErr: self._dataIn = QtCore.QDate().currentDate() self._domani = self._dataIn.addDays(1) self._dataOut = self._domani self._nome = None self._cognome = None self.occupate = [] # old # self.DataBase = deepc(self.getDb(self._dataIn)) self.DataBase = deepc(self.getDb()) self.counter = 0 # self.DataBase = Od() datePren = {'platforms': {}} self.datePrenotazioni = Od(datePren) self.dateBooking = [] self.dateAirbb = [] self.datePrivati = [] self.datePulizie = [] self.dateSpese = [] self.dateNote = [] self.platformDict = { 'Booking': self.dateBooking, 'AirB&B': self.dateAirbb, 'Privati': self.datePrivati, 'pulizie': self.datePulizie }
def analyze_mean_and_stderr(result_file): arg_tpc_clu_list = fu.load_array(result_file) rows = list() for kwargs, topics, clusters in arg_tpc_clu_list: s2v = Od((s, au.score(topics, clusters, s)) for s in au.eval_scores) row = Od(list(kwargs.items()) + list(s2v.items())) rows.append(row) rows = sorted(rows, key=lambda item: item['nmi'], reverse=True) df = pd.DataFrame(data=rows) # print(df) groups = au.group_data_frame(df, column='n_components') nmi_list, ari_list, acc_list = list(), list(), list() for _, df_ in groups: print(result_file) print(df_) nmis = df_['nmi'].values[0:6] aris = df_['ari'].values[0:6] accs = df_['acc'].values[0:6] nmi_list.append(au.mean_std(nmis)) ari_list.append(au.mean_std(aris)) acc_list.append(au.mean_std(accs)) print(au.transpose(nmi_list)) print(au.transpose(ari_list)) print(au.transpose(acc_list))
def read_scores_from_file(file): e_flag = '' score_od = Od() for line in iu.read_lines(file): if line.startswith('b'): continue elif line.startswith('e'): e_flag = line[line.find('-') + 1:] elif line.startswith('{') and 'nmi' in line: for k, v in iu.loads(line).items(): score_od.setdefault(k, list()).append(v) if len(score_od) == 0: fname = iu.base_name(file) print('{} - empty'.format(fname[:fname.find(',')])) epoch = len(list(score_od.values())[0]) if len(score_od) > 0 else 0 return score_od, epoch, e_flag
def evaluate(self, batches): def get_scores(pred_target, add_on): preds, trues = list(), list() for batch in batches: c_probs = self.sess.run(pred_target, feed_dict=self.get_fd_by_batch(batch)) preds.extend(np.argmax(c_probs, axis=1).reshape(-1)) trues.extend(d.topic for d in batch) od = au.scores(trues, preds, au.eval_scores) return Od((k + add_on, v) for k, v in od.items()) from collections import OrderedDict as Od scores = Od() scores.update(get_scores(self.pc_probs, add_on='')) if self.use_adv_nis: scores.update(get_scores(self.pc_probs_nis, add_on='_nis')) return scores
def _write_cytoscape_files(self, noa_path, eda_path, pp_path, idx_selection, label=None): """ Compute some node and edge attributes and write these to files that can be loaded in cytoscape. """ df = self.interactions_.loc[idx_selection, ] edges = [ sorted([p1, p2]) for (p1, p2) in zip(df[P1].values, df[P2].values) ] # Compute some selected node-attributes, # Write the noa (noda-attribute) file. accessions = sorted(set([p for tup in edges for p in tup])) gene_names = [self.gene_names_[a] for a in accessions] node_in_training = [self.node_in_training_set(a) for a in accessions] cyto_n_attrs = pd.DataFrame( Od([('name', accessions), ('node in training', node_in_training), ('gene name', gene_names)])) cyto_n_attrs.to_csv(noa_path, sep=self.sep, index=False) # Compute some selected edge-attributes a, # Write the eda (edge-attribute) file. columns = ['source', 'target', 'name', 'edge in training', 'max-pr'] cyto_e_attrs = dict() cyto_e_attrs['source'] = [p1 for p1, _ in edges] cyto_e_attrs['target'] = [p2 for _, p2 in edges] cyto_e_attrs['name'] = ['{} pp {}'.format(p1, p2) for p1, p2 in edges] cyto_e_attrs['edge in training'] = [ self.edge_in_training_set(e, label) for e in edges ] cyto_e_attrs['max-pr'] = list(df['max-pr'].values) for label in self.labels: column = self._label_to_column(label) cyto_e_attrs[column] = df[column].values columns.append(column) cyto_interactions = pd.DataFrame(cyto_e_attrs, columns=columns) cyto_interactions.to_csv(pp_path, sep=self.sep, index=False) return self
def read_excel_onesheet2(fd, sheet_name, seq_name='测序编号', same_line_debug=False): """: 第一列必须唯一,且为英语,作为第一层键; 第一行必须唯一,且为英语,作为二层键 返回dict """ if os.path.exists(fd): pass else: print '@Error:We can not find file :%s' % fd excel_hash = Od() workbook = xlrd.open_workbook(fd) worksheets = workbook.sheet_names() # print worksheets #.decode('utf-8') worksheet = workbook.sheet_by_name(sheet_name) num_rows = worksheet.nrows num_cols = worksheet.ncols num = 0 for rown in range(0, num_rows): one_line_info = [] for coln in range(0, num_cols): col_name = str(worksheet.cell_value(0, coln)) if col_name == seq_name: seq_col_loc = coln one_line_info.append(worksheet.cell_value(rown, coln)) seq_name_line = worksheet.cell_value(rown, seq_col_loc) if seq_name_line != "pass": num += 1 excel_hash[num] = {} excel_hash[num]['seq_name'] = seq_name_line excel_hash[num]['one_line'] = one_line_info # print excel_hash.keys() return excel_hash
def scores(y_true, y_pred, using_scores=eval_scores): return Od((s, score(y_true, y_pred, s)) for s in using_scores)
def merge_blast_rdp_file(blast_fp, rdp_fp, result_fp): blast_info = Od() rdp_info = {} data3 = open(result_fp, 'w') with open(rdp_fp) as data2: header2 = [ 'kingdom', 'phylum', 'class', 'order', 'family', 'genus2', 'rdp_value' ] for each_line2 in data2: if each_line2.strip() == '': continue cnt2 = each_line2.strip().split('\t') seq2 = cnt2[0] taxon_speies = cnt2[1].split(';') rdp_value = cnt2[2] taxon_num = len(taxon_speies) rdp_kindom = '.' rdp_phylum = '.' rdp_class = '.' rdp_order = '.' rdp_family = '.' rdp_genus = '.' if taxon_num == 1: rdp_kindom = taxon_speies[0].replace('k__', '') elif taxon_num == 2: rdp_kindom = taxon_speies[0].replace('k__', '') rdp_phylum = taxon_speies[1].replace('p__', '') elif taxon_num == 3: rdp_kindom = taxon_speies[0].replace('k__', '') rdp_phylum = taxon_speies[1].replace('p__', '') rdp_class = taxon_speies[2].replace('c__', '') elif taxon_num == 4: rdp_kindom = taxon_speies[0].replace('k__', '') rdp_phylum = taxon_speies[1].replace('p__', '') rdp_class = taxon_speies[2].replace('c__', '') rdp_order = taxon_speies[3].replace('o__', '') elif taxon_num == 5: rdp_kindom = taxon_speies[0].replace('k__', '') rdp_phylum = taxon_speies[1].replace('p__', '') rdp_class = taxon_speies[2].replace('c__', '') rdp_order = taxon_speies[3].replace('o__', '') rdp_family = taxon_speies[4].replace('f__', '').replace( '[', '').replace(']', '') elif taxon_num >= 6: rdp_kindom = taxon_speies[0].replace('k__', '') rdp_phylum = taxon_speies[1].replace('p__', '') rdp_class = taxon_speies[2].replace('c__', '') rdp_order = taxon_speies[3].replace('o__', '') rdp_family = taxon_speies[4].replace('f__', '').replace( '[', '').replace(']', '') rdp_genus = taxon_speies[5].replace('g__', '') if rdp_kindom == '': rdp_kindom = '.' if rdp_phylum == '': rdp_phylum = '.' if rdp_class == '': rdp_class = '.' if rdp_order == '': rdp_order = '.' if rdp_family == '': rdp_family = '.' if rdp_genus == '': rdp_genus = '.' rdp_info[seq2] = [ rdp_kindom, rdp_phylum, rdp_class, rdp_order, rdp_family, rdp_genus, rdp_value ] with open(blast_fp) as data1: for each_line in data1: if each_line.strip == '': continue elif each_line.startswith('#'): header1 = each_line.strip().split('\t') header = header1 + header2 data3.write('%s\n' % '\t'.join(header)) continue cnt = each_line.strip().split('\t') seq = cnt[0] blast_info[seq] = cnt[1:] if seq in rdp_info.keys(): info = cnt + rdp_info[seq] else: info = cnt + ['.', '.', '.', '.', '.', '.', '.'] data3.write('%s\n' % '\t'.join(info)) data3.close()
def group_data_frame(data_frame, column): value_set = sorted(set(data_frame[column])) return [(Od([(column, v)]), data_frame[data_frame[column] == v]) for v in value_set]
def count_occurence(y1, y2): y1_to_counter = Od((y, Counter()) for y in set(y1)) for v1, v2 in zip(y1, y2): y1_to_counter[v1][v2] += 1 return y1_to_counter
def read_l3_influence(): from me.analyze import group_data_frame_columns idx = 0 df = pd.DataFrame() log_base = Nodes.select(n1702='./logging_tge/', ngpu='./logging') for file in iu.list_children(log_base, full_path=True, pattern='.txt$'): scores_od = Od() early = False for line in iu.read_lines(file): if 'early' in line: early = True if 'nmi' not in line: continue for s_type, s_value in name2entries(line, inter=' ', intra=':'): s_value = float(s_value) if s_type not in scores_od: scores_od[s_type] = [s_value] else: scores_od[s_type].__add__(s_value) file_name = file[file.rfind('/') + 1:] if len(scores_od) == 0: print('{} is empty'.format(file_name)) continue for k, v in name2entries(file_name): df.loc[idx, k] = v epoch = scores_od.pop('e', default=None) for s_type, s_values in scores_od.items(): top_k = 10 top_value = np.mean(sorted(s_values, reverse=True)[:top_k]) last_value = np.mean(s_values[::-1][:top_k]) # df.loc[idx, s_type] = top_value df.loc[idx, s_type] = last_value df.loc[idx, 'epoch'] = str(max(epoch) + 1 if epoch is not None else 0) + ( ' e.s.' if early else '') idx += 1 df = df.sort_values(by='nmi', ascending=False) influence = Od() df_list = group_data_frame_columns(df, ['dn', 'l3']) for bv_list, d in df_list: print(' '.join(['{}={}'.format(*bv) for bv in bv_list])) nmi_mean, nmi_std = au.mean_std(d['nmi'].values[0:]) ari_mean, ari_std = au.mean_std(d['ari'].values[0:]) print('nmi:{:.4f}+{:.4f}'.format(nmi_mean, nmi_std)) print('ari:{:.4f}+{:.4f}'.format(ari_mean, ari_std)) print(d.iloc[:10, :]) dn, l3 = dict(bv_list)['dn'], float(dict(bv_list)['l3']) influence.setdefault(dn, list()) # if l3 <= 1e-4: # nmi_mean -= 0.01 # ari_mean -= 0.01 influence[dn].__add__( [round(v, 6) for v in [l3, nmi_mean, ari_mean, nmi_std, ari_std]]) print() for dn, values in influence.items(): influence[dn] = [[round(v, 6) for v in value] for value in np.array(values, dtype=float).T] # arr = np.array(influence[dn][1]) # print(arr) # print(arr - arr[0]) # print() with open('influence.json', mode='w') as fp: iu.json.dump(influence, fp)
def read_iterations(): def append_score_line_to_od(od_, score_line_): for score_name, score_value in name2entries(score_line_, inter=' ', intra=':'): score_value = float(score_value) if score_name not in od_: od_[score_name] = [score_value] else: od_[score_name].__add__(score_value) def append_score_od_to_iter(file_name_, od_): entries = dict(name2entries(file_name_)) param_as_key = tuple(entries[k] for k in desired_keys) if param_as_key not in param2iter: param2iter[param_as_key] = [od_] else: param2iter[param_as_key].__add__(od_) def group_iters(): groups_ = Od() for p_, o_ in param2iter.items(): k_, v_ = (p_[0], p_[1]), (p_[2], o_) groups_.setdefault(k_, list()) groups_[k_].__add__(v_) return groups_ param2iter = Od() log_base = Nodes.select(n1702='./logging_half_r/', ngpu='./logging_r') for file in iu.list_children(log_base, full_path=True, pattern='gid.+\.txt$'): file_name = file[file.rfind('/') + 1:] # if dict(name2entries(file_name))['vs'] in {'3', 3}: # continue score_dict = Od() for line in iu.read_lines(file): if 'nmi' not in line: continue append_score_line_to_od(score_dict, line) if len(score_dict) == 0: print('{} is empty'.format(file_name)) continue append_score_od_to_iter(file_name, score_dict) # for param in sorted(param2iter.keys(), key=lambda i: i[0]): params = Nodes.select(n1702=[ ('TREC', 0.01, 0.1), ('Google', 0.1, 0.1), ('Event', 0.01, 0.01), ('20ng', 0.1, 0.1), ], ngpu=[ ('Reuters', 0.001, 0.1), ]) params = list([tuple(map(str, param)) for param in params]) array = list() for param in param2iter.keys(): od_list = param2iter[param] for score_dict in od_list: score_dict.pop('e', default=None) array.extend([od_list]) print(len(array))