def __init__(self, cluid, twarr, prob, hot, level, geo_list, time_list,
                 keywords):
        self.cluid = cluid
        self.twarr = twarr
        self.prob = prob
        self.hot = hot
        self.level = level
        self.geo_list = geo_list
        self.time_list = time_list
        self.keywords = keywords

        self.od = Od()
        summary = Od(
            zip(
                ['cluster_id', 'prob', 'level', 'hot', 'keywords'],
                [cluid, prob, level, hot, keywords],
            ))
        geo_table = [
            Od(zip(['quality', 'address', 'country', 'bbox'], geo))
            for geo in geo_list
        ]
        time_table = Od(
            zip(['most_possible_time', 'earliest_time', 'latest_time'],
                time_list))
        self.od.update(
            zip(
                ['summary', 'geo_infer', 'time_infer', 'tweet_list'],
                [summary, geo_table, time_table, twarr],
            ))
Ejemplo n.º 2
0
 def chain_to_info(**blockchain):
     """
     TODO
     :param blockchain:
     :return:
     """
     temp_chain = cp.deepcopy(
         blockchain['blocks']) if 'blocks' in blockchain else []
     temp_txs = cp.deepcopy(
         blockchain['txs']) if 'txs' in blockchain else []
     # Converting the blocks in the chain to ordered dictionaries
     blocks_info = [
         Od([('previous_hash', block.previous_hash), ('index', block.index),
             ('proof', block.proof),
             ('transactions', [
                 Od([('sender', tx.sender), ('recipient', tx.recipient),
                     ('amount', tx.amount), ('timestamp', tx.timestamp),
                     ('signature', tx.signature)])
                 for tx in block.transactions
             ]), ('timestamp', block.timestamp)]) for block in temp_chain
     ]
     # Converting the transactions in the chain to ordered dictionaries
     txs_info = [
         Od([('sender', tx.sender), ('recipient', tx.recipient),
             ('amount', tx.amount), ('timestamp', tx.timestamp),
             ('signature', tx.signature)]) for tx in temp_txs
     ]
     return blocks_info, txs_info
Ejemplo n.º 3
0
def main():
    # print出来的结果最好也输出到文件里
    base = '/home/cdong/works/research/clu/data/'
    files = [
        '20ng_tf.pkl',
        '20ng_tfidf.pkl',
        'Event_tf.pkl',
        'Event_tfidf.pkl',
        'Google_tf.pkl',
        'Google_tfidf.pkl',
        'Reuters_tf.pkl',
        'Reuters_tfidf.pkl',
        'TREC_tf.pkl',
        'TREC_tfidf.pkl',
    ]
    for file in files:
        file = base + file
        print(file)
        scores = list()
        for i in range(3):
            features, y_true = pickle.load(open(file, 'rb'))
            y_pred = GANMM(features)
            score = Od((s, get_score(y_true, y_pred, s))
                       for s in ('acc', 'ari', 'nmi'))
            scores.append(score)
            print(score)
        name2scores = Od()
        for score in scores:
            for score_name, score_value in score.items():
                name2scores.setdefault(score_name, list()).append(score_value)
            for score_name, score_values in name2scores.items():
                print('mean {}: {}\n'.format(score_name,
                                             np.mean(score_values)))
Ejemplo n.º 4
0
 def vec2dict(idx_vec):
     od = Od(zip(names, [None] * len(names)))
     for n_idx, v_idx in enumerate(idx_vec):
         n, v = names[n_idx], values_list[n_idx][v_idx]
         if not callable(v):
             od[n] = v
     return od
Ejemplo n.º 5
0
 def group_iters():
     groups_ = Od()
     for p_, o_ in param2iter.items():
         k_, v_ = (p_[0], p_[1]), (p_[2], o_)
         groups_.setdefault(k_, list())
         groups_[k_].__add__(v_)
     return groups_
Ejemplo n.º 6
0
 def construct_od(self):
     """
     从当前聚类信息构造一个字段按顺序保存的字典,该字典维护一个用于描述聚类信息的树形结构,
     各字段含义与数值类型见本文件 line 182-209 的说明
     :return: OrderedDict,保存了聚类的结构化信息
     """
     summary = Od(
         zip(
             ['cluster_id', 'prob', 'level', 'hot', 'keywords'],
             [self.cluid, self.prob, self.level, self.hot, self.keywords],
         ))
     od = Od(
         zip(
             ['summary', 'geo_infer', 'time_infer', 'tweet_list'],
             [summary, self.geo_table, self.time_table, self.twarr],
         ))
     return od
Ejemplo n.º 7
0
def _init_months():
    temp = [
        'Jan', 'Feb', 'Mar', 'Apr', 'May', 'Jun', 'Jul', 'Aug', 'Sep', 'Oct',
        'Nov', 'Dec'
    ]
    months = Od()
    for i in range(1, 13):
        months[i] = temp[i - 1]
    return months
Ejemplo n.º 8
0
 def get_scores(pred_target, add_on):
     preds, trues = list(), list()
     for batch in batches:
         c_probs = self.sess.run(pred_target,
                                 feed_dict=self.get_fd_by_batch(batch))
         preds.extend(np.argmax(c_probs, axis=1).reshape(-1))
         trues.extend(d.topic for d in batch)
     od = au.scores(trues, preds, au.eval_scores)
     return Od((k + add_on, v) for k, v in od.items())
Ejemplo n.º 9
0
def group_data_frame_columns(data_frame, columns):
    groups = [(Od(), data_frame)]
    for col in columns:
        for _ in range(len(groups)):
            p_od, p_df = groups.pop(0)
            for n_od, n_df in group_data_frame(p_df, col):
                p_od = p_od.copy()
                p_od.update(n_od)
                groups.append((p_od, n_df))
    return groups
Ejemplo n.º 10
0
    def platformPulizie_old(self, db=None):
        if db is None:
            db = self.DataBase

        # todo evitare di caricare le date passate?
        # print(os.getcwdb())
        self.datePrenotazioni.clear()
        # a, m, g = self.amg(self._dataIn)
        for anno in db.keys():
            for mese in db[anno].keys():
                for giorno in db[anno][mese].keys():
                    data = QtCore.QDate(anno, mese, giorno)
                    plat = db[anno][mese][giorno]["checkIn"]["platform"]
                    pulizie = db[anno][mese][giorno]["checkOut"][
                        'data partenza']
                    if 'platforms' not in self.datePrenotazioni:
                        self.datePrenotazioni['platforms'] = Od()
                    if plat != '':

                        if plat not in self.datePrenotazioni['platforms']:

                            dat = {'date': [data]}
                            self.datePrenotazioni['platforms'][plat] = Od(dat)
                            # self.datePrenotazioni['platforms'][plat]['colore'] = QtGui.QColor
                        else:
                            print("PLAT: ", plat, "##")
                            self.datePrenotazioni['platforms'][plat][
                                'date'].append(data)
                    # if plat == "Booking":
                    #     if data not in self.dateBooking:
                    #         self.dateBooking.append(data)
                    # elif plat == "AirB&B":
                    #     if data not in self.dateAirbb:
                    #         self.dateAirbb.append(data)
                    # elif plat == 'Privati':
                    #     if data not in self.datePrivati:
                    #         self.datePrivati.append(data)
                    if pulizie != '':
                        if pulizie not in self.datePulizie:
                            self.datePulizie.append(pulizie)

        return self.datePrenotazioni, self.datePulizie
Ejemplo n.º 11
0
def print_groups(use_max, sort_group, files):
    def group_score(frame, col):
        group_top = 5
        mean = np.mean(frame[col].values[1:group_top])
        return mean

    board = (read_board_multi if len(files) >= 60 else read_board_files)(
        use_max, files)
    board.fillna('_', inplace=True)
    for s in set(board.columns).intersection(set(group_by)):
        print(s, sorted(Counter(board[s])))
    sort_by = 'acc'
    board.sort_values(by=sort_by, ascending=False, inplace=True)

    ret_dfs = list()
    for dn_bv, dn_df in au.group_data_frame_columns(board, columns=[dn_]):
        data_name = dn_bv[dn_]
        print(data_name)
        bv_df_list = au.group_data_frame_columns(dn_df, columns=group_by)
        bv_df_sc_list = list()
        for bv, df in bv_df_list:
            evals = ['nmi', 'ari', 'acc']
            # evals += [s + '_nis' for s in evals]
            sc = Od((s, group_score(df, s)) for s in evals)
            bv_df_sc_list.append((bv, df, sc))
        if sort_group:
            bv_df_sc_list = sorted(
                bv_df_sc_list,
                key=lambda item:
                (item[-1]['acc'], item[-1]['ari'], item[-1]['nmi']))
        ret_df = pd.DataFrame()
        for bv, df, sc in bv_df_sc_list:
            # if not (bv[l1_] == '1.0' and bv[l2_] == '1.0'
            #         and bv[l3_] == '0.0'):
            #     continue
            print('<{}>'.format(len(df)), end='    ')
            print(' '.join(['{}={:7}'.format(*x) for x in bv.items()]),
                  end='|  ')
            print('    '.join(
                ['{} {:.4f}'.format(s, g) for s, g in sc.items()]))
            print(df.iloc[:4, :], '\n')
            # print(','.join(df[gid_].tolist()))
            i = len(ret_df)
            for b, v in bv.items():
                ret_df.loc[i, b] = v
            for s, c in sc.items():
                ret_df.loc[i, s] = round(float(c), 4)
            ret_df.loc[i, 'LEN'] = len(df)
            ret_df.loc[i, dn_] = data_name
        print(data_name, '\n' * 3)
        # if comment is not None:
        #     ret_df.to_csv('{}_{}.csv'.format(comment, data_name))
        ret_dfs.append(ret_df)
    pd.concat(ret_dfs).to_csv('{}.csv'.format(comment))
Ejemplo n.º 12
0
def ddd(thing):
    stack = traceback.extract_stack()
    for s in stack:
        filename, lineno, function_name, code = s
        if filename == '/var/src/conkyconfpy/conkyconfpy.py':
            d = Od({
                'lineno': lineno,
                'function_name': function_name,
                'code': code,
            })
            pprint(d)
Ejemplo n.º 13
0
    def platformPulizie(self, db=None):
        if db is None:
            db = self.DataBase

        # todo evitare di caricare le date passate?
        # print(os.getcwdb())
        self.datePrenotazioni.clear()
        # a, m, g = self.amg(self._dataIn)
        for anno in db.keys():
            for mese in db[anno].keys():
                for giorno in db[anno][mese].keys():
                    data = QtCore.QDate(anno, mese, giorno)
                    checkIn = db[anno][mese][giorno]["checkIn"]
                    plat = checkIn["platform"]
                    pulizie = db[anno][mese][giorno]["checkOut"][
                        'data partenza']
                    note = checkIn['note']
                    if 'platforms' not in self.datePrenotazioni:
                        self.datePrenotazioni['platforms'] = Od()
                    if plat != '':
                        if plat not in self.datePrenotazioni['platforms']:
                            dat = {'date': [data]}
                            self.datePrenotazioni['platforms'][plat] = Od(dat)
                            # self.datePrenotazioni['platforms'][plat]['colore'] = QtGui.QColor
                        else:
                            # print("not in self.datePrenotazioni (managerprenotazioni) PLAT: ", plat, "##")
                            self.datePrenotazioni['platforms'][plat][
                                'date'].append(data)
                    if pulizie != '':
                        if pulizie not in self.datePulizie:
                            print('pulizie', pulizie)
                            self.datePulizie.append(pulizie)
                    # if spese > 0:
                    #     print('spese append: ', spese)
                    #     if data not in self.dateSpese:
                    #         self.dateSpese.append(data)
                    if len(note) > 0:
                        if data not in self.dateNote:
                            self.dateNote.append(data)

        return self.datePrenotazioni, self.datePulizie, self.dateNote
Ejemplo n.º 14
0
def print_coherent_topics():
    from collections import Counter
    from data.make_embeddings import load_pretrain_google_news
    word2vec = load_pretrain_google_news()
    print('word2vec load over')

    for file in res_files:
        topic_list, c_alpha_list, w_alpha_list, doc_embed, word_embed, clu_embed, d_name = np.load(
            file)

        print('c_alpha_list.shape', np.array(c_alpha_list).shape)
        cluid_list = np.argmax(c_alpha_list, axis=1)
        cluid2counter = Od(
            (cluid, Counter()) for cluid in range(len(clu_embed)))
        for topic, cluid in zip(topic_list, cluid_list):
            cluid2counter[cluid][topic] += 1

        print(d_name)
        ifd, docarr = name2object[d_name].load_ifd_and_docarr()

        w_num = 31
        c_num = 100
        cw_sim = au.cosine_similarity(clu_embed, word_embed)
        cw_sim_sort = np.sort(cw_sim, axis=1)[:, :-w_num:-1]
        top_sim_clu = np.argsort(np.mean(cw_sim_sort,
                                         axis=1).reshape(-1))[::-1][:c_num]
        for cluid in sorted(list(top_sim_clu)):
            print(cluid)
            topic_distrib = cluid2counter[cluid].most_common()
            if len(topic_distrib) == 0:
                continue
            topic = topic_distrib[0][0]
            print('cluid: {}, guess topic: {}, distribution: {}'.format(
                cluid, topic, topic_distrib[:5]))

            cw_sim_top = cw_sim[cluid]
            top_word_id = np.argsort(cw_sim_top)[:-w_num:-1]
            valid_words = [
                ifd.id2word(wid) for wid in top_word_id
                if ifd.id2word(wid) in word2vec
            ]
            print(' '.join(valid_words))

        # cw_sim_top = cw_sim[top_sim_clu]
        # top_word_id = np.argsort(cw_sim_top, axis=1)[:, :-w_num:-1]
        # print(np.sort(np.mean(cw_sim_sort, axis=1).reshape(-1))[:-c_num:-1])
        # print(np.array([cw_sim[wid] for wid, cw_sim in zip(top_word_id, cw_sim_top)]))
        #
        # ifd = name2class[d_name].load_ifd()
        # for idx, wid_list in enumerate(top_word_id):
        #     valid_words = [ifd.id2word(wid) for wid in wid_list if ifd.id2word(wid) in word2vec]
        #     print('{}: '.format(idx) + ('{} ' * len(valid_words)).format(*valid_words))
        print('\n----\n')
Ejemplo n.º 15
0
    def __init__(self, cluid, twarr, prob, hot, level, geo_list, time_list,
                 keywords):
        self.cluid = cluid
        self.twarr = twarr
        self.prob = prob
        self.hot = hot
        self.level = level
        self.geo_list = geo_list
        self.time_list = time_list
        self.keywords = keywords

        self.geo_table = [
            Od(zip(['quality', 'address', 'country', 'bbox', 'freq'], g))
            for g in self.geo_list
        ]
        self.s_geo_table = [
            row for row in self.geo_table
            if row['quality'] not in self.large_geo
        ]
        self.time_table = Od(zip(['most_possible_time', 'earliest_time', 'latest_time'], self.time_list)) \
            if self.time_list is not None else None
Ejemplo n.º 16
0
def analyze_refine_mean_and_stderr(result_file, mean_std_file):
    using_scores = ['nmi', 'h**o', 'cmplt', 'ari']
    arg_tpc_clu_list = iu.load_array(result_file)
    rows = list()
    for kwargs, topics, clusters in arg_tpc_clu_list:
        scores = [au.score(topics, clusters, s) for s in using_scores]
        res_dict = Od(zip(using_scores, scores))
        row = Od(list(kwargs.items()) + list(res_dict.items()))
        rows.append(row)
    rows = sorted(rows, key=lambda item: item['nmi'], reverse=True)
    df = pd.DataFrame(data=rows)
    print(df)
    score_array = df[using_scores].values
    mean = np.mean(score_array, axis=0)
    std = np.std(score_array, axis=0, ddof=1)
    table = list(zip(*[using_scores, mean, std]))
    lines = [
        '{}: {} ± {}'.format(name, round(mean, 4), round(std, 4))
        for name, mean, std in table
    ]
    iu.write_lines(mean_std_file, lines)
Ejemplo n.º 17
0
 def print_top_words(self, ifd):
     from collections import OrderedDict as Od
     cluid2counter = Od(
         (twh.cluster.cluid, Counter()) for twh in self.twharr)
     for twh in self.twharr:
         cluid2counter[twh.cluster.cluid][twh.topic] += 1
     print('total cluster number: {}'.format(len(self.cludict)))
     clu2distrib = Od()
     for cluid in sorted(cluid2counter.keys()):
         topic_distrib = cluid2counter[cluid].most_common()
         topic = topic_distrib[0][0]
         print('cluid: {}, topic: {}, t distrib: {}'.format(
             cluid, topic, topic_distrib[:10]))
         word_distrib = self.cludict[cluid].tokens.most_common()[:60]
         valid_words = [ifd.id2word(wid) for wid, cnt in word_distrib]
         print(' '.join(valid_words))
         clu2distrib[cluid] = {
             'topic_distrib': topic_distrib,
             'word_distrib': valid_words
         }
     return clu2distrib
Ejemplo n.º 18
0
    def read_excel_onesheet(fd, sheet_name, same_line_debug=False):
        """:
        第一列必须唯一,且为英语,作为第一层键;
        第一行必须唯一,且为英语,作为二层键
        返回dict
        """
        if os.path.exists(fd):
            pass
        else:
            print '@Error:We can not find file :%s' % fd

        excel_hash = Od()
        workbook = xlrd.open_workbook(fd)
        worksheets = workbook.sheet_names()

        worksheet = workbook.sheet_by_name(sheet_name)
        num_rows = worksheet.nrows
        num_cols = worksheet.ncols
        for rown in range(1, num_rows):
            row_name = worksheet.cell_value(rown, 0)
            if same_line_debug:
                if row_name not in excel_hash.keys():
                    excel_hash[row_name] = Od()
                    key1 = row_name
                else:
                    print 'Same line %s' % row_name
                    exit()
            else:
                if rown not in excel_hash.keys():
                    excel_hash[rown] = Od()
                    key1 = rown
            for coln in range(0, num_cols):
                col_name = str(worksheet.cell_value(0, coln))
                if col_name not in excel_hash[key1].keys():
                    pass
                else:
                    print 'The same clown %s' % col_name
                excel_hash[key1][col_name] = worksheet.cell_value(rown, coln)
        return excel_hash
Ejemplo n.º 19
0
    def __init__(self, info=None, shortcut=0):

        # self._dataIn = dataIn
        # self._domani = dataIn.addDays(1)
        # self._nome = nome
        # self._cognome = cognome

        # opzione per correggere il percorso durante i tests
        self.shortcut = shortcut
        self._info = info
        # if self._info is not None:
        manageErr = False
        try:
            self.info = self.getInfo
            self._dataIn = self.info['data arrivo']
            self._dataOut = self.info['data partenza']
            self._domani = self._dataIn.addDays(1)
            self._nome = self.info['nome']
            self._cognome = self.info['cognome']
        except AttributeError:
            manageErr = True
        except TypeError:
            manageErr = True
        if manageErr:
            self._dataIn = QtCore.QDate().currentDate()
            self._domani = self._dataIn.addDays(1)
            self._dataOut = self._domani
            self._nome = None
            self._cognome = None
        self.occupate = []
        # old
        # self.DataBase = deepc(self.getDb(self._dataIn))
        self.DataBase = deepc(self.getDb())
        self.counter = 0
        # self.DataBase = Od()
        datePren = {'platforms': {}}
        self.datePrenotazioni = Od(datePren)
        self.dateBooking = []
        self.dateAirbb = []
        self.datePrivati = []
        self.datePulizie = []
        self.dateSpese = []
        self.dateNote = []
        self.platformDict = {
            'Booking': self.dateBooking,
            'AirB&B': self.dateAirbb,
            'Privati': self.datePrivati,
            'pulizie': self.datePulizie
        }
Ejemplo n.º 20
0
def analyze_mean_and_stderr(result_file):
    arg_tpc_clu_list = fu.load_array(result_file)
    rows = list()
    for kwargs, topics, clusters in arg_tpc_clu_list:
        s2v = Od((s, au.score(topics, clusters, s)) for s in au.eval_scores)
        row = Od(list(kwargs.items()) + list(s2v.items()))
        rows.append(row)
    rows = sorted(rows, key=lambda item: item['nmi'], reverse=True)
    df = pd.DataFrame(data=rows)
    # print(df)
    groups = au.group_data_frame(df, column='n_components')
    nmi_list, ari_list, acc_list = list(), list(), list()
    for _, df_ in groups:
        print(result_file)
        print(df_)
        nmis = df_['nmi'].values[0:6]
        aris = df_['ari'].values[0:6]
        accs = df_['acc'].values[0:6]
        nmi_list.append(au.mean_std(nmis))
        ari_list.append(au.mean_std(aris))
        acc_list.append(au.mean_std(accs))
    print(au.transpose(nmi_list))
    print(au.transpose(ari_list))
    print(au.transpose(acc_list))
Ejemplo n.º 21
0
def read_scores_from_file(file):
    e_flag = ''
    score_od = Od()
    for line in iu.read_lines(file):
        if line.startswith('b'):
            continue
        elif line.startswith('e'):
            e_flag = line[line.find('-') + 1:]
        elif line.startswith('{') and 'nmi' in line:
            for k, v in iu.loads(line).items():
                score_od.setdefault(k, list()).append(v)
    if len(score_od) == 0:
        fname = iu.base_name(file)
        print('{} - empty'.format(fname[:fname.find(',')]))
    epoch = len(list(score_od.values())[0]) if len(score_od) > 0 else 0
    return score_od, epoch, e_flag
Ejemplo n.º 22
0
    def evaluate(self, batches):
        def get_scores(pred_target, add_on):
            preds, trues = list(), list()
            for batch in batches:
                c_probs = self.sess.run(pred_target,
                                        feed_dict=self.get_fd_by_batch(batch))
                preds.extend(np.argmax(c_probs, axis=1).reshape(-1))
                trues.extend(d.topic for d in batch)
            od = au.scores(trues, preds, au.eval_scores)
            return Od((k + add_on, v) for k, v in od.items())

        from collections import OrderedDict as Od
        scores = Od()
        scores.update(get_scores(self.pc_probs, add_on=''))
        if self.use_adv_nis:
            scores.update(get_scores(self.pc_probs_nis, add_on='_nis'))
        return scores
Ejemplo n.º 23
0
    def _write_cytoscape_files(self,
                               noa_path,
                               eda_path,
                               pp_path,
                               idx_selection,
                               label=None):
        """
        Compute some node and edge attributes and write these to
        files that can be loaded in cytoscape.
        """
        df = self.interactions_.loc[idx_selection, ]
        edges = [
            sorted([p1, p2]) for (p1, p2) in zip(df[P1].values, df[P2].values)
        ]

        # Compute some selected node-attributes,
        # Write the noa (noda-attribute) file.
        accessions = sorted(set([p for tup in edges for p in tup]))
        gene_names = [self.gene_names_[a] for a in accessions]
        node_in_training = [self.node_in_training_set(a) for a in accessions]
        cyto_n_attrs = pd.DataFrame(
            Od([('name', accessions), ('node in training', node_in_training),
                ('gene name', gene_names)]))
        cyto_n_attrs.to_csv(noa_path, sep=self.sep, index=False)

        # Compute some selected edge-attributes a,
        # Write the eda (edge-attribute) file.
        columns = ['source', 'target', 'name', 'edge in training', 'max-pr']
        cyto_e_attrs = dict()
        cyto_e_attrs['source'] = [p1 for p1, _ in edges]
        cyto_e_attrs['target'] = [p2 for _, p2 in edges]
        cyto_e_attrs['name'] = ['{} pp {}'.format(p1, p2) for p1, p2 in edges]
        cyto_e_attrs['edge in training'] = [
            self.edge_in_training_set(e, label) for e in edges
        ]
        cyto_e_attrs['max-pr'] = list(df['max-pr'].values)
        for label in self.labels:
            column = self._label_to_column(label)
            cyto_e_attrs[column] = df[column].values
            columns.append(column)

        cyto_interactions = pd.DataFrame(cyto_e_attrs, columns=columns)
        cyto_interactions.to_csv(pp_path, sep=self.sep, index=False)
        return self
Ejemplo n.º 24
0
    def read_excel_onesheet2(fd,
                             sheet_name,
                             seq_name='测序编号',
                             same_line_debug=False):
        """:
        第一列必须唯一,且为英语,作为第一层键;
        第一行必须唯一,且为英语,作为二层键
        返回dict
        """
        if os.path.exists(fd):
            pass
        else:
            print '@Error:We can not find file :%s' % fd

        excel_hash = Od()
        workbook = xlrd.open_workbook(fd)
        worksheets = workbook.sheet_names()

        # print worksheets #.decode('utf-8')

        worksheet = workbook.sheet_by_name(sheet_name)
        num_rows = worksheet.nrows
        num_cols = worksheet.ncols
        num = 0
        for rown in range(0, num_rows):
            one_line_info = []
            for coln in range(0, num_cols):
                col_name = str(worksheet.cell_value(0, coln))
                if col_name == seq_name:
                    seq_col_loc = coln
                one_line_info.append(worksheet.cell_value(rown, coln))

            seq_name_line = worksheet.cell_value(rown, seq_col_loc)
            if seq_name_line != "pass":
                num += 1
                excel_hash[num] = {}
                excel_hash[num]['seq_name'] = seq_name_line
                excel_hash[num]['one_line'] = one_line_info

        # print excel_hash.keys()
        return excel_hash
Ejemplo n.º 25
0
def scores(y_true, y_pred, using_scores=eval_scores):
    return Od((s, score(y_true, y_pred, s)) for s in using_scores)
Ejemplo n.º 26
0
def merge_blast_rdp_file(blast_fp, rdp_fp, result_fp):
    blast_info = Od()
    rdp_info = {}
    data3 = open(result_fp, 'w')
    with open(rdp_fp) as data2:
        header2 = [
            'kingdom', 'phylum', 'class', 'order', 'family', 'genus2',
            'rdp_value'
        ]
        for each_line2 in data2:
            if each_line2.strip() == '':
                continue
            cnt2 = each_line2.strip().split('\t')

            seq2 = cnt2[0]
            taxon_speies = cnt2[1].split(';')
            rdp_value = cnt2[2]

            taxon_num = len(taxon_speies)

            rdp_kindom = '.'
            rdp_phylum = '.'
            rdp_class = '.'
            rdp_order = '.'
            rdp_family = '.'
            rdp_genus = '.'

            if taxon_num == 1:
                rdp_kindom = taxon_speies[0].replace('k__', '')
            elif taxon_num == 2:
                rdp_kindom = taxon_speies[0].replace('k__', '')
                rdp_phylum = taxon_speies[1].replace('p__', '')
            elif taxon_num == 3:
                rdp_kindom = taxon_speies[0].replace('k__', '')
                rdp_phylum = taxon_speies[1].replace('p__', '')
                rdp_class = taxon_speies[2].replace('c__', '')
            elif taxon_num == 4:
                rdp_kindom = taxon_speies[0].replace('k__', '')
                rdp_phylum = taxon_speies[1].replace('p__', '')
                rdp_class = taxon_speies[2].replace('c__', '')
                rdp_order = taxon_speies[3].replace('o__', '')
            elif taxon_num == 5:
                rdp_kindom = taxon_speies[0].replace('k__', '')
                rdp_phylum = taxon_speies[1].replace('p__', '')
                rdp_class = taxon_speies[2].replace('c__', '')
                rdp_order = taxon_speies[3].replace('o__', '')
                rdp_family = taxon_speies[4].replace('f__', '').replace(
                    '[', '').replace(']', '')
            elif taxon_num >= 6:
                rdp_kindom = taxon_speies[0].replace('k__', '')
                rdp_phylum = taxon_speies[1].replace('p__', '')
                rdp_class = taxon_speies[2].replace('c__', '')
                rdp_order = taxon_speies[3].replace('o__', '')
                rdp_family = taxon_speies[4].replace('f__', '').replace(
                    '[', '').replace(']', '')
                rdp_genus = taxon_speies[5].replace('g__', '')

            if rdp_kindom == '':
                rdp_kindom = '.'
            if rdp_phylum == '':
                rdp_phylum = '.'
            if rdp_class == '':
                rdp_class = '.'
            if rdp_order == '':
                rdp_order = '.'
            if rdp_family == '':
                rdp_family = '.'
            if rdp_genus == '':
                rdp_genus = '.'

            rdp_info[seq2] = [
                rdp_kindom, rdp_phylum, rdp_class, rdp_order, rdp_family,
                rdp_genus, rdp_value
            ]

    with open(blast_fp) as data1:
        for each_line in data1:
            if each_line.strip == '':
                continue
            elif each_line.startswith('#'):
                header1 = each_line.strip().split('\t')
                header = header1 + header2
                data3.write('%s\n' % '\t'.join(header))
                continue
            cnt = each_line.strip().split('\t')
            seq = cnt[0]
            blast_info[seq] = cnt[1:]
            if seq in rdp_info.keys():
                info = cnt + rdp_info[seq]
            else:
                info = cnt + ['.', '.', '.', '.', '.', '.', '.']
            data3.write('%s\n' % '\t'.join(info))

    data3.close()
Ejemplo n.º 27
0
def group_data_frame(data_frame, column):
    value_set = sorted(set(data_frame[column]))
    return [(Od([(column, v)]), data_frame[data_frame[column] == v])
            for v in value_set]
Ejemplo n.º 28
0
def count_occurence(y1, y2):
    y1_to_counter = Od((y, Counter()) for y in set(y1))
    for v1, v2 in zip(y1, y2):
        y1_to_counter[v1][v2] += 1
    return y1_to_counter
Ejemplo n.º 29
0
def read_l3_influence():
    from me.analyze import group_data_frame_columns
    idx = 0
    df = pd.DataFrame()
    log_base = Nodes.select(n1702='./logging_tge/', ngpu='./logging')
    for file in iu.list_children(log_base, full_path=True, pattern='.txt$'):
        scores_od = Od()
        early = False
        for line in iu.read_lines(file):
            if 'early' in line:
                early = True
            if 'nmi' not in line:
                continue
            for s_type, s_value in name2entries(line, inter=' ', intra=':'):
                s_value = float(s_value)
                if s_type not in scores_od:
                    scores_od[s_type] = [s_value]
                else:
                    scores_od[s_type].__add__(s_value)

        file_name = file[file.rfind('/') + 1:]
        if len(scores_od) == 0:
            print('{} is empty'.format(file_name))
            continue
        for k, v in name2entries(file_name):
            df.loc[idx, k] = v
        epoch = scores_od.pop('e', default=None)
        for s_type, s_values in scores_od.items():
            top_k = 10
            top_value = np.mean(sorted(s_values, reverse=True)[:top_k])
            last_value = np.mean(s_values[::-1][:top_k])
            # df.loc[idx, s_type] = top_value
            df.loc[idx, s_type] = last_value
        df.loc[idx,
               'epoch'] = str(max(epoch) + 1 if epoch is not None else 0) + (
                   ' e.s.' if early else '')
        idx += 1

    df = df.sort_values(by='nmi', ascending=False)
    influence = Od()
    df_list = group_data_frame_columns(df, ['dn', 'l3'])
    for bv_list, d in df_list:
        print(' '.join(['{}={}'.format(*bv) for bv in bv_list]))
        nmi_mean, nmi_std = au.mean_std(d['nmi'].values[0:])
        ari_mean, ari_std = au.mean_std(d['ari'].values[0:])
        print('nmi:{:.4f}+{:.4f}'.format(nmi_mean, nmi_std))
        print('ari:{:.4f}+{:.4f}'.format(ari_mean, ari_std))
        print(d.iloc[:10, :])
        dn, l3 = dict(bv_list)['dn'], float(dict(bv_list)['l3'])
        influence.setdefault(dn, list())
        # if l3 <= 1e-4:
        #     nmi_mean -= 0.01
        #     ari_mean -= 0.01
        influence[dn].__add__(
            [round(v, 6) for v in [l3, nmi_mean, ari_mean, nmi_std, ari_std]])
        print()
    for dn, values in influence.items():
        influence[dn] = [[round(v, 6) for v in value]
                         for value in np.array(values, dtype=float).T]
    #     arr = np.array(influence[dn][1])
    #     print(arr)
    #     print(arr - arr[0])
    #     print()
    with open('influence.json', mode='w') as fp:
        iu.json.dump(influence, fp)
Ejemplo n.º 30
0
def read_iterations():
    def append_score_line_to_od(od_, score_line_):
        for score_name, score_value in name2entries(score_line_,
                                                    inter=' ',
                                                    intra=':'):
            score_value = float(score_value)
            if score_name not in od_:
                od_[score_name] = [score_value]
            else:
                od_[score_name].__add__(score_value)

    def append_score_od_to_iter(file_name_, od_):
        entries = dict(name2entries(file_name_))
        param_as_key = tuple(entries[k] for k in desired_keys)
        if param_as_key not in param2iter:
            param2iter[param_as_key] = [od_]
        else:
            param2iter[param_as_key].__add__(od_)

    def group_iters():
        groups_ = Od()
        for p_, o_ in param2iter.items():
            k_, v_ = (p_[0], p_[1]), (p_[2], o_)
            groups_.setdefault(k_, list())
            groups_[k_].__add__(v_)
        return groups_

    param2iter = Od()
    log_base = Nodes.select(n1702='./logging_half_r/', ngpu='./logging_r')
    for file in iu.list_children(log_base,
                                 full_path=True,
                                 pattern='gid.+\.txt$'):
        file_name = file[file.rfind('/') + 1:]
        # if dict(name2entries(file_name))['vs'] in {'3', 3}:
        #     continue
        score_dict = Od()
        for line in iu.read_lines(file):
            if 'nmi' not in line:
                continue
            append_score_line_to_od(score_dict, line)
        if len(score_dict) == 0:
            print('{} is empty'.format(file_name))
            continue
        append_score_od_to_iter(file_name, score_dict)

    # for param in sorted(param2iter.keys(), key=lambda i: i[0]):
    params = Nodes.select(n1702=[
        ('TREC', 0.01, 0.1),
        ('Google', 0.1, 0.1),
        ('Event', 0.01, 0.01),
        ('20ng', 0.1, 0.1),
    ],
                          ngpu=[
                              ('Reuters', 0.001, 0.1),
                          ])
    params = list([tuple(map(str, param)) for param in params])
    array = list()
    for param in param2iter.keys():
        od_list = param2iter[param]
        for score_dict in od_list:
            score_dict.pop('e', default=None)
        array.extend([od_list])
    print(len(array))