def _test():
    tp = TextProcessor(10, 2192, 100)
    corpus = [
        '你好 中国', '打印 每类 文本 for', 'for', '遍历 所有 文本 第二个 for 便利 某一类 文本 下 的 词语 权重'
    ]
    print(tp.w2v_transform('\n'.join(corpus))['你好'])
    corpus = [
        '你好 中国'.split(), '打印 每类 文本 for'.split(), 'for'.split(),
        '遍历 所有 文本 第二个 for 便利 某一类 文本 下 的 词语 权重'.split()
    ]
    ctf = tp.tf_idf_transform(corpus)
    clsi = tp.lda_transform(ctf)

    for i in clsi:
        print(i)
    aa = tp.ldaModel.print_topics(num_topics=500, num_words=50)
    for i in aa:
        print(i)

    path = conf.get_filename_via_tpl('model',
                                     model_type='lsi',
                                     n_users=conf.N_USERS,
                                     n_samples=conf.N_SAMPLES,
                                     n_dims=conf.N_DIMS)
    tp.load_model('lsi')
    tp.w2v_transform([['你好啊', 'hell0'], ['123', 'forfor']])
    def tf_idf_transform(self, doc):
        """
        Perform tf-idf transformation on doc.
        """
        self.dictionary = corpora.Dictionary(doc)
        corpus = [self.dictionary.doc2bow(text) for text in doc]
        self.tfIdfModel = TfidfModel(corpus)

        conf.mk_dir(self.tfIdfPath)

        self.dictionary.save(self.dictPath)
        logger.info('Dictionary has been saved in %s.' % self.dictPath)

        self.tfIdfModel.save(self.tfIdfPath)
        logger.info('TF-IDF model has been saved in %s.' % self.tfIdfPath)

        tfidf_corpus = self.tfIdfModel[corpus]
        tfidf_corpus_path = conf.get_filename_via_tpl('tfidf',
                                                      n_users=self.nUsers,
                                                      postfix='mm',
                                                      n_samples=self.nSamples)
        corpora.MmCorpus.serialize(tfidf_corpus_path, tfidf_corpus)
        logger.info('TF-IDF corpus with a shape of %s has been saved in %s.' %
                    (np.array(tfidf_corpus).shape, tfidf_corpus_path))

        return tfidf_corpus
    def lsi_transform(self, corpus_tf_idf):
        logger.info('Training lsi model with a n_dims of %d...' % self.nDims)
        if self.dictionary is None and os.path.exists(self.dictPath):
            self.dictionary = corpora.Dictionary.load(self.dictPath)

        self.lsiModel = LsiModel(corpus=corpus_tf_idf,
                                 num_topics=self.nDims,
                                 id2word=self.dictionary)
        # print self.lsiModel[corpus]

        conf.mk_dir(self.lsiPath)

        self.lsiModel.save(self.lsiPath)
        logger.info('Lsi model has been saved in %s.' % self.lsiPath)

        lsi_corpus = self.lsiModel[corpus_tf_idf]
        lsi_corpus_path = conf.get_filename_via_tpl('lsi',
                                                    n_users=self.nUsers,
                                                    n_samples=self.nSamples,
                                                    n_dims=self.nDims,
                                                    postfix='mm')
        conf.mk_dir(lsi_corpus_path)
        corpora.MmCorpus.serialize(lsi_corpus_path, lsi_corpus)
        logger.info('Lsi corpus with a shape of %s has been saved in %s.' %
                    (np.array(lsi_corpus).shape, lsi_corpus_path))

        return lsi_corpus
 def save_sequences(self, new_mblog_info=None):
     mblog_info = new_mblog_info if new_mblog_info else self.mblogInfo
     uid_fname = conf.get_filename_via_tpl('uid',
                                           n_users=len(self.uidList),
                                           n_samples=mblog_info.seqLen)
     seq_fname = conf.get_filename_via_tpl('seq',
                                           n_users=len(self.uidList),
                                           n_samples=mblog_info.seqLen)
     # Save user ids
     with open(uid_fname, 'w') as fp:
         csv_writer = csv.writer(fp)
         csv_writer.writerow(self.uidList)
         logger.info('User id are saved in %s. ' % uid_fname)
     # Save sequences
     with open(seq_fname, 'w') as fp:
         csv_writer = csv.writer(fp)
         csv_writer.writerows(self.sequences)
         logger.info('Sequences data is saved in file %s. ' % seq_fname)
def calculate_te(data, vec_type, lag=1, normalised=True):
    """
    Perform transfer entropy on each pair of samples to find out causal relationships.
    """
    data = np.array(data)
    n_nodes, n_samples, n_dims = data.shape
    cn = np.zeros((n_nodes, n_nodes))
    te_mat = np.zeros((n_nodes, n_nodes))
    #
    if normalised:
        H_0 = np.zeros(n_nodes)
        for i in range(n_nodes):
            max_min = np.max(data[i], 0) - np.min(data[i], 0)
            H_0[i] = np.sum(np.log2(max_min))
        # for i in range(n_nodes):
        #     H_0[i] = entropy(data[i])

    logger.info('Calculating te...')
    # Calculate te and fill with the causal network.
    for i in range(n_nodes):
        sample_i = data[i]
        for j in range(i, n_nodes):
            sample_j = data[j]
            # Construct variables XP, YP and X/YF for te estimator.
            sample_i_p = sample_i[lag:]
            sample_i_f = sample_i[:-lag]
            sample_j_p = sample_j[lag:]
            te_j_i = cmi(sample_i_f, sample_j_p, sample_i_p)

            if normalised:
                te_j_i = te_j_i / (H_0[i] - cond_entropy(
                    sample_i_f, np.concatenate((sample_i_p, sample_j_p), 1)))
                # te_j_i = te_j_i / H_0[i]
            te_mat[j][i] = te_j_i

            if i != j:
                sample_j_f = sample_j[:-lag]
                te_i_j = cmi(sample_j_f, sample_i_p, sample_j_p)
                if normalised:
                    te_i_j = te_i_j / (H_0[j] - cond_entropy(
                        sample_j_f, np.concatenate(
                            (sample_i_p, sample_j_p), 1)))
                    # te_i_j = te_i_j / H_0[j]
                te_mat[i][j] = te_i_j
    te_path = conf.get_filename_via_tpl('te_' + vec_type,
                                        n_users=n_nodes,
                                        n_samples=n_samples,
                                        n_dims=n_dims,
                                        lag=lag)
    np.savetxt(te_path, te_mat, delimiter=',', fmt='%f')
    logger.info('Te result has been saved in %s. ' % te_path)
    return cn, te_mat
    def lda_transform(self,
                      corpus_tf_idf,
                      train_separated=False,
                      is_update=False):
        """
        Init a lda model with a n_topics whose default is 500, then fit it with corpus_tf_idf and transform it.
        :param corpus_tf_idf: Corpus which has been transformed into tf-idf matrix.
        :param train_separated: The model is going to be train with all corpus one time or some of them separately one time.
        :param is_update: Whether the training to be perform is to construct a new model or update one existed.
        :return: lda corpus.
        """
        logger.info('Training lda model with a n_dims of %d...' % self.nDims)
        if self.dictionary is None and os.path.exists(self.dictPath):
            self.dictionary = corpora.Dictionary.load(self.dictPath)

        if is_update:
            # A ldaModel had been trained before and now update the model with other corpus.
            if self.ldaModel is None:
                self.load_model('lda')
            self.ldaModel.update(corpus_tf_idf)
            logger.info('Lda model has been updated successfully.')
            return self.ldaModel[corpus_tf_idf]

        if train_separated:
            # corpus = []
            # spacing = 10000
            # for i in range(int(len(corpus_tf_idf)/spacing)):
            #     corpus.append(corpus_tf_idf[i*spacing: i])
            # self.ldaModel = LdaModel()
            pass

        self.ldaModel = LdaModel(corpus=corpus_tf_idf,
                                 num_topics=self.nDims,
                                 id2word=self.dictionary)

        conf.mk_dir(self.ldaPath)
        self.ldaModel.save(self.ldaPath)
        logger.info('lda model has been saved in %s' % self.ldaPath)

        lda_corpus = self.ldaModel[corpus_tf_idf]
        lda_corpus_path = conf.get_filename_via_tpl('lda',
                                                    n_users=self.nUsers,
                                                    n_samples=self.nSamples,
                                                    n_dims=self.nDims,
                                                    postfix='mm')
        conf.mk_dir(lda_corpus_path)
        corpora.MmCorpus.serialize(lda_corpus_path, lda_corpus)
        logger.info('Lda corpus with a shape of %s has been saved in %s.' %
                    (np.array(lda_corpus).shape, lda_corpus_path))

        return lda_corpus
    def load_corpus(self, model_type, dense=False):
        corpus = None
        try:
            if model_type == 'tfidf':
                corpus = corpora.MmCorpus(
                    conf.get_filename_via_tpl('tfidf',
                                              n_users=self.nUsers,
                                              postfix='mm',
                                              n_samples=self.nSamples))
            elif model_type in ['lsi', 'lda']:
                corpus = corpora.MmCorpus(
                    conf.get_filename_via_tpl(model_type,
                                              n_users=self.nUsers,
                                              n_samples=self.nSamples,
                                              n_dims=self.nDims,
                                              postfix='mm'))
            elif model_type == 'w2v':
                corpus = np.loadtxt(conf.get_filename_via_tpl(
                    model_type,
                    n_users=self.nUsers,
                    n_samples=self.nSamples,
                    n_dims=self.nDims),
                                    dtype=np.float,
                                    delimiter=',')

            logger.info('%s corpus with a shape of %s has been loaded. ' %
                        (model_type, np.array(corpus).shape))

            if dense and model_type in ['tfidf', 'lsi', 'lda']:
                corpus = matutils.corpus2dense(corpus,
                                               self.nDims,
                                               self.nSamples * self.nUsers,
                                               dtype=np.float).T
            else:
                corpus = np.array(corpus)
        except Exception as e:
            raise e
        return corpus
def recover_text_list(n_users, n_samples, debug=False):
    text_list = []

    with open(
            conf.get_filename_via_tpl('uid',
                                      n_users=n_users,
                                      n_samples=n_samples)) as fp:
        uid_list = [int(i) for i in fp.readline().split(',')]

    debug_flag = 0
    for uid in uid_list:
        if debug and debug_flag > 0:
            break

        csv.field_size_limit(sys.maxsize)
        with open(conf.get_filename_via_tpl('text',
                                            user_id=uid,
                                            n_samples=n_samples),
                  encoding='utf-8') as fp:
            csv_reader = csv.reader(fp)
            for line in csv_reader:
                if not len(line) or line[0] == '':
                    text = []
                else:
                    text = line[0].strip().split(' ')
                    while '' in text:
                        text.remove('')
                text_list.append(text)
            logger.info(
                'Successfully recover user %d\'s data with %d samples. ' %
                (uid, n_samples))
        debug_flag += 1
    # for i in range(10):
    #     print(text_list[i])
    if debug:
        print(text_list[:100])
    return text_list
    def __init__(self, n_users, n_samples, n_dims):
        self.nUsers, self.nSamples, self.nDims = n_users, n_samples, n_dims
        self.tfIdfModel = self.lsiModel = self.ldaModel = self.w2vModel = self.dictionary = None

        self.dictPath, self.tfIdfPath, self.lsiPath, self.ldaPath, self.w2vPath, self.w2vVecPath =\
            conf.get_filename_via_tpl('model', model_type='tfidf', n_users=n_users, n_samples=n_samples, model_filename='dict'), \
            conf.get_filename_via_tpl('model', model_type='tfidf', n_users=n_users, n_samples=n_samples, model_filename='tfidf'),\
            conf.get_filename_via_tpl('model', model_type='lsi', n_users=n_users, n_samples=n_samples, n_dims=n_dims, model_filename='lsi_model'), \
            conf.get_filename_via_tpl('model', model_type='lda', n_users=n_users, n_samples=n_samples, n_dims=n_dims, model_filename='lda_model'),\
            conf.get_filename_via_tpl('model', model_type='w2v', n_users=n_users, n_samples=n_samples, n_dims=n_dims, model_filename='w2vmodel'), \
            conf.get_filename_via_tpl('model', model_type='w2v', n_users=n_users, n_samples=n_samples, n_dims=n_dims, model_filename='vec.txt')
    def w2v_transform(self, sentences):
        """
        Perform word2vec on texts and obtain a w2v model.
        :param sentences: Sentences that each one of it contains a list of words of a text.
        :return: W2v model.
        """
        logger.info('Training w2v model with a dim of %d...' % self.nDims)
        # file = open(infile_path, 'r', encoding='utf-8') if infile_path.find('\n') < 0 else StringIO(infile_path)
        # sentences = []
        # for sen in file.readlines():
        #     sentences.append(sen.strip().split(' '))
        # print(sentences)
        self.w2vModel = Word2Vec(sentences, size=self.nDims, min_count=0)

        conf.mk_dir(self.w2vPath)
        self.w2vModel.save(self.w2vPath)
        self.w2vModel.wv.save_word2vec_format(self.w2vVecPath, binary=False)
        # print(model['['])

        # Construct w2v corpus
        w2v_corpus = []
        for sen in sentences:
            vec = [0] * self.nDims
            if len(sen) > 0:
                for word in sen:
                    vec = list(
                        map(lambda m, n: m + n, vec, self.w2vModel[word]))
                    # vec += self.w2vModel[word]
            w2v_corpus.append(vec)

        w2v_corpus_path = conf.get_filename_via_tpl('w2v',
                                                    n_users=self.nUsers,
                                                    n_samples=self.nSamples,
                                                    n_dims=self.nDims)
        conf.mk_dir(w2v_corpus_path)

        with open(w2v_corpus_path, 'w') as fp:
            csv_writer = csv.writer(fp)
            for line in w2v_corpus:
                csv_writer.writerow(line)
        logger.info('W2v corpus has been saved in %s. ' % w2v_corpus_path)

        return w2v_corpus
Exemple #11
0
# coding: utf-8
import datetime
import numpy as np
import csv

import utils.config_util as conf
from utils import model_object as mo
from utils.log import get_console_logger

np.set_printoptions(threshold=np.inf)
logger = get_console_logger(__name__)

n_users, n_samples = 12, 2192
original_seq = np.loadtxt(conf.get_filename_via_tpl('seq', n_users=12, n_samples=2192), delimiter=',')
print(original_seq.shape)


def segment_ts_enum():
    """
    Segment time series with time steps constructed by inserting numbers one by one the the time steps list.
    """
    times_steps = {original_seq.shape[1]}
    results = [['obj_func', 'h_seq', 'it', 'penalty']]

    while len(times_steps) < original_seq.shape[1]:
        max_obj = max_idx = -1
        max_line = None

        for i in range(1, original_seq.shape[1]):
            if i not in times_steps:
                temp_steps = [i] + list(times_steps)
Exemple #12
0
def segment_ts_bottom_up(pre_compute=True):
    """
    Segment time series using a bottom-up method, and its candidate_obj are calculated as inner products of every two neighbor time points.
    """
    start_time = datetime.datetime.now()

    # seq = original_seq.copy()
    seq = original_seq[:, :100].copy()
    # print(seq)
    results = [['time_steps', 'obj_func', 'h_vars', 'h_tps', 'it', 'regularization']]
    # lamb = -0.01
    lamb = -100 / seq.shape[1]

    # Initial
    last_time_steps, last_idx = np.array(range(1, seq.shape[1] + 1)), -1
    last_obj, last_seq, last_obj_details = mo.object_function(seq, last_time_steps, lamb, True)
    results.append([last_time_steps, last_obj] + last_obj_details)
    print('result: ', last_obj)
    print('details:', last_obj_details)
    # return

    if pre_compute:
        mo.pre_compute(seq, last_time_steps)

    # Maximize object function iteratively
    while True:
        max_idx, max_time_steps, max_candidate_obj, max_candidate_seq, max_candidate_obj_details = -1, None, -1, None, None
        next_length = last_time_steps.shape[0] - 1

        # Find maximum obj of next level's
        for i in range(next_length):
            new_time_steps = np.delete(last_time_steps, i)
            temp_candidate_obj, temp_candidate_seq, temp_candidate_obj_details = \
                mo.object_function(seq, new_time_steps, lamb, True, i, last_seq)
            # print('----')
            # print('time_steps:', new_time_steps)
            # print('temp result: ', temp_candidate_obj)
            # print('temp result terms:', temp_candidate_obj_details)

            if temp_candidate_obj > max_candidate_obj:
                max_idx, max_time_steps, max_candidate_obj, max_candidate_seq, max_candidate_obj_details = \
                    i, new_time_steps, temp_candidate_obj, temp_candidate_seq, temp_candidate_obj_details

        # Loop ending condition: None of next level's obj result greater than last level's
        if max_idx == -1 or max_candidate_obj < last_obj or last_seq.shape[1] < 4:
            break

        # Merge time points i and (i+1) to boost seq's obj result
        if pre_compute:
            mo.pre_compute(seq, max_time_steps, max_idx, last_seq)

        last_idx, last_time_steps, last_obj, last_seq, last_obj_details = \
            max_idx, max_time_steps, max_candidate_obj, max_candidate_seq, max_candidate_obj_details
        results.append([last_time_steps, last_obj] + last_obj_details)
        # print('time steps: ', last_time_steps)
        print('----\nlevel %d' % (seq.shape[1] - last_seq.shape[1]))
        print('result: ', last_obj)
        print('details:', last_obj_details)

        # break
    # return

    finish_time = datetime.datetime.now()
    print('%s用时:%f\n--------' % ('precompute' if pre_compute else 'not-precompute', (finish_time - start_time).total_seconds()))

    # Save results
    filename = conf.get_filename_via_tpl(
        'Obj', n_users=seq.shape[0], n_samples=seq.shape[1], date=datetime.datetime.now().strftime('%y%m%d%H%M%S'))
    with open(filename, 'w', newline='') as fp:
        csv_writer = csv.writer(fp)
        csv_writer.writerow(['lambda: %f' % lamb])
        csv_writer.writerows(results)

    return results
    def construct_time_series_data(self):
        mblog_info = self.mblogInfo

        def sequence_idx(time_str):
            if isinstance(mblog_info.timeStep, int):
                return int((mblog_info.endTime - datetime.strptime(
                    time_str, mblog_info.timeFormat)).total_seconds() /
                           mblog_info.timeStep)
            else:
                steps_count = len(mblog_info.timeStep)
                total_seconds = (mblog_info.endTime - datetime.strptime(
                    time_str, mblog_info.timeFormat)).total_seconds()
                index = int(total_seconds / mblog_info.timeStep[-1])
                rest = total_seconds - index * mblog_info.timeStep[-1]
                for i in range(steps_count):
                    if rest < mblog_info.timeStep[i]:
                        return index * steps_count + i
                return (index + 1) * steps_count

        with open(conf.get_absolute_path('data') + 'default_users.txt',
                  encoding='utf-8') as fp:
            lines = fp.readlines()

        user_mblogs_dir = conf.get_absolute_path('data') + 'user_mblogs/'

        self.uidList = []
        for line in lines:
            try:
                uid, count = int(line.split('|')[0]), int(line.split('|')[1])
            except ValueError as e:
                logger.error('Invalid uid or count. %s' % e)
                continue
            filename = '{}-{}.csv'.format(uid, count)
            self.uidList.append(uid)
            absolute_mblogs_fname = user_mblogs_dir + filename

            sequence = [0] * mblog_info.seqLen
            text_list = [''] * mblog_info.seqLen
            with open(absolute_mblogs_fname) as fp:
                csv_reader = csv.reader(fp)
                next(csv_reader)
                try:
                    while True:
                        line = next(csv_reader)
                        pub_time = line[2]
                        text = content_filter(line[3]).strip() + ' '
                        idx = sequence_idx(pub_time)
                        try:
                            sequence[idx] = 1
                            text_list[idx] += text
                        except IndexError:
                            # logger.info('Index(%d) out of range. Cause: pub_time(%s) is out of range. ' % (idx, pub_time))
                            print('%d, %d' % (len(sequence), idx))
                except StopIteration:
                    self.sequences.append(sequence)
                    text_list = tokenize(text_list)
                    # self.textList.extend(text_list)
                    logger.info('Successfully gen user %d\'s data. ' % uid)

            # Text of one user is saved to one file.
            with open(
                    conf.get_filename_via_tpl('text',
                                              user_id=uid,
                                              n_samples=mblog_info.seqLen),
                    'w') as fp:
                csv_writer = csv.writer(fp)
                for row in text_list:
                    csv_writer.writerow([row])
        # print self.uidList
        self.save_sequences()
Exemple #14
0
    #     except ValueError as msg:
    #         logger.error('Not a valid file. Skip it. ' + str(msg))
    # uid_pairs = []
    # for uid in uid_list:
    #     for uid_2 in uid_list:
    #         if uid != uid_2:
    #             uid_pairs.append((uid, uid_2))
    # users_retweet = check_retweet(uid_pairs)
    # with open(conf.get_absolute_path('DATA_ROOT') + '/users_retweet.csv', 'w', encoding='utf-8') as fp:
    #     csv_writer = csv.writer(fp)
    #     for idx in range(len(uid_pairs)):
    #         csv_writer.writerow([str(uid_pairs[idx][0]) + '-->' + str(uid_pairs[idx][1]), str(users_retweet[idx])])

    # Make transfer
    uid_list = np.loadtxt(conf.get_filename_via_tpl('uid',
                                                    n_users=conf.N_USERS,
                                                    n_samples=conf.N_SAMPLES),
                          delimiter=',',
                          dtype=np.int)
    uid_dict = {}
    idx = 0
    for uid in uid_list:
        uid_dict[uid] = idx
        idx += 1
    print(uid_list)
    transfer = np.zeros((conf.N_USERS, conf.N_USERS))

    with open(conf.get_absolute_path('data') + 'users_retweet.csv',
              encoding='utf-8') as fp:
        csv_reader = csv.reader(fp)
        for r in csv_reader:
import utils.config_util as conf
from temp.te import cmidd

os.system('cls')
lagmax = 6
tmax = conf.N_SAMPLES
nnode = conf.N_USERS
resampleTime = 100

length = tmax - lagmax - 1
lag_max_pre = np.eye(nnode, dtype=int)
lag_max_late = np.zeros((nnode, nnode), dtype=np.int)
lag_te = np.zeros((nnode, nnode))
sample = np.loadtxt(conf.get_filename_via_tpl('seq',
                                              n_users=nnode,
                                              n_samples=conf.N_SAMPLES),
                    delimiter=',')
# sample = sample.T
t = time.time()
for n in range(nnode):  #xrange(0):
    con = {}
    con[(n, 1)] = sample[n, lagmax:tmax - 1]
    x = sample[n, lagmax + 1:tmax]
    redundance = 0
    nodeset = range(nnode)

    for j in range(lagmax):
        n_teList = []

        for i in nodeset:
def evaluate(n_users, n_samples, n_dims):
    """
    Evaluate result via precise, recall and f-value.
    :return: Accuracy, recall and f1.
    """
    result = np.loadtxt(conf.get_filename_via_tpl('te_text',
                                                  n_users=n_users,
                                                  n_samples=n_samples,
                                                  n_dims=n_dims),
                        delimiter=',')
    # print(result)
    new_result = np.zeros(result.shape)
    new_result_state = np.zeros(result.shape).astype(int)
    for i in range(n_users):
        for j in range(n_users):
            if result[i][j] > 0.1:
                new_result[i][j] = result[i][j]
                new_result_state[i][j] = 1
    # for i in range(n_users):
    #     for j in range(i, n_users):
    #         if abs(result[i][j]-result[j][i]) < 0.1:
    #             new_result[i][j] = new_result[j][i] = 0
    # print(new_result)
    with open(
            conf.get_filename_via_tpl('re',
                                      n_users=n_users,
                                      n_samples=n_samples,
                                      n_dims=n_dims), 'w') as fp:
        csv_writer = csv.writer(fp)
        csv_writer.writerows(new_result)

    comparison = np.loadtxt(conf.RESULT_DIR + '/transfer',
                            delimiter=',',
                            dtype=int)
    # print(comparison)

    print('----Evaluation of %d users, %d samples, %d dims. ----' %
          (n_users, n_samples, n_dims))
    acc_rate = np.sum(new_result_state == comparison) * 1. / np.power(
        n_users, 2)
    predict_result = np.zeros((2, 2)).astype(int)

    for i in range(n_users):
        for j in range(n_users):
            predict_result[~comparison[i][j]][~new_result_state[i][j]] += 1
    print(predict_result)

    p = 1. * predict_result[0][0] / (np.sum(predict_result[:, 0]))
    r = 1. * predict_result[0][0] / (np.sum(predict_result[0, :]))
    f1 = (2 * p * r) / (p + r)
    print('Accuracy: %.3f' % acc_rate)
    print('Precise: %.3f' % p)
    print('Recall: %.3f' % r)
    print('F1: %.3f' % f1)

    print('---ROC-AUC---')
    fpr, tpr, thresholds = roc_curve(comparison.reshape(n_users * n_users),
                                     new_result.reshape(n_users * n_users))
    print('fpr: ')
    print(fpr)
    print('tpr: ')
    print(tpr)
    print('thresholds: ')
    print(thresholds)

    roc_auc = auc(fpr, tpr)
    print('roc-auc:')
    print(roc_auc)

    print('\n\n')
Exemple #17
0
def segment_ts_bottom_up_test(pre_compute=True):
    start_time = datetime.datetime.now()

    # seq = original_seq.copy()
    seq = original_seq[:, :100].copy()
    results = [['time_steps', 'obj_func', 'h_vars', 'h_tps', 'it', 'regularization']]
    # lamb = -0.01
    lamb = -100 / seq.shape[1]

    # Initial
    last_time_steps, last_idx = np.array(range(1, seq.shape[1] + 1)), -1
    last_obj, last_seq, last_obj_details = mo.object_function(seq, last_time_steps, lamb, True)
    results.append([last_time_steps, last_obj] + last_obj_details)
    print('result: ', last_obj)
    print('details:', last_obj_details)
    # return

    if pre_compute:
        mo.pre_compute(seq, last_time_steps)

    levels = []
    stop_level = 97
    levels_idx = 0

    max_rst = 0.
    while True:
        next_length = last_time_steps.shape[0] - 1

        # Find maximum obj of next level's
        for i in range(last_idx + 1, next_length):
            new_time_steps = np.delete(last_time_steps, i)
            temp_candidate_obj, temp_candidate_seq, temp_candidate_obj_details = \
                mo.object_function(seq, new_time_steps, lamb, True, i, last_seq)

            if temp_candidate_obj > max_rst:
                max_rst = temp_candidate_obj

            if next_length > stop_level:
                levels.append([i, new_time_steps, temp_candidate_obj, temp_candidate_seq, temp_candidate_obj_details, last_seq])
            results.append([new_time_steps, temp_candidate_obj] + temp_candidate_obj_details)

        results.append([''])
        if len(levels) <= levels_idx:
            break

        curr_level = levels[levels_idx]
        levels_idx += 1
        last_idx, last_time_steps, last_obj, last_seq, last_obj_details, ll_seq = \
            curr_level[0], curr_level[1], curr_level[2], curr_level[3], curr_level[4], curr_level[5]
        # Merge time points i and (i+1) to boost seq's obj result
        if pre_compute:
            mo.pre_compute(seq, last_time_steps)

        # break
    # return

    finish_time = datetime.datetime.now()
    print('%s用时:%f\n--------' % ('precompute' if pre_compute else 'not-precompute', (finish_time - start_time).total_seconds()))
    print('最大值: ', max_rst)

    # Save results
    filename = conf.get_filename_via_tpl(
        'Obj', n_users=seq.shape[0], n_samples=seq.shape[1], date=datetime.datetime.now().strftime('%y%m%d%H%M%S'))
    with open(filename, 'w', newline='') as fp:
        csv_writer = csv.writer(fp)
        csv_writer.writerow(['lambda: %f' % lamb])
        csv_writer.writerows(results)

    return results
# coding: utf-8
import numpy as np

import utils.config_util as conf
import utils.entropy_estimators as ee
from gen_data import DataGenerator, MblogInfo

# Read sequences of all users which have been processed with the smallest time step.
data_info = {'n_users': 12, 'n_samples': 2192, 'n_dims': 100}
seq_filename = conf.get_filename_via_tpl('seq', n_users=data_info['n_users'], n_samples=data_info['n_samples'])
sequences = np.loadtxt(seq_filename, np.int, delimiter=',')
# print(sequences)
# print(sequences.shape)

# Set an active rate and find the optimal time steps to make a maximum joint entropy.
active_rate = .4
active_count = active_rate * data_info['n_users']

hist = sequences.sum(0)
active_status = np.zeros(sequences.shape[1])
active_status[np.where(hist > active_count)] = 1

# print(active_status)
# print(len(active_status))
# print(sum(active_status))

last_joint_entropy = None


def merge_sub_sequence(s, e, seqs):
    global last_joint_entropy
def testTE(lagmax, tmax, resampleTime, nnode, sample, indegreeaverage):
    """
    function of TE
    """
    length = tmax - lagmax - 1
    lag_max_pre = np.eye(nnode, dtype=int)
    lag_max_late = np.zeros((nnode, nnode), dtype=np.int)
    lag_te = np.zeros((nnode, nnode))
    # sample = sample.T
    for n in range(nnode):
        con = {}
        con[(n, 1)] = sample[n, lagmax:tmax - 1]
        x = sample[n, lagmax + 1:tmax]
        nodeset = range(nnode)
        ####find lag&pc####
        for i in nodeset:
            if n == i:
                temp_te = cmidd(x, sample[i, (lagmax - 1):(tmax - 2)], con)
                temp_te_2 = np.array([
                    cmidd(
                        x,
                        random.sample(
                            sample[i, (lagmax - 1):(tmax - 2)].tolist(),
                            length), con) for m in range(resampleTime)
                ])
                if len(temp_te_2[temp_te_2 > temp_te]) / 100.00 < 0.01:
                    lag_max_late[i, n] += 1
                    lag_max_pre[i, n] += 1
                    lag_te[n, i] = temp_te
            else:
                temp_te = cmidd(x, sample[i, (lagmax + 1):(tmax)], con)
                temp_te_2 = np.array([
                    cmidd(
                        x,
                        random.sample(sample[i, (lagmax + 1):(tmax)].tolist(),
                                      length), con)
                    for m in range(resampleTime)
                ])
                if len(temp_te_2[temp_te_2 > temp_te]) / 100.00 < 0.01:
                    lag_max_late[i, n] += 1
                    lag_max_pre[i, n] += 1
                    lag_te[n, i] = temp_te

    for n in range(nnode):
        for i in range(nnode):
            if n != i:
                if (lag_te[n, i] >= lag_te[i, n]):
                    lag_max_late[i, n] = 0

                elif lag_te[i, n]:
                    lag_max_late[n, i] = 0
            else:
                break
    np.savetxt(conf.get_filename_via_tpl('te_lag_max_pre',
                                         n_users=nnode,
                                         n_samples=tmax),
               lag_max_pre,
               fmt='%d',
               delimiter=',')
    np.savetxt(conf.get_filename_via_tpl('te_lag_max_late',
                                         n_users=nnode,
                                         n_samples=tmax),
               lag_max_late,
               fmt='%d',
               delimiter=',')
    np.savetxt(conf.get_filename_via_tpl('te_lag_te',
                                         n_users=nnode,
                                         n_samples=tmax),
               lag_te,
               delimiter=',')
    logger.info('TE results have been saved in result folder.')
    return lag_max_pre, lag_max_late, lag_te
def testMNR(lagmax, tmax, resampleTime, nnode, sample, indegreeaverage):
    """
    function of MNR
    """
    length = tmax - lagmax - 1
    lag_max_pre = np.eye(nnode, dtype=int)
    lag_max_late = np.zeros((nnode, nnode), dtype=np.int)
    lag_te = np.zeros((nnode, nnode))
    # sample = sample.T
    for n in range(nnode):
        con = {}
        con[(n, 1)] = sample[n, lagmax:tmax - 1]
        x = sample[n, lagmax + 1:tmax]
        nodeset = range(nnode)

        ####find lag&pc####
        for j in range(lagmax):
            n_teList = []
            for i in nodeset:
                if n == i:
                    temp_te = cmidd(x, sample[i,
                                              (lagmax - j - 1):(tmax - j - 2)],
                                    con)
                    temp_te_2 = np.array([
                        cmidd(
                            x,
                            random.sample(
                                sample[i, (lagmax - j - 1):(tmax - j -
                                                            2)].tolist(),
                                length), con) for m in range(resampleTime)
                    ])
                    if len(temp_te_2[temp_te_2 > temp_te]) / 100.00 < 0.01:
                        n_teList.append(i)
                        con[(i,
                             j + 2)] = sample[i,
                                              (lagmax - j - 1):(tmax - j - 2)]
                else:
                    temp_te = cmidd(x, sample[i, (lagmax - j + 1):(tmax - j)],
                                    con)
                    temp_te_2 = np.array([
                        cmidd(
                            x,
                            random.sample(
                                sample[i,
                                       (lagmax - j + 1):(tmax - j)].tolist(),
                                length), con) for m in range(resampleTime)
                    ])
                    if len(temp_te_2[temp_te_2 > temp_te]) / 100.00 < 0.01:
                        n_teList.append(i)
                        con[(i, j + 1)] = sample[i,
                                                 (lagmax - j + 1):(tmax - j)]

            lag_max_pre[n, n_teList] += 1

            if len(n_teList):
                nodeset = n_teList[:]
            else:
                break

        ####remove lag&pc####
        nodeindex = lag_max_pre[n].nonzero()[0]
        for i in nodeindex:
            tem_con = deepcopy(con)
            j = lag_max_pre[n, i]
            while (j > 0) and bool(len(con)):
                tem_con = deepcopy(con)
                y_next = tem_con.pop((i, j))
                temp_te = cmidd(x, y_next, tem_con)
                temp_te_2 = np.array([
                    cmidd(x, random.sample(list(y_next), length), tem_con)
                    for m in range(resampleTime)
                ])
                if len(temp_te_2[temp_te_2 > temp_te]) / 100.00 < 0.01:
                    break
                else:
                    con = tem_con
                    j -= 1
            if j and len(con):
                for l in range(1, j + 1):
                    tem_con = deepcopy(con)
                    lag_te[n, i] += cmidd(x, tem_con.pop((i, l)), tem_con)
                lag_max_late[n, i] = j

    for n in range(nnode):
        for i in range(nnode):
            if n != i:
                if (lag_te[n, i] >= lag_te[i, n]):
                    lag_max_late[i, n] = 0
                elif lag_te[i, n]:
                    lag_max_late[n, i] = 0
            else:
                break
    np.savetxt(conf.get_filename_via_tpl('mnr_lag_max_pre',
                                         n_users=nnode,
                                         n_samples=tmax),
               lag_max_pre,
               fmt='%d',
               delimiter=',')
    np.savetxt(conf.get_filename_via_tpl('mnr_lag_max_late',
                                         n_users=nnode,
                                         n_samples=tmax),
               lag_max_late,
               fmt='%d',
               delimiter=',')
    np.savetxt(conf.get_filename_via_tpl('mnr_lag_te',
                                         n_users=nnode,
                                         n_samples=tmax),
               lag_te,
               delimiter=',')
    logger.info('MNR results have been saved in result folder.')
    return lag_max_pre, lag_max_late, lag_te
    def construct_with_diff_ts(self, new_mblog_info):
        n_users, n_samples = 12, 2192
        uid_list = np.loadtxt(conf.get_filename_via_tpl('uid',
                                                        n_users=n_users,
                                                        n_samples=n_samples),
                              delimiter=',',
                              dtype=np.int)
        original_seq = np.loadtxt(conf.get_filename_via_tpl(
            'seq', n_users=n_users, n_samples=n_samples),
                                  delimiter=',',
                                  dtype=np.int)
        original_text_list = []
        for uid in uid_list:
            with open(conf.get_filename_via_tpl('text',
                                                user_id=uid,
                                                n_samples=n_samples),
                      encoding='utf-8') as fp:
                csv_reader = csv.reader(fp)
                text = [
                    line[0].strip() if len(line) > 0 else ''
                    for line in csv_reader
                ]
                original_text_list.append(text)
                assert len(
                    text) == 2192, 'Texts of user %d are not enough. ' % uid

        time_steps = new_mblog_info.timeStep
        if n_samples == len(new_mblog_info.timeStep):
            return original_seq, original_text_list

        new_seq = np.zeros((original_seq.shape[0], len(time_steps)), np.int)
        new_text_list = [[''] * len(time_steps)] * original_seq.shape[0]

        nidx = oidx = 0
        for time_point in time_steps:
            step = int(time_point / (24 * 3600)) - oidx
            # if step == 1:
            #     new_seq[:, nidx] = original_seq[:, oidx]
            #     new_text_list[:, nidx] = original_text_list[:, oidx]
            # else:
            for r in range(original_seq.shape[0]):
                new_seq[r, nidx] = sum(original_seq[r, oidx:oidx + step])
                new_text_list[r][nidx] = original_text_list[r][oidx]
                for c in range(1, step):
                    new_text_list[r][nidx] = new_text_list[r][
                        nidx] + ' ' + original_text_list[r][oidx + c]
                    # print(original_text_list[r][oidx + c])
            new_seq[new_seq[:, nidx] > 0, nidx] = 1

            oidx += step
            nidx += 1
        assert nidx == len(time_steps), 'nidx != len(time_steps)'
        assert oidx == original_seq.shape[
            1], 'Total amount of time steps is smaller than sample length.'

        self.uidList = uid_list
        self.sequences = new_seq
        self.save_sequences(new_mblog_info)

        for uid, texts in zip(uid_list, new_text_list):
            with open(
                    conf.get_filename_via_tpl('text',
                                              user_id=uid,
                                              n_samples=new_mblog_info.seqLen),
                    'w') as fp:
                csv_writer = csv.writer(fp)
                for row in texts:
                    csv_writer.writerow([row])

        return new_seq, new_text_list