Ejemplo n.º 1
0
 def save_smat(features, ft_pt):
     """
     save features to disk in SMAT format
     :param features: the matrix of features
     :param ft_pt: features file path
     :return: none
     """
     (row_num, col_num) = features.shape
     data = features.data
     indice = features.indices
     indptr = features.indptr
     f = open(ft_pt, 'w')
     f.write("%d %d\n" % (row_num, col_num))
     ind_indptr = 1
     begin_line = True
     for ind_data in range(len(data)):
         while ind_data == indptr[ind_indptr]:
             f.write('\n')
             begin_line = True
             ind_indptr += 1
         if (data[ind_data] < 1e-12) and (data[ind_data] > -1e-12):
             continue
         if (not begin_line) and (ind_data != indptr[ind_indptr - 1]):
             f.write(' ')
         f.write("%d:%s" % (indice[ind_data], data[ind_data]))
         begin_line = False
     while ind_indptr < len(indptr):
         f.write("\n")
         ind_indptr += 1
     LogUtil.log("INFO", "save smat feature file done (%s)" % ft_pt)
     f.close()
Ejemplo n.º 2
0
    def load_all(feature_pt, feature_names, rawset_name, will_save=False):
        index_begin = 0
        features = None
        for index in reversed(range(1, len(feature_names))):
            f_names_s = '|'.join(
                feature_names[0:index + 1]) + '|' + rawset_name
            f_names_md5 = hashlib.md5(f_names_s).hexdigest()
            if isfile('%s/md5_%s.smat.npz' % (feature_pt, f_names_md5)):
                index_begin = index
                features = Feature.load('%s/md5_%s.smat' %
                                        (feature_pt, f_names_md5))
                break
        LogUtil.log(
            'INFO', 'load %s features [%s, %s)' %
            (rawset_name, feature_names[0], feature_names[index_begin]))

        if 1 > index_begin:
            features = Feature.load(
                '%s/%s.%s.smat' % (feature_pt, feature_names[0], rawset_name))
        for index in range(index_begin + 1, len(feature_names)):
            features = Feature.merge_col(
                features,
                Feature.load('%s/%s.%s.smat' %
                             (feature_pt, feature_names[index], rawset_name)))

        features = features.tocsr()

        if will_save and (index_begin < len(feature_names) - 1):
            f_names_s = '|'.join(feature_names) + '|' + rawset_name
            f_names_md5 = hashlib.md5(f_names_s).hexdigest()
            Feature.save_npz(features,
                             '%s/md5_%s.smat' % (feature_pt, f_names_md5))
        return features
Ejemplo n.º 3
0
 def load_smat(ft_fp):
     """
     load features from disk, the format:
         row_num col_num
         f1_index:f1_value f2_index:f2_value ...
     """
     data = []
     indice = []
     indptr = [0]
     f = open(ft_fp)
     [row_num, col_num] = [int(num) for num in f.readline().strip().split()]
     for line in f:
         line = line.strip()
         subs = line.split()
         for sub in subs:
             [f_index, f_value] = sub.split(":")
             f_index = int(f_index)
             f_value = float(f_value)
             data.append(f_value)
             indice.append(f_index)
         indptr.append(len(data))
     f.close()
     features = csr_matrix((data, indice, indptr),
                           shape=(row_num, col_num),
                           dtype=float)
     LogUtil.log("INFO", "load smat feature file done (%s)" % ft_fp)
     return features
 def save_smat(features, ft_pt):
     """
     save features to disk in SMAT format
     :param features: the matrix of features
     :param ft_pt: features file path
     :return: none
     """
     (row_num, col_num) = features.shape
     data = features.data
     indice = features.indices
     indptr = features.indptr
     f = open(ft_pt, 'w')
     f.write("%d %d\n" % (row_num, col_num))
     ind_indptr = 1
     begin_line = True
     for ind_data in range(len(data)):
         while ind_data == indptr[ind_indptr]:
             f.write('\n')
             begin_line = True
             ind_indptr += 1
         if (data[ind_data] < 1e-12) and (data[ind_data] > -1e-12):
             continue
         if (not begin_line) and (ind_data != indptr[ind_indptr - 1]):
             f.write(' ')
         f.write("%d:%s" % (indice[ind_data], data[ind_data]))
         begin_line = False
     while ind_indptr < len(indptr):
         f.write("\n")
         ind_indptr += 1
     LogUtil.log("INFO", "save smat feature file done (%s)" % ft_pt)
     f.close()
Ejemplo n.º 5
0
 def load_npz(ft_fp):
     loader = np.load('%s.npz' % ft_fp)
     features = csr_matrix(
         (loader['data'], loader['indices'], loader['indptr']),
         shape=loader['shape'])
     LogUtil.log("INFO", "load npz feature file done (%s)" % ft_fp)
     return features
 def load_smat(ft_fp):
     """
     load features from disk, the format:
         row_num col_num
         f1_index:f1_value f2_index:f2_value ...
     """
     data = []
     indice = []
     indptr = [0]
     f = open(ft_fp)
     [row_num, col_num] = [int(num) for num in f.readline().strip().split()]
     for line in f:
         line = line.strip()
         subs = line.split()
         for sub in subs:
             [f_index, f_value] = sub.split(":")
             f_index = int(f_index)
             f_value = float(f_value)
             data.append(f_value)
             indice.append(f_index)
         indptr.append(len(data))
     f.close()
     features = csr_matrix((data, indice, indptr), shape=(row_num, col_num), dtype=float)
     LogUtil.log("INFO", "load smat feature file done (%s)" % ft_fp)
     return features
 def init_powerful_word_dside(pword, thresh_num, thresh_rate):
     pword_dside = []
     pword = filter(lambda x: x[1][0] * x[1][5] >= thresh_num, pword)
     pword_sort = sorted(pword, key=lambda d: d[1][6], reverse=True)
     pword_dside.extend(map(lambda x: x[0], filter(lambda x: x[1][6] >= thresh_rate, pword_sort)))
     LogUtil.log('INFO', 'Double side power words(%d): %s' % (len(pword_dside), str(pword_dside)))
     return pword_dside
    def load_all(feature_pt, feature_names, rawset_name, will_save=False):
        index_begin = 0
        features = None
        for index in reversed(range(1, len(feature_names))):
            f_names_s = '|'.join(feature_names[0:index + 1]) + '|' + rawset_name
            f_names_md5 = hashlib.md5(f_names_s).hexdigest()
            if isfile('%s/md5_%s.smat.npz' % (feature_pt, f_names_md5)):
                index_begin = index
                features = Feature.load('%s/md5_%s.smat' % (feature_pt, f_names_md5))
                break
        LogUtil.log('INFO', 'load %s features [%s, %s)' % (rawset_name, feature_names[0], feature_names[index_begin]))

        if 1 > index_begin:
            features = Feature.load('%s/%s.%s.smat' % (feature_pt, feature_names[0], rawset_name))
        for index in range(index_begin + 1, len(feature_names)):
            features = Feature.merge_col(features,
                                         Feature.load(
                                             '%s/%s.%s.smat' % (feature_pt, feature_names[index], rawset_name)))

        features = features.tocsr()

        if will_save and (index_begin < len(feature_names) - 1):
            f_names_s = '|'.join(feature_names) + '|' + rawset_name
            f_names_md5 = hashlib.md5(f_names_s).hexdigest()
            Feature.save_npz(features, '%s/md5_%s.smat' % (feature_pt, f_names_md5))
        return features
Ejemplo n.º 9
0
    def __generate_index(self, row_num):
        index_all = [list()] * self.cv_num
        for i in range(row_num):
            index_all[int(random.random() * self.cv_num)].append(i)
        for i in range(self.cv_num):
            LogUtil.log(
                'INFO',
                'generate cv index, size(part%d)=%d' % (i, len(index_all[i])))

        index_pt = self.config.get('DEFAULT', 'index_pt')
        for i in range(self.cv_num):
            fold_id = i
            # train
            fp = '%s/cv_tag%s_n%d_f%d_train.%s.index' % (
                index_pt, self.cv_tag, self.cv_num, fold_id,
                self.config.get('MODEL', 'offline_rawset_name'))
            DataUtil.save_vector(fp, list(), 'w')
            for j in range(self.cv_num - 2):
                part_id = (i + j) % self.cv_num
                DataUtil.save_vector(fp, index_all[part_id], 'a')
            # valid
            fp = '%s/cv_tag%s_n%d_f%d_valid.%s.index' % (
                index_pt, self.cv_tag, self.cv_num, fold_id,
                self.config.get('MODEL', 'offline_rawset_name'))
            part_id = (fold_id + self.cv_num - 2) % self.cv_num
            DataUtil.save_vector(fp, index_all[part_id], 'w')
            # test
            fp = '%s/cv_tag%s_n%d_f%d_test.%s.index' % (
                index_pt, self.cv_tag, self.cv_num, fold_id,
                self.config.get('MODEL', 'offline_rawset_name'))
            part_id = (fold_id + self.cv_num - 1) % self.cv_num
            DataUtil.save_vector(fp, index_all[part_id], 'w')
Ejemplo n.º 10
0
    def extract(self, data_set_name, part_num=1, part_id=0):
        """
        Extract the feature from original data set
        :param data_set_name: name of data set
        :param part_num: number of partitions of data
        :param part_id: partition ID which will be extracted
        :return:
        """
        # load data set from disk
        data = pd.read_csv('%s/%s.csv' % (self.config.get('DEFAULT', 'source_pt'), data_set_name)).fillna(value="")
        begin_id = int(1. * len(data) / part_num * part_id)
        end_id = int(1. * len(data) / part_num * (part_id + 1))

        # set feature file path
        feature_pt = self.config.get('DEFAULT', 'feature_pt')
        if 1 == part_num:
            self.data_feature_fp = '%s/%s.%s.smat' % (feature_pt, self.feature_name, data_set_name)
        else:
            self.data_feature_fp = '%s/%s.%s.smat.%03d_%03d' % (feature_pt,
                                                                self.feature_name,
                                                                data_set_name,
                                                                part_num,
                                                                part_id)

        feature_file = open(self.data_feature_fp, 'w')
        feature_file.write('%d %d\n' % (end_id - begin_id, int(self.get_feature_num())))
        # extract feature
        for index, row in data[begin_id:end_id].iterrows():
            feature = self.extract_row(row)
            Feature.save_feature(feature, feature_file)
        feature_file.close()

        LogUtil.log('INFO',
                    'save features (%s, %s, %d, %d) done' % (self.feature_name, data_set_name, part_num, part_id))
 def init_powerful_word_oside(pword, thresh_num, thresh_rate):
     pword_oside = []
     pword = filter(lambda x: x[1][0] * x[1][3] >= thresh_num, pword)
     pword_oside.extend(
         map(lambda x: x[0], filter(lambda x: x[1][4] >= thresh_rate, pword)))
     LogUtil.log('INFO', 'One side power words(%d): %s' % (
         len(pword_oside), str(pword_oside)))
     return pword_oside
 def load_npz(ft_fp):
     loader = np.load('%s.npz' % ft_fp)
     features = csr_matrix((loader['data'],
                            loader['indices'],
                            loader['indptr']),
                           shape=loader['shape'])
     LogUtil.log("INFO", "load npz feature file done (%s)" % ft_fp)
     return features
Ejemplo n.º 13
0
 def __unlock(self):
     lock_name = self.config.get('MODEL', 'lock_name')
     lock_pt = self.config.get('MODEL', 'lock_pt')
     lock_fp = '%s/%s.lock' % (lock_pt, lock_name)
     if isfile(lock_fp):
         os.remove(lock_fp)
         LogUtil.log('INFO', 'delete lock, lock_name=%s' % lock_name)
     else:
         LogUtil.log('WARNING', 'missing lock, lock_name=%s' % lock_name)
Ejemplo n.º 14
0
 def get_labels(df):
     """
     Get labels of data set
     :param df: original data set
     :return: label list of data set
     """
     labels = df['is_duplicate'].tolist()
     LogUtil.log("INFO", "num(1)=%d, num(0)=%d" % (sum(labels), len(labels) - sum(labels)))
     return labels
 def __unlock(self):
     lock_name = self.config.get('MODEL', 'lock_name')
     lock_pt = self.config.get('MODEL', 'lock_pt')
     lock_fp = '%s/%s.lock' % (lock_pt, lock_name)
     if isfile(lock_fp):
         os.remove(lock_fp)
         LogUtil.log('INFO', 'delete lock, lock_name=%s' % lock_name)
     else:
         LogUtil.log('WARNING', 'missing lock, lock_name=%s' % lock_name)
Ejemplo n.º 16
0
 def init_powerful_word_oside(pword, thresh_num, thresh_rate):
     pword_oside = []
     pword = filter(lambda x: x[1][0] * x[1][3] >= thresh_num, pword)
     pword_oside.extend(
         map(lambda x: x[0], filter(lambda x: x[1][4] >= thresh_rate,
                                    pword)))
     LogUtil.log(
         'INFO', 'One side power words(%d): %s' %
         (len(pword_oside), str(pword_oside)))
     return pword_oside
    def generate_powerful_word(data, subset_indexs):
        """
        计算数据中词语的影响力,格式如下:
            词语 --> [0. 出现语句对数量,1. 出现语句对比例,2. 正确语句对比例,3. 单侧语句对比例,4. 单侧语句对正确比例,5. 双侧语句对比例,6. 双侧语句对正确比例]
        """
        words_power = {}
        train_subset_data = data.iloc[subset_indexs, :]
        for index, row in train_subset_data.iterrows():
            label = int(row['is_duplicate'])
            q1_words = str(row['question1']).lower().split()
            q2_words = str(row['question2']).lower().split()
            all_words = set(q1_words + q2_words)
            q1_words = set(q1_words)
            q2_words = set(q2_words)
            for word in all_words:
                if word not in words_power:
                    words_power[word] = [0. for i in range(7)]
                # 计算出现语句对数量
                words_power[word][0] += 1.
                words_power[word][1] += 1.

                if ((word in q1_words) and (word not in q2_words)) or ((word not in q1_words) and (word in q2_words)):
                    # 计算单侧语句数量
                    words_power[word][3] += 1.
                    if 0 == label:
                        # 计算正确语句对数量
                        words_power[word][2] += 1.
                        # 计算单侧语句正确比例
                        words_power[word][4] += 1.
                if (word in q1_words) and (word in q2_words):
                    # 计算双侧语句数量
                    words_power[word][5] += 1.
                    if 1 == label:
                        # 计算正确语句对数量
                        words_power[word][2] += 1.
                        # 计算双侧语句正确比例
                        words_power[word][6] += 1.
        for word in words_power:
            # 计算出现语句对比例
            words_power[word][1] /= len(subset_indexs)
            # 计算正确语句对比例
            words_power[word][2] /= words_power[word][0]
            # 计算单侧语句对正确比例
            if words_power[word][3] > 1e-6:
                words_power[word][4] /= words_power[word][3]
            # 计算单侧语句对比例
            words_power[word][3] /= words_power[word][0]
            # 计算双侧语句对正确比例
            if words_power[word][5] > 1e-6:
                words_power[word][6] /= words_power[word][5]
            # 计算双侧语句对比例
            words_power[word][5] /= words_power[word][0]
        sorted_words_power = sorted(words_power.iteritems(), key=lambda d: d[1][0], reverse=True)
        LogUtil.log("INFO", "power words calculation done, len(words_power)=%d" % len(sorted_words_power))
        return sorted_words_power
 def merge_col(features_1, features_2):
     """
     merge features made split by column
     :param features_1: the first part of features
     :param features_2: the second part of features
     :return: feature matrix
     """
     features = hstack([features_1, features_2])
     (row_num, col_num) = features.shape
     LogUtil.log("INFO", "merge col done, shape=(%d,%d)" % (row_num, col_num))
     return features
Ejemplo n.º 19
0
    def extract(self):
        version = self.config.get('INFO', 'version')
        cv_num = self.config.get('INFO', 'cv_num')
        offline_rawset_name = self.config.get('MODEL', 'offline_rawset_name')
        index_fp = self.config.get('DIRECTORY', 'feature_pt')
        feature_name = '%s_%s' % (self.__class__.__name__, version)

        # load prediction of offline tests
        offline_test_pred_all_fp = '%s/pred/cv_n%d_test.%s.pred' % (
            self.config.get('DIRECTORY',
                            'out_pt'), cv_num, offline_rawset_name)
        offline_test_pred_all_origin = PostProcessor.read_result_list(
            offline_test_pred_all_fp)
        offline_test_pred_all = [0] * len(offline_test_pred_all_origin)
        # load index of offline tests
        offline_test_index_all = list()
        for fold_id in range(cv_num):
            offline_test_indexs_fp = '%s/cv_n%d_f%d_test.%s.index' % (
                index_fp, cv_num, fold_id, offline_rawset_name)
            offline_test_indexs = Feature.load_index(offline_test_indexs_fp)
            offline_test_index_all.extend(offline_test_indexs)
        for index in range(len(offline_test_pred_all)):
            offline_test_pred_all[offline_test_index_all[
                index]] = offline_test_pred_all_origin[index]

        # load prediction of online data set
        online_preds = list()
        for fold_id in range(cv_num):
            online_pred_fp = '%s/cv_n%d_f%d_online.%s.pred' % (
                self.config.get('DIRECTORY', 'pred_pt'), cv_num, fold_id,
                self.config.get('MODEL', 'online_test_rawset_name'))
            online_pred_one = PostProcessor.read_result_list(online_pred_fp)
            online_preds.append(online_pred_one)
        # sample for online prediction
        online_pred = []
        for i in range(len(online_preds[0])):
            cv_id = int(random.random() * cv_num)
            online_pred.append(online_preds[cv_id][i])

        offline_pred = [[fv] for fv in offline_test_pred_all]
        online_pred = [[fv] for fv in online_pred]

        # directory of features
        feature_pt = self.config.get('DIRECTORY', 'feature_pt')
        train_feature_fp = '%s/%s.train.smat' % (feature_pt, feature_name)
        test_feature_fp = '%s/%s.test.smat' % (feature_pt, feature_name)

        train_features = sparse.csr_matrix(np.array(offline_pred))
        Feature.save_smat(train_features, train_feature_fp)
        LogUtil.log('INFO', 'save train features (%s) done' % feature_name)

        test_features = sparse.csr_matrix(np.array(online_pred))
        Feature.save_smat(test_features, test_feature_fp)
        LogUtil.log('INFO', 'save test features (%s) done' % feature_name)
Ejemplo n.º 20
0
 def init_powerful_word_dside(pword, thresh_num, thresh_rate):
     pword_dside = []
     pword = filter(lambda x: x[1][0] * x[1][5] >= thresh_num, pword)
     pword_sort = sorted(pword, key=lambda d: d[1][6], reverse=True)
     pword_dside.extend(
         map(lambda x: x[0],
             filter(lambda x: x[1][6] >= thresh_rate, pword_sort)))
     LogUtil.log(
         'INFO', 'Double side power words(%d): %s' %
         (len(pword_dside), str(pword_dside)))
     return pword_dside
Ejemplo n.º 21
0
 def stat_dul_question(df):
     """
     Make statistics to duplication of questions
     :param df: original data set
     :return: none
     """
     questions = df['question1'].tolist() + df['question2'].tolist()
     len_questions = len(questions)
     len_uniq_questions = len(set(questions))
     LogUtil.log("INFO", "len(questions)=%d, len(unique_questions)=%d, rate=%f" % (
         len_questions, len_uniq_questions, 1.0 * len_uniq_questions / len_questions))
    def init_tfidf(self):
        train_data = pd.read_csv('%s/train.csv' % self.config.get('DIRECTORY', 'origin_pt')).fillna(value="")  # [:100]
        test_data = pd.read_csv('%s/test.csv' % self.config.get('DIRECTORY', 'origin_pt')).fillna(value="")  # [:100]

        tfidf = TfidfVectorizer(stop_words='english', ngram_range=(1, 1))
        tfidf_txt = pd.Series(
            train_data['question1'].tolist() + train_data['question2'].tolist() + test_data['question1'].tolist() +
            test_data['question2'].tolist()).astype(str)
        tfidf.fit_transform(tfidf_txt)
        LogUtil.log("INFO", "init tfidf done ")
        return tfidf
Ejemplo n.º 23
0
 def merge_col(features_1, features_2):
     """
     merge features made split by column
     :param features_1: the first part of features
     :param features_2: the second part of features
     :return: feature matrix
     """
     features = hstack([features_1, features_2])
     (row_num, col_num) = features.shape
     LogUtil.log("INFO",
                 "merge col done, shape=(%d,%d)" % (row_num, col_num))
     return features
 def __lock(self):
     lock_name = self.config.get('MODEL', 'lock_name')
     lock_time = self.config.getint('MODEL', 'lock_time')
     lock_pt = self.config.get('MODEL', 'lock_pt')
     if '' != lock_name:
         lock_fp = '%s/%s.lock' % (lock_pt, lock_name)
         while isfile(lock_fp):
             LogUtil.log('INFO', 'model is running, lock_name=%s, waiting %d ...' % (lock_name, lock_time))
             time.sleep(lock_time)
         f = open(lock_fp, 'w')
         f.close()
     LogUtil.log('INFO', 'generate lock, lock_name=%s' % lock_name)
Ejemplo n.º 25
0
 def generate_graph_clique(G):
     n2clique = {}
     cliques = []
     for clique in nx.find_cliques(G):
         for n in clique:
             if n not in n2clique:
                 n2clique[n] = []
             n2clique[n].append(len(cliques))
         cliques.append(clique)
     LogUtil.log('INFO',
                 'init graph cliques done, len(cliques)=%d' % len(cliques))
     return n2clique, cliques
Ejemplo n.º 26
0
 def generate_graph_cc(G):
     n2cc = {}
     ccs = []
     for cc in nx.connected_components(G):
         for n in cc:
             if n in n2cc:
                 LogUtil.log('WARNING',
                             '%d already in n2cc(=%d)' % (n, n2cc[n]))
             n2cc[n] = len(ccs)
         ccs.append(cc)
     LogUtil.log('INFO', 'init graph cc done, len(cliques)=%d' % len(ccs))
     return n2cc, ccs
 def generate_idf(data_fp):
     data = csv.reader(data_fp)
     idf = {}
     for index, row in data.iterrows():
         words = str(row['question']).strip().split() if WordEmbedding.to_lower else str(
             row['question']).lower().strip().split()
         for word in words:
             idf[word] = idf.get(word, 0) + 1
     num_docs = len(data)
     for word in idf:
         idf[word] = math.log(num_docs / (idf[word] + 1.)) / math.log(2.)
     LogUtil.log("INFO", "IDF calculation done, len(idf)=%d" % len(idf))
     return idf
Ejemplo n.º 28
0
 def get_qid2question(df):
     """
     Get map(qid, question)
     :param df: original data set
     :return: map(qid, question)
     """
     qid2question = {}
     qids = df['qid1'].tolist() + df['qid2'].tolist()
     questions = df['question1'].tolist() + df['question2'].tolist()
     for ind in range(len(qids)):
         qid2question[qids[ind]] = questions[ind]
     LogUtil.log("INFO", "len(qids)=%d, len(unique_qids)=%d" % (len(qids), len(qid2question)))
     return qid2question
 def save_npz(features, ft_fp):
     """
     save features to disk in binary format
     :param features:
     :param ft_fp:
     :return:
     """
     np.savez(ft_fp,
              data=features.data,
              indices=features.indices,
              indptr=features.indptr,
              shape=features.shape)
     LogUtil.log('INFO', 'save npz feature file done (%s)' % ft_fp)
Ejemplo n.º 30
0
 def save_npz(features, ft_fp):
     """
     save features to disk in binary format
     :param features:
     :param ft_fp:
     :return:
     """
     np.savez(ft_fp,
              data=features.data,
              indices=features.indices,
              indptr=features.indptr,
              shape=features.shape)
     LogUtil.log('INFO', 'save npz feature file done (%s)' % ft_fp)
Ejemplo n.º 31
0
 def __lock(self):
     lock_name = self.config.get('MODEL', 'lock_name')
     lock_time = self.config.getint('MODEL', 'lock_time')
     lock_pt = self.config.get('MODEL', 'lock_pt')
     if '' != lock_name:
         lock_fp = '%s/%s.lock' % (lock_pt, lock_name)
         while isfile(lock_fp):
             LogUtil.log(
                 'INFO', 'model is running, lock_name=%s, waiting %d ...' %
                 (lock_name, lock_time))
             time.sleep(lock_time)
         f = open(lock_fp, 'w')
         f.close()
     LogUtil.log('INFO', 'generate lock, lock_name=%s' % lock_name)
Ejemplo n.º 32
0
 def generate_idf(data_fp):
     data = csv.reader(data_fp)
     idf = {}
     for index, row in data.iterrows():
         words = str(row['question']).strip().split(
         ) if WordEmbedding.to_lower else str(
             row['question']).lower().strip().split()
         for word in words:
             idf[word] = idf.get(word, 0) + 1
     num_docs = len(data)
     for word in idf:
         idf[word] = math.log(num_docs / (idf[word] + 1.)) / math.log(2.)
     LogUtil.log("INFO", "IDF calculation done, len(idf)=%d" % len(idf))
     return idf
    def merge_file(feature_pt, feature_name, data_set_name, part_num):
        features = None
        for part_id in range(part_num):
            features_part_fp = '%s/%s.%s.smat.%03d_%03d' % (feature_pt, feature_name, data_set_name, part_num, part_id)
            features_part = Feature.load(features_part_fp)
            if features is None:
                features = features_part
            else:
                features = Feature.merge_row(features, features_part)

        features_fp = '%s/%s.%s.smat' % (feature_pt, feature_name, data_set_name)
        Feature.save_smat(features, features_fp)
        LogUtil.log('INFO',
                    'merge features (%s, %s, %d) done' % (feature_name, data_set_name, part_num))
Ejemplo n.º 34
0
    def init_tfidf(self):
        train_data = pd.read_csv(
            '%s/train.csv' % self.config.get('DEFAULT', 'origin_pt')).fillna(
                value="")  # [:100]
        test_data = pd.read_csv(
            '%s/test.csv' % self.config.get('DEFAULT', 'origin_pt')).fillna(
                value="")  # [:100]

        tfidf = TfidfVectorizer(stop_words='english', ngram_range=(1, 1))
        tfidf_txt = pd.Series(train_data['question1'].tolist() +
                              train_data['question2'].tolist() +
                              test_data['question1'].tolist() +
                              test_data['question2'].tolist()).astype(str)
        tfidf.fit_transform(tfidf_txt)
        LogUtil.log("INFO", "init tfidf done ")
        return tfidf
Ejemplo n.º 35
0
    def generate_cv_subset_index(cf, argv):
        """
        Generate index used for 5-fold cross validation
        :param cf: configuration file
        :param argv: parameter list
        :return: none
        """
        tag = argv[0]
        cv_num = 5
        cv_rawset_name = 'train_with_swap'
        train_data_size = 404290

        index_all = []
        for i in range(cv_num):
            index_all.append([])
        for i in range(train_data_size):
            index_all[int(random.random() * cv_num)].append(i)

        for i in range(cv_num):
            LogUtil.log('INFO', 'size(part%d)=%d' % (i, len(index_all[i])))

        index_fp = cf.get('DEFAULT', 'feature_index_pt')
        for i in range(cv_num):
            fold_id = i
            # train
            fp = '%s/cv_tag%s_n%d_f%d_train.%s.index' % (
                index_fp, tag, cv_num, fold_id, cv_rawset_name)
            for j in range(cv_num - 2):
                part_id = (i + j) % cv_num
                DataUtil.save_vector(fp, index_all[part_id], 'a')
            for j in range(cv_num - 2):
                part_id = (i + j) % cv_num
                DataUtil.save_vector(
                    fp,
                    [index + train_data_size
                     for index in index_all[part_id]], 'a')
            # valid
            fp = '%s/cv_tag%s_n%d_f%d_valid.%s.index' % (
                index_fp, tag, cv_num, fold_id, cv_rawset_name)
            part_id = (fold_id + cv_num - 2) % cv_num
            DataUtil.save_vector(fp, index_all[part_id], 'w')
            # test
            fp = '%s/cv_tag%s_n%d_f%d_test.%s.index' % (
                index_fp, tag, cv_num, fold_id, cv_rawset_name)
            part_id = (fold_id + cv_num - 1) % cv_num
            DataUtil.save_vector(fp, index_all[part_id], 'w')
Ejemplo n.º 36
0
    def merge_file(feature_pt, feature_name, data_set_name, part_num):
        features = None
        for part_id in range(part_num):
            features_part_fp = '%s/%s.%s.smat.%03d_%03d' % (
                feature_pt, feature_name, data_set_name, part_num, part_id)
            features_part = Feature.load(features_part_fp)
            if features is None:
                features = features_part
            else:
                features = Feature.merge_row(features, features_part)

        features_fp = '%s/%s.%s.smat' % (feature_pt, feature_name,
                                         data_set_name)
        Feature.save_smat(features, features_fp)
        LogUtil.log(
            'INFO', 'merge features (%s, %s, %d) done' %
            (feature_name, data_set_name, part_num))
 def __init_out_dir(self):
     # generate output tag
     self.out_tag = time.strftime("%Y-%m-%d_%H-%M-%S", time.localtime(time.time()))
     self.config.set('DIRECTORY', str(self.out_tag))
     # generate output directory
     out_pt = self.config.get('DIRECTORY', 'out_pt')
     out_pt_exists = os.path.exists(out_pt)
     if out_pt_exists:
         LogUtil.log("ERROR", 'out path (%s) already exists ' % out_pt)
         return
     else:
         os.mkdir(out_pt)
         os.mkdir(self.config.get('DIRECTORY', 'pred_pt'))
         os.mkdir(self.config.get('DIRECTORY', 'model_pt'))
         os.mkdir(self.config.get('DIRECTORY', 'fault_pt'))
         os.mkdir(self.config.get('DIRECTORY', 'conf_pt'))
         os.mkdir(self.config.get('DIRECTORY', 'score_pt'))
         LogUtil.log('INFO', 'out path (%s) created ' % out_pt)
     # save config
     self.config.write(open(self.config.get('DIRECTORY', 'conf_pt') + 'featwheel.conf', 'w'))
Ejemplo n.º 38
0
 def balance_index(indexs, labels, positive_rate):
     """
     balance indexs to adjust the positive rate
     :param indexs: index vector to sample raw data set
     :param labels: label vector of raw data set
     :param positive_rate: positive rate
     :return: index vector after balanced
     """
     if positive_rate < 1e-6 or positive_rate > 1. - 1e-6:
         return indexs
     pos_indexs = [index for index in indexs if labels[index] == 1.]
     neg_indexs = [index for index in indexs if labels[index] == 0.]
     origin_rate = 1.0 * len(pos_indexs) / len(indexs)
     LogUtil.log(
         "INFO", "original: len(pos)=%d, len(neg)=%d, rate=%.2f%%" %
         (len(pos_indexs), len(neg_indexs), 100.0 * origin_rate))
     if origin_rate < positive_rate:
         pos_indexs, neg_indexs = neg_indexs, pos_indexs
         origin_rate = 1.0 - origin_rate
         positive_rate = 1.0 - positive_rate
         LogUtil.log("INFO", "increase postive instances ...")
     else:
         LogUtil.log("INFO", "increase negtive instances ...")
     k = (1. - positive_rate) * origin_rate / positive_rate / (1 -
                                                               origin_rate)
     LogUtil.log("INFO", "k=%.4f" % k)
     balance_indexs = pos_indexs
     while k > 1e-6:
         if k > 1. - 1e-6:
             balance_indexs.extend(neg_indexs)
         else:
             balance_indexs.extend(
                 random.sample(neg_indexs, int(k * len(neg_indexs))))
         k -= 1.
     pos_indexs = [index for index in balance_indexs if labels[index] == 1.]
     neg_indexs = [index for index in balance_indexs if labels[index] == 0.]
     balanced_rate = 1.0 * len(pos_indexs) / len(balance_indexs)
     LogUtil.log(
         "INFO", "balanced: len(pos)=%d, len(neg)=%d, rate=%.2f%%" %
         (len(pos_indexs), len(neg_indexs), 100.0 * balanced_rate))
     return balance_indexs
 def init_idf(data):
     idf = {}
     q_set = set()
     for index, row in data.iterrows():
         q1 = str(row['question1'])
         q2 = str(row['question2'])
         if q1 not in q_set:
             q_set.add(q1)
             words = q1.lower().split()
             for word in words:
                 idf[word] = idf.get(word, 0) + 1
         if q2 not in q_set:
             q_set.add(q2)
             words = q2.lower().split()
             for word in words:
                 idf[word] = idf.get(word, 0) + 1
     num_docs = len(data)
     for word in idf:
         idf[word] = math.log(num_docs / (idf[word] + 1.)) / math.log(2.)
     LogUtil.log("INFO", "idf calculation done, len(idf)=%d" % len(idf))
     return idf
Ejemplo n.º 40
0
 def init_idf(data):
     idf = {}
     q_set = set()
     for index, row in data.iterrows():
         q1 = str(row['question1'])
         q2 = str(row['question2'])
         if q1 not in q_set:
             q_set.add(q1)
             words = q1.lower().split()
             for word in words:
                 idf[word] = idf.get(word, 0) + 1
         if q2 not in q_set:
             q_set.add(q2)
             words = q2.lower().split()
             for word in words:
                 idf[word] = idf.get(word, 0) + 1
     num_docs = len(data)
     for word in idf:
         idf[word] = math.log(num_docs / (idf[word] + 1.)) / math.log(2.)
     LogUtil.log("INFO", "idf calculation done, len(idf)=%d" % len(idf))
     return idf
 def balance_index(indexs, labels, positive_rate):
     """
     balance indexs to adjust the positive rate
     :param indexs: index vector to sample raw data set
     :param labels: label vector of raw data set
     :param positive_rate: positive rate
     :return: index vector after balanced
     """
     if positive_rate < 1e-6 or positive_rate > 1. - 1e-6:
         return indexs
     pos_indexs = [index for index in indexs if labels[index] == 1.]
     neg_indexs = [index for index in indexs if labels[index] == 0.]
     origin_rate = 1.0 * len(pos_indexs) / len(indexs)
     LogUtil.log("INFO", "original: len(pos)=%d, len(neg)=%d, rate=%.2f%%" % (
         len(pos_indexs), len(neg_indexs), 100.0 * origin_rate))
     if origin_rate < positive_rate:
         pos_indexs, neg_indexs = neg_indexs, pos_indexs
         origin_rate = 1.0 - origin_rate
         positive_rate = 1.0 - positive_rate
         LogUtil.log("INFO", "increase postive instances ...")
     else:
         LogUtil.log("INFO", "increase negtive instances ...")
     k = (1. - positive_rate) * origin_rate / positive_rate / (1 - origin_rate)
     LogUtil.log("INFO", "k=%.4f" % k)
     balance_indexs = pos_indexs
     while k > 1e-6:
         if k > 1. - 1e-6:
             balance_indexs.extend(neg_indexs)
         else:
             balance_indexs.extend(random.sample(neg_indexs, int(k * len(neg_indexs))))
         k -= 1.
     pos_indexs = [index for index in balance_indexs if labels[index] == 1.]
     neg_indexs = [index for index in balance_indexs if labels[index] == 0.]
     balanced_rate = 1.0 * len(pos_indexs) / len(balance_indexs)
     LogUtil.log("INFO", "balanced: len(pos)=%d, len(neg)=%d, rate=%.2f%%" % (
         len(pos_indexs), len(neg_indexs), 100.0 * balanced_rate))
     return balance_indexs
    def __generate_index(self, row_num):
        index_all = [list()] * self.cv_num
        for i in range(row_num):
            index_all[int(random.random() * self.cv_num)].append(i)
        for i in range(self.cv_num):
            LogUtil.log('INFO', 'generate cv index, size(part%d)=%d' % (i, len(index_all[i])))

        index_pt = self.config.get('DEFAULT', 'index_pt')
        for i in range(self.cv_num):
            fold_id = i
            # train
            fp = '%s/cv_tag%s_n%d_f%d_train.%s.index' % (index_pt,
                                                         self.cv_tag,
                                                         self.cv_num,
                                                         fold_id,
                                                         self.config.get('MODEL', 'offline_rawset_name'))
            DataUtil.save_vector(fp, list(), 'w')
            for j in range(self.cv_num - 2):
                part_id = (i + j) % self.cv_num
                DataUtil.save_vector(fp, index_all[part_id], 'a')
            # valid
            fp = '%s/cv_tag%s_n%d_f%d_valid.%s.index' % (index_pt,
                                                         self.cv_tag,
                                                         self.cv_num,
                                                         fold_id,
                                                         self.config.get('MODEL', 'offline_rawset_name'))
            part_id = (fold_id + self.cv_num - 2) % self.cv_num
            DataUtil.save_vector(fp, index_all[part_id], 'w')
            # test
            fp = '%s/cv_tag%s_n%d_f%d_test.%s.index' % (index_pt,
                                                        self.cv_tag,
                                                        self.cv_num,
                                                        fold_id,
                                                        self.config.get('MODEL', 'offline_rawset_name'))
            part_id = (fold_id + self.cv_num - 1) % self.cv_num
            DataUtil.save_vector(fp, index_all[part_id], 'w')
Ejemplo n.º 43
0
 def __init_out_dir(self):
     # generate output tag
     self.out_tag = time.strftime("%Y-%m-%d_%H-%M-%S",
                                  time.localtime(time.time()))
     self.config.set('DIRECTORY', str(self.out_tag))
     # generate output directory
     out_pt = self.config.get('DIRECTORY', 'out_pt')
     out_pt_exists = os.path.exists(out_pt)
     if out_pt_exists:
         LogUtil.log("ERROR", 'out path (%s) already exists ' % out_pt)
         return
     else:
         os.mkdir(out_pt)
         os.mkdir(self.config.get('DIRECTORY', 'pred_pt'))
         os.mkdir(self.config.get('DIRECTORY', 'model_pt'))
         os.mkdir(self.config.get('DIRECTORY', 'fault_pt'))
         os.mkdir(self.config.get('DIRECTORY', 'conf_pt'))
         os.mkdir(self.config.get('DIRECTORY', 'score_pt'))
         LogUtil.log('INFO', 'out path (%s) created ' % out_pt)
     # save config
     self.config.write(
         open(
             self.config.get('DIRECTORY', 'conf_pt') + 'featwheel.conf',
             'w'))
Ejemplo n.º 44
0
    def run_offline(self):
        LogUtil.log('INFO', 'cv_tag(%s)' % self.cv_tag)
        # load feature matrix
        offline_features = Feature.load_all(
            self.config.get('DIRECTORY', 'feature_pt'),
            self.config.get('FEATURE', 'feature_selected').split(),
            self.config.get('MODEL', 'offline_rawset_name'),
            self.config.get('FEATURE', 'will_save'))
        # load labels
        offline_labels = DataUtil.load_vector(
            '%s/%s.label' % (self.config.get('DIRECTORY', 'label_pt'),
                             self.config.get('MODEL', 'offline_rawset_name')),
            True)
        # generate index file
        if '' == self.cv_tag:
            self.cv_tag = self.out_tag
            self.__generate_index(offline_features.shape[0])
        # cross validation
        offline_valid_preds_all = [0.] * offline_features.shape[0]
        offline_test_preds_all = [0.] * offline_features.shape[0]
        for fold_id in range(self.cv_num):
            LogUtil.log('INFO', 'cross validation fold_id(%d) begin' % fold_id)

            # generate training data set
            offline_train_pos_rate = float(
                self.config.get('MODEL', 'train_pos_rate'))
            offline_train_indexs_fp = '%s/cv_tag%s_n%d_f%d_train.%s.index' % (
                self.config.get('DIRECTORY',
                                'index_pt'), self.cv_tag, self.cv_num, fold_id,
                self.config.get('MODEL', 'offline_rawset_name'))
            offline_train_indexs = DataUtil.load_vector(
                offline_train_indexs_fp, 'int')
            offline_train_features, offline_train_labels, offline_train_balanced_indexs = \
                CrossValidation.__generate_data(offline_train_indexs,
                                                offline_labels,
                                                offline_features,
                                                offline_train_pos_rate)
            LogUtil.log('INFO', 'offline train data generation done')

            # generate validation data set
            offline_valid_pos_rate = float(
                self.config.get('MODEL', 'valid_pos_rate'))
            offline_valid_indexs_fp = '%s/cv_tag%s_n%d_f%d_valid.%s.index' % (
                self.config.get('DIRECTORY',
                                'index_pt'), self.cv_tag, self.cv_num, fold_id,
                self.config.get('MODEL', 'offline_rawset_name'))
            offline_valid_indexs = DataUtil.load_vector(
                offline_valid_indexs_fp, 'int')
            offline_valid_features, offline_valid_labels, offline_valid_balanced_indexs = \
                CrossValidation.__generate_data(offline_valid_indexs,
                                                offline_labels,
                                                offline_features,
                                                offline_valid_pos_rate)
            LogUtil.log('INFO', 'offline valid data generation done')

            # generate test data set
            offline_test_pos_rate = float(
                self.config.get('MODEL', 'test_pos_rate'))
            offline_test_indexs_fp = '%s/cv_tag%s_n%d_f%d_test.%s.index' % (
                self.config.get('DIRECTORY',
                                'index_pt'), self.cv_tag, self.cv_num, fold_id,
                self.config.get('MODEL', 'offline_rawset_name'))
            offline_test_indexs = DataUtil.load_vector(offline_test_indexs_fp,
                                                       'int')
            offline_test_features, offline_test_labels, offline_test_balanced_indexs = \
                CrossValidation.__generate_data(offline_test_indexs,
                                                offline_labels,
                                                offline_features,
                                                offline_test_pos_rate)
            LogUtil.log('INFO', 'offline test data generation done')

            model = Model.new(self.config.get('MODEL', 'model_name'),
                              self.config)
            model_fp = self.config.get('DIRECTORY', 'model_pt') + '/cv_n%d_f%d.%s.model' % \
                                                                  (self.cv_num,
                                                                   fold_id,
                                                                   self.config.get('MODEL', 'model_name'))
            model.save(model_fp)
            offline_train_preds, offline_valid_preds, offline_test_preds = model.fit(
                offline_train_features, offline_train_labels,
                offline_valid_features, offline_valid_labels,
                offline_test_features, offline_test_labels)
            offline_train_score = Evaluator.evaluate(
                self.config.get('MODEL', 'evaluator_name'),
                offline_train_labels, offline_train_preds)
            offline_valid_score = Evaluator.evaluate(
                self.config.get('MODEL', 'evaluator_name'),
                offline_valid_labels, offline_valid_preds)
            offline_test_score = Evaluator.evaluate(
                self.config.get('MODEL', 'evaluator_name'),
                offline_test_labels, offline_test_preds)
            score_fp = '%s/%s.score' % (self.config.get(
                'DIRECTORY', 'score_pt'), 'cv')
            score_file = open(score_fp, 'a')
            score_file.write('fold:%d\ttrain:%s\tvalid:%s\ttest:%s\n' %
                             (fold_id, offline_train_score,
                              offline_valid_score, offline_test_score))
            score_file.close()
            # merge prediction results
            for index in range(len(offline_valid_balanced_indexs)):
                offline_valid_preds_all[offline_valid_balanced_indexs[
                    index]] = offline_valid_preds[index]
            for index in range(len(offline_test_balanced_indexs)):
                offline_test_preds_all[offline_test_balanced_indexs[
                    index]] = offline_test_preds[index]
            LogUtil.log('INFO', 'cross test fold_id(%d) done' % fold_id)
        # save prediction results
        offline_valid_preds_all_fp = '%s/cv_n%d_valid.%s.pred' % (
            self.config.get('DIRECTORY', 'pred_pt'), self.cv_num,
            self.config.get('MODEL', 'offline_rawset_name'))
        DataUtil.save_vector(offline_valid_preds_all_fp,
                             offline_valid_preds_all, 'w')
        offline_test_preds_all_fp = '%s/cv_n%d_test.%s.pred' % (
            self.config.get('DIRECTORY', 'pred_pt'), self.cv_num,
            self.config.get('MODEL', 'offline_rawset_name'))
        DataUtil.save_vector(offline_test_preds_all_fp, offline_test_preds_all,
                             'w')
        # evaluate
        offline_valid_score = Evaluator.evaluate(
            self.config.get('MODEL', 'evaluator_name'), offline_labels,
            offline_valid_preds_all)
        offline_test_score = Evaluator.evaluate(
            self.config.get('MODEL', 'evaluator_name'), offline_labels,
            offline_test_preds_all)
        score_fp = '%s/%s.score' % (self.config.get('DIRECTORY',
                                                    'score_pt'), 'cv')
        score_file = open(score_fp, 'a')
        score_file.write('cross_validation\tvalid:%s\ttest:%s\n' %
                         (offline_valid_score, offline_test_score))
        score_file.close()
 def sample_col(features, indexs):
     features_sampled = features[:, indexs]
     (row_num, col_num) = features_sampled.shape
     LogUtil.log("INFO", "col sample done, shape=(%d,%d)" % (row_num, col_num))
     return features_sampled
Ejemplo n.º 46
0
    def run_offline(self):
        # load feature matrix
        offline_features = Feature.load_all(
            self.config.get('DIRECTORY', 'feature_pt'),
            self.config.get('FEATURE', 'feature_selected').split(),
            self.config.get('MODEL', 'offline_rawset_name'),
            self.config.get('FEATURE', 'will_save'))
        # load labels
        offline_labels = DataUtil.load_vector(
            '%s/%s.label' % (self.config.get('DIRECTORY', 'label_pt'),
                             self.config.get('MODEL', 'offline_rawset_name')),
            True)
        # generate index file
        if '' == self.se_tag:
            self.se_tag = self.out_tag
            self.__generate_index(offline_features.shape[0])
        index_pt = self.config.get('DIRECTORY', 'index_pt')
        # generate training data set
        offline_train_pos_rate = float(
            self.config.get('MODEL', 'train_pos_rate'))
        offline_train_indexs_fp = '%s/se_tag%s_train.%s.index' % (
            index_pt, self.se_tag,
            self.config.get('MODEL', 'offline_rawset_name'))
        offline_train_indexs = DataUtil.load_vector(offline_train_indexs_fp,
                                                    'int')
        offline_train_features, offline_train_labels, offline_train_balanced_indexs = \
            SingleExec.__generate_data(offline_train_indexs,
                                       offline_labels,
                                       offline_features,
                                       offline_train_pos_rate)
        LogUtil.log('INFO', 'offline train data generation done')

        # generate validation data set
        offline_valid_pos_rate = float(
            self.config.get('MODEL', 'valid_pos_rate'))
        offline_valid_indexs_fp = '%s/se_tag%s_valid.%s.index' % (
            index_pt, self.se_tag,
            self.config.get('MODEL', 'offline_rawset_name'))
        offline_valid_indexs = DataUtil.load_vector(offline_valid_indexs_fp,
                                                    'int')
        offline_valid_features, offline_valid_labels, offline_valid_balanced_indexs = \
            SingleExec.__generate_data(offline_valid_indexs,
                                       offline_labels,
                                       offline_features,
                                       offline_valid_pos_rate)
        LogUtil.log('INFO', 'offline valid data generation done')

        # generate test data set
        offline_test_pos_rate = float(self.config.get('MODEL',
                                                      'test_pos_rate'))
        offline_test_indexs_fp = '%s/se_tag%s_test.%s.index' % (
            index_pt, self.se_tag,
            self.config.get('MODEL', 'offline_rawset_name'))
        offline_test_indexs = DataUtil.load_vector(offline_test_indexs_fp,
                                                   'int')
        offline_test_features, offline_test_labels, offline_test_balanced_indexs = \
            SingleExec.__generate_data(offline_test_indexs,
                                       offline_labels,
                                       offline_features,
                                       offline_test_pos_rate)
        LogUtil.log('INFO', 'offline test data generation done')

        model = Model.new(self.config.get('MODEL', 'model_name'), self.config)
        model_fp = self.config.get(
            'DIRECTORY', 'model_pt') + '/se.%s.model' % self.config.get(
                'MODEL', 'model_name')
        model.save(model_fp)
        offline_train_preds, offline_valid_preds, offline_test_preds = model.fit(
            offline_train_features, offline_train_labels,
            offline_valid_features, offline_valid_labels,
            offline_test_features, offline_test_labels)
        offline_train_score = Evaluator.evaluate(
            self.config.get('MODEL', 'evaluator_name'), offline_train_labels,
            offline_train_preds)
        offline_valid_score = Evaluator.evaluate(
            self.config.get('MODEL', 'evaluator_name'), offline_valid_labels,
            offline_valid_preds)
        offline_test_score = Evaluator.evaluate(
            self.config.get('MODEL', 'evaluator_name'), offline_test_labels,
            offline_test_preds)
        score_fp = '%s/%s.score' % (self.config.get('DIRECTORY',
                                                    'score_pt'), 'cv')
        score_file = open(score_fp, 'a')
        score_file.write(
            'single_exec\ttrain:%s\tvalid:%s\ttest:%s\n' %
            (offline_train_score, offline_valid_score, offline_test_score))
        score_file.close()
        # save prediction results
        offline_valid_preds_fp = '%s/se_valid.%s.pred' % (self.config.get(
            'DIRECTORY',
            'pred_pt'), self.config.get('MODEL', 'offline_rawset_name'))
        DataUtil.save_vector(offline_valid_preds_fp, offline_valid_preds, 'w')
        offline_test_preds_fp = '%s/se_test.%s.pred' % (self.config.get(
            'DIRECTORY',
            'pred_pt'), self.config.get('MODEL', 'offline_rawset_name'))
        DataUtil.save_vector(offline_test_preds_fp, offline_test_preds, 'w')
 def sample_row(features, indexs):
     features_sampled = features[indexs, :]
     (row_num, col_num) = features_sampled.shape
     LogUtil.log("INFO", "row sample done, shape=(%d,%d)" % (row_num, col_num))
     return features_sampled
    def run_offline(self):
        # load feature matrix
        offline_features = Feature.load_all(self.config.get('DIRECTORY', 'feature_pt'),
                                            self.config.get('FEATURE', 'feature_selected').split(),
                                            self.config.get('MODEL', 'offline_rawset_name'),
                                            self.config.get('FEATURE', 'will_save'))
        # load labels
        offline_labels = DataUtil.load_vector('%s/%s.label' % (self.config.get('DIRECTORY', 'label_pt'),
                                                               self.config.get('MODEL', 'offline_rawset_name')),
                                              True)
        # generate index file
        if '' == self.se_tag:
            self.se_tag = self.out_tag
            self.__generate_index(offline_features.shape[0])
        index_pt = self.config.get('DIRECTORY', 'index_pt')
        # generate training data set
        offline_train_pos_rate = float(self.config.get('MODEL', 'train_pos_rate'))
        offline_train_indexs_fp = '%s/se_tag%s_train.%s.index' % (index_pt,
                                                                  self.se_tag,
                                                                  self.config.get('MODEL', 'offline_rawset_name'))
        offline_train_indexs = DataUtil.load_vector(offline_train_indexs_fp, 'int')
        offline_train_features, offline_train_labels, offline_train_balanced_indexs = \
            SingleExec.__generate_data(offline_train_indexs,
                                       offline_labels,
                                       offline_features,
                                       offline_train_pos_rate)
        LogUtil.log('INFO', 'offline train data generation done')

        # generate validation data set
        offline_valid_pos_rate = float(self.config.get('MODEL', 'valid_pos_rate'))
        offline_valid_indexs_fp = '%s/se_tag%s_valid.%s.index' % (index_pt,
                                                                  self.se_tag,
                                                                  self.config.get('MODEL', 'offline_rawset_name'))
        offline_valid_indexs = DataUtil.load_vector(offline_valid_indexs_fp, 'int')
        offline_valid_features, offline_valid_labels, offline_valid_balanced_indexs = \
            SingleExec.__generate_data(offline_valid_indexs,
                                       offline_labels,
                                       offline_features,
                                       offline_valid_pos_rate)
        LogUtil.log('INFO', 'offline valid data generation done')

        # generate test data set
        offline_test_pos_rate = float(self.config.get('MODEL', 'test_pos_rate'))
        offline_test_indexs_fp = '%s/se_tag%s_test.%s.index' % (index_pt,
                                                                self.se_tag,
                                                                self.config.get('MODEL', 'offline_rawset_name'))
        offline_test_indexs = DataUtil.load_vector(offline_test_indexs_fp, 'int')
        offline_test_features, offline_test_labels, offline_test_balanced_indexs = \
            SingleExec.__generate_data(offline_test_indexs,
                                       offline_labels,
                                       offline_features,
                                       offline_test_pos_rate)
        LogUtil.log('INFO', 'offline test data generation done')

        model = Model.new(self.config.get('MODEL', 'model_name'), self.config)
        model_fp = self.config.get('DIRECTORY', 'model_pt') + '/se.%s.model' % self.config.get('MODEL', 'model_name')
        model.save(model_fp)
        offline_train_preds, offline_valid_preds, offline_test_preds = model.fit(offline_train_features,
                                                                                 offline_train_labels,
                                                                                 offline_valid_features,
                                                                                 offline_valid_labels,
                                                                                 offline_test_features,
                                                                                 offline_test_labels)
        offline_train_score = Evaluator.evaluate(self.config.get('MODEL', 'evaluator_name'),
                                                 offline_train_labels,
                                                 offline_train_preds)
        offline_valid_score = Evaluator.evaluate(self.config.get('MODEL', 'evaluator_name'),
                                                 offline_valid_labels,
                                                 offline_valid_preds)
        offline_test_score = Evaluator.evaluate(self.config.get('MODEL', 'evaluator_name'),
                                                offline_test_labels,
                                                offline_test_preds)
        score_fp = '%s/%s.score' % (self.config.get('DIRECTORY', 'score_pt'), 'cv')
        score_file = open(score_fp, 'a')
        score_file.write('single_exec\ttrain:%s\tvalid:%s\ttest:%s\n' % (offline_train_score,
                                                                         offline_valid_score,
                                                                         offline_test_score))
        score_file.close()
        # save prediction results
        offline_valid_preds_fp = '%s/se_valid.%s.pred' % (self.config.get('DIRECTORY', 'pred_pt'),
                                                          self.config.get('MODEL', 'offline_rawset_name'))
        DataUtil.save_vector(offline_valid_preds_fp, offline_valid_preds, 'w')
        offline_test_preds_fp = '%s/se_test.%s.pred' % (self.config.get('DIRECTORY', 'pred_pt'),
                                                        self.config.get('MODEL', 'offline_rawset_name'))
        DataUtil.save_vector(offline_test_preds_fp, offline_test_preds, 'w')
    def run_offline(self):
        LogUtil.log('INFO', 'cv_tag(%s)' % self.cv_tag)
        # load feature matrix
        offline_features = Feature.load_all(self.config.get('DIRECTORY', 'feature_pt'),
                                            self.config.get('FEATURE', 'feature_selected').split(),
                                            self.config.get('MODEL', 'offline_rawset_name'),
                                            self.config.get('FEATURE', 'will_save'))
        # load labels
        offline_labels = DataUtil.load_vector('%s/%s.label' % (self.config.get('DIRECTORY', 'label_pt'),
                                                               self.config.get('MODEL', 'offline_rawset_name')),
                                              True)
        # generate index file
        if '' == self.cv_tag:
            self.cv_tag = self.out_tag
            self.__generate_index(offline_features.shape[0])
        # cross validation
        offline_valid_preds_all = [0.] * offline_features.shape[0]
        offline_test_preds_all = [0.] * offline_features.shape[0]
        for fold_id in range(self.cv_num):
            LogUtil.log('INFO', 'cross validation fold_id(%d) begin' % fold_id)

            # generate training data set
            offline_train_pos_rate = float(self.config.get('MODEL', 'train_pos_rate'))
            offline_train_indexs_fp = '%s/cv_tag%s_n%d_f%d_train.%s.index' % (self.config.get('DIRECTORY', 'index_pt'),
                                                                              self.cv_tag,
                                                                              self.cv_num,
                                                                              fold_id,
                                                                              self.config.get('MODEL',
                                                                                              'offline_rawset_name'))
            offline_train_indexs = DataUtil.load_vector(offline_train_indexs_fp, 'int')
            offline_train_features, offline_train_labels, offline_train_balanced_indexs = \
                CrossValidation.__generate_data(offline_train_indexs,
                                                offline_labels,
                                                offline_features,
                                                offline_train_pos_rate)
            LogUtil.log('INFO', 'offline train data generation done')

            # generate validation data set
            offline_valid_pos_rate = float(self.config.get('MODEL', 'valid_pos_rate'))
            offline_valid_indexs_fp = '%s/cv_tag%s_n%d_f%d_valid.%s.index' % (self.config.get('DIRECTORY', 'index_pt'),
                                                                              self.cv_tag,
                                                                              self.cv_num,
                                                                              fold_id,
                                                                              self.config.get('MODEL',
                                                                                              'offline_rawset_name'))
            offline_valid_indexs = DataUtil.load_vector(offline_valid_indexs_fp, 'int')
            offline_valid_features, offline_valid_labels, offline_valid_balanced_indexs = \
                CrossValidation.__generate_data(offline_valid_indexs,
                                                offline_labels,
                                                offline_features,
                                                offline_valid_pos_rate)
            LogUtil.log('INFO', 'offline valid data generation done')

            # generate test data set
            offline_test_pos_rate = float(self.config.get('MODEL', 'test_pos_rate'))
            offline_test_indexs_fp = '%s/cv_tag%s_n%d_f%d_test.%s.index' % (self.config.get('DIRECTORY', 'index_pt'),
                                                                            self.cv_tag,
                                                                            self.cv_num,
                                                                            fold_id,
                                                                            self.config.get('MODEL',
                                                                                            'offline_rawset_name'))
            offline_test_indexs = DataUtil.load_vector(offline_test_indexs_fp, 'int')
            offline_test_features, offline_test_labels, offline_test_balanced_indexs = \
                CrossValidation.__generate_data(offline_test_indexs,
                                                offline_labels,
                                                offline_features,
                                                offline_test_pos_rate)
            LogUtil.log('INFO', 'offline test data generation done')

            model = Model.new(self.config.get('MODEL', 'model_name'), self.config)
            model_fp = self.config.get('DIRECTORY', 'model_pt') + '/cv_n%d_f%d.%s.model' % \
                                                                  (self.cv_num,
                                                                   fold_id,
                                                                   self.config.get('MODEL', 'model_name'))
            model.save(model_fp)
            offline_train_preds, offline_valid_preds, offline_test_preds = model.fit(offline_train_features,
                                                                                     offline_train_labels,
                                                                                     offline_valid_features,
                                                                                     offline_valid_labels,
                                                                                     offline_test_features,
                                                                                     offline_test_labels)
            offline_train_score = Evaluator.evaluate(self.config.get('MODEL', 'evaluator_name'),
                                                     offline_train_labels,
                                                     offline_train_preds)
            offline_valid_score = Evaluator.evaluate(self.config.get('MODEL', 'evaluator_name'),
                                                     offline_valid_labels,
                                                     offline_valid_preds)
            offline_test_score = Evaluator.evaluate(self.config.get('MODEL', 'evaluator_name'),
                                                    offline_test_labels,
                                                    offline_test_preds)
            score_fp = '%s/%s.score' % (self.config.get('DIRECTORY', 'score_pt'), 'cv')
            score_file = open(score_fp, 'a')
            score_file.write('fold:%d\ttrain:%s\tvalid:%s\ttest:%s\n' % (fold_id,
                                                                         offline_train_score,
                                                                         offline_valid_score,
                                                                         offline_test_score))
            score_file.close()
            # merge prediction results
            for index in range(len(offline_valid_balanced_indexs)):
                offline_valid_preds_all[offline_valid_balanced_indexs[index]] = offline_valid_preds[index]
            for index in range(len(offline_test_balanced_indexs)):
                offline_test_preds_all[offline_test_balanced_indexs[index]] = offline_test_preds[index]
            LogUtil.log('INFO', 'cross test fold_id(%d) done' % fold_id)
        # save prediction results
        offline_valid_preds_all_fp = '%s/cv_n%d_valid.%s.pred' % (self.config.get('DIRECTORY', 'pred_pt'),
                                                                  self.cv_num,
                                                                  self.config.get('MODEL', 'offline_rawset_name'))
        DataUtil.save_vector(offline_valid_preds_all_fp, offline_valid_preds_all, 'w')
        offline_test_preds_all_fp = '%s/cv_n%d_test.%s.pred' % (self.config.get('DIRECTORY', 'pred_pt'),
                                                                self.cv_num,
                                                                self.config.get('MODEL', 'offline_rawset_name'))
        DataUtil.save_vector(offline_test_preds_all_fp, offline_test_preds_all, 'w')
        # evaluate
        offline_valid_score = Evaluator.evaluate(self.config.get('MODEL', 'evaluator_name'),
                                                 offline_labels,
                                                 offline_valid_preds_all)
        offline_test_score = Evaluator.evaluate(self.config.get('MODEL', 'evaluator_name'),
                                                offline_labels,
                                                offline_test_preds_all)
        score_fp = '%s/%s.score' % (self.config.get('DIRECTORY', 'score_pt'), 'cv')
        score_file = open(score_fp, 'a')
        score_file.write('cross_validation\tvalid:%s\ttest:%s\n' % (offline_valid_score, offline_test_score))
        score_file.close()
Ejemplo n.º 50
0
 def generate_pagerank(G, alpha, max_iter):
     pr = nx.pagerank(G, alpha=alpha, max_iter=max_iter)
     LogUtil.log('INFO', 'Graph cal pagerank done')
     return pr
Ejemplo n.º 51
0
    def generate_graph(config, weight_feature_name, weight_feature_id,
                       reverse):
        q2id = {}
        e2weight = {}
        G = nx.Graph()

        train_wfs_fs = None
        test_wfs_fs = None
        if weight_feature_name is not None:
            train_wfs_fs = Feature.load(
                '%s/%s.train.smat' %
                (config.get('DIRECTORY', 'feature_question_pair_pt'),
                 weight_feature_name)).toarray()
            test_wfs_fs = Feature.load(
                '%s/%s.test.smat' %
                (config.get('DIRECTORY', 'feature_question_pair_pt'),
                 weight_feature_name)).toarray()
            if 'True' == reverse:
                LogUtil.log('INFO', 'will reverse')
                for index in range(len(train_wfs_fs)):
                    train_wfs_fs[index][weight_feature_id] = 1. - train_wfs_fs[
                        index][weight_feature_id]
                for index in range(len(test_wfs_fs)):
                    test_wfs_fs[index][weight_feature_id] = 1. - test_wfs_fs[
                        index][weight_feature_id]

        fin = csv.reader(
            open('%s/train.csv' % config.get('DIRECTORY', 'origin_pt')))
        fin.next()
        index = 0
        for p in fin:
            q1 = str(p[3]).strip()
            q2 = str(p[4]).strip()
            weight = 0 if train_wfs_fs is None else train_wfs_fs[index][
                weight_feature_id]
            if q1 not in q2id:
                q2id[q1] = len(q2id)
            if q2 not in q2id:
                q2id[q2] = len(q2id)
            G.add_edge(q2id[q1], q2id[q2], weight=weight)
            e2weight[(q2id[q1], q2id[q2])] = weight
            e2weight[(q2id[q2], q2id[q1])] = weight
            index += 1

        fin = csv.reader(
            open('%s/test.csv' % config.get('DIRECTORY', 'origin_pt')))
        fin.next()
        index = 0
        for p in fin:
            q1 = str(p[1]).strip()
            q2 = str(p[2]).strip()
            weight = 0 if test_wfs_fs is None else test_wfs_fs[index][
                weight_feature_id]
            if q1 not in q2id:
                q2id[q1] = len(q2id)
            if q2 not in q2id:
                q2id[q2] = len(q2id)
            G.add_edge(q2id[q1], q2id[q2], weight=weight)
            e2weight[(q2id[q1], q2id[q2])] = weight
            e2weight[(q2id[q2], q2id[q1])] = weight
            index += 1
        LogUtil.log('INFO', 'Graph constructed.')

        return q2id, e2weight, G
Ejemplo n.º 52
0
 def generate_hits(G, max_iter):
     hits_h, hits_a = nx.hits(G, max_iter=max_iter)
     LogUtil.log('INFO', 'Graph cal hits done')
     return hits_h, hits_a