Esempio n. 1
0
    def __cal_matrix(self, file_name=""):
        log.info("extract feature from pre-defined setting!")
        if self.__feature_method == "QD":
            paper_len = len(self.__paper)
            matrix_l = np.zeros([paper_len, paper_len])
            for i in range(paper_len):
                for j in range(paper_len):
                    # print "element %d, %d" % (i, j)
                    num = self.__cal_matrix_element(i, j)
                    matrix_l[i][j] = num
                    matrix_l[j][i] = num
        elif self.__feature_method == "DM":
            #            if file_name == "":
            #                log.error("file name is empty, please check!")
            #                return []
            #            file_path = os.path.join("./data/word2vec/remove_stop/%s.vec" % file_name)
            file_path = self.__child_path + "word_segment.vec"
            matrix_l = self.__get_doc2vec_matrix(file_path)
        else:
            log.error("self.__feature_method is " + self.__feature_method)
            return []
#        matrix_l = self.__feature_normalization(matrix_l)
        if self.summary_method == "hDPP":
            self.__doc_matrix_ = matrix_l
        return matrix_l
Esempio n. 2
0
 def __quality_initial_coverage(self):
     """
     initial quality list use sentence coverage feature
     :return: null
     """
     if self.feature_merge.split("-")[3] == "0":
         return
     log.info("quality calculation: sentence coverage")
     if self.quality_method__ == "":
         self.quality_method__ += "cov"
     else:
         self.quality_method__ += "-cov"
     tmp_quality = np.zeros([len(self.__paper)])
     sen_num = len(self.__paper)
     union_paper = " ".join(
         [" ".join(set(sen.split(" "))) for sen in self.__paper]).split(" ")
     for i in range(sen_num):
         word_list = self.__paper[i].split(" ")
         word_in_sen = [
             union_paper.count(cur_word) / float(sen_num)
             for cur_word in word_list
         ]
         tmp_quality[i] = np.sum(word_in_sen) / len(word_list)
     tmp_quality = self.__feature_normalization(tmp_quality)
     self.__quality += tmp_quality * float(self.feature_merge.split("-")[3])
Esempio n. 3
0
 def launch_multiling_single_summary(self, dic_path):
     self.__rouge_path = ini_rouge_data(name_suffix=self.feature_merge +
                                        "-" + "-" + self.summary_method)
     path_dir = os.listdir(dic_path)
     for cur_lang in path_dir:
         if cur_lang not in ["zh", "en"]:
             continue
         lang_dir = os.path.join("%s/%s" % (dic_path, cur_lang))
         self.__all_conf = []
         # get target length of current language
         self.__target_len = dict()
         for line in read_file(self.__target_len_dir + cur_lang + ".txt"):
             self.__target_len[line.split("_")[0]] = int(line.split(",")[1])
         # get summary of current file(cur_file)
         for cur_file in os.listdir(lang_dir):
             self.max_sum_len__ = self.__target_len[cur_file]
             child_path = os.path.join('%s/%s/%s/' %
                                       (dic_path, cur_lang, cur_file))
             self.__child_path = child_path
             log.info(child_path)
             self.get_mss_paper_summary(cur_lang, cur_file)
         # write_file(self.__all_conf, self.__rouge_path + cur_lang + "/configure/.configure_all_" + cur_lang + ".txt", False)
         # if not os.path.exists(self.__rouge_path + cur_lang + "/output"):
         #     os.makedirs(self.__rouge_path + cur_lang + "/output")
     return self.__rouge_path
Esempio n. 4
0
    def __initial(self, start_idx):
        #        doc2vec_file = open("data/input_data/qualityForTrain.vec", "r")
        doc2vec_file = open("data/input_data/qualityForTrain.vec", "r")
        i = 0
        for vector in doc2vec_file.readlines():
            if i < start_idx:
                i += 1
                continue
            list_vector = vector.strip().split(" ")
            self.__f_x_test[i - start_idx][:] = np.array(list_vector[:])
            i += 1
            if i - start_idx >= self.__data_size:
                break
        doc2vec_file.close()

        file = open("./data/input_data/taskAAForDoc2VecTrainData.txt", "r")
        #        file = open("./data/input_data_test/all_test_data.txt", "r")
        datas = [
            sentence.strip().decode("utf-8").split(" ")
            for sentence in file.readlines()
        ]
        self.__test_data = [
            datas[i + start_idx] for i in range(self.__data_size)
        ]
        log.info(len(self.__test_data))
        #        log.info(self.__test_data)
        file.close()
        file = open("./data/input_data/taskAAForDoc2VecTrainLabel.txt", "r")
        datas = [sentence.strip() for sentence in file.readlines()]
        self.__test_label = [
            datas[i + start_idx] for i in range(self.__data_size)
        ]
        file.close()
Esempio n. 5
0
 def __feature_normalization(tmp_array):
     log.info("feature normalization: sigmoid")
     # sigmoid
     if isinstance(tmp_array, list):
         return (np.array(tmp_array) / np.max(tmp_array)).tolist()
         # return [np.exp(-1.0 * x) for x in tmp_array]
     else:
         return tmp_array / np.max(tmp_array)
Esempio n. 6
0
 def __similarity_calculating(self, idx_i, idx_j):
     if idx_i + idx_j <= 0:
         log.info("distance calculation: JACCARD")
         self.distance_method__ = "jaccard"
     inter_ = set(self.__paper[idx_i].split(" ")).intersection(
         self.__paper[idx_j].split(" "))
     union_ = set(self.__paper[idx_i].split(" ")).union(
         self.__paper[idx_j].split(" "))
     return float(len(inter_)) / float(len(union_))
Esempio n. 7
0
 def __get_doc2vec_matrix(self, path):
     log.info('use word2vec')
     self.quality_method__ = "word2vec"
     self.distance_method__ = "100"
     word2vec_matrix = read_file(path)
     word2vec_matrix = word2vec_matrix[2:len(word2vec_matrix) - 1]
     self.__key_word = [vec_.split(u" ")[0] for vec_ in word2vec_matrix]
     log.debug("word2vec key words: \n" + "\t".join(self.__key_word))
     word2vec = np.array([(vec_.encode("utf-8")).split(" ")[1:]
                          for vec_ in word2vec_matrix])
     word2vec = word2vec.astype(np.float64)
     return word2vec.dot(word2vec.transpose()) * 1000
Esempio n. 8
0
    def __learning_single_attitude(self, attitude):
        """
        learning parameters __parameter_theta, which is the parameters of quality model
        :param input_x: input data of X
        :param label_y: label y
        :return: L-matrix
        """
        log.info('learning: ' + attitude)
        self.__parameter_theta = np.random.random(size=self.__feature_size)
        self.__sample = sp.DppSampling(100, self.__feature_size)
        self.__initiate_f_x()
        grad = self.calculate_gradient(attitude)
        best_f = 0.0
        iter_count = 0
        log.debug("grad")
        log.info(grad)
        while (not self.__whether_end(grad)) and iter_count < 1000:
            log.info("interation: " + str(iter_count))
            new_f, ignore, ignore_value = self.__sample.sampling(self.__parameter_theta, attitude)
            if new_f > best_f:
                best_f = new_f
                self.__final_parameter[attitude] = copy.deepcopy(self.__parameter_theta)
            log.debug("grad")
            log.info(grad)
            log.debug("parameter")
            log.debug(self.__parameter_theta)
            self.__parameter_theta = self.__parameter_theta - self.__step * grad
            grad = self.calculate_gradient(attitude)
            iter_count += 1

        self.__tmp_eigenvalue[attitude] = copy.deepcopy(self.__sample.get_best_eigenvalue())
        self.__tmp_answer[attitude] = copy.deepcopy(self.__sample.get_best_answer())
Esempio n. 9
0
    def __quality_initial_position(self):
        if self.feature_merge.split("-")[0] == "0":
            return
        log.info("quality calculation: position")
        if self.quality_method__ == "":
            self.quality_method__ += "pos"
        else:
            self.quality_method__ += "-pos"

        tmp_quality = np.zeros([len(self.__paper)])
        for i in range(len(self.__paper)):
            tmp_quality[i] = 1.0 - float(i) / float(len(self.__paper))
        # tmp_quality = self.__feature_normalization(tmp_quality)
        self.__quality += tmp_quality * float(self.feature_merge.split("-")[0])
Esempio n. 10
0
 def __quality_initial_level(self):
     if self.feature_merge.split("-")[4] == "0":
         return
     log.info("quality calculation: hLDA level")
     log.info("get level score: " + self.__child_path)
     if self.quality_method__ == "":
         self.quality_method__ += "lev"
     else:
         self.quality_method__ += "-lev"
     if self.__level_tmp is None:
         self.__level_tmp = LevelScore()
     tmp_level = self.__level_tmp.get_paper_level_score(self.__child_path)
     tmp_level = self.__feature_normalization(tmp_level)
     self.__quality += (np.array(tmp_level) *
                        float(self.feature_merge.split("-")[4]))
Esempio n. 11
0
 def __calculate_similarity(self, idx_i, idx_j):
     '''
     if self.__similarity is None:
         log.info("initiating similarity")
         self.__similarity = self.__f_x_test.dot(
             self.__f_x_test.transpose())
     return self.__similarity[idx_i][idx_j]
     '''
     if self.__similarity is None:
         log.info("initiating similarity")
         self.__similarity = np.zeros([self.__data_size, self.__data_size])
         for i in range(self.__data_size):
             for j in range(i, self.__data_size):
                 self.__similarity[i][j] = self.__similarity__method(i, j)
                 self.__similarity[j][i] = self.__similarity[i][j]
     return self.__similarity[idx_i][idx_j]
Esempio n. 12
0
 def launch_multiling_single_summary(self, dic_path):
     # MMS-2015
     #test = PreProcessing()
     #self.__rouge_path = test.ini_rouge_data(name_suffix=self.feature_merge)
     print dic_path
     path_dir = os.listdir(dic_path)
     print path_dir
     for cur_cluster in path_dir:
         print cur_cluster
         cluster_dir = os.path.join("%s/%s/" % (dic_path, cur_cluster))
         self.__all_conf = []
         # get target length of current language
         # get summary of current file(cur_file)
         self.__child_path = cluster_dir
         log.info(cluster_dir)
         self.get_mss_paper_summary(cur_cluster)
     return ""
Esempio n. 13
0
    def __init__(self, data_size, feature_size):
        log.info("dpp learning module")
        self.__data_size = data_size
        self.__feature_size = feature_size
        self.__train_label = []
        self.__train_data = []
#        self.__parameter_theta = np.zeros(feature_size)
        self.__parameter_theta = None
        self.__matrix_l = np.zeros([data_size, data_size])
        self.__similarity = None
        self.__f_x = np.zeros([data_size, self.__feature_size])
        self.__label = np.zeros(data_size)
        self.__step = 0.01
        self.__tmp_answer = dict()
        self.__final_parameter = dict()
        self.__tmp_eigenvalue = dict()
        self.__sample = None
Esempio n. 14
0
    def get_mss_paper_summary(self, file_name, if_write_file=True):
        """
        generate summary for one paper, single document summarization
        :param lang:
        :param file_name: current file name, used for write summary answer
        :param if_write_file: whether write generated summary to answer file named file_name
        :return:
        """
        # initial
        self.__quality, self.__paper_name = None, file_name
        self.quality_method__ = ""

        if self.stop_word_method == "remove_stop":
            self.__paper = read_file(self.__child_path + "RemoveStop.temp")
        elif self.stop_word_method == "with_stop":
            self.__paper = read_file(self.__child_path + "word_segment.temp")
        self.__titles = read_file(self.__child_path + "titles.temp")
        self.__paper_original = read_file(self.__child_path +
                                          "word_segment.temp")
        self.__sub_paper_len = [
            int(i) for i in read_file(self.__child_path + "sec_idx.temp")
        ]

        # extract sentence
        feature_subset, eig = self.__cal_candidate_set()
        #print len(feature_subset)
        #        feature_subset = range(len(self.__paper_original))
        #        eig = []
        log.info(feature_subset)
        log.debug(eig)
        # use feature list to extract summary
        summary = self.__construct_summary(feature_subset, eig)

        if if_write_file:
            if file_name == '':
                log.error("file name is empty")
                return ""
            # write answer to file for ROUGE
            #print self.__rouge_path
            answer_path = self.__child_path
            write_file(
                summary,
                os.path.join('%s/%s.txt' %
                             (answer_path, file_name + '_result')), False)

        return "".join(summary)
Esempio n. 15
0
    def __initiate_f_x(self):
        log.info("initial f(x)")
        doc2vec_file = open("data/input_data/qualityForTrain.vec", "r")
        i = 0
        for vector in doc2vec_file.readlines():
            list_vector = vector.strip().split(" ")
            self.__f_x[i][:] = np.array(list_vector[:])
            i += 1
            if i >= self.__data_size:
                break
        doc2vec_file.close()

        sentence_file = open("data/input_data/taskAAForDoc2VecTrainData.txt")
        self.__train_data = [sentence.strip().decode("utf-8").split(" ") for sentence in sentence_file.readlines()]
        sentence_file.close()
        train_label_file = open("data/input_data/taskAAForDoc2VecTrainLabel.txt")
        self.__train_label = [sentence.strip() for sentence in train_label_file.readlines()]
        train_label_file.close()
Esempio n. 16
0
 def __quality_initial_length(self):
     if self.feature_merge.split("-")[1] == "0":
         return
     log.info("quality calculation: length")
     if self.quality_method__ == "":
         self.quality_method__ += "len"
     else:
         self.quality_method__ += "-len"
     tmp_quality = np.zeros([len(self.__paper)])
     for i in range(len(self.__paper)):
         tmp_quality[i] = len(self.__paper[i].replace(" ", ""))
     mean = tmp_quality.sum() / float(len(self.__paper))
     var = np.cov(tmp_quality)
     for i in range(len(self.__paper)):
         tmp_quality[i] = np.exp(
             (-1 * np.square(tmp_quality[i] - mean)) / var)
     # tmp_quality = self.__feature_normalization(tmp_quality)
     self.__quality += tmp_quality * float(self.feature_merge.split("-")[1])
Esempio n. 17
0
 def __cal_candidate_set(self):
     matrix_l = self.__cal_matrix()
     subset_ = []
     eigenvalue = []
     try:
         if self.candidate_method == "DR":
             subset_, eigenvalue = ds.sample(matrix_l)
         elif self.candidate_method == "CLU-DPP":
             cluster = hlda_analysis.sentence_cluster(
                 self.__child_path, "run000")
             # debug hLDA message, include: total cluster number, each cluster sentence,
             i = 0
             tmp = ""
             log.info("cluster number: " + str(len(cluster)))
             for sen_list in cluster:
                 tmp += "\n cluster: " + str(
                     i) + "\tsentence_num is " + str(len(sen_list)) + "\n"
                 tmp += "\n".join(np.array(self.__paper_original)[sen_list])
                 i += 1
             log.debug(tmp)
             # begin calculate and get sentence
             for i in range(len(cluster) / 2):
                 sen_list = cluster[i]
                 tmp_matrix = matrix_l[sen_list][:, sen_list]
                 tmp_set, eig = ds.sample(tmp_matrix)
                 if len(sen_list) < 10:
                     subset_.append(sen_list)
                     eigenvalue.append(eig)
                     continue
                 subset_.append(np.array(sen_list)[tmp_set].tolist())
                 eigenvalue.append(np.array(eig)[tmp_set].tolist())
         elif self.candidate_method == "RANDOM":
             for i in range(20):
                 subset_.append(
                     np.random.randint(0, len(self.__paper_original)))
         else:
             raise RuntimeError("value error: " + self.candidate_method)
     except RuntimeError as e:
         log.error(e)
     finally:
         return subset_, eigenvalue
Esempio n. 18
0
    def __similarity_calculating_wordnet(self, idx_i, idx_j):
        if idx_i + idx_j <= 0:
            log.info("distance calculation: WordNet")
            self.distance_method__ = "WordNet"
        list1 = self.__paper[idx_i].split()
        list2 = self.__paper[idx_j].split()
        # list1 = ['RAM','keeps','things','being','worked','with']
        # list2 = ['The','CPU','uses','RAM','as','a','short-term','memory','store']
        vec = []
        bylist = [list1, list2]
        totallist = list(set(list1 + list2))
        for i in range(len(bylist)):
            vector = np.zeros(len(totallist))
            for j in range(len(totallist)):
                if totallist[j] in bylist[i]:
                    vector[j] = 1
                    continue
                tmp_vec = []
                synsets1 = wn.synsets(totallist[j])

                if len(synsets1) == 0:
                    vector[j] = 0.0
                    continue
                for word in bylist[i]:
                    synsets2 = wn.synsets(word)
                    if len(synsets2) == 0:
                        continue
                    tmp_score = synsets1[0].path_similarity(synsets2[0])
                    if tmp_score is not None:
                        tmp_vec.append(tmp_score)
                if len(tmp_vec) == 0:
                    vector[j] = 0.0
                else:
                    vector[j] = max(tmp_vec)
            vec.append(vector)

        l_1 = np.sqrt(vec[0].dot(vec[0]))
        l_2 = np.sqrt(vec[1].dot(vec[1]))
        cos_angle = vec[0].dot(vec[1]) / (l_1 * l_2)
        return cos_angle
Esempio n. 19
0
    def __cal_matrix(self, file_name=""):
        log.info("extract feature from pre-defined setting!")
        if self.__feature_method == "QD":
            paper_len = len(self.__paper)
            matrix_l = np.zeros([paper_len, paper_len])
            for i in range(paper_len):
                for j in range(paper_len):
                    if i > j:
                        continue
                    num = self.__cal_matrix_element(i, j)
                    matrix_l[i][j] = num
                    matrix_l[j][i] = num

        elif self.__feature_method == "DM":
            file_path = self.__child_path + "word_segment.vec"
            matrix_l = self.__get_doc2vec_matrix(file_path)
        else:
            log.error("self.__feature_method is " + self.__feature_method)
            return []
#        matrix_l = self.__feature_normalization(matrix_l)
        if self.summary_method == "hDPP":
            self.__doc_matrix_ = matrix_l
        return matrix_l
Esempio n. 20
0
 def __quality_initial_similarity(self):
     if self.feature_merge.split("-")[2] == "0":
         return
     log.info("quality calculation: similarity")
     if self.quality_method__ == "":
         self.quality_method__ += "sim"
     else:
         self.quality_method__ += "-sim"
     tmp_quality = np.zeros([len(self.__paper)])
     '''
     calculate quality use similarity
     '''
     title = " ".join(self.__titles)
     vectorizer = CountVectorizer()
     transformer = TfidfTransformer()
     corpus = [title]
     corpus.extend(self.__paper)
     tfidf = transformer.fit_transform(vectorizer.fit_transform(corpus))
     # word = vectorizer.get_feature_names() # all words
     weight = tfidf.toarray()
     for i in range(len(self.__paper)):
         tmp_quality[i] = np.array(weight[i + 1]).dot(np.array(weight[0]))
     tmp_quality = self.__feature_normalization(tmp_quality)
     self.__quality += tmp_quality * float(self.feature_merge.split("-")[2])
Esempio n. 21
0
    def calculate_gradient(self, attitude):
        # compute L(x; theta) as in equation (155)
        log.debug(self.__parameter_theta)
        log.info("computing matrix L")
        for row_i in range(self.__data_size):
            for col_j in range(self.__data_size):
                self.__matrix_l[row_i][col_j] = self.__calculate_matrix_element(row_i, col_j)

        # Eigendecompose L(x; theta)
        log.info("engendecomposing")
        log.debug("matrix value: " + str(np.linalg.det(self.__matrix_l)))
        (eigenvalue, feature_vector) = np.linalg.eig(self.__matrix_l)
        for i in range(len(eigenvalue)):
            eigenvalue[i] = float(eigenvalue[i])
            if np.abs(eigenvalue[i]) < 0.000000001:
                eigenvalue[i] = 0.0
        log.debug("eigenvalue")
        log.debug(eigenvalue)

        # calculate K_ii
        log.info("calculating Kii")
        vector_k = np.zeros(self.__data_size)
        log.debug("feature value matrix")
        for i in range(self.__data_size):
            for j in range(self.__data_size):
                vector_k[i] += ((eigenvalue[j] / (eigenvalue[j] + 1)) * (feature_vector[i][j] ** 2))

#        log.debug("Kii: " + str(vector_k))
        # calculate gradient
        log.info("calculating gradient")
        sigma_sub_f_x = np.zeros(self.__feature_size)
        for index in range(self.__data_size):
            if self.__train_label[index] == attitude:
                sigma_sub_f_x += self.__f_x[index]

        sigma_kii_f_x = np.zeros(self.__feature_size)
        for i in range(self.__data_size):
            sigma_kii_f_x += vector_k[i] * self.__f_x[i]

        return sigma_sub_f_x - sigma_kii_f_x
Esempio n. 22
0
 def __calculate_similarity(self, idx_i, idx_j):
     if self.__similarity is None:
         log.info("initiating similarity")
         self.__similarity = self.__f_x.dot(self.__f_x.transpose().reshape(self.__data_size, self.__feature_size))
     return self.__similarity[idx_i][idx_j]
     '''
Esempio n. 23
0
    def get_mss_paper_summary(self, lang, file_name, if_write_file=True):
        """
        generate summary for one paper, single document summarization
        :param lang:
        :param file_name: current file name, used for write summary answer
        :param if_write_file: whether write generated summary to answer file named file_name
        :return:
        """
        # initial
        self.__quality, self.__paper_name = None, file_name
        self.quality_method__ = ""
        '''
        if DATA == "mms2015":
            self.__all_file.merge_mms_2015(os.path.dirname(self.__child_path), "chinese")
        elif DATA == "mss2017":
            if lang in ["vi", "ka"]:
                self.__all_file.merge_mss_2017(os.path.dirname(self.__child_path))
            else:
                self.__all_file.merge_mss_2017_ros(os.path.dirname(self.__child_path))
        self.__paper_original = self.__all_file.get_merged_paper()
        if self.stop_word_method == "remove_stop":
            self.__paper = self.__all_file.get_filtered_paper()
        elif self.stop_word_method == "with_stop":
            self.__paper = self.__all_file.get_merged_paper()
        self.__titles = self.__all_file.get_titles()
        # used for generate hLDA input file and calculate level method.
        if (not os.path.exists(self.__child_path + "model.temp")) or False:
            write_file(self.__paper, self.__child_path + "RemoveStop.temp", False)
            write_file(self.__paper_original, self.__child_path + "word_segment.temp", False)
            model_temp(self.__paper, self.__child_path)
            return ""
        '''
        if self.stop_word_method == "remove_stop":
            self.__paper = read_file(self.__child_path + "RemoveStop.temp")
        elif self.stop_word_method == "with_stop":
            self.__paper = read_file(self.__child_path + "word_segment.temp")
        self.__titles = read_file(self.__child_path + "titles.temp")
        self.__paper_original = read_file(self.__child_path +
                                          "word_segment.temp")
        self.__sub_paper_len = [
            int(i) for i in read_file(self.__child_path + "sec_idx.temp")
        ]
        # extract sentence
        feature_subset, eig = self.__cal_candidate_set()
        #        feature_subset = range(len(self.__paper_original))
        #        eig = []
        log.error("results is: ")
        log.info(feature_subset)
        log.debug(eig)
        # use feature list to extract summary
        summary = self.__construct_summary(feature_subset, eig, lang)
        if if_write_file:
            if file_name == '':
                log.error("file name is empty")
                return ""
            # write answer to file for ROUGE
            answer_path = self.__rouge_path + lang + "/systems/"
            write_file(summary,
                       os.path.join('%s%s.txt' % (answer_path, file_name)),
                       False)
            '''
            # generate gold summary split by CHAR
            gold_path = self.__rouge_path + lang + "/models/"
            if not os.path.exists(gold_path):
                os.makedirs(gold_path)
            tmp_name = lang + "/" + file_name + "_summary.txt"
            abs_human = read_file('./data/MultiLing2015-MSS/multilingMss2015Eval/summary/' + tmp_name)
            if not os.path.exists(gold_path + file_name + "_summary.txt") and lang != "vi" and lang != 'ka':
                write_file([" ".join(api.tokenize("\n".join(abs_human)))],
                           gold_path + file_name + "_summary.txt", False)
            if lang == "vi":
                write_file(abs_human, gold_path + file_name + "_summary.txt", False)
            # generate configure file of each document for ROUGE
            conf_path = self.__rouge_path + lang + "/configure/"
            if not os.path.exists(conf_path):
                os.makedirs(conf_path)
            tmp_conf_ = answer_path + file_name + ".txt " + gold_path + file_name + "_summary.txt"
            self.__all_conf.append(tmp_conf_)
            write_file([tmp_conf_], os.path.join('%s/%s.txt' % (conf_path, file_name)), False)
            '''

        return "".join(summary)
Esempio n. 24
0
    def sample_k_test(self, parameter, sentiment_test):
        self.__parameter_theta = parameter
        matrix_l = np.zeros([self.__data_size, self.__data_size])
        for row_i in range(self.__data_size):
            for col_j in range(row_i, self.__data_size):
                num = self.__calculate_matrix_element(row_i, col_j)
                print num
                matrix_l[row_i][col_j] = num
                matrix_l[col_j][row_i] = num
        size_ = self.__data_size
        ans = range(size_)
        '''
        matrix_k = matrix_l
        diff = range(size_)
        for i in range(10):
            center_idx, vec = ds.sample_k(matrix_k, 2)
            tmp = range(size_)
            log.info("center_idx: " + str(diff[center_idx[0]]) + " " + str(diff[center_idx[1]]))
            if i > 2:
                ans.append(diff[center_idx[0]])
                ans.append(diff[center_idx[1]])
                log.info("".join(self.__test_data[diff[center_idx[0]]]))
                log.info(self.__test_label[diff[center_idx[0]]])
                log.info("".join(self.__test_data[diff[center_idx[1]]]))
                log.info(self.__test_label[diff[center_idx[1]]])
            diff = list(set(tmp)-set(center_idx))
            self.__f_x_test = self.__f_x_test[diff][:]
            matrix_k = matrix_k[diff][:, diff]
            size_ = len(diff)
        log.info(matrix_l.shape)
        '''
        center_idx, vec = ds.sample_k(matrix_l, 3)
        log.debug(vec)
        final_answer_ = np.array(ans)[center_idx]
        log.info(final_answer_)
        log.info("".join(self.__test_data[final_answer_[0]]))
        log.info(self.__test_label[final_answer_[0]])
        log.info("".join(self.__test_data[final_answer_[1]]))
        log.info(self.__test_label[final_answer_[1]])
        log.info("".join(self.__test_data[final_answer_[2]]))
        log.info(self.__test_label[final_answer_[2]])
        cluster_1 = []
        cluster_2 = []
        cluster_3 = []
        score_0 = sentiment_test[center_idx[0]]
        score_1 = sentiment_test[center_idx[1]]
        score_2 = sentiment_test[center_idx[2]]
        label_1 = ''
        label_0 = ''
        label_2 = ''
        if score_0 > score_1:
            if score_0 > score_2:
                label_0 = 'FAVOR'
                if score_1 > score_2:
                    label_1 = 'NONE'
                    label_2 = 'AGAINST'
                else:
                    label_1 = 'AGAINST'
                    label_2 = 'NONE'
            else:
                label_2 = 'FAVOR'
                label_0 = 'NONE'
                label_1 = 'AGAINST'
        else:
            if score_1 > score_2:
                label_1 = 'FAVOR'
                if score_0 > score_2:
                    label_0 = 'NONE'
                    label_2 = 'AGAINST'
                else:
                    label_0 = 'AGAINST'
                    label_2 = 'NONE'
            else:
                score_2 = 'FAVOR'
                score_0 = 'AGAINST'
                score_1 = 'NONE'

        cluster_1.append(center_idx[0])
        cluster_2.append(center_idx[1])
        cluster_3.append(center_idx[2])
        for i in range(self.__data_size):
            #            sim_1 = matrix_l[i][i] * matrix_l[center_idx[0]][center_idx[0]] - matrix_l[i][center_idx[0]] ** 2
            #            sim_2 = matrix_l[i][i] * matrix_l[center_idx[1]][center_idx[1]] - matrix_l[i][center_idx[1]] ** 2
            #            sim_3 = matrix_l[i][i] * matrix_l[center_idx[2]][center_idx[2]] - matrix_l[i][center_idx[2]] ** 2
            #            sim_1 = np.sum(np.square(vec[:][i] - vec[:][center_idx[0]]))
            #            sim_2 = np.sum(np.square(vec[:][i] - vec[:][center_idx[1]]))
            #            sim_3 = np.sum(np.square(vec[:][i] - vec[:][center_idx[2]]))
            sim_1 = self.__similarity[i][center_idx[0]]
            sim_2 = self.__similarity[i][center_idx[1]]
            sim_3 = self.__similarity[i][center_idx[2]]
            if sim_1 > sim_2:
                if sim_1 > sim_3:
                    cluster_1.append(i)
                else:
                    cluster_3.append(i)
            elif sim_2 > sim_3:
                cluster_2.append(i)
            else:
                cluster_3.append(i)
        ans = []
        for i in range(self.__data_size):
            print "i: " + str(i)
            if i in cluster_1:
                #                ans.append(self.__test_label[center_idx[0]])
                ans.append(label_0)
            elif i in cluster_2:
                ans.append(label_1)
            else:
                ans.append(label_2)

        log.info('FINAL' + str(ans))
        for label in ["FAVOR", "AGAINST", "NONE"]:
            self.get_f_score(ans, label)
Esempio n. 25
0
    def learning(self):
        for label in ["FAVOR", "AGAINST", "NONE"]:
            self.__learning_single_attitude(label)
        right = 0.0
        ans = []
        log.info("answer")
        print self.__tmp_answer
        log.info(self.__tmp_answer["FAVOR"])
        log.info(self.__tmp_eigenvalue["FAVOR"])
        log.info(self.__tmp_answer["AGAINST"])
        log.info(self.__tmp_eigenvalue["AGAINST"])
        log.info(self.__tmp_answer["NONE"])
        log.info(self.__tmp_eigenvalue["NONE"])
        tmp = []
        for i in range(100):
            a = list()
            a.append(self.__tmp_answer["FAVOR"][i])
            a.append(self.__tmp_answer["AGAINST"][i])
            a.append(self.__tmp_answer["NONE"][i])
            count = 0
            label = ""
            for j in range(3):
                if a[j] == "NONEs":
                    count += 1
                else:
                    label = a[j]
            tmp.append(str(self.__tmp_eigenvalue["FAVOR"][i]) + '\t' + str(self.__tmp_eigenvalue["AGAINST"][i]) +
                       '\t' + str(self.__tmp_eigenvalue["NONE"][i]))
            if count == 2:
                right += 1
                ans.append(label)
            else:
                favor_value = self.__tmp_eigenvalue["FAVOR"][i]
                against_value = self.__tmp_eigenvalue["AGAINST"][i]
                none_value = self.__tmp_eigenvalue["NONE"][i]
                if favor_value > against_value:
                    if favor_value > none_value:
                        ans.append("FAVOR")
                    else:
                        ans.append("NONE")
                else:
                    if against_value > none_value:
                        ans.append("AGAINST")
                    else:
                        ans.append("NONE")

        log.info(tmp)
        log.info("final_answer: " + str(ans))
        log.info("final parameters: ")
        log.info(self.__final_parameter)
        for label in ["FAVOR", "AGAINST", "NONE"]:
            self.__sample.get_f_score(ans, label)