def init(self):
        """
        初始化proj,即新建目录,并在该目录下新建明文和密文目录
        :return: None
        """

        def init_action():
            if os.path.isdir(self.proj_name):  # 如果已经存在项目目录,就需要递归删除该目录
                # os.removedirs(self.proj_name)
                shutil.rmtree(self.proj_name)
            os.mkdir(self.proj_name)
            os.mkdir(self.proj_dir_path + 'plain_text')
            os.mkdir(self.proj_dir_path + 'cipher_text')

            # 接下来,还要保存配置文件
            with open(self.proj_dir_path + 'config', 'wb') as f:
                pickle.dump([self.k, self.l, self.s, self.file_cnt], f)

        if os.path.isdir(self.proj_name):
            printer.print_warning("发现已经存在同名目录,是否需要清除该目录下所有内容? (Y/N)")
            ok = input()
            if ok == 'Y' or ok == 'y':
                printer.print_info("正在清空并初始化中...")
                init_action()
                printer.print_success("清空完成!")
            else:
                printer.print_info("用户已拒绝操作,程序退出...")
                return
        else:
            printer.print_info("正在初始化项目中...")
            init_action()
            printer.print_success("初始化项目完成!")
    def upload(self):
        """
        上传密文、加密后的索引、配置文件到服务器上
        :return:
        """

        def upload_action():
            return upload_manager.upload_to_server(self.proj_name, 'Y')

        def delete_local_cipher():
            """
            如果上传完毕,删除本地上的密文集和加密索引
            :return:
            """
            shutil.rmtree(self.proj_dir_path + 'cipher_text')
            os.remove(self.proj_dir_path + 'index.enc')

        if 2 <= get_status_by_bits(self.status_bits) < 6:
            printer.print_error('操作失败,理由: ')
            self.status()
            return
        res = upload_action()
        if res != 'success':
            printer.print_error('上传失败!服务器返回信息如下:')
            printer.print_error(res)
        else:
            printer.print_success('上传成功!')
            delete_local_cipher()
Ejemplo n.º 3
0
def run_selector_parallel_min(partitions, n_partition):
    train_data_path = root_path + 'train-set_all.csv'
    authors_path = root_path + 'usable_authors_to_train_val.npy'
    output_path = root_path + f'train_positive/partition_{n_partition}.csv'
    selector = ParallelSelectorMin(train_data_path, authors_path, output_path, partitions, n_partition, None)

    printer.print_success('START')
    selector.run()
    printer.print_success('FINISHED')
 def _load_data(self, train_data_path):
     if os.path.exists(train_data_path):
         self.train_data_df_top_comments = pd.read_csv(train_data_path)[[
             'timestamp', 'author_id', 'comment_id', 'article_id',
             'parent_comment_id'
         ]]
         printer.print_success('Input Data loaded')
     else:
         printer.print_error('Train data path does not exist!')
Ejemplo n.º 5
0
def run_selector_parallel_min_validation(partitions, n_partition):
    val_data_path = root_path + 'validation-set_all.csv'
    authors_path = root_path + 'usable_authors_validation.npy'
    output_path = root_path + f'val_positive/partition-{n_partition}.csv'
    selector = ParallelSelectorMin(val_data_path, authors_path, output_path, partitions, n_partition)

    printer.print_success('START')
    selector.run()
    printer.print_success('FINISHED')
Ejemplo n.º 6
0
def run_selector_negative_paralallel_validation(partitions, n_partition):
    val_data_path = root_path + 'validation-set_all.csv'
    authors_path = root_path + 'usable_authors_validation.npy'
    article_dates_path = root_path + 'c_articles_dates.csv'
    output_path = root_path + f'val_negative/partition-{n_partition}.csv'
    selector = NegativeExampleSelector(val_data_path, authors_path, article_dates_path, output_path, partitions,
                                       n_partition, 1, False, False)

    printer.print_success('START')
    selector.run()
    printer.print_success('FINISHED')
Ejemplo n.º 7
0
def run_selector_parallel_min_test(partitions, n_partition):
    # train_data_path, authors_path, output_path, partitions, n_partition
    test_data_path = root_path + 'test-set_all.csv'
    authors_path = root_path + 'usable_authors_test.npy'
    output_path = root_path + f'test_positive/partition-{n_partition}.csv'
    selector = ParallelSelectorMin(test_data_path, authors_path, output_path, partitions, n_partition,
                                   random_sample=False)

    printer.print_success('START')
    selector.run()
    printer.print_success('FINISHED')
 def generate_keys_action():
     k1, k2, k3, k4 = self.gen()
     print('========THE KEY========')
     print('{}\n{}\n{}\n{}'.format(base64.b64encode(k1).decode(encoding='UTF-8'),
                                   base64.b64encode(k2).decode(encoding='UTF-8'),
                                   base64.b64encode(k3).decode(encoding='UTF-8'),
                                   base64.b64encode(k4).decode(encoding='UTF-8')))
     print('========THE KEY========')
     # 保存密钥
     self.save_keys()
     printer.print_success('密钥文件已保存至本地.')
Ejemplo n.º 9
0
def run_selector_negative_paralallel_test(partitions, n_partition):
    test_path = root_path + 'test-set_all.csv'
    authors_path = root_path + 'usable_authors_test.npy'
    article_dates_path = root_path + 'c_articles_dates.csv'
    output_path = root_path + f'test_negative/partition-{n_partition}.csv'
    np.random.seed(123)
    selector = NegativeExampleSelector(test_path, authors_path, article_dates_path, output_path, partitions,
                                       n_partition, 50, True, False)

    printer.print_success('START')
    selector.run()
    printer.print_success('FINISHED')
Ejemplo n.º 10
0
def run_selector_negative_paralallel(partitions, n_partition):
    data_path = root_path + 'train-set_all.csv'
    authors_path = root_path + 'usable_authors_to_train_val.npy'
    article_dates_path = root_path + 'c_articles_dates.csv'
    output_path = root_path + f'train_negative/partition-{n_partition}.csv'
    k = 1
    np.random.seed(123)
    selector = NegativeExampleSelector(data_path, authors_path, article_dates_path, output_path, partitions,
                                       n_partition, k)

    printer.print_success('START')
    selector.run()
    printer.print_success('FINISHED')
    def _load_data(self):
        self.article_dates_df = pd.read_csv(self.article_dates_path)
        self.article_dates_df['timestamp'] = pd.to_datetime(
            self.article_dates_df['date'])

        if os.path.exists(self.data_path):
            self.data_df = pd.read_csv(self.data_path)[[
                'timestamp', 'author_id', 'comment_id', 'article_id',
                'parent_comment_id'
            ]]
            printer.print_success('Input Data loaded ')
        else:
            printer.print_error('Train data path does not exist!')
 def encrypt_action():
     printer.print_info('检查明文目录下文件名格式是否符合要求...')
     if not scanner.check_filename_format(self.proj_dir_path):
         printer.print_info('不符合文件命名格式,请问是否需要执行自动格式化文件名操作? (Y/N)')
         ok = input()
         if ok == 'y' or ok == 'Y':
             scanner.reformat_filename(self.proj_dir_path)
             printer.print_success('格式化文件名成功!')
         else:
             printer.print_error('软件终止...请自行更改文件名以满足要求!')
     else:
         printer.print_success('检查完毕,文件名符合要求!')
     printer.print_info('开始加密索引和文档...')
     self.enc()
     self.save_encrypted_index()  # 记得保存索引
     printer.print_success('加密索引和文档成功')
Ejemplo n.º 13
0
def tokenize_texts(input_path, output_path):
    tokenizer = CommentTokenizer(input_path, output_path)
    printer.print_success('START')
    tokenizer.run()
    printer.print_success('FINISHED')
Ejemplo n.º 14
0
 def make_tf_idf(self):
     printer.print_progress('Run TFIDF Model')
     self.model = TfidfModel(self.corpus, normalize=False)
     printer.print_success('Finished to create corpus')
    def enc(self):
        def initialization():
            # step1. scan D and generate the set of distinct keywords δ(D)
            self.distinct_word_set = scanner.generate_the_set_of_distinct_keywords_for_docs(self.proj_dir_path)[1]
            # step2. for all w ∈ δ(D), generate D(w)
            self.D_ = scanner.generate_Dw_for_each_keyword(self.proj_dir_path)
            # step3. initialize a global counter ctr = 1 ---> see __init__()

        def building_the_array_A():
            # step4. for 1<=i<=|δ(D)|, build a list Li with nodes Ni,j and store it in array A as follows:
            for i in range(1, len(self.distinct_word_set) + 1):
                keyword = self.distinct_word_set[i - 1]  # 在这里注意论文中的i和程序中的i不同,应当减一
                Ki = [None] * (len(self.D_[keyword]) + 1)
                Ni = [None] * (len(self.D_[keyword]) + 1)
                # sample a key Ki,0 <-$- {0, 1}^k
                Ki[0] = Random.new().read(int(self.k / 8))
                self.k0_for_each_keyword[keyword] = Ki[0]
                # for 1<=j<=|D(wi)|-1
                j = 0
                for j in range(1, len(self.D_[keyword])):
                    # let id(Di,j) be the jth identifier in D(wi)
                    id_Dij = self.D_[keyword][j - 1]  # todo
                    # generate a key Ki,j <- SKE1.Gen(1^k)
                    Ki[j] = Random.new().read(int(self.k / 8))
                    # if j == 1:
                    #    self.k0_for_each_keyword[keyword] = Ki[j]
                    # Ni[j] = str(id_Dij) + "|||" + str(Ki[j]) + "|||" + self.mu(Ki[j - 1], Ni[j])
                    Ni[j] = id_Dij.to_bytes(self.file_cnt_byte, byteorder="big") + Ki[j] + self.mu(self.k1, num2byte(
                        self.ctr + 1, int(self.s / 8)))
                    index = self.mu(self.k1, num2byte(self.ctr, int(self.s / 8)))
                    if j == 1:
                        self.addrA[keyword] = index  # 保存头节点的地址到dict里面去
                    index = int.from_bytes(index, byteorder="big")
                    self.A[index] = self.SKEEnc(Ki[j - 1], Ni[j])

                    if self.entry_size_of_A == -1:
                        self.entry_size_of_A = len(self.A[index])

                    self.ctr += 1
                # for the last node of Li
                # set the address of the next node to NULL: Ni,|D(wi)| = <id(Di,|D(wi)|) || 0^k || NULL>
                j += 1  # ...
                id_Dij = self.D_[keyword][len(self.D_[keyword]) - 1]
                Ni[len(self.D_[keyword])] = id_Dij.to_bytes(self.file_cnt_byte, byteorder="big") + b"\x00" * int(
                    self.k / 8) + b"\x00" * int(math.ceil(self.s / 8))  # todo
                index = self.mu(self.k1, num2byte(self.ctr, int(self.s / 8)))

                if j == 1:
                    self.addrA[keyword] = index  # 保存头节点的地址到dict里面去
                    # self.k0_for_each_keyword[keyword] = Ki[j]

                index = int.from_bytes(index, byteorder="big")
                self.A[index] = self.SKEEnc(Ki[j - 1], Ni[len(self.D_[keyword])])

                # encrypt the node Ni,|D(wi)| under the key Ki,|D(wi)-1| and store it in A
                self.ctr += 1

            # step5. set the remaining s - s' entries of A to random values of the same size
            # as the existing s' entries of A
            for i in range(len(self.A)):
                if self.A[i] is None:
                    self.A[i] = Random.new().read(self.entry_size_of_A)

        def building_the_look_up_table_T():
            size = -1  # size为look-up table 中元素的长度,用于第7个步骤

            # step6. for all wi ∈ δ(D), set T[π_K3(wi)] = <addr_A(N_i,1 || K_i,0)> ⊕ f_K2(wi)
            for w in self.distinct_word_set:
                index = self.pi(self.k3, str2byte(w))
                index = int.from_bytes(index, byteorder="big")
                self.T[index] = self.xor(self.addrA[w] + self.k0_for_each_keyword[w],
                                         self.f(self.k2, str2byte(w)))
                if size == -1:
                    size = len(self.T[index])

            # step7. if |δ(D)| < |△|, then set the remaining |△| - |δ(D)| entries of T to random values of the
            # same size as the existing |δ(D)| entries of T
            for i in range(2 ** self.l):
                if self.T[i] is None:
                    self.T[i] = Random.new().read(size)

        def enc_docs():
            # step8. for 1 <= i <= n, let ci <- SKE2.Enc_K4(Di)
            DIR = 'plain_text'
            file_count = len([name for name in os.listdir(DIR) if os.path.isfile(os.path.join(DIR, name))])
            for i in range(file_count):
                self.enc_doc(i, self.k4)

        printer.print_info('创建索引中...')
        initialization()

        printer.print_info('加密索引中...')
        building_the_array_A()
        building_the_look_up_table_T()

        printer.print_info('加密文档中...')
        enc_docs()
        printer.print_success('已就绪.')
        # step9. output
        return self.A, self.T
Ejemplo n.º 16
0
def collect_texts(raw_comments_path, selection_path, offset_path, output_path):
    collector = CommentSelector(raw_comments_path, selection_path, offset_path, output_path)
    printer.print_success('START')
    collector.run()
    printer.print_success('FINISHED')
Ejemplo n.º 17
0
 def _save(self):
     df = pd.DataFrame(self.output, columns=self.output_header)
     df.to_csv(self.output_path, index=False)
     printer.print_success(f'Saved to {self.output_path}')
Ejemplo n.º 18
0
 def make_corpus(self):
     for line in tqdm(self.data, total=self.number_of_lines):
         self.corpus.append(self.dictionary.doc2bow(line))
     printer.print_success('Finished to create corpus')