Example #1
0
    def __init__(self, wordindex2emb, gd , HIN_path='', features=None, semantic_name='', if_text_sem=True,
                 if_tag_sem=True, if_mashup_sem=True, if_api_sem=True):

        self.ws = word_sim(wordindex2emb)  # embedding 层参数
        self.num_users = data_repository.get_md().mashup_num
        self.num_items = data_repository.get_md().api_num
        self.semantic_name = semantic_name  # 输入的特征的名字  默认为空,是利用CI模型做的;其他的使用HDP等要输入
        self.gd = gd  # 要用到其中的编码

        # 利用外部传入的content和tag的feature计算相似度
        if features is not None:
            if len(features) == 2:
                if if_text_sem and not if_tag_sem:  # 可以只用text的语义特征,PasRec等使用
                    self.mashup_texts_features, self.api_texts_features = features
                if if_mashup_sem and not if_api_sem:  # 可以只用mashup的语义特征  HIN中使用
                    self.mashup_texts_features, self.mashup_tag_features = features
            elif len(features) == 4:
                self.mashup_texts_features, self.mashup_tag_features, self.api_texts_features, self.api_tag_features = features

        self.mashup_apis_dict = list2dict(data_repository.get_md().mashup_api_list)
        self.api_id2provider = data_repository.get_md().api_df['API_Provider']

        # self.path= os.path.join(HIN_path,self.name) # 存放在CI文件夹下!data_repository.get_ds().data_root  no_kcv_root_path
        self.path = HIN_path  # 存放相似度的路径  kcvIndex/CIModelPath/HIN_sims/

        if not os.path.exists(self.path):
            os.makedirs(self.path)

        self.p1_sims, self.p2_sims, self.p3_sims, self.p4_sims, self.p5_sims, self.p6_sims = None, None, None, None, None, None
        self.p1_sims_sem, self.p2_sims_sem, self.p3_sims_sem, self.p4_sims_sem = None, None, None, None

        self.flag1, self.flag2, self.flag3, self.flag4, self.flag5, self.flag6 = False, False, False, False, False, False
        self.flag1_sem, self.flag2_sem, self.flag4_sem = False, False, False
Example #2
0
    def set_others(self):
        # 在set_data()或read_data后设置
        self.his_mashup_ids = np.unique(
            self.train_df['mashup'].values)  # 训练mashup id的有序排列
        self.his_mashup_ids_set = set(self.his_mashup_ids)
        print('mashup num in training set :{}'.format(len(
            self.his_mashup_ids)))
        self.train_mashup_api_list = list(
            filter(lambda x: x[0] in self.his_mashup_ids_set,
                   data_repository.get_md().mashup_api_list))
        self.train_mashup_api_dict = list2dict(self.train_mashup_api_list)

        # 模型随数据变化,所以存储在数据的文件夹下
        self.model_path = os.path.join(self.data_root,
                                       '{}')  # simple_model_name  CI路径
        self.new_best_epoch_path = os.path.join(
            '{}', 'best_epoch.dat')  # model_dir,  .format(simple_model_name)
        self.new_model_para_path = os.path.join(
            '{}',
            'weights_{}.h5')  # model_dir, .format(simple_model_name, epoch)
        self.new_best_NDCG_path = os.path.join(
            '{}', 'best_NDCG.dat')  # model_dir,  .format(simple_model_name)
Example #3
0
def split_dataset_for_oldScene_KCV(data_dir,
                                   num_negatives=6,
                                   test_candidates_nums=50,
                                   kcv=5):
    name = 'neg_{}_testCandi{}'.format(num_negatives, test_candidates_nums)
    result_path = os.path.join(data_dir, 'split_data_oldScene', name)
    mashup_api_list = meta_data.mashup_api_list
    mashup_api_dict = list2dict(mashup_api_list)

    # 返回K个数据对象,每个可以使用相关属性
    test_data_obj = dataset(result_path, name, kcv - 1)
    if os.path.exists(test_data_obj.train_instances_path):  # 已经划分过
        for i in range(kcv):
            print('has splited data in kcv mode before,read them!')
            data = dataset(result_path, name, i)
            data.read_data()  # 从文件中读取对象
            yield data

    else:  # 还未划分过
        mashup_ids = list(mashup_api_dict.keys())
        # 首先为每个mashup寻找正负训练用例(训练用的正例也可以当做测试用例
        all_apis = {api_id for api_id in range(meta_data.api_num)}  # 所有api的id

        # 首先为每个mashup制定确定的正负例,候选api等
        # {mid:api_instances}
        mid2true_instances, mid2false_instances, mid2candidate_instances = {}, {}, {}
        for mashup_id, api_ids in mashup_api_dict.items():  # api_ids是set
            unobserved_apis_list = list(all_apis - api_ids)
            random.shuffle(unobserved_apis_list)

            mid2true_instances[mashup_id] = {}
            mid2false_instances[mashup_id] = {}
            mid2candidate_instances[mashup_id] = {}

            api_ids_list = list(api_ids)  # 已选择的apis,做正例
            mid2true_instances[mashup_id] = api_ids_list

            all_neg_num = min(meta_data.api_num,
                              num_negatives * len(api_ids_list))
            mid2false_instances[
                mashup_id] = unobserved_apis_list[:all_neg_num]  # 负例

            if test_candidates_nums == 'all':  # 选取全部api做测试
                mid2candidate_instances[mashup_id] = list(all_apis)
            else:  # 选取部分作为测试,实际组件api和部分unobserved
                mid2candidate_instances[
                    mashup_id] = api_ids_list + unobserved_apis_list[:
                                                                     test_candidates_nums]

        random.shuffle(mashup_ids)
        batch = len(mashup_ids) // kcv
        for i in range(kcv):  # 每个kcv
            start_index = i * batch
            batch_stopindex = len(mashup_ids) if i == kcv - 1 else (i +
                                                                    1) * batch
            test_mashups = mashup_ids[start_index:batch_stopindex]
            train_mashups = mashup_ids[:start_index] + mashup_ids[
                batch_stopindex:-1]

            test_mashup_ids, all_candidate_api_ids, all_ground_api_ids = [], [], []
            train_mashup_api_list, train_slt_ids, train_labels = [], [], []

            for mashup_id in train_mashups:
                for true_api_id in mid2true_instances[mashup_id]:
                    train_mashup_api_list.append(
                        (mashup_id,
                         true_api_id))  # mashup_id,train_api_id,target
                    train_labels.append(1)
                for false_api_id in mid2false_instances[mashup_id]:
                    train_mashup_api_list.append(
                        (mashup_id,
                         false_api_id))  # mashup_id,train_api_id,target
                    train_labels.append(0)

            # test和train格式不同
            # test mashup和api的一行list是多个测试样本,而all_ground_api_ids,test_slt_ids的一行对应前者的一行
            for mashup_id in test_mashups:
                candidate_api_ids = mid2candidate_instances[mashup_id]
                all_candidate_api_ids.append(candidate_api_ids)
                test_mashup_ids.append([mashup_id] * len(candidate_api_ids))
                all_ground_api_ids.append(
                    mid2true_instances[mashup_id])  # 训练用的正例

            train_mashup_id_instances, train_api_id_instances = zip(
                *train_mashup_api_list)

            data = dataset(result_path, name, i)
            data.set_data_oldScene(train_mashup_id_instances,
                                   train_api_id_instances, train_labels,
                                   test_mashup_ids, all_candidate_api_ids,
                                   all_ground_api_ids)
            print('{}/{} dataset, build done!'.format(i, kcv))
            yield data
Example #4
0
def split_dataset_for_newScene_New_KCV(data_dir,
                                       num_negatives=6,
                                       slt_num=3,
                                       combination_num=3,
                                       train_positive_samples=50,
                                       test_candidates_nums=50,
                                       kcv=5):
    """

    :param data_dir: 要划分数据的路径
    :param num_negatives: 负采样比例
    :param slt_num: 指定的最大已选择服务的数目
    :param combination_num: 已选择服务中,只选取一部分组合,缓解数据不平衡问题
    :param train_positive_samples: 每个训练用的mashup,除了已选服务,剩下的保留多少个训练正例,防止组件太多的mashup所占的比例太大
    :param test_candidates_nums: 每个mashup要评价多少个待测item
    :param kcv:
    :return: 各个kcv上生成的dataset对象
    """
    name = 'neg_{}_sltNum{}_com_{}_trainPos_{}_testCandi{}'.format(
        num_negatives, slt_num, combination_num, train_positive_samples,
        test_candidates_nums)
    result_path = os.path.join(data_dir, 'split_data_newScene', name)

    # 返回K个dataset对象,每个可以使用相关属性
    test_data_obj = dataset(result_path, name, kcv - 1)
    if os.path.exists(test_data_obj.train_instances_path):  # 已经划分过
        for i in range(kcv):
            print('has splited data in kcv mode before,read them!')
            data = dataset(result_path, name, i)
            data.read_data()  # 从文件中读取对象
            yield data
    else:  # 还未划分过
        # 未划分数据集
        mashup_api_list = meta_data.mashup_api_list
        mashup_api_dict = list2dict(mashup_api_list)

        mashup_ids = list(mashup_api_dict.keys())
        # 首先为每个mashup寻找正负训练用例(训练用的正例也可以当做测试用例
        all_apis = {api_id for api_id in range(meta_data.api_num)}  # 所有api的id

        # 1.首先为每个mashup指定确定的已选服务和对应的正负例/待测api等
        # {mid:{slt_aid_list:api_instances}
        mid2true_instances, mid2false_instances, mid2candidate_instances = {}, {}, {}
        for mashup_id, api_ids in mashup_api_dict.items():  # api_ids是set
            unobserved_apis_list = list(all_apis - api_ids)
            random.shuffle(unobserved_apis_list)

            mid2true_instances[mashup_id] = {}
            mid2false_instances[mashup_id] = {}
            mid2candidate_instances[mashup_id] = {}

            api_ids_list = list(api_ids)
            max_slt_num = min(slt_num,
                              len(api_ids_list) -
                              1)  # eg:最大需要三个,但是只有2个services构成
            for act_slt_num in range(max_slt_num):  # 选择1个时,两个时...
                act_slt_num += 1
                combinations = list(
                    itertools.combinations(api_ids_list, act_slt_num))
                if combination_num != 'all':  # 只选取一部分组合,缓解数据不平衡问题
                    combination_num = min(len(combinations), combination_num)
                    combinations = combinations[:combination_num]

                for slt_api_ids in combinations:  # 随机组合已选择的api,扩大数据量 # 组合产生,当做已选中的apis
                    train_api_ids = list(
                        api_ids - set(slt_api_ids)
                    )  # masked observed interaction 用于训练或测试的

                    if train_positive_samples != 'all':  # 选择一部分正例 做训练或测试
                        train_positive_samples_num = min(
                            len(train_api_ids),
                            train_positive_samples)  # 最多50个,一般没有那么多
                        train_api_ids = train_api_ids[:
                                                      train_positive_samples_num]

                    mid2true_instances[mashup_id][
                        slt_api_ids] = train_api_ids  # 训练用正例 slt_api_ids是tuple

                    num_negative_instances = min(
                        num_negatives * len(train_api_ids),
                        len(unobserved_apis_list))
                    mid2false_instances[mashup_id][
                        slt_api_ids] = unobserved_apis_list[:
                                                            num_negative_instances]  # 随机选择的负例

                    if test_candidates_nums == 'all':  # 待预测
                        test_candidates_list = list(all_apis -
                                                    set(slt_api_ids))
                    else:
                        test_candidates_list = unobserved_apis_list[:
                                                                    test_candidates_nums] + train_api_ids
                    mid2candidate_instances[mashup_id][
                        slt_api_ids] = test_candidates_list

        random.shuffle(mashup_ids)
        batch = len(mashup_ids) // kcv
        # 2.然后,根据上面的结果划分为各个KCV,训练和测试
        for i in range(kcv):  # 每个kcv
            start_index = i * batch
            batch_stopindex = len(mashup_ids) if i == kcv - 1 else (i +
                                                                    1) * batch
            test_mashups = mashup_ids[start_index:batch_stopindex]
            train_mashups = mashup_ids[:start_index] + mashup_ids[
                batch_stopindex:-1]

            test_mashup_ids, all_candidate_api_ids, test_slt_ids, all_ground_api_ids = [], [], [], []
            train_mashup_api_list, train_slt_ids, train_labels = [], [], []

            for mashup_id in train_mashups:
                for slt_api_ids, true_api_instances in mid2true_instances[
                        mashup_id].items():
                    for true_api_id in true_api_instances:
                        train_slt_ids.append(slt_api_ids)
                        train_mashup_api_list.append(
                            (mashup_id,
                             true_api_id))  # mashup_id,train_api_id,target
                        train_labels.append(1)
                for slt_api_ids, false_api_instances in mid2false_instances[
                        mashup_id].items():
                    for false_api_id in false_api_instances:
                        train_slt_ids.append(slt_api_ids)
                        train_mashup_api_list.append(
                            (mashup_id,
                             false_api_id))  # mashup_id,train_api_id,target
                        train_labels.append(0)

            # test和train格式不同:
            # train是一行一个样本;
            # test mashup和api的一行list是多个测试样本,而all_ground_api_ids,test_slt_ids的一行对应前者的一行(共用相同的)
            for mashup_id in test_mashups:
                for slt_api_ids, candidate_api_instances in mid2candidate_instances[
                        mashup_id].items():
                    all_candidate_api_ids.append(candidate_api_instances)
                    test_mashup_ids.append([mashup_id] *
                                           len(candidate_api_instances))
                    all_ground_api_ids.append(
                        mid2true_instances[mashup_id][slt_api_ids])  # 训练用的正例
                    test_slt_ids.append(slt_api_ids)

            train_mashup_id_instances, train_api_id_instances = zip(
                *train_mashup_api_list)

            # 3.根据训练集和测试集的划分,初始化dataset对象!!!
            data = dataset(result_path, name, i)
            data.set_data(train_mashup_id_instances, train_api_id_instances,
                          train_labels, train_slt_ids, test_mashup_ids,
                          all_candidate_api_ids, all_ground_api_ids,
                          test_slt_ids)
            print('{}/{} dataset, build done!'.format(i, kcv))
            yield data

    print('you  have splited and saved them!')
Example #5
0
def split_dataset_for_oldScene_KCV(args):
    data_dir = args.cur_data_dir
    num_negatives = args.num_negatives
    test_candidates_nums = args.test_candidates_nums
    kcv = args.kcv

    name = 'oldScene_neg{}_testCandi{}'.format(num_negatives,
                                               test_candidates_nums)
    result_root = os.path.join(data_dir, 'split_data_oldScene', name)
    mashup_api_list = meta_data(args).mashup_api_list
    mashup_api_dict = list2dict(mashup_api_list)

    # 返回K个数据对象,每个可以使用相关属性
    if os.path.exists(dataset(args, result_root, name,
                              kcv - 1).train_df_path):  # 已经划分过
        for i in range(kcv):
            print('has splited data in kcv mode before,read them!')
            data = dataset(args, result_root, name, i)
            data.initialize()  # 从文件中读取对象
            set_ds(data)  # 设置唯一实例
            yield data
    else:  # 还未划分过
        mashup_ids = list(mashup_api_dict.keys())
        all_apis = set(meta_data(args).api_df.index.tolist())  # 所有api的id

        # 首先为每个mashup指定确定的正负例,候选api等
        # {mid:api_instances}
        mid2true_instances, mid2false_instances, mid2candidate_instances = {}, {}, {}
        for mashup_id, api_ids in mashup_api_dict.items():  # api_ids是set
            unobserved_apis_list = list(all_apis - api_ids)
            random.shuffle(unobserved_apis_list)

            mid2true_instances[mashup_id] = {}
            mid2false_instances[mashup_id] = {}
            mid2candidate_instances[mashup_id] = {}

            api_ids_list = list(api_ids)  # 已选择的apis,做正例
            mid2true_instances[mashup_id] = api_ids_list

            all_neg_num = min(
                meta_data(args).api_num, num_negatives * len(api_ids_list))
            mid2false_instances[
                mashup_id] = unobserved_apis_list[:all_neg_num]  # 负例

            if test_candidates_nums == 'all':  # 选取全部api做测试
                mid2candidate_instances[mashup_id] = list(all_apis)
            else:  # 选取部分作为测试,实际组件api和部分unobserved
                test_candidates_nums = int(test_candidates_nums)
                mid2candidate_instances[
                    mashup_id] = api_ids_list + unobserved_apis_list[:
                                                                     test_candidates_nums]

        random.shuffle(mashup_ids)
        batch = len(mashup_ids) // kcv
        for i in range(kcv):  # 每个kcv
            start_index = i * batch
            batch_stopindex = len(mashup_ids) if i == kcv - 1 else (i +
                                                                    1) * batch
            test_mashups = mashup_ids[start_index:batch_stopindex]
            train_mashups = mashup_ids[:start_index] + mashup_ids[
                batch_stopindex:-1]

            train_df = pd.DataFrame(columns=['mashup', 'api', 'label'])
            for mashup_id in train_mashups:
                for true_api_id in mid2true_instances[mashup_id]:
                    train_df.append(
                        {
                            'mashup': mashup_id,
                            'api': true_api_id,
                            'label': 1
                        },
                        ignore_index=True)
                for false_api_id in mid2false_instances[mashup_id]:
                    train_df.append(
                        {
                            'mashup': mashup_id,
                            'api': false_api_id,
                            'label': 0
                        },
                        ignore_index=True)

            # test和train格式不同
            # test mashup和api的一行list是多个测试样本,而all_ground_api_ids,test_slt_ids的一行对应前者的一行
            test_df = pd.DataFrame(columns=[
                'mashup', 'slt_apis', 'candidate_apis', 'all_ground_api_ids'
            ])
            for mashup_id in test_mashups:
                test_df.append(
                    {
                        'mashup': mashup_id,
                        'candidate_apis': mid2candidate_instances[mashup_id],
                        'all_ground_api_ids': mid2true_instances[mashup_id]
                    },
                    ignore_index=True)

            data = dataset(args, result_root, name, i)
            data.initialize(train_df, test_df)
            set_ds(data)  # 设置唯一实例
            print('{}/{} dataset, build done!'.format(i, kcv))
            yield data
Example #6
0
def split_dataset_for_newScene_KCV(args):
    """
    新场景划分数据
    :param data_dir: 要划分数据的路径
    :param num_negatives: 负采样比例
    :param slt_num: 指定的最大已选择服务的数目
    :param slt_combination_num: 真实组件服务中,只选取一部分组合作为已选服务,缓解数据不平衡问题: eg: C10/3 C50/3
    # :param train_positive_samples: 每个训练用的mashup,除了已选服务,剩下的保留多少个服务作为训练正例,防止组件太多的mashup所占的比例太大
    :param test_candidates_nums: 每个mashup要评价多少个待测负例item: 为all时全部评价
    :param kcv:
    :return: 某折的dataset对象
    """
    data_dir = args.cur_data_dir
    num_negatives = args.num_negatives
    slt_num = args.slt_item_num
    slt_combination_num = args.combination_num
    test_candidates_nums = args.test_candidates_nums
    kcv = args.kcv

    name = 'newScene_neg{}_sltNum{}_com{}_testCandi{}'.format(
        num_negatives, slt_num, slt_combination_num, test_candidates_nums)
    result_root = os.path.join(data_dir, 'split_data_newScene', name)

    # 返回K个dataset对象,每个可以使用相关属性
    if os.path.exists(dataset(args, result_root, name,
                              kcv - 1).train_df_path):  # 已经划分过
        for i in range(kcv):
            print('data has been splited in kcv mode before,read them!')
            data = dataset(args, result_root, name, i)
            data.initialize()  # 从文件中读取对象
            set_ds(data)  # 设置唯一实例
            yield data
    else:
        mashup_api_list = meta_data(args).mashup_api_list
        mashup_api_dict = list2dict(mashup_api_list)
        mashup_ids = meta_data(args).mashup_df.index.tolist()
        mashup_ids.remove(0)  # 占位
        all_apis = set(meta_data(args).api_df.index.tolist())  # 所有api的id
        all_apis.remove(0)

        # 1.首先为每个mashup指定已选服务和对应的正负例(训练)/待测api(测试)
        # {mid:{slt_aid_list:api_instances}
        mid2true_instances, mid2false_instances, mid2candidate_instances = {}, {}, {}
        for mashup_id, api_ids in mashup_api_dict.items():  # api_ids是set
            unobserved_apis_list = list(all_apis - api_ids)
            random.shuffle(unobserved_apis_list)

            mid2true_instances[mashup_id] = {}
            mid2false_instances[mashup_id] = {}
            mid2candidate_instances[mashup_id] = {}

            api_ids_list = list(api_ids)
            max_slt_num = min(slt_num,
                              len(api_ids_list) -
                              1)  # eg:最大需要三个已选服务,但是只有2个services构成
            for act_slt_num in range(max_slt_num):  # 选择1个时,两个时...
                act_slt_num += 1
                combinations = list(
                    itertools.combinations(api_ids_list, act_slt_num))
                if slt_combination_num != 'all':  # 只选取一部分组合,缓解数据不平衡问题
                    slt_combination_num = min(len(combinations),
                                              slt_combination_num)
                    combinations = combinations[:slt_combination_num]

                for slt_api_ids in combinations:  # 随机组合已选择的api,扩大数据量 # 组合产生,当做已选中的apis
                    train_api_ids = list(
                        api_ids - set(slt_api_ids)
                    )  # masked observed interaction 用于训练或测试的

                    # if train_positive_samples != 'all':  # 选择一部分正例 做训练或测试
                    #     train_positive_samples_num = min(len(train_api_ids), train_positive_samples) # 最多50个,一般没有那么多
                    #     train_api_ids = train_api_ids[:train_positive_samples_num]

                    mid2true_instances[mashup_id][
                        slt_api_ids] = train_api_ids  # 训练用正例 slt_api_ids是tuple

                    num_negative_instances = min(
                        num_negatives * len(train_api_ids),
                        len(unobserved_apis_list))
                    mid2false_instances[mashup_id][
                        slt_api_ids] = unobserved_apis_list[:
                                                            num_negative_instances]  # 随机选择的负例

                    if test_candidates_nums == 'all':  # 待预测
                        test_candidates_list = list(all_apis -
                                                    set(slt_api_ids))
                    else:
                        test_candidates_nums = int(test_candidates_nums)
                        test_candidates_list = unobserved_apis_list[:
                                                                    test_candidates_nums] + train_api_ids
                    mid2candidate_instances[mashup_id][
                        slt_api_ids] = test_candidates_list

        random.shuffle(mashup_ids)
        batch = len(mashup_ids) // kcv
        # 2.然后,根据上面的结果划分为各个KCV,训练和测试
        for i in range(kcv):  # 每个kcv
            start_index = i * batch
            batch_stopindex = len(mashup_ids) if i == kcv - 1 else (i +
                                                                    1) * batch
            test_mashups = mashup_ids[start_index:batch_stopindex]
            train_mashups = mashup_ids[:start_index] + mashup_ids[
                batch_stopindex:-1]
            print(train_mashups)
            print(test_mashups)

            train_df = pd.DataFrame(
                columns=['mashup', 'slt_apis', 'api', 'label'])
            for mashup_id in train_mashups:
                for slt_api_ids, true_api_instances in mid2true_instances[
                        mashup_id].items():
                    for true_api_id in true_api_instances:
                        train_df = train_df.append(
                            {
                                'mashup': mashup_id,
                                'slt_apis': slt_api_ids,
                                'api': true_api_id,
                                'label': 1
                            },
                            ignore_index=True)

                for slt_api_ids, false_api_instances in mid2false_instances[
                        mashup_id].items():
                    for false_api_id in false_api_instances:
                        train_df = train_df.append(
                            {
                                'mashup': mashup_id,
                                'slt_apis': slt_api_ids,
                                'api': false_api_id,
                                'label': 0
                            },
                            ignore_index=True)

            # test和train格式不同: train是一行一个样本; test的待测api太多,节省空间,一行:
            # 一个mashup,多个待测api,一份all_ground_api_ids,一份test_slt_ids
            test_df = pd.DataFrame(columns=[
                'mashup', 'slt_apis', 'candidate_apis', 'all_ground_api_ids'
            ])
            for mashup_id in test_mashups:
                for slt_api_ids, candidate_api_instances in mid2candidate_instances[
                        mashup_id].items():
                    test_df = test_df.append(
                        {
                            'mashup':
                            mashup_id,
                            'slt_apis':
                            slt_api_ids,
                            'candidate_apis':
                            candidate_api_instances,
                            'all_ground_api_ids':
                            mid2true_instances[mashup_id][slt_api_ids]
                        },
                        ignore_index=True)

            # 3.根据训练集和测试集的划分,初始化dataset对象!!!
            data = dataset(args, result_root, name, i)
            data.initialize(train_df, test_df)
            set_ds(data)  # 设置唯一实例
            print('{}/{} dataset, build done!'.format(i, kcv))
            yield data

    print('you have splited and saved them!')