Esempio n. 1
0
    def txtSave(self, inputFile, outputFile):
        try:
            Logger().get_log().error('将文件信息转换为bunch对象')
            catelist = os.listdir(inputFile)
            bunch = Bunch(target_name=[], label=[], filenames=[], contents=[])
            bunch.target_name.extend(catelist)  # 将类别保存到Bunch对象中
            for eachDir in catelist:
                print(eachDir, inputFile)
                eachPath = inputFile + r"\\" + eachDir + r"\\"
                fileList = os.listdir(eachPath)
                for eachFile in fileList:  # 二级目录中的每个子文件
                    fullName = eachPath + eachFile  # 二级目录子文件全路径
                    bunch.label.append(eachDir)  # 当前分类标签
                    bunch.filenames.append(fullName)  # 保存当前文件的路径
                    bunch.contents.append(
                        TXTFile(fullName).read().strip())  # 保存文件词向量
            with open(outputFile, 'wb') as file_obj:  # 持久化必须用二进制访问模式打开
                pickle.dump(bunch, file_obj)
                # pickle.dump(obj, file, [,protocol])函数的功能:将obj对象序列化存入已经打开的file中。
                # obj:想要序列化的obj对象。
                # file:文件名称。
                # protocol:序列化使用的协议。如果该项省略,则默认为0。如果为负值或HIGHEST_PROTOCOL,则使用最高的协议版

        except Exception as ex:
            Logger('error').get_log().error(ex)
Esempio n. 2
0
 def normalReader(self, line_format='user item rating', sep=','):
     try:
         self.reader = Reader(line_format=line_format, sep=sep)
         return Dataset.load_from_file(self.config['path'], reader=self.reader)
     except Exception as ex:
         Logger('error').get_log().error(ex)
         Logger('error').clear()
Esempio n. 3
0
 def evaluate(self, data, measures=['rmse', 'mae']):
     try:
         print('-----------------' + str(self.algo) +
               '-----------------------')
         return cross_validate(self.algo, data, measures=measures)
     except Exception as ex:
         Logger('error').get_log().error(ex)
         Logger('error').clear()
Esempio n. 4
0
 def adapter(self, sep, name_list):
     try:
         Logger().get_log().error('adapter is built')
         id, rid_to_name, name_to_rid = 0, {}, {}
         for name in name_list:
             if name not in name_to_rid.keys():
                 id += 1
                 name_to_rid[name] = id
         for key, val in name_to_rid.items():
             rid_to_name[val] = key
         return rid_to_name, name_to_rid
     except Exception as ex:
         Logger('error').get_log().error(ex)
Esempio n. 5
0
    def getRS(self):
        try:
            print('获取推荐系统')
            path = self.config['path']
            if self.config['type'] == 'KNN':
                self.models = [KNNBaselineRS(path), KNNBasicRS(path)]
            elif self.config['type'] == 'BaselineAlgorithms':
                self.models = [BaselineOnlyRS(path), NormalPredictorRS(path)]
            elif self.config['type'] == 'MatrixFactorization':
                self.models = [SVDRS(path), SVDppRS(path), NMFRS(path)]


        except Exception as ex:
            Logger('error').get_log().error(ex)
            Logger().clear()
Esempio n. 6
0
    def getNeighbors(self, namelist: list, target, n):
        '''
        :param namelist: 所有用户的名称
        :param target: 目标用户标签
        :param n: 邻居个数
        :return:  邻居对象标签列表
        '''
        try:
            # 获取用户名到用户id 和 用户id到用户名的映射
            rid_to_name, name_to_rid = self.read_item_names(namelist)
            # Retieve inner id of the movie Toy Story
            toy_story_raw_id = name_to_rid[target]
            toy_story_inner_id = self.algo.trainset.to_inner_iid(
                toy_story_raw_id)
            # Retrieve inner ids of the nearest neighbors of Toy Story.
            toy_story_neighbors = self.algo.get_neighbors(toy_story_inner_id,
                                                          k=n)

            # Convert inner ids of the neighbors into names.
            toy_story_neighbors = (self.algo.trainset.to_raw_iid(inner_id)
                                   for inner_id in toy_story_neighbors)
            toy_story_neighbors = (rid_to_name[rid]
                                   for rid in toy_story_neighbors)

            print('The' + n + ' nearest neighbors of ' + target + ' are:')
            for user in toy_story_neighbors:
                print(user)
            return toy_story_neighbors
        except Exception as ex:
            Logger('error').get_log().error(ex)
Esempio n. 7
0
 def reader(self, line_format='user item rating', sep=','):
     try:
         self.reader = Reader(line_format=line_format, sep=sep)
         self.data = Dataset.load_from_file(self.path, reader=self.reader)
         print(type(self.data))
     except Exception as ex:
         Logger('error').get_log().error(ex)
Esempio n. 8
0
    def add(self, index, config = {'number_of_shards': 5, 'number_of_replicas': 0}):
        '''

        :param config: 设置分片和备份
        :return:
        '''
        settings = {
            "settings": {
                "number_of_shards": config['number_of_shards'],  # 一个分片
                "number_of_replicas": config['number_of_replicas']  # 0个备份
            },
            "mappings": {
                "Document": {
                    "dynamic": "strict",  # 含义不明确
                    "properties": {
                        "content": {
                            "type": "text"
                        },
                        "file_name": {
                            "type": "text"
                        },
                        "Date": {
                            "type": "date"
                        }
                    }
                }
            }
        }
        try:
            if not self._es.indices.exists(index):
                self._es.indices.create(index = index, ignore = 400, body = settings)
                print('Created Index')
        except Exception as ex:
            Logger('error').get_log().error(ex, '创建失败')
Esempio n. 9
0
 def reader(self, line_format='user item rating', sep=','):
     try:
         self.reader = Reader(line_format=line_format, sep=sep)
         self.data = Dataset.load_from_file(self.path, reader=self.reader)
         self.rid_to_name, self.name_to_rid = self.read_item_names(sep)
     except Exception as ex:
         Logger('error').get_log().error(ex)
Esempio n. 10
0
 def filter(self):
     try:
         print('run start', datetime.datetime.now())
         self.result, self.data = {}, {}
         print('筛选推荐系统')
         if self.config['reader'] == 'normal':  self.reader = self.normalReader
         elif self.config['reader'] == 'json': self.reader = self.jsonReader
         pool = multiprocessing.Pool(processes= 4)
         for index, model in enumerate(self.models[:4]):
             print('model' + str(index) + '读取数据')
             self.data[index] = pool.apply(self.reader, (self.config['line_format'], self.config['sep']))
         pool.close()
         pool.join()
         print('读取数据完毕')
         pool = multiprocessing.Pool(processes= min(4, len(self.models)))
         for index, model in enumerate(self.models[:4]):
             print('模型' + str(model) + '评估')
             self.result[index] = pool.apply_async(model.evaluate, (self.data[index], ['rmse', 'mae'] ))
         pool.close()
         pool.join()
         MaxRMSE, MaxMAE, index = 0, 0, 0
         for key, res in self.result.items():
             res= res.get()
             print(res)
             if 'test_rmse' in res.keys() and res['test_rmse'].tolist():
                 if np.mean(res['test_rmse'].tolist()) > MaxRMSE: index = key
             elif 'test_mae' in res.keys() and res['test_mae'].tolist():
                 if np.mean(res['test_mse'].tolist()) > MaxMAE: index = key
             else: index = 0
         print('模型评估完毕, 最终选择' + str(index) + '号模型')
         print('run finish', datetime.datetime.now())
         return self.models[index]
     except Exception as ex:
         Logger('error').get_log().error(ex)
         return None
Esempio n. 11
0
 def connect(self, name):
     try:
         self.myclient = pymongo.MongoClient("mongodb://" +
                                             mongo_config['host'] + ':' +
                                             mongo_config['port'])
         self.db = self.myclient[name]
     except Exception as e:
         Logger('error').get_log().error(e)
Esempio n. 12
0
 def insert(self, index, datas:list):
     actions = []
     for data in datas:
         action = {
             "_index": index,
             "_type": type,
             "_id": None,
             "_source": data
         }
         actions.append(action)
     startime = datetime.datetime.now()
     if len(actions):
         try:
             helpers.bulk(self._es, actions, request_timeout = 100)
             Logger().get_log().error(startime + '开始' + '本次共写入{}条数据'.format(len(actions)))
         except Exception as ex:
             Logger('error').get_log().error(ex)
Esempio n. 13
0
 def insert(self, table):
     try:
         collist = self.db.list_collection_names()
         if table in collist:
             print('数据表已经存在,无需重复创建')
         return self.db[table]
     except Exception as e:
         Logger('error').get_log().error(e)
Esempio n. 14
0
 def drop(self, table):
     try:
         collist = self.db.list_collection_names()
         if table not in collist: print('删除{}表失败'.format(table))
         else:
             self.db[table].drop()
             print("删除{}表成功".format(table))
     except Exception as ex:
         Logger('error').get_log().error(ex)
Esempio n. 15
0
 def predict(self, k=4):
     '''
     :param user: 类比对象
     :param userid:用来确定预测系统
     :param k: 相似对象数
     :return: 相似对象标签
     '''
     try:
         raw_id = self.name_to_rid[self.target]
         inner_id = self.algo.trainset.to_inner_iid(raw_id)
         neighbors = self.algo.get_neighbors(inner_id, k=k)
         neighbors = (self.algo.trainset.to_raw_iid(inner_id)
                      for inner_id in neighbors)
         neighbors = (self.rid_to_name[rid] for rid in neighbors)
         return neighbors if len(neighbors) < 10 else neighbors[:10]
     except PredictionImpossible as pl:
         Logger('error').get_log().error(pl)
     except Exception as ex:
         Logger('error').get_log().error(ex)
Esempio n. 16
0
 def rank(self, person, n=5):
     try:
         # 求参数用户和其他所有用户的相似系数
         scores = [(self.sim_distance(person, other), other)
                   for other in self.prefs if other != person]
         # 排序,默认根据元组第一个元素
         scores.sort(reverse=True)
         return scores[0:n] if len(scores) <= n else scores
     except Exception as ex:
         Logger('error').get_log().error(ex)
Esempio n. 17
0
 def read(self):
     try:
         content = []
         with open(self.path, 'r', encoding='utf-8') as f:
             readlines = csv.reader(f)
             for line in readlines:
                 content.append(line)
         return content
     except Exception as ex:
         Logger('error').get_log().error(ex)
Esempio n. 18
0
 def read(self, path):
     '''
     :param path: 文件路径
     :return: dict
     '''
     try:
         with open(path, 'r') as f:
             return json.load(fp = f)
     except Exception as ex:
         Logger('error').get_log().error(ex)
Esempio n. 19
0
 def dbSave(self, dbname='test'):
     try:
         Logger().get_log().error('将数据库信息转化为bunch对象')
         pathdict = Path(dbname).get_PathDict()
         catelist = os.listdir(pathdict['DataBasePath'])
         bunch = Bunch(target_name=[], label=[], filenames=[], contents=[])
         bunch.target_name.extend(catelist)  # 将类别保存到Bunch对象中
         mongoDB = MongoDB()
         mongoDB.connect(dbname)
         for table in catelist:
             collection = mongoDB.insert(table)
             for item in collection.find():
                 bunch.label.append(item['type'])
                 bunch.filenames.append(item['file_name'])
                 bunch.contents.append(item['content'].strip())
     except Exception as ex:
         Logger('error').get_log().error(ex)
     finally:
         return bunch
Esempio n. 20
0
 def jsonReader(self, rating_scale =(1, 5)):
     '''
     json数据格式:
     {'itemID': [1, 1, 1, 2, 2],
             'userID': [9, 32, 2, 45, 'user_foo'],
             'rating': [3, 2, 4, 3, 1]}
     :param rating_scale:
     :return:
     '''
     try:
         dic = Json(self.config['path']).read()
         dl = list(dic.keys())
         df = pd.DataFrame(dic)
         self.reader = Reader(rating_scale=rating_scale)
         # 传入的列必须对应着 userID,itemID 和 rating(严格按此顺序)。
         return Dataset.load_from_df(df[[label for label in dl]], reader=self.reader)
     except Exception as ex:
         Logger('error').get_log().error(ex)
         Logger('error').clear()
Esempio n. 21
0
    def read(self, line_format, sep=','):
        '''

        :param line_format: 'user item rating'
        :param sep:
        :return:
        '''
        try:
            reader = Reader(line_format=line_format, sep=sep)
            return Dataset.load_from_file(self.path, reader=reader)
        except Exception as ex:
            Logger('error').get_log().error(ex)
Esempio n. 22
0
 def documents_Init(self, DataBasePath, index_name='test'):
     '''
     在index_name数据库下创建表并添加数据
     :param Data_BasePath: 数据文件夹根目录
     :param index_name:    数据库名称
     :return:
     '''
     print('将所有文档存储到elasticsearch')
     folder, txtFile = Folder(), TXTFile()
     indexnames = os.listdir(DataBasePath)
     save_dict = []
     content = folder.read('txt', txtFile)
     for item in content:
         result = (str(content)).replace("\r\n", "").strip()  # 删除多余空行与空格
         cutResult = jieba.cut(result)  # 默认方式分词,分词结果用空格隔开
         # save_dict.append(
         #         {'file_name': ChildPath, 'content': result, 'type': name, 'keywords': ' '.join(cutResult)})
     try:
         self.insert(index_name, save_dict)  # 将数据批量导入elasticsearch
     except Exception as ex:
         Logger('error').get_log().error(ex)
     else:
         Logger().get_log().error('索引初始化完成')
Esempio n. 23
0
    def build(self):
        try:
            if self.opts['engine'] == 'TPEngine':
                pass
            elif self.opts['engine'] == 'RSEngine':
                if 'option' in self.opts.keys() and self.opts['option']:
                    module_meta = __import__('Recommand',
                                             globals(),
                                             locals(), [self.opts['option']],
                                             level=1)
                    class_meta = getattr(module_meta, self.opts['option'])
                    obj = class_meta(self.opts)
                    obj.run()

        except Exception as ex:
            Logger('error').get_log().error(ex)
Esempio n. 24
0
 def bunchSave(self, config, dbname='test'):
     try:
         print('将数据库信息转换为bunch对象')
         catelist = os.listdir(config['DataBasePath'])
         bunch = Bunch(target_name=[], label=[], filenames=[], contents=[])
         bunch.target_name.extend(catelist)  # 将类别保存到Bunch对象中
         for table in catelist:
             # init_Table(db, table)
             collection = self.db[table]
             for item in collection.find():
                 bunch.label.append(item['type'])
                 bunch.filenames.append(item['file_name'])
                 bunch.contents.append(item['content'].strip())
         return bunch
     except Exception as ex:
         Logger('error').get_log().error(ex)
Esempio n. 25
0
 def init_Path(self):
     #创建文件夹
     try:
         Path_Dict[self.database] = {
             "DataBasePath": DataBasePath + "\\" + self.database + "\\data",
             "TestBasePath": DataBasePath + "\\" + self.database + "\\test",
             "inputpath": DataBasePath + "\\" + self.database + "\\data",
             "outputpath":
             DataBasePath + "\\" + self.database + "\\segResult",
             "trainset":
             DataBasePath + "\\" + self.database + "\\trainset.dat",
             "tfidfspace":
             DataBasePath + "\\" + self.database + "\\tfidfspace.dat",
             "testbunch":
             DataBasePath + "\\" + self.database + "\\test_set.dat",
             "predictspace":
             DataBasePath + "\\" + self.database + "\\predict.dat",
             "stopwords":
             DataBasePath + "\\" + self.database + "\\stopword.txt"
         }
     except Exception as ex:
         Logger('error').get_log().error(ex)
     return Path_Dict[self.database]
Esempio n. 26
0
def bunchSave(inputFile, outputFile):
    print('run bunchSave', time.time())
    start = time.time()
    catelist = os.listdir(inputFile)

    bunch = Bunch(target_name=[], label=[], filenames=[], contents=[])
    bunch.target_name.extend(catelist)  # 将类别保存到Bunch对象中

    for eachDir in catelist:
        eachPath = inputFile + "/" + eachDir + "/"
        fileList = os.listdir(eachPath)
        for eachFile in fileList:  # 二级目录中的每个子文件
            fullName = eachPath + eachFile  # 二级目录子文件全路径
            bunch.label.append(eachDir)  # 当前分类标签
            bunch.filenames.append(fullName)  # 保存当前文件的路径
            bunch.contents.append(TXTFile(fullName).read().strip())  # 保存文件词向量
    with open(outputFile, 'wb') as file_obj:  # 持久化必须用二进制访问模式打开
        pickle.dump(bunch, file_obj)  #文件可以不建立,但是文件夹必须建立
        #pickle.dump(obj, file, [,protocol])函数的功能:将obj对象序列化存入已经打开的file中。
        #obj:想要序列化的obj对象。
        #file:文件名称。
        #protocol:序列化使用的协议。如果该项省略,则默认为0。如果为负值或HIGHEST_PROTOCOL,则使用最高的协议版本
    Logger().get_log().error('finish bunch save + ', time.time(),
                             ', use time :' + str(start - time.time()))
Esempio n. 27
0
 def train(self):
     try:
         trainset = self.data.build_full_trainset()
         self.algo.fit(trainset)
     except Exception as ex:
         Logger('error').get_log().error(ex)
Esempio n. 28
0
 def search_many(self, collection, target: dict):
     try:
         return self.db[collection].find(target)
     except Exception as ex:
         Logger('error').get_log().error(ex)
Esempio n. 29
0
 def run(self):
     try:
         self.getRS()
         self.filter()
     except Exception as ex:
         Logger('error').get_log().error(ex)
Esempio n. 30
0
 def update(self, collection, filter, update):
     try:
         return self.db[collection].find_one_and_update(
             filter, {'$set': update})
     except Exception as ex:
         Logger('error').get_log().error(ex)