def run(): bm = BaseModel(tn='qcc_original') bm2 = BaseModel(tn='qcc_format') metaModels = ['基本信息', '企业发展', '法律诉讼', '经营风险', '经营状况', '公司新闻', '知识产权'] models = { '基本信息': Enterprise(), '企业发展': Develop(), '法律诉讼': Judicature(), '经营风险': Risk(), '经营状况': Operating(), '公司新闻': News(), '知识产权': Right() } for m in metaModels: enterprises = bm.query( sql={ 'metaModel': m, # 'name': '重庆斯麦尔酒店有限公司' }, # field={'content': 1, '_id': 0}, # no_cursor_timeout=True limit=1000, # skip=0 ) print('\ndeal metaModel({})...'.format(m)) mdl = models[m] mdl.run(enterprises, bm2) pass
def __init__(self, name): self.bm = BaseModel(tn='qcc', location='gcxy', dbname='data') self.name = name self.timeline = [] self.getTimeline() self.timeline.sort(key=lambda x: x[0], reverse=False) pass
def __init__(self, **kwargs): BaseGraph.__init__(self, **kwargs) self.base = BaseModel( tn='cq_all', # location='local2', # dbname='data' ) pass
def f1(): bm = BaseModel(tn='qcc_spider_all_4_14', location='server', dbname='prod') # enterprises = bm.aggregate(pipeline=[ # {'$match': {'metaModel': '基本信息'}}, # # {'$project': {'_id': 1, 'name': 1}} # ]) enterprises = bm.query(sql={'metaModel': '基本信息'}, no_cursor_timeout=True) ds = [] data = [] keep = [] i = 0 for etp in enterprises: i += 1 # if i > 10: # break cs = get_keys(etp, '基本信息', return_value=True, filter_key=[ '_id', 'metaModel', 'source', 'url', 'headers', 'get', 'date', '序号', '日期', '链接', '时间' ]) for c in cs: _ = c.split(':') if len(keep): if sum([1 if kp in _[0] else 0 for kp in keep]): data.append([_[0], _[1]]) else: data.append([_[0], _[1]]) if i % 1000 == 0: d = pd.DataFrame(data, columns=['k', 'v']) d['f'] = 1 d = d.groupby(['k', 'v'], as_index=False).agg({ # 'v': lambda x: '\n'.join(set([_ for _ in ('\n'.join(list(x))).split('\n')])) 'f': 'sum' }) ds.append(d) data.clear() pass d = pd.DataFrame(data, columns=['k', 'v']) d['f'] = 1 d = d.groupby(['k', 'v'], as_index=False).agg({ # 'v': lambda x: '\n'.join(set([_ for _ in ('\n'.join(list(x))).split('\n')])) 'f': 'sum' }) ds.append(d) ds = pd.concat(ds) ds = ds.groupby(['k', 'v'], as_index=False).agg({ # 'v': lambda x: '\n'.join(set([_ for _ in ('\n'.join(list(x))).split('\n')])) 'f': 'sum' }) # ds.to_csv(workspace + 'flss-all.csv', index=False) return ds pass
def __init__(self, **kwargs): BaseGraph.__init__(self, **kwargs) self.base = BaseModel( tn='cq_api', # tn='relationsDetail.1.0', # location='gcxy', # dbname='data' ) pass
def insert(): bm = BaseModel(tn='qcc_original') fs = File.get_all_file('D:\graph_data\data\qcc_20200423\\') for f in fs: js = read_json(f) try: bm.insert_batch(js) except: continue pass
def remove_data(cls, table_name, **kw): """ 删除数据 :param table_name: :param kw: :return: """ try: BaseModel(table_name, cls.location, cls.dbname).remove(kw) except Exception: raise MongoIOError('Failed with delete data by MongoDB')
def duplication(): import time import pandas as pd bm = BaseModel(tn='qcc') metaModels = [ '基本信息', # '企业发展', # '法律诉讼', # '经营风险', # '经营状况', # '公司新闻', # '知识产权' ] for m in metaModels: data = bm.aggregate(pipeline=[ { '$match': { 'metaModel': m } }, { '$project': { '_id': 0, 'name': 1, # 'recall': 1, # 'date': 1 } } ]) data = pd.DataFrame(list(data)) data.to_csv('qcc_names.csv', index=False) # data = data.sort_values(['name', 'recall', 'date'], ascending=False) # data['dup'] = data['name'].duplicated(keep='first') # total = len(data) # dup = data[data['dup']]['_id'] # dup_count = len(dup) # print('\nduplicate({}): {}/{}'.format(m, dup_count, total)) # i = 0 # start = time.time() # for _ in dup: # duplicate: 356454/1691737 # # bm.remove(_id=i) # dc = bm.mc.delete_one({'_id': _}) # i += dc.deleted_count # if i % 10 == 0: # progress_bar( # dup_count, i, 'drop duplicate data and spend {} ' # 'seconds'.format(int(time.time() - start))) pass # duplication()
def field(cls, table_name, field_name): """ Query the value of a field in the database :param table_name: the database's table name :param field_name: the table's field name :return: all values in database """ try: return BaseModel(table_name, cls.location, cls.dbname).distinct(field_name) except Exception: raise MongoIOError('query the field raise a error')
def remove_data(self, table_name, **kw): """ 删除数据 :param table_name: :param kw: :return: """ try: r = BaseModel(table_name, self.location, self.dbname).remove(kw) return r except Exception: raise MongoIOError('Failed with delete data by MongoDB')
def update_date(cls, table_name, condition, **kw): """ 按condition条件更新table_name表数据 :param table_name: :param condition: 形如{‘date':datetime.datetime(2018,1,1)}的一个字典 :param kw:形如close=0这样的参数组 :return: """ try: BaseModel(table_name, cls.location, cls.dbname).update_batch(condition, kw) except Exception: raise MongoIOError('Failed with update by MongoDB')
def insert_data(cls, table_name, data): """ 一个简易的数据插入接口 :param table_name: :param data: :return: """ try: if len(data): d = data.to_dict(orient='records') BaseModel(table_name, cls.location, cls.dbname).insert_batch(d) except Exception: raise MongoIOError('Failed with insert data by MongoDB')
def get_old_keys(): bm = BaseModel(tn='qcc_original') metaModel = '公司新闻' enterprises = bm.query( sql={ 'metaModel': metaModel, # 'name': '重庆导宇科技有限公司' }, field={ 'content': 1, '_id': 0, 'name': 1 }, no_cursor_timeout=True) i = 0 exit_filed = set() for etp in enterprises: i += 1 # if i > 10: # break name = etp.pop('name') try: cs = dictToDim2(etp, metaModel, '$') except Exception as e: print(e) print(name) for c in cs: exit_filed.add(c) pass data = [] for s in exit_filed: _ = s.split('$') d = [] for i in _: if len(i): d.append(i) data.append(','.join(d) + '\n') fp = workspace + '{}\\'.format(metaModel) File.check_file(fp) with open(fp + '字段.csv', 'w', encoding='gbk') as f: f.writelines(data) pass # exit_filed = pd.DataFrame(data=[f for f in exit_filed], columns=['key']) # fp = workspace + '{}\\'.format(metaModel) # File.check_file(fp) # exit_filed.to_csv(fp + '字段.csv', index=False) pass
def insert_one(self, table_name, data, add_id=False): """ insert one record :param table_name: :param data: a dict :param add_id: :return: """ try: if add_id: data['_id'] = ObjectId() BaseModel(table_name, self.location, self.dbname).insert(data) except Exception: raise MongoIOError('Failed with insert data by MongoDB')
def update_data(self, table_name, condition, **kw): """ 按condition条件更新table_name表数据 :param table_name: :param condition: 形如{‘date':datetime.datetime(2018,1,1)}的一个字典 :param kw:形如close=0这样的参数组 :return: """ try: r = BaseModel(table_name, self.location, self.dbname).update_batch(condition, kw) return r except Exception as e: ExceptionInfo(e) raise MongoIOError('Failed with update by MongoDB')
def read_one(self, table_name, field=None, **kw): """ 有时候只需要读一条数据,没必要使用read_data, :param table_name: :param field: :param kw: :return: a dict or None """ try: cursor = BaseModel(table_name, self.location, self.dbname).query_one(kw, field) except Exception as e: ExceptionInfo(e) cursor = None finally: return cursor
def insert_data(self, table_name, data, add_id=False): """ 一个简易的数据插入接口 :param table_name: :param data: :param add_id: :return: """ try: if add_id: data['_id'] = data.index.map(lambda x: ObjectId()) if len(data): d = data.to_dict(orient='records') BaseModel(table_name, self.location, self.dbname).insert_batch(d) except Exception: raise MongoIOError('Failed with insert data by MongoDB')
def aggregate(self, table_name, pipeline): """ :param table_name: :param pipeline: a list, 每一个元素相当于一个管道操作,常见的操作包括 匹配('$match')、属性域选择('$project') :return: """ try: cursor = BaseModel(table_name, self.location, self.dbname).aggregate(pipeline) # data = pd.DataFrame() # if cursor.count(): data = pd.DataFrame(list(cursor)) cursor.close() return data except Exception as e: ExceptionInfo(e) return pd.DataFrame()
def read_data(cls, table_name, field=None, **kw): """ 一个简易的数据读取接口 :param table_name: :param field: :param kw: :return: """ try: cursor = BaseModel(table_name, cls.location, cls.dbname).query(kw, field) data = pd.DataFrame() if cursor.count(): data = pd.DataFrame(list(cursor)) except Exception as e: ExceptionInfo(e) finally: cursor.close() return data
def min(cls, table_name, field='_id', **kw): """ 找到满足kw条件的field列上的最小值 :param table_name: :param field: :param kw: :return: """ try: if not isinstance(field, str): raise TypeError('field must be an instance of str') cursor = BaseModel(table_name, cls.location, cls.dbname).query(sql=kw, field={field: True}) if cursor.count(): d = pd.DataFrame(list(cursor)) m = d.loc[:, [field]].min()[field] else: m = None cursor.close() return m except Exception as e: raise e
def lasted_ticker(cls, code, date, table_name='ticker'): try: if isinstance(code, str): sc = code elif isinstance(code, list): sc = {'$in': code} else: raise TypeError("'code' must be str or list of str") if isinstance(date, dt.datetime): d = dt.datetime(date.year, date.month, date.day) t = {'$gte': date - dt.timedelta(minutes=1), '$lte': date} pass else: raise TypeError("this 'date' must be datetime") cursor = BaseModel(table_name, cls.location, cls.dbname).aggregate([{ '$match': { 'stock_code': sc, 'date': d } }, { '$match': { 'datetime': t } }]) data = pd.DataFrame(list(cursor)) if len(data): data = data.sort_values(['stock_code', 'datetime'], ascending=False) data = data.drop_duplicates(['stock_code'], keep='first') data = data.reset_index(drop=True) cursor.close() return data pass except Exception as e: ExceptionInfo(e) return pd.DataFrame()
def read_data(cls, code, start_date, end_date, field=None, timemerge=False, **kw): """ :param field: :param code: :param start_date: :param end_date: :param timemerge: :return: """ try: sql = dict(stock_code=code, date={ '$gte': start_date, '$lte': end_date }) sql = dict(sql, **kw) cursor = BaseModel('kline_tick', cls.location, cls.dbname).query(sql, field) if cursor.count(): data = pd.DataFrame(list(cursor)) data = cls.merge_time(data) if timemerge else data cursor.close() return data else: cursor.close() return pd.DataFrame() except Exception as e: ExceptionInfo(e) return pd.DataFrame()
def f1(): from Calf.data import BaseModel bm = BaseModel(tn='qcc_cq_new_test') js = read_json('D:\graph_data\基本信息\\115.238.252.22_owakrjctbm.json') bm.insert_batch(js)
def __init__(self): BaseGraph.__init__(self) self.base = BaseModel(tn='重庆裁决文书(内容)') pass
def check(): fps = File.get_all_file(import_path) n_fps = [] r_fps = [] for p in fps: if 'nodes' in p: n_fps.append(p) if 'relationships': r_fps.append(p) pass from Calf.data import BaseModel base = BaseModel(tn='cq_all', # tn='qcc.1.1', # location='gcxy', # dbname='data' ) def func1(): # 处理非基本信息模块下的Enterprise etp_fps = [] for p in n_fps: if 'Enterprise' in p and 'EtpGraph' not in p: etp_fps.append(p) etp_fps = set([os.path.join(*p.split('\\')[:-1]) for p in etp_fps]) etp = entities('Enterprise') etp_data = [] for ep in etp_fps: ed = etp.read_csv(ep, ep) etp_data.append(ed) etp_data = pd.concat(etp_data) etp_data.drop_duplicates(['URL:ID(Enterprise)'], inplace=True) etp_data.reset_index(drop=True, inplace=True) total = len(etp_data) etp_data['exist'] = False for i, r in etp_data.iterrows(): try: _ = base.query_one(sql={ 'name': r['NAME'], 'metaModel': '基本信息' }, field={ 'name': 1, '_id': 0 }) if _ is not None: etp_data.loc[i, ['exist']] = True if i % 100 == 0: progress_bar(total, i, 'check') except Exception as e: print(e) etp_data = etp_data[~etp_data['exist']] # etp_data.drop(['exist'], axis=1) etp.to_csv(etp_data, import_path, split_header=True) pass # func1() def func2(): # 处理Related rel_fps = [] for p in n_fps: if 'Related' in p: rel_fps.append(p) rel_fps = set([os.path.join(*p.split('\\')[:-1]) for p in rel_fps]) rel = entities('Related') rel_data = [] for ep in rel_fps: ed = rel.read_csv(ep, ep) rel_data.append(ed) rel_data = pd.concat(rel_data) # rel_data.drop_duplicates(['URL:ID'], inplace=True) drop = rel_data.loc[:, ['URL:ID(Related)', 'NAME']] drop['count'] = 1 drop = drop.groupby(['URL:ID(Related)'], as_index=False).agg({ 'count': 'count', 'NAME': 'first' }) drop = drop[(drop['count'] > 3) & (drop['NAME'].str.len() < 4)] drop = drop['URL:ID(Related)'] # drop = drop.tolist() if len(drop): rel_data = rel_data[~rel_data['URL:ID(Related)'].isin(drop)] rel.to_csv(rel_data, import_path, split_header=True) pass func2() pass