Exemple #1
0
def run():
    bm = BaseModel(tn='qcc_original')
    bm2 = BaseModel(tn='qcc_format')

    metaModels = ['基本信息', '企业发展', '法律诉讼', '经营风险', '经营状况', '公司新闻', '知识产权']
    models = {
        '基本信息': Enterprise(),
        '企业发展': Develop(),
        '法律诉讼': Judicature(),
        '经营风险': Risk(),
        '经营状况': Operating(),
        '公司新闻': News(),
        '知识产权': Right()
    }
    for m in metaModels:
        enterprises = bm.query(
            sql={
                'metaModel': m,
                # 'name': '重庆斯麦尔酒店有限公司'
            },
            # field={'content': 1, '_id': 0},
            # no_cursor_timeout=True
            limit=1000,
            # skip=0
        )
        print('\ndeal metaModel({})...'.format(m))
        mdl = models[m]
        mdl.run(enterprises, bm2)
    pass
Exemple #2
0
 def __init__(self, name):
     self.bm = BaseModel(tn='qcc', location='gcxy', dbname='data')
     self.name = name
     self.timeline = []
     self.getTimeline()
     self.timeline.sort(key=lambda x: x[0], reverse=False)
     pass
Exemple #3
0
 def __init__(self, **kwargs):
     BaseGraph.__init__(self, **kwargs)
     self.base = BaseModel(
         tn='cq_all',
         # location='local2',
         # dbname='data'
     )
     pass
Exemple #4
0
def f1():
    bm = BaseModel(tn='qcc_spider_all_4_14', location='server', dbname='prod')
    # enterprises = bm.aggregate(pipeline=[
    #     {'$match': {'metaModel': '基本信息'}},
    #     # {'$project': {'_id': 1, 'name': 1}}
    # ])
    enterprises = bm.query(sql={'metaModel': '基本信息'}, no_cursor_timeout=True)

    ds = []
    data = []
    keep = []
    i = 0
    for etp in enterprises:
        i += 1
        # if i > 10:
        #     break
        cs = get_keys(etp,
                      '基本信息',
                      return_value=True,
                      filter_key=[
                          '_id', 'metaModel', 'source', 'url', 'headers',
                          'get', 'date', '序号', '日期', '链接', '时间'
                      ])
        for c in cs:
            _ = c.split(':')
            if len(keep):
                if sum([1 if kp in _[0] else 0 for kp in keep]):
                    data.append([_[0], _[1]])
            else:
                data.append([_[0], _[1]])

        if i % 1000 == 0:
            d = pd.DataFrame(data, columns=['k', 'v'])
            d['f'] = 1
            d = d.groupby(['k', 'v'], as_index=False).agg({
                # 'v': lambda x: '\n'.join(set([_ for _ in ('\n'.join(list(x))).split('\n')]))
                'f': 'sum'
            })
            ds.append(d)
            data.clear()
        pass

    d = pd.DataFrame(data, columns=['k', 'v'])
    d['f'] = 1
    d = d.groupby(['k', 'v'], as_index=False).agg({
        # 'v': lambda x: '\n'.join(set([_ for _ in ('\n'.join(list(x))).split('\n')]))
        'f': 'sum'
    })
    ds.append(d)
    ds = pd.concat(ds)
    ds = ds.groupby(['k', 'v'], as_index=False).agg({
        # 'v': lambda x: '\n'.join(set([_ for _ in ('\n'.join(list(x))).split('\n')]))
        'f': 'sum'
    })
    # ds.to_csv(workspace + 'flss-all.csv', index=False)
    return ds
    pass
Exemple #5
0
 def __init__(self, **kwargs):
     BaseGraph.__init__(self, **kwargs)
     self.base = BaseModel(
         tn='cq_api',
         # tn='relationsDetail.1.0',
         # location='gcxy',
         # dbname='data'
     )
     pass
Exemple #6
0
def insert():
    bm = BaseModel(tn='qcc_original')
    fs = File.get_all_file('D:\graph_data\data\qcc_20200423\\')
    for f in fs:
        js = read_json(f)
        try:
            bm.insert_batch(js)
        except:
            continue
    pass
Exemple #7
0
 def remove_data(cls, table_name, **kw):
     """
     删除数据
     :param table_name:
     :param kw:
     :return:
     """
     try:
         BaseModel(table_name, cls.location, cls.dbname).remove(kw)
     except Exception:
         raise MongoIOError('Failed with delete data by MongoDB')
Exemple #8
0
def duplication():
    import time
    import pandas as pd

    bm = BaseModel(tn='qcc')

    metaModels = [
        '基本信息',
        # '企业发展',
        # '法律诉讼',
        # '经营风险',
        # '经营状况',
        # '公司新闻',
        # '知识产权'
    ]

    for m in metaModels:
        data = bm.aggregate(pipeline=[
            {
                '$match': {
                    'metaModel': m
                }
            },
            {
                '$project': {
                    '_id': 0,
                    'name': 1,
                    # 'recall': 1,
                    # 'date': 1
                }
            }
        ])
        data = pd.DataFrame(list(data))
        data.to_csv('qcc_names.csv', index=False)
        # data = data.sort_values(['name', 'recall', 'date'], ascending=False)
        # data['dup'] = data['name'].duplicated(keep='first')
        # total = len(data)
        # dup = data[data['dup']]['_id']
        # dup_count = len(dup)
        # print('\nduplicate({}): {}/{}'.format(m, dup_count, total))
        # i = 0
        # start = time.time()
        # for _ in dup:   # duplicate: 356454/1691737
        #     # bm.remove(_id=i)
        #     dc = bm.mc.delete_one({'_id': _})
        #     i += dc.deleted_count
        #     if i % 10 == 0:
        #         progress_bar(
        #             dup_count, i, 'drop duplicate data and spend {} '
        #                           'seconds'.format(int(time.time() - start)))
        pass


# duplication()
Exemple #9
0
 def field(cls, table_name, field_name):
     """
     Query the value of a field in the database
     :param table_name: the database's table name
     :param field_name: the table's field name
     :return: all values in database
     """
     try:
         return BaseModel(table_name, cls.location,
                          cls.dbname).distinct(field_name)
     except Exception:
         raise MongoIOError('query the field raise a error')
Exemple #10
0
 def remove_data(self, table_name, **kw):
     """
     删除数据
     :param table_name:
     :param kw:
     :return:
     """
     try:
         r = BaseModel(table_name, self.location, self.dbname).remove(kw)
         return r
     except Exception:
         raise MongoIOError('Failed with delete data by MongoDB')
Exemple #11
0
 def update_date(cls, table_name, condition, **kw):
     """
     按condition条件更新table_name表数据
     :param table_name:
     :param condition: 形如{‘date':datetime.datetime(2018,1,1)}的一个字典
     :param kw:形如close=0这样的参数组
     :return:
     """
     try:
         BaseModel(table_name, cls.location,
                   cls.dbname).update_batch(condition, kw)
     except Exception:
         raise MongoIOError('Failed with update by MongoDB')
Exemple #12
0
 def insert_data(cls, table_name, data):
     """
     一个简易的数据插入接口
     :param table_name:
     :param data:
     :return:
     """
     try:
         if len(data):
             d = data.to_dict(orient='records')
             BaseModel(table_name, cls.location, cls.dbname).insert_batch(d)
     except Exception:
         raise MongoIOError('Failed with insert data by MongoDB')
Exemple #13
0
def get_old_keys():
    bm = BaseModel(tn='qcc_original')

    metaModel = '公司新闻'

    enterprises = bm.query(
        sql={
            'metaModel': metaModel,
            # 'name': '重庆导宇科技有限公司'
        },
        field={
            'content': 1,
            '_id': 0,
            'name': 1
        },
        no_cursor_timeout=True)
    i = 0
    exit_filed = set()
    for etp in enterprises:
        i += 1
        # if i > 10:
        #     break
        name = etp.pop('name')
        try:
            cs = dictToDim2(etp, metaModel, '$')
        except Exception as e:
            print(e)
            print(name)
        for c in cs:
            exit_filed.add(c)
        pass

    data = []
    for s in exit_filed:
        _ = s.split('$')
        d = []
        for i in _:
            if len(i):
                d.append(i)
        data.append(','.join(d) + '\n')
    fp = workspace + '{}\\'.format(metaModel)
    File.check_file(fp)
    with open(fp + '字段.csv', 'w', encoding='gbk') as f:
        f.writelines(data)
        pass
    # exit_filed = pd.DataFrame(data=[f for f in exit_filed], columns=['key'])
    # fp = workspace + '{}\\'.format(metaModel)
    # File.check_file(fp)
    # exit_filed.to_csv(fp + '字段.csv', index=False)
    pass
Exemple #14
0
 def insert_one(self, table_name, data, add_id=False):
     """
     insert one record
     :param table_name:
     :param data: a dict
     :param add_id:
     :return:
     """
     try:
         if add_id:
             data['_id'] = ObjectId()
         BaseModel(table_name, self.location, self.dbname).insert(data)
     except Exception:
         raise MongoIOError('Failed with insert data by MongoDB')
Exemple #15
0
 def update_data(self, table_name, condition, **kw):
     """
     按condition条件更新table_name表数据
     :param table_name:
     :param condition: 形如{‘date':datetime.datetime(2018,1,1)}的一个字典
     :param kw:形如close=0这样的参数组
     :return:
     """
     try:
         r = BaseModel(table_name, self.location,
                       self.dbname).update_batch(condition, kw)
         return r
     except Exception as e:
         ExceptionInfo(e)
         raise MongoIOError('Failed with update by MongoDB')
Exemple #16
0
 def read_one(self, table_name, field=None, **kw):
     """
     有时候只需要读一条数据,没必要使用read_data,
     :param table_name:
     :param field:
     :param kw:
     :return: a dict or None
     """
     try:
         cursor = BaseModel(table_name, self.location,
                            self.dbname).query_one(kw, field)
     except Exception as e:
         ExceptionInfo(e)
         cursor = None
     finally:
         return cursor
Exemple #17
0
 def insert_data(self, table_name, data, add_id=False):
     """
     一个简易的数据插入接口
     :param table_name:
     :param data:
     :param add_id:
     :return:
     """
     try:
         if add_id:
             data['_id'] = data.index.map(lambda x: ObjectId())
         if len(data):
             d = data.to_dict(orient='records')
             BaseModel(table_name, self.location,
                       self.dbname).insert_batch(d)
     except Exception:
         raise MongoIOError('Failed with insert data by MongoDB')
Exemple #18
0
 def aggregate(self, table_name, pipeline):
     """
     :param table_name:
     :param pipeline: a list, 每一个元素相当于一个管道操作,常见的操作包括
     匹配('$match')、属性域选择('$project')
     :return: 
     """
     try:
         cursor = BaseModel(table_name, self.location,
                            self.dbname).aggregate(pipeline)
         # data = pd.DataFrame()
         # if cursor.count():
         data = pd.DataFrame(list(cursor))
         cursor.close()
         return data
     except Exception as e:
         ExceptionInfo(e)
         return pd.DataFrame()
Exemple #19
0
 def read_data(cls, table_name, field=None, **kw):
     """
     一个简易的数据读取接口
     :param table_name:
     :param field:
     :param kw:
     :return:
     """
     try:
         cursor = BaseModel(table_name, cls.location,
                            cls.dbname).query(kw, field)
         data = pd.DataFrame()
         if cursor.count():
             data = pd.DataFrame(list(cursor))
     except Exception as e:
         ExceptionInfo(e)
     finally:
         cursor.close()
         return data
Exemple #20
0
 def min(cls, table_name, field='_id', **kw):
     """
     找到满足kw条件的field列上的最小值
     :param table_name:
     :param field:
     :param kw:
     :return:
     """
     try:
         if not isinstance(field, str):
             raise TypeError('field must be an instance of str')
         cursor = BaseModel(table_name, cls.location,
                            cls.dbname).query(sql=kw, field={field: True})
         if cursor.count():
             d = pd.DataFrame(list(cursor))
             m = d.loc[:, [field]].min()[field]
         else:
             m = None
         cursor.close()
         return m
     except Exception as e:
         raise e
Exemple #21
0
 def lasted_ticker(cls, code, date, table_name='ticker'):
     try:
         if isinstance(code, str):
             sc = code
         elif isinstance(code, list):
             sc = {'$in': code}
         else:
             raise TypeError("'code' must be str or list of str")
         if isinstance(date, dt.datetime):
             d = dt.datetime(date.year, date.month, date.day)
             t = {'$gte': date - dt.timedelta(minutes=1), '$lte': date}
             pass
         else:
             raise TypeError("this 'date' must be datetime")
         cursor = BaseModel(table_name, cls.location,
                            cls.dbname).aggregate([{
                                '$match': {
                                    'stock_code': sc,
                                    'date': d
                                }
                            }, {
                                '$match': {
                                    'datetime': t
                                }
                            }])
         data = pd.DataFrame(list(cursor))
         if len(data):
             data = data.sort_values(['stock_code', 'datetime'],
                                     ascending=False)
             data = data.drop_duplicates(['stock_code'], keep='first')
             data = data.reset_index(drop=True)
         cursor.close()
         return data
         pass
     except Exception as e:
         ExceptionInfo(e)
         return pd.DataFrame()
Exemple #22
0
    def read_data(cls,
                  code,
                  start_date,
                  end_date,
                  field=None,
                  timemerge=False,
                  **kw):
        """

        :param field:
        :param code:
        :param start_date:
        :param end_date:
        :param timemerge:
        :return:
        """
        try:
            sql = dict(stock_code=code,
                       date={
                           '$gte': start_date,
                           '$lte': end_date
                       })
            sql = dict(sql, **kw)
            cursor = BaseModel('kline_tick', cls.location,
                               cls.dbname).query(sql, field)
            if cursor.count():
                data = pd.DataFrame(list(cursor))
                data = cls.merge_time(data) if timemerge else data
                cursor.close()
                return data
            else:
                cursor.close()
                return pd.DataFrame()
        except Exception as e:
            ExceptionInfo(e)
            return pd.DataFrame()
Exemple #23
0
def f1():
    from Calf.data import BaseModel

    bm = BaseModel(tn='qcc_cq_new_test')
    js = read_json('D:\graph_data\基本信息\\115.238.252.22_owakrjctbm.json')
    bm.insert_batch(js)
Exemple #24
0
 def __init__(self):
     BaseGraph.__init__(self)
     self.base = BaseModel(tn='重庆裁决文书(内容)')
     pass
Exemple #25
0
def check():
    fps = File.get_all_file(import_path)
    n_fps = []
    r_fps = []
    for p in fps:
        if 'nodes' in p:
            n_fps.append(p)
        if 'relationships':
            r_fps.append(p)
        pass

    from Calf.data import BaseModel
    base = BaseModel(tn='cq_all',
                     # tn='qcc.1.1',
                     # location='gcxy',
                     # dbname='data'
                     )

    def func1():
        # 处理非基本信息模块下的Enterprise
        etp_fps = []
        for p in n_fps:
            if 'Enterprise' in p and 'EtpGraph' not in p:
                etp_fps.append(p)
        etp_fps = set([os.path.join(*p.split('\\')[:-1]) for p in etp_fps])
        etp = entities('Enterprise')
        etp_data = []
        for ep in etp_fps:
            ed = etp.read_csv(ep, ep)
            etp_data.append(ed)
        etp_data = pd.concat(etp_data)
        etp_data.drop_duplicates(['URL:ID(Enterprise)'], inplace=True)
        etp_data.reset_index(drop=True, inplace=True)
        total = len(etp_data)

        etp_data['exist'] = False
        for i, r in etp_data.iterrows():
            try:
                _ = base.query_one(sql={
                    'name': r['NAME'],
                    'metaModel': '基本信息'
                },
                                   field={
                                       'name': 1,
                                       '_id': 0
                                   })
                if _ is not None:
                    etp_data.loc[i, ['exist']] = True
                if i % 100 == 0:
                    progress_bar(total, i, 'check')
            except Exception as e:
                print(e)
        etp_data = etp_data[~etp_data['exist']]
        # etp_data.drop(['exist'], axis=1)
        etp.to_csv(etp_data, import_path, split_header=True)
        pass

    # func1()

    def func2():
        # 处理Related
        rel_fps = []
        for p in n_fps:
            if 'Related' in p:
                rel_fps.append(p)
        rel_fps = set([os.path.join(*p.split('\\')[:-1]) for p in rel_fps])
        rel = entities('Related')
        rel_data = []
        for ep in rel_fps:
            ed = rel.read_csv(ep, ep)
            rel_data.append(ed)
        rel_data = pd.concat(rel_data)
        # rel_data.drop_duplicates(['URL:ID'], inplace=True)

        drop = rel_data.loc[:, ['URL:ID(Related)', 'NAME']]
        drop['count'] = 1
        drop = drop.groupby(['URL:ID(Related)'], as_index=False).agg({
            'count':
            'count',
            'NAME':
            'first'
        })
        drop = drop[(drop['count'] > 3) & (drop['NAME'].str.len() < 4)]
        drop = drop['URL:ID(Related)']
        # drop = drop.tolist()
        if len(drop):
            rel_data = rel_data[~rel_data['URL:ID(Related)'].isin(drop)]
        rel.to_csv(rel_data, import_path, split_header=True)
        pass

    func2()
    pass