Ejemplos de BaseModel en Python

Lenguaje de programación: Python

Namespace/Package Name: Calf.data

Clase / Tipo: BaseModel

Ejemplos en hotexamples.com: 30

Python BaseModel - 30 ejemplos encontrados. Estos son los ejemplos en Python del mundo real mejor valorados de Calf.data.BaseModel extraídos de proyectos de código abierto. Puedes valorar ejemplos para ayudarnos a mejorar la calidad de los ejemplos.

Métodos usados con frecuencia

Mostrar Ocultar

BaseModel(25)

query(12)

query_one(8)

close(5)

count(3)

insert_batch(2)

aggregate(1)

Ejemplo n.º 1

Mostrar archivo

Archivo: run.py Proyecto: CharsLeung/zlr-etl

def run():
    bm = BaseModel(tn='qcc_original')
    bm2 = BaseModel(tn='qcc_format')

    metaModels = ['基本信息', '企业发展', '法律诉讼', '经营风险', '经营状况', '公司新闻', '知识产权']
    models = {
        '基本信息': Enterprise(),
        '企业发展': Develop(),
        '法律诉讼': Judicature(),
        '经营风险': Risk(),
        '经营状况': Operating(),
        '公司新闻': News(),
        '知识产权': Right()
    }
    for m in metaModels:
        enterprises = bm.query(
            sql={
                'metaModel': m,
                # 'name': '重庆斯麦尔酒店有限公司'
            },
            # field={'content': 1, '_id': 0},
            # no_cursor_timeout=True
            limit=1000,
            # skip=0
        )
        print('\ndeal metaModel({})...'.format(m))
        mdl = models[m]
        mdl.run(enterprises, bm2)
    pass

Ejemplo n.º 2

Mostrar archivo

 def __init__(self, name):
     self.bm = BaseModel(tn='qcc', location='gcxy', dbname='data')
     self.name = name
     self.timeline = []
     self.getTimeline()
     self.timeline.sort(key=lambda x: x[0], reverse=False)
     pass

Ejemplo n.º 3

Mostrar archivo

Archivo: enterprise_graph.py Proyecto: CharsLeung/zlr

 def __init__(self, **kwargs):
     BaseGraph.__init__(self, **kwargs)
     self.base = BaseModel(
         tn='cq_all',
         # location='local2',
         # dbname='data'
     )
     pass

Ejemplo n.º 4

Mostrar archivo

Archivo: industry_graph.py Proyecto: CharsLeung/zlr

 def __init__(self, **kwargs):
     BaseGraph.__init__(self, **kwargs)
     self.base = BaseModel(
         tn='cq_api',
         # tn='relationsDetail.1.0',
         # location='gcxy',
         # dbname='data'
     )
     pass

Ejemplo n.º 5

Mostrar archivo

Archivo: t1.py Proyecto: CharsLeung/zlr

def f1():
    bm = BaseModel(tn='qcc_spider_all_4_14', location='server', dbname='prod')
    # enterprises = bm.aggregate(pipeline=[
    #     {'$match': {'metaModel': '基本信息'}},
    #     # {'$project': {'_id': 1, 'name': 1}}
    # ])
    enterprises = bm.query(sql={'metaModel': '基本信息'}, no_cursor_timeout=True)

    ds = []
    data = []
    keep = []
    i = 0
    for etp in enterprises:
        i += 1
        # if i > 10:
        #     break
        cs = get_keys(etp,
                      '基本信息',
                      return_value=True,
                      filter_key=[
                          '_id', 'metaModel', 'source', 'url', 'headers',
                          'get', 'date', '序号', '日期', '链接', '时间'
                      ])
        for c in cs:
            _ = c.split(':')
            if len(keep):
                if sum([1 if kp in _[0] else 0 for kp in keep]):
                    data.append([_[0], _[1]])
            else:
                data.append([_[0], _[1]])

        if i % 1000 == 0:
            d = pd.DataFrame(data, columns=['k', 'v'])
            d['f'] = 1
            d = d.groupby(['k', 'v'], as_index=False).agg({
                # 'v': lambda x: '\n'.join(set([_ for _ in ('\n'.join(list(x))).split('\n')]))
                'f': 'sum'
            })
            ds.append(d)
            data.clear()
        pass

    d = pd.DataFrame(data, columns=['k', 'v'])
    d['f'] = 1
    d = d.groupby(['k', 'v'], as_index=False).agg({
        # 'v': lambda x: '\n'.join(set([_ for _ in ('\n'.join(list(x))).split('\n')]))
        'f': 'sum'
    })
    ds.append(d)
    ds = pd.concat(ds)
    ds = ds.groupby(['k', 'v'], as_index=False).agg({
        # 'v': lambda x: '\n'.join(set([_ for _ in ('\n'.join(list(x))).split('\n')]))
        'f': 'sum'
    })
    # ds.to_csv(workspace + 'flss-all.csv', index=False)
    return ds
    pass

Ejemplo n.º 6

Mostrar archivo

Archivo: t1.py Proyecto: CharsLeung/zlr-etl

def insert():
    bm = BaseModel(tn='qcc_original')
    fs = File.get_all_file('D:\graph_data\data\qcc_20200423\\')
    for f in fs:
        js = read_json(f)
        try:
            bm.insert_batch(js)
        except:
            continue
    pass

Ejemplo n.º 7

Mostrar archivo

Archivo: run.py Proyecto: CharsLeung/zlr-etl

def duplication():
    import time
    import pandas as pd

    bm = BaseModel(tn='qcc')

    metaModels = [
        '基本信息',
        # '企业发展',
        # '法律诉讼',
        # '经营风险',
        # '经营状况',
        # '公司新闻',
        # '知识产权'
    ]

    for m in metaModels:
        data = bm.aggregate(pipeline=[
            {
                '$match': {
                    'metaModel': m
                }
            },
            {
                '$project': {
                    '_id': 0,
                    'name': 1,
                    # 'recall': 1,
                    # 'date': 1
                }
            }
        ])
        data = pd.DataFrame(list(data))
        data.to_csv('qcc_names.csv', index=False)
        # data = data.sort_values(['name', 'recall', 'date'], ascending=False)
        # data['dup'] = data['name'].duplicated(keep='first')
        # total = len(data)
        # dup = data[data['dup']]['_id']
        # dup_count = len(dup)
        # print('\nduplicate({}): {}/{}'.format(m, dup_count, total))
        # i = 0
        # start = time.time()
        # for _ in dup:   # duplicate: 356454/1691737
        #     # bm.remove(_id=i)
        #     dc = bm.mc.delete_one({'_id': _})
        #     i += dc.deleted_count
        #     if i % 10 == 0:
        #         progress_bar(
        #             dup_count, i, 'drop duplicate data and spend {} '
        #                           'seconds'.format(int(time.time() - start)))
        pass


# duplication()

Ejemplo n.º 8

Mostrar archivo

Archivo: t1.py Proyecto: CharsLeung/zlr-etl

def get_old_keys():
    bm = BaseModel(tn='qcc_original')

    metaModel = '公司新闻'

    enterprises = bm.query(
        sql={
            'metaModel': metaModel,
            # 'name': '重庆导宇科技有限公司'
        },
        field={
            'content': 1,
            '_id': 0,
            'name': 1
        },
        no_cursor_timeout=True)
    i = 0
    exit_filed = set()
    for etp in enterprises:
        i += 1
        # if i > 10:
        #     break
        name = etp.pop('name')
        try:
            cs = dictToDim2(etp, metaModel, '$')
        except Exception as e:
            print(e)
            print(name)
        for c in cs:
            exit_filed.add(c)
        pass

    data = []
    for s in exit_filed:
        _ = s.split('$')
        d = []
        for i in _:
            if len(i):
                d.append(i)
        data.append(','.join(d) + '\n')
    fp = workspace + '{}\\'.format(metaModel)
    File.check_file(fp)
    with open(fp + '字段.csv', 'w', encoding='gbk') as f:
        f.writelines(data)
        pass
    # exit_filed = pd.DataFrame(data=[f for f in exit_filed], columns=['key'])
    # fp = workspace + '{}\\'.format(metaModel)
    # File.check_file(fp)
    # exit_filed.to_csv(fp + '字段.csv', index=False)
    pass

Ejemplo n.º 9

Mostrar archivo

Archivo: modeldata.py Proyecto: CharsLeung/zlr

 def aggregate(self, table_name, pipeline):
     """
     :param table_name:
     :param pipeline: a list, 每一个元素相当于一个管道操作，常见的操作包括
     匹配('$match')、属性域选择('$project')
     :return: 
     """
     try:
         cursor = BaseModel(table_name, self.location,
                            self.dbname).aggregate(pipeline)
         # data = pd.DataFrame()
         # if cursor.count():
         data = pd.DataFrame(list(cursor))
         cursor.close()
         return data
     except Exception as e:
         ExceptionInfo(e)
         return pd.DataFrame()

Ejemplo n.º 10

Mostrar archivo

 def read_data(cls, table_name, field=None, **kw):
     """
     一个简易的数据读取接口
     :param table_name:
     :param field:
     :param kw:
     :return:
     """
     try:
         cursor = BaseModel(table_name, cls.location,
                            cls.dbname).query(kw, field)
         data = pd.DataFrame()
         if cursor.count():
             data = pd.DataFrame(list(cursor))
     except Exception as e:
         ExceptionInfo(e)
     finally:
         cursor.close()
         return data

Ejemplo n.º 11

Mostrar archivo

 def remove_data(cls, table_name, **kw):
     """
     删除数据
     :param table_name:
     :param kw:
     :return:
     """
     try:
         BaseModel(table_name, cls.location, cls.dbname).remove(kw)
     except Exception:
         raise MongoIOError('Failed with delete data by MongoDB')

Ejemplo n.º 12

Mostrar archivo

Archivo: modeldata.py Proyecto: CharsLeung/zlr

 def remove_data(self, table_name, **kw):
     """
     删除数据
     :param table_name:
     :param kw:
     :return:
     """
     try:
         r = BaseModel(table_name, self.location, self.dbname).remove(kw)
         return r
     except Exception:
         raise MongoIOError('Failed with delete data by MongoDB')

Ejemplo n.º 13

Mostrar archivo

 def field(cls, table_name, field_name):
     """
     Query the value of a field in the database
     :param table_name: the database's table name
     :param field_name: the table's field name
     :return: all values in database
     """
     try:
         return BaseModel(table_name, cls.location,
                          cls.dbname).distinct(field_name)
     except Exception:
         raise MongoIOError('query the field raise a error')

Ejemplo n.º 14

Mostrar archivo

 def insert_data(cls, table_name, data):
     """
     一个简易的数据插入接口
     :param table_name:
     :param data:
     :return:
     """
     try:
         if len(data):
             d = data.to_dict(orient='records')
             BaseModel(table_name, cls.location, cls.dbname).insert_batch(d)
     except Exception:
         raise MongoIOError('Failed with insert data by MongoDB')

Ejemplo n.º 15

Mostrar archivo

 def update_date(cls, table_name, condition, **kw):
     """
     按condition条件更新table_name表数据
     :param table_name:
     :param condition: 形如{‘date':datetime.datetime(2018,1,1)}的一个字典
     :param kw:形如close=0这样的参数组
     :return:
     """
     try:
         BaseModel(table_name, cls.location,
                   cls.dbname).update_batch(condition, kw)
     except Exception:
         raise MongoIOError('Failed with update by MongoDB')

Ejemplo n.º 16

Mostrar archivo

 def min(cls, table_name, field='_id', **kw):
     """
     找到满足kw条件的field列上的最小值
     :param table_name:
     :param field:
     :param kw:
     :return:
     """
     try:
         if not isinstance(field, str):
             raise TypeError('field must be an instance of str')
         cursor = BaseModel(table_name, cls.location,
                            cls.dbname).query(sql=kw, field={field: True})
         if cursor.count():
             d = pd.DataFrame(list(cursor))
             m = d.loc[:, [field]].min()[field]
         else:
             m = None
         cursor.close()
         return m
     except Exception as e:
         raise e

Ejemplo n.º 17

Mostrar archivo

Archivo: modeldata.py Proyecto: CharsLeung/zlr

 def insert_one(self, table_name, data, add_id=False):
     """
     insert one record
     :param table_name:
     :param data: a dict
     :param add_id:
     :return:
     """
     try:
         if add_id:
             data['_id'] = ObjectId()
         BaseModel(table_name, self.location, self.dbname).insert(data)
     except Exception:
         raise MongoIOError('Failed with insert data by MongoDB')

Ejemplo n.º 18

Mostrar archivo

 def lasted_ticker(cls, code, date, table_name='ticker'):
     try:
         if isinstance(code, str):
             sc = code
         elif isinstance(code, list):
             sc = {'$in': code}
         else:
             raise TypeError("'code' must be str or list of str")
         if isinstance(date, dt.datetime):
             d = dt.datetime(date.year, date.month, date.day)
             t = {'$gte': date - dt.timedelta(minutes=1), '$lte': date}
             pass
         else:
             raise TypeError("this 'date' must be datetime")
         cursor = BaseModel(table_name, cls.location,
                            cls.dbname).aggregate([{
                                '$match': {
                                    'stock_code': sc,
                                    'date': d
                                }
                            }, {
                                '$match': {
                                    'datetime': t
                                }
                            }])
         data = pd.DataFrame(list(cursor))
         if len(data):
             data = data.sort_values(['stock_code', 'datetime'],
                                     ascending=False)
             data = data.drop_duplicates(['stock_code'], keep='first')
             data = data.reset_index(drop=True)
         cursor.close()
         return data
         pass
     except Exception as e:
         ExceptionInfo(e)
         return pd.DataFrame()

Ejemplo n.º 19

Mostrar archivo

Archivo: modeldata.py Proyecto: CharsLeung/zlr

 def update_data(self, table_name, condition, **kw):
     """
     按condition条件更新table_name表数据
     :param table_name:
     :param condition: 形如{‘date':datetime.datetime(2018,1,1)}的一个字典
     :param kw:形如close=0这样的参数组
     :return:
     """
     try:
         r = BaseModel(table_name, self.location,
                       self.dbname).update_batch(condition, kw)
         return r
     except Exception as e:
         ExceptionInfo(e)
         raise MongoIOError('Failed with update by MongoDB')

Ejemplo n.º 20

Mostrar archivo

Archivo: modeldata.py Proyecto: CharsLeung/zlr

 def read_one(self, table_name, field=None, **kw):
     """
     有时候只需要读一条数据，没必要使用read_data，
     :param table_name:
     :param field:
     :param kw:
     :return: a dict or None
     """
     try:
         cursor = BaseModel(table_name, self.location,
                            self.dbname).query_one(kw, field)
     except Exception as e:
         ExceptionInfo(e)
         cursor = None
     finally:
         return cursor

Ejemplo n.º 21

Mostrar archivo

Archivo: modeldata.py Proyecto: CharsLeung/zlr

 def insert_data(self, table_name, data, add_id=False):
     """
     一个简易的数据插入接口
     :param table_name:
     :param data:
     :param add_id:
     :return:
     """
     try:
         if add_id:
             data['_id'] = data.index.map(lambda x: ObjectId())
         if len(data):
             d = data.to_dict(orient='records')
             BaseModel(table_name, self.location,
                       self.dbname).insert_batch(d)
     except Exception:
         raise MongoIOError('Failed with insert data by MongoDB')

Ejemplo n.º 22

Mostrar archivo

    def read_data(cls,
                  code,
                  start_date,
                  end_date,
                  field=None,
                  timemerge=False,
                  **kw):
        """

        :param field:
        :param code:
        :param start_date:
        :param end_date:
        :param timemerge:
        :return:
        """
        try:
            sql = dict(stock_code=code,
                       date={
                           '$gte': start_date,
                           '$lte': end_date
                       })
            sql = dict(sql, **kw)
            cursor = BaseModel('kline_tick', cls.location,
                               cls.dbname).query(sql, field)
            if cursor.count():
                data = pd.DataFrame(list(cursor))
                data = cls.merge_time(data) if timemerge else data
                cursor.close()
                return data
            else:
                cursor.close()
                return pd.DataFrame()
        except Exception as e:
            ExceptionInfo(e)
            return pd.DataFrame()

Ejemplo n.º 23

Mostrar archivo

Archivo: enterprise_graph.py Proyecto: CharsLeung/zlr

class EtpGraph(BaseGraph):

    def __init__(self, **kwargs):
        BaseGraph.__init__(self, **kwargs)
        self.base = BaseModel(
            tn='cq_all',
            # location='local2',
            # dbname='data'
        )
        pass

    def create_index_and_constraint(self):
        """
        为涉及到的实体创建唯一性约束跟索引，唯一键自动带有索引
        不必再单独创建索引
        :return:
        """
        # 用到的实体对象
        used_entity = [
            'Enterprise',
            'Person',
            'Telephone',
            'Address',
            'Email',
            # 'ShareHolder',
            # 'Branch',
            # 'HeadCompany',
            # 'Invested',
            # 'Related',
            'ConstructionProject',
            'Certificate'
        ]
        constraint = {}
        index = {}
        for l in used_entity:
            constraint[l] = [entities(l).primarykey]
            idx = entities(l).index
            if len(idx):
                index[l] = idx
        self.add_index_and_constraint(index, constraint)
        pass

    def create_nodes_from_enterprise_baseinfo(self, eb):
        """
        创建企业基本信息衍生出来的所有节点：
        1.企业
        2.法人代表
        3.管理人员
        4.地址
        实际上公司基本信息里面还衍生出了很多实体对象
        但这些对象是在后面随关系一并创建的
        :return:
        """
        nodes = []
        etp = Enterprise(eb)
        etp_n = etp.get_neo_node(primarykey=etp.primarykey)
        if etp_n is None:
            self.to_logs('filed initialize enterprise Neo node',
                         'ERROR', eb['name'])
            return None
        else:
            nodes.append(etp_n)
        try:
            lr = etp.get_legal_representative()
            lr_n = lr.get_neo_node(primarykey=lr.primarykey)
            if lr_n is None:
                self.to_logs('filed initialize legal representative '
                             'Neo node', 'ERROR', eb['name'])
            else:
                nodes.append(lr_n)
        except Exception as e:
            self.to_logs('deal legal representative raise ({})'
                         ''.format(e), 'EXCEPTION', eb['name'])
        try:
            ms = etp.get_manager()
            if len(ms):
                for m in ms:
                    m_n = m['person']
                    m_n = m_n.get_neo_node(primarykey=m_n.primarykey)
                    if m_n is None:
                        self.to_logs('filed initialize major manager '
                                     'Neo node', 'ERROR', eb['name'])

                    else:
                        nodes.append(m_n)
        except Exception as e:
            self.to_logs('deal major managers raise ({})'.format(e),
                         'EXCEPTION', eb['name'])
        try:
            dz = etp.get_address()
            dz_n = dz.get_neo_node(primarykey=dz.primarykey)
            if dz_n is None:
                self.to_logs('filed initialize address Neo node',
                             'ERROR', eb['name'])
            else:
                nodes.append(dz_n)
        except Exception as e:
            self.to_logs('deal address raise ({})'.format(e),
                         'EXCEPTION', eb['name'])

        return nodes

    def get_all_nodes_from_enterprise(self, etp):
        nodes = [etp]
        try:
            lr = etp.get_legal_representative()
            if lr.isPerson():
                nodes.append(lr)
        except Exception as e:
            self.to_logs('deal legal representative raise ({})'
                         ''.format(e), 'EXCEPTION', etp['NAME'])
        try:
            ms = etp.get_manager()
            if len(ms):
                nodes += [m['person'] for m in ms]
        except Exception as e:
            self.to_logs('deal major managers raise ({})'.format(e),
                         'EXCEPTION', etp['NAME'])
        try:
            nodes.append(etp.get_address())
        except Exception as e:
            self.to_logs('deal address raise ({})'.format(e),
                         'EXCEPTION', etp['NAME'])
        try:
            nodes.append(etp.get_telephone_number())
            pass
        except Exception as e:
            ExceptionInfo(e)
            self.to_logs('deal telephone number raise ({})'.format(e),
                         'EXCEPTION', etp['NAME'])
        try:
            nodes.append(etp.get_email())
            pass
        except Exception as e:
            ExceptionInfo(e)
            self.to_logs('deal email raise ({})'.format(e),
                         'EXCEPTION', etp['NAME'])
        try:
            cps = etp.get_construction_project()
            if len(cps):
                nodes += [
                    c.pop('project') for c in cps
                ]
        except Exception as e:
            ExceptionInfo(e)
            self.to_logs('deal construction project raise ({})'.format(e),
                         'EXCEPTION', etp['NAME'])
        try:
            ccs = etp.get_construction_certificate()
            nodes += [c.pop('ctf') for c in ccs]
        except Exception as e:
            ExceptionInfo(e)
            self.to_logs('deal construction certificate raise ({})'.format(e),
                         'EXCEPTION', etp['NAME'])
        try:
            sh = etp.get_share_holder()
            if len(sh):
                _nds_ = []
                for s in sh:
                    _s_ = s.pop('share_holder')
                    if _s_.isPerson():
                        _nds_.append(_s_)
                nodes += _nds_
        except Exception as e:
            ExceptionInfo(e)
            self.to_logs('deal share holder raise ({})'.format(e),
                         'EXCEPTION', etp['NAME'])
        try:
            brs = etp.get_branch()
            if len(brs):
                _nds_ = []
                for b in brs:
                    _p_ = b['principal']
                    if _p_.isPerson():
                        _nds_.append(_p_)
                nodes += _nds_
        except Exception as e:
            ExceptionInfo(e)
            self.to_logs('deal branch raise ({})'.format(e),
                         'EXCEPTION', etp['NAME'])
        try:
            hcs = etp.get_head_company()
            if len(hcs):
                _nds_ = []
                for h in hcs:
                    _p_ = h['principal']
                    if _p_.isPerson():
                        _nds_.append(_p_)
                nodes += _nds_
        except Exception as e:
            ExceptionInfo(e)
            self.to_logs('deal head company raise ({})'.format(e),
                         'EXCEPTION', etp['NAME'])
        return nodes

    def get_all_nodes(self):
        enterprises = self.base.query(
            sql={
                'metaModel': '基本信息',
                # 'name': {'$in': ns['name'].tolist()}
            },
            limit=10000,
            no_cursor_timeout=True)
        i, j = 0, 0
        # etp_count = enterprises.count()
        etp_count = 1000
        nodes = dict()
        for ep in enterprises:
            i += 1
            etp = Enterprise(ep)
            nds = self.get_all_nodes_from_enterprise(etp)
            for _nds_ in nds:
                if _nds_ is None:
                    continue
                _nds_ = _nds_.to_dict()
                if _nds_['label'] in nodes.keys():
                    nodes[_nds_['label']].append(_nds_)
                else:
                    nodes[_nds_['label']] = [_nds_]
                pass
            if i % 1000 == 0:
                j += 1
                print(SuccessMessage(
                    '{}:success merge nodes to database '
                    'round {} and deal {}/{} enterprise'
                    ''.format(dt.datetime.now(), j, i, etp_count)
                ))
            pass
        return nodes

    def create_all_nodes(self):
        """
        创建企业基本信息衍生出来的所有节点
        :return:
        """
        # import pandas as pd
        # ns = pd.read_csv('D:\graph_data\graph_run_logs_for_enterprise.csv')
        enterprises = self.base.query(
            sql={
                'metaModel': '基本信息',
                # 'name': {'$in': ns['name'].tolist()}
            },
            limit=1000,
            no_cursor_timeout=True)
        i, j = 0, 0
        etp_count = enterprises.count()
        nodes = []
        for e in enterprises:
            j += 1
            nds = self.create_nodes_from_enterprise_baseinfo(e)
            # nds = self.get_nodes_from_enterprise_baseinfo(e)
            nodes += nds
            if len(nodes) > 1000:
                i += 1
                # self.graph_merge_nodes(nodes)
                # if not self.index_and_constraint_statue:
                #     self.create_index_and_constraint()
                print(SuccessMessage('{}:success merge nodes to database '
                                     'round {} and deal {}/{} enterprise,and'
                                     ' merge {} nodes.'.format(
                    dt.datetime.now(), i, j, etp_count, len(nodes)
                )))
                nodes.clear()
        if len(nodes):
            i += 1
            # self.graph_merge_nodes(nodes)
            # if not self.index_and_constraint_statue:
            #     self.create_index_and_constraint()
            print(SuccessMessage('{}:success merge nodes to database '
                                 'round {} and deal {}/{} enterprise,and'
                                 ' merge {} nodes.'.format(
                dt.datetime.now(), i, j, etp_count, len(nodes)
            )))
            nodes.clear()
        pass

    def get_all_relationships_from_enterprise(self, etp):
        """
        创建从公司基本信息可以看出的关系：
        1.person-[lr]->enterprise
        2.person-[be_in_office]->enterprise
        3.enterprise-[located]->address
        4.person|enterprise-[holding]->enterprise
        5.enterprise-[have]->telephone
        6.enterprise-[have]->email
        :param :
        :return:
        """
        # 如果关系上的节点不存在，数据库同样会补充创建节点，这一点很重要
        rps = []
        etp_n = etp.get_neo_node(primarykey=etp.primarykey)
        if etp_n is None:
            self.to_logs('filed initialize enterprise Neo node',
                         'ERROR', etp['NAME'])
            return rps
        try:
            lr = etp.get_legal_representative()
            # 法定代表人有可能会是以下这些对象
            lr_n = self.match_node(
                *['Person'] + legal,
                cypher='_.URL = "{}"'.format(lr['URL'])
            )
            if lr_n is None:
                lr_n = lr.get_neo_node(primarykey=lr.primarykey)
            if lr_n is None:
                self.to_logs('filed initialize legal representative Neo node',
                             'ERROR', etp['NAME'])
            else:
                rps.append(LegalRep(lr_n, etp_n))
        except Exception as e:
            ExceptionInfo(e)
            self.to_logs('deal legal representative raise ({})'.format(e),
                         'EXCEPTION', etp['NAME'])
        try:
            ms = etp.get_manager()
            if len(ms):
                for m in ms:
                    # 主要人员 下面必然是人
                    m_n = m.pop('person')
                    m_n = m_n.get_neo_node(primarykey=m_n.primarykey)
                    if m_n is None:
                        self.to_logs('filed initialize major manager Neo node',
                                     'ERROR', etp['NAME'])
                    else:
                        rps.append(BeInOffice(m_n, etp_n, **m))
        except Exception as e:
            self.to_logs('deal major managers raise ({})'.format(e),
                         'EXCEPTION', etp['NAME'])
        try:
            dz = etp.get_address()
            dz_n = dz.get_neo_node(primarykey=dz.primarykey)
            if dz_n is None:
                self.to_logs('filed initialize address Neo node',
                             'ERROR', etp['NAME'])
            else:
                rps.append(Located(etp_n, dz_n))
        except Exception as e:
            self.to_logs('deal address raise ({})'.format(e),
                         'EXCEPTION', etp['NAME'])

        try:
            sh = etp.get_share_holder()
            if len(sh):
                for s in sh:
                    s_ = s.pop('share_holder')
                    # 股东有可能会是以下这些对象
                    sh_n = self.match_node(
                        'Person',
                        cypher='_.URL = "{}"'.format(s_['URL'])
                    )
                    if sh_n is None:
                        sh_n = self.match_node(
                            *legal,
                            cypher='_.URL = "{}" OR _.NAME = "{}"'.format(
                                s_['URL'], s_['NAME'])
                        )
                    if sh_n is None:  # 在以有的对象里面没找到这个股东
                        # 创建这个意外的股东
                        sh_n = s_.get_neo_node(primarykey=s_.primarykey)
                        if sh_n is None:
                            self.to_logs('filed initialize unexpected share '
                                         'holder Neo node', 'ERROR', etp['NAME'])
                    if sh_n is not None:
                        rps.append(Share(sh_n, etp_n, **s))
        except Exception as e:
            ExceptionInfo(e)
            self.to_logs('deal share holder raise ({})'.format(e),
                         'EXCEPTION', etp['NAME'])

        try:
            tel = etp.get_telephone_number()
            if tel is None:
                # self.to_logs('there is not valid telephone for'
                #              ' this enterprise.', 'ERROR', eb['name'])
                pass
            else:
                tel_n = tel.get_neo_node(primarykey=tel.primarykey)
                if tel_n is None:
                    self.to_logs('filed initialize telephone Neo node',
                                 'ERROR', etp['NAME'])
                else:
                    rps.append(Have(etp_n, tel_n))
            pass
        except Exception as e:
            ExceptionInfo(e)
            self.to_logs('deal telephone number raise ({})'.format(e),
                         'EXCEPTION', etp['NAME'])

        try:
            eml = etp.get_email()
            if eml is None:
                # self.to_logs('there is not valid email for'
                #              ' this enterprise.', 'ERROR', eb['name'])
                pass
            else:
                eml_n = eml.get_neo_node(primarykey=eml.primarykey)
                if eml_n is None:
                    self.to_logs('filed initialize email Neo node',
                                 'ERROR', etp['NAME'])
                else:
                    rps.append(Have(etp_n, eml_n))
            pass
        except Exception as e:
            ExceptionInfo(e)
            self.to_logs('deal email raise ({})'.format(e),
                         'EXCEPTION', etp['NAME'])
        try:
            ivs = etp.get_invest_outer()
            if len(ivs):
                for iv in ivs:
                    iv_ = iv.pop('invested')
                    # 被投资企业可能是下面这些对象
                    iv_n = self.match_node(
                        *legal,
                        cypher='_.URL = "{}" OR _.NAME = "{}"'.format(
                            iv_['URL'], iv_['NAME'])
                    )
                    if iv_n is None:
                        iv_n = iv_.get_neo_node(primarykey=iv_.primarykey)
                        if iv_n is None:
                            self.to_logs('filed initialize unexpected invested '
                                         'Neo node', 'ERROR', etp['NAME'])
                            continue
                    rps.append(Investing(etp_n, iv_n, **iv))
        except Exception as e:
            ExceptionInfo(e)
            self.to_logs('deal invest raise ({})'.format(e),
                         'EXCEPTION', etp['NAME'])
        try:
            brs = etp.get_branch()
            if len(brs):
                for b in brs:
                    b_ = b.pop('branch')
                    # 分支机构可能是下面这些对象
                    b_n = self.match_node(
                        *legal,
                        cypher='_.URL = "{}" OR _.NAME = "{}"'.format(
                            b_['URL'], b_['NAME'])
                    )
                    if b_n is None:
                        b_n = b_.get_neo_node(primarykey=b_.primarykey)
                        if b_n is None:
                            self.to_logs('filed initialize unexpected branch '
                                         'Neo node', 'ERROR', etp['NAME'])
                            continue
                        p_ = b['principal']
                        p_n = p_.get_neo_node(primarykey=p_.primarykey)
                        if p_n is not None:
                            rps.append(Principal(p_n, b_n))
                    b.pop('principal')
                    rps.append(BranchAgency(
                        etp_n, b_n, **b
                    ))
        except Exception as e:
            ExceptionInfo(e)
            self.to_logs('deal branch raise ({})'.format(e),
                         'EXCEPTION', etp['NAME'])
        try:
            hcs = etp.get_head_company()
            if len(hcs):
                for h in hcs:
                    h_ = h.pop('head')
                    # 总公司可能是下面这些对象
                    h_n = self.match_node(
                        *legal,
                        cypher='_.URL = "{}" OR _.NAME = "{}"'.format(
                            h_['URL'], h_['NAME'])
                    )
                    if h_n is None:
                        h_n = h_.get_neo_node(primarykey=h_.primarykey)
                        if h_n is None:
                            self.to_logs('filed initialize unexpected head '
                                         'company Neo node', 'ERROR', etp['NAME'])
                            continue
                        p_ = h['principal']
                        p_n = p_.get_neo_node(primarykey=p_.primarykey)
                        if p_n is not None:
                            rps.append(Principal(p_n, h_n))
                    h.pop('principal')
                    rps.append(SuperiorAgency(
                        etp_n, h_n, **h
                    ))
        except Exception as e:
            ExceptionInfo(e)
            self.to_logs('deal head company raise ({})'.format(e),
                         'EXCEPTION', etp['NAME'])
        try:
            cps = etp.get_construction_project()
            if len(cps):
                for c in cps:
                    c_ = c.pop('project')
                    c_n = c_.get_neo_node(primarykey=c_.primarykey)
                    if c_n is None:
                        self.to_logs('filed initialize unexpected construction '
                                     'project Neo node', 'ERROR', etp['NAME'])
                        continue
                    jsdw = c.pop('jsdw')
                    # 查询这个建设单位是否已经存在
                    j_n = self.match_node(
                        *legal,
                        cypher='_.URL = "{}" OR _.NAME = "{}"'.format(
                            jsdw['URL'], jsdw['NAME'])
                    )
                    if j_n is None:
                        j_n = jsdw.get_neo_node(primarykey=jsdw.primarykey)
                        if j_n is None:
                            self.to_logs('filed initialize unexpected construction '
                                         'agency Neo node', 'ERROR', etp['NAME'])
                            continue
                    # TODO(lj):需要考虑是否将承建、建设单独列为一种关系
                    rps.append(Have(
                        etp_n, c_n, **dict(角色='承建单位', **c)
                    ))
                    rps.append(Have(
                        j_n, c_n, **dict(角色='建设单位', **c)
                    ))
        except Exception as e:
            ExceptionInfo(e)
            self.to_logs('deal construction project raise ({})'.format(e),
                         'EXCEPTION', etp['NAME'])
        try:
            ccs = etp.get_construction_certificate()
            if len(ccs):
                for c in ccs:
                    c_ = c.pop('ctf')
                    c_n = c_.get_neo_node(primarykey=c_.primarykey)
                    if c_n is None:
                        self.to_logs('filed initialize unexpected construction '
                                     'certificate Neo node', 'ERROR', etp['NAME'])
                        continue
                    rps.append(Have(etp_n, c_n, **c))
        except Exception as e:
            ExceptionInfo(e)
            self.to_logs('deal construction certificate raise ({})'.format(e),
                         'EXCEPTION', etp['NAME'])
        return rps

    def create_all_relationship(self):
        """
        创建从公司基本信息可以看出的关系：
        1.person-[lr]->enterprise
        2.person-[be_in_office]->enterprise
        3.enterprise-[located]->address
        4.person|enterprise-[holding]->enterprise
        :return:
        """
        enterprises = self.base.query(
            sql={
                'metaModel': '基本信息',
                # 'name': '重庆长安汽车股份有限公司'
            },
            limit=1000,
            no_cursor_timeout=True)
        i, j = 0, 0
        etp_count = enterprises.count()
        relationships = []
        for _ in enterprises:
            j += 1
            etp = Enterprise(_)
            rps = self.get_relationship_from_enterprise(etp)
            relationships += rps

            if len(relationships) > 1000:
                i += 1
                self.graph_merge_relationships(relationships)
                if not self.index_and_constraint_statue:
                    self.create_index_and_constraint()
                print(SuccessMessage('{}:success merge relationships to database '
                                     'round {} and deal {}/{} enterprise,and'
                                     ' merge {} relationships.'.format(
                    dt.datetime.now(), i, j, etp_count, len(relationships)
                )))
                relationships.clear()
                # if i > 10:
                #     return
        if len(relationships):
            i += 1
            self.graph_merge_relationships(relationships)
            if not self.index_and_constraint_statue:
                self.create_index_and_constraint()
            print(SuccessMessage('{}:success merge relationships to database '
                                 'round {} and deal {}/{} enterprise,and'
                                 ' merge {} relationships.'.format(
                dt.datetime.now(), i, j, etp_count, len(relationships)
            )))
            relationships.clear()

    def get_all_relationships(self):
        enterprises = self.base.query(
            sql={
                'metaModel': '基本信息',
                # 'name': '重庆长安汽车股份有限公司'
            },
            limit=10000,
            no_cursor_timeout=True)
        i, j = 0, 0
        etp_count = enterprises.count()
        # etp_count = 1000
        relationships = {}
        for ep in enterprises:
            i += 1
            etp = Enterprise(ep)
            rps = self.get_all_relationships_from_enterprise(etp)
            for _rps_ in rps:
                _rps_ = _rps_.to_dict()
                if _rps_['label'] in relationships.keys():
                    relationships[_rps_['label']].append(_rps_)
                else:
                    relationships[_rps_['label']] = [_rps_]
                pass
            if i % 1000 == 0:
                j += 1
                print(SuccessMessage(
                    '{}:success merge nodes to database '
                    'round {} and deal {}/{} enterprise'
                    ''.format(dt.datetime.now(), j, i, etp_count)
                ))
            pass
        return relationships

    def get_all_nodes_and_relationships_from_enterprise(self, etp):
        """
        创建从公司基本信息可以看出的关系：
        1.person-[lr]->enterprise
        2.person-[be_in_office]->enterprise
        3.enterprise-[located]->address
        4.person|enterprise-[holding]->enterprise
        5.enterprise-[have]->telephone
        6.enterprise-[have]->email
        :param :
        :return:
        """
        # 如果关系上的节点不存在，数据库同样会补充创建节点，这一点很重要
        nodes, rps = [], []
        etp_n = self.get_neo_node(etp)
        if etp_n is None:
            self.logger.debug('{} filed initialize enterprise '
                              'Neo node'.format(etp['NAME']))
            return nodes, rps
        nodes.append(etp_n)
        try:
            lr = etp.get_legal_representative()
            # 法定代表人有可能会是以下这些对象
            lr_n = self.match_node(
                *['Person'] + legal,
                cypher='_.URL = "{}"'.format(lr['URL'])
            )
            if lr_n is None:
                lr_n = self.get_neo_node(lr)
            if lr_n is None:
                self.logger.debug('{} filed initialize legal representative '
                                  'Neo node'.format(etp['NAME']))
            else:
                nodes.append(lr_n)
                rps.append(LegalRep(lr_n, etp_n))
        except Exception as e:
            ExceptionInfo(e)
            self.logger.error('{} deal legal representative raise '
                              '({})'.format(etp['NAME'], e),
                              exc_info=True)
        try:
            ms = etp.get_manager()
            if len(ms):
                for m in ms:
                    # 主要人员 下面必然是人
                    m_n = m.pop('person')
                    m_n = self.get_neo_node(m_n)
                    if m_n is None:
                        self.logger.debug('{} filed initialize major manager '
                                          'Neo node'.format(etp['NAME']))
                    else:
                        nodes.append(m_n)
                        rps.append(BeInOffice(m_n, etp_n, **m))
        except Exception as e:
            self.logger.error('{} deal major managers raise '
                              '({})'.format(etp['NAME'], e),
                              exc_info=True)
        try:
            dz = etp.get_address()
            dz_n = self.get_neo_node(dz)
            if dz_n is None:
                self.logger.debug('{} filed initialize address '
                                  'Neo node'.format(etp['NAME']))
            else:
                nodes.append(dz_n)
                rps.append(Located(etp_n, dz_n))
        except Exception as e:
            self.logger.error('{} deal address raise '
                              '({})'.format(etp['NAME'], e),
                              exc_info=True)

        try:
            sh = etp.get_share_holder()
            if len(sh):
                for s in sh:
                    s_ = s.pop('share_holder')
                    # 股东有可能会是以下这些对象
                    sh_n = self.match_node(
                        'Person',
                        cypher='_.URL = "{}"'.format(s_['URL'])
                    )
                    if sh_n is None:
                        sh_n = self.match_node(
                            *legal,
                            cypher='_.URL = "{}" OR _.NAME = "{}"'.format(
                                s_['URL'], s_['NAME'])
                        )
                    if sh_n is None:  # 在以有的对象里面没找到这个股东
                        # 创建这个意外的股东
                        sh_n = self.get_neo_node(s_)
                        if sh_n is None:
                            self.logger.debug('{} filed initialize unexpected share '
                                              'holder Neo node'.format(etp['NAME']))
                    if sh_n is not None:
                        nodes.append(sh_n)
                        rps.append(Share(etp_n, sh_n, **s))
        except Exception as e:
            self.logger.error('{} deal share holder raise '
                              '({})'.format(etp['NAME'], e),
                              exc_info=True)

        try:
            tel = etp.get_telephone_number()
            if tel is None:
                # self.to_logs('there is not valid telephone for'
                #              ' this enterprise.', 'ERROR', eb['name'])
                pass
            else:
                tel_n = self.get_neo_node(tel)
                if tel_n is None:
                    self.logger.debug('{} filed initialize telephone '
                                      'Neo node'.format(etp['NAME']))
                else:
                    nodes.append(tel_n)
                    rps.append(Have(etp_n, tel_n))
            pass
        except Exception as e:
            self.logger.error('{} deal telephone number raise '
                              '({})'.format(etp['NAME'], e),
                              exc_info=True)

        try:
            eml = etp.get_email()
            if eml is None:
                # self.to_logs('there is not valid email for'
                #              ' this enterprise.', 'ERROR', eb['name'])
                pass
            else:
                eml_n = self.get_neo_node(eml)
                if eml_n is None:
                    self.logger.debug('{} filed initialize email '
                                      'Neo node'.format(etp['NAME']))
                else:
                    nodes.append(eml_n)
                    rps.append(Have(etp_n, eml_n))
            pass
        except Exception as e:
            self.logger.debug('{} deal email raise ({})'
                              ''.format(etp['NAME'], e),
                              exc_info=True)
        try:
            ivs = etp.get_invest_outer()
            if len(ivs):
                for iv in ivs:
                    iv_ = iv.pop('invested')
                    # 被投资企业可能是下面这些对象
                    iv_n = self.match_node(
                        *legal,
                        cypher='_.URL = "{}" OR _.NAME = "{}"'.format(
                            iv_['URL'], iv_['NAME'])
                    )
                    if iv_n is None:
                        iv_n = self.get_neo_node(iv_)
                        if iv_n is None:
                            self.logger.debug('{} filed initialize unexpected invested '
                                              'Neo node'.format(etp['NAME']))
                            continue
                    nodes.append(iv_n)
                    rps.append(Investing(etp_n, iv_n, **iv))
        except Exception as e:
            self.logger.error('{} deal invest raise ({})'
                              ''.format(etp['NAME'], e),
                              exc_info=True)
        try:
            brs = etp.get_branch()
            if len(brs):
                for b in brs:
                    b_ = b.pop('branch')
                    # 分支机构可能是下面这些对象
                    b_n = self.match_node(
                        *legal,
                        cypher='_.URL = "{}" OR _.NAME = "{}"'.format(
                            b_['URL'], b_['NAME'])
                    )
                    if b_n is None:
                        b_n = self.get_neo_node(b_)
                        if b_n is None:
                            self.logger.debug('{} filed initialize unexpected branch '
                                              'Neo node'.format(etp['NAME']))
                            continue
                        p_ = b['principal']
                        p_n = self.get_neo_node(p_)
                        if p_n is not None:
                            nodes.append(p_n)
                            rps.append(Principal(p_n, b_n))
                    b.pop('principal')
                    nodes.append(b_n)
                    rps.append(BranchAgency(
                        etp_n, b_n, **b
                    ))
        except Exception as e:
            self.logger.error('{} deal branch raise ({})'
                              ''.format(etp['NAME'], e),
                              exc_info=True)
        try:
            hcs = etp.get_head_company()
            if len(hcs):
                for h in hcs:
                    h_ = h.pop('head')
                    # 总公司可能是下面这些对象
                    h_n = self.match_node(
                        *legal,
                        cypher='_.URL = "{}" OR _.NAME = "{}"'.format(
                            h_['URL'], h_['NAME'])
                    )
                    if h_n is None:
                        h_n = self.get_neo_node(h_)
                        if h_n is None:
                            self.logger.debug('filed initialize unexpected head '
                                              'company Neo node'.format(etp['NAME']))
                            continue
                        p_ = h['principal']
                        p_n = self.get_neo_node(p_)
                        if p_n is not None:
                            nodes.append(p_n)
                            rps.append(Principal(p_n, h_n))
                    h.pop('principal')
                    nodes.append(h_n)
                    rps.append(SuperiorAgency(
                        etp_n, h_n, **h
                    ))
        except Exception as e:
            self.logger.error('{} deal head company raise ({})'
                              ''.format(etp['NAME'], e),
                              exc_info=True)
        try:
            cps = etp.get_construction_project()
            if len(cps):
                for c in cps:
                    c_ = c.pop('project')
                    c_n = self.get_neo_node(c_)
                    if c_n is None:
                        self.logger.debug('filed initialize unexpected construction '
                                          'project Neo node'.format(etp['NAME']))
                        continue
                    jsdw = c.pop('jsdw')
                    # 查询这个建设单位是否已经存在
                    j_n = self.match_node(
                        *legal,
                        cypher='_.URL = "{}" OR _.NAME = "{}"'.format(
                            jsdw['URL'], jsdw['NAME'])
                    )
                    if j_n is None:
                        j_n = self.get_neo_node(jsdw)
                        if j_n is None:
                            self.logger.debug('filed initialize unexpected construction '
                                              'agency Neo node'.format(etp['NAME']))
                            continue
                    # TODO(lj):需要考虑是否将承建、建设单独列为一种关系
                    nodes.append(c_n)
                    rps.append(Have(
                        etp_n, c_n, **dict(角色='承建单位', **c)
                    ))
                    nodes.append(j_n)
                    rps.append(Have(
                        j_n, c_n, **dict(角色='建设单位', **c)
                    ))
        except Exception as e:
            self.logger.error('{} deal construction project raise ({})'
                              ''.format(etp['NAME'], e),
                              exc_info=True)
        try:
            ccs = etp.get_construction_certificate()
            if len(ccs):
                for c in ccs:
                    c_ = c.pop('ctf')
                    c_n = self.get_neo_node(c_)
                    if c_n is None:
                        self.logger.debug('filed initialize unexpected construction '
                                          'certificate Neo node'.format(etp['NAME']))
                        continue
                    nodes.append(c_n)
                    rps.append(Have(etp_n, c_n, **c))
        except Exception as e:
            ExceptionInfo(e)
            self.logger.error('deal construction certificate raise ({})'
                              ''.format(etp['NAME'], e),
                              exc_info=True)
        return nodes, rps

    def get_all_nodes_and_relationships(
            self, save_folder=None, **kwargs):
        enterprises = self.base.query(
            sql={
                'metaModel': '基本信息',
                # 'name': '重庆长寿城乡商贸总公司'   # {'$in': ns['name'].tolist()}
            },
            # limit=100000,
            # skip=290000,
            no_cursor_timeout=True)
        i, j = 0, 0
        nc, rc = 0, 0
        etp_count = enterprises.count()
        # etp_count = 1000
        nodes, relationships = {}, {}
        _st_ = time.time()
        for ep in enterprises:
            try:
                i += 1
                etp = Enterprise(ep)
                nds, rps = self.get_all_nodes_and_relationships_from_enterprise(etp)
                for _nds_ in nds:
                    if _nds_ is None:
                        continue
                    label = list(_nds_.labels)[0]
                    _nds_ = dict(label=label, **_nds_)
                    if _nds_['label'] in nodes.keys():
                        nodes[_nds_['label']].append(_nds_)
                    else:
                        nodes[_nds_['label']] = [_nds_]
                    pass
                for _rps_ in rps:
                    _rps_ = _rps_.to_dict()
                    if _rps_['label'] in relationships.keys():
                        relationships[_rps_['label']].append(_rps_)
                    else:
                        relationships[_rps_['label']] = [_rps_]
                    pass
            except Exception as e:
                self.logger.error('{} {}'.format(e, ep['name']),
                                  exc_info=True)
                continue
            if i % 10000 == 0:
                j += 1
                if save_folder is not None:
                    _nc_, _rc_ = self.save_graph(
                        save_folder, nodes,
                        relationships, **kwargs)
                    nc += _nc_
                    rc += _rc_
                    nodes.clear()
                    relationships.clear()
                self.logger.info(SuccessMessage(
                    'success trans data to csv round {} and '
                    'deal {}/{} enterprise spend {} seconds.'
                    ''.format(j, i, etp_count, int(time.time() - _st_))
                ))
                _st_ = time.time()
                pass
        if save_folder is not None:
            _nc_, _rc_ = self.save_graph(
                save_folder, nodes,
                relationships, **kwargs)
            nc += _nc_
            rc += _rc_
            nodes.clear()
            relationships.clear()
            self.logger.info('Summary:')
            self.logger.info(' save graph data:')
            self.logger.info('   {} nodes'.format(nc))
            self.logger.info('   {} relationships'.format(rc))
            pass
        return nodes, relationships

Ejemplo n.º 24

Mostrar archivo

Archivo: industry_graph.py Proyecto: CharsLeung/zlr

class IndGraph(BaseGraph):

    def __init__(self, **kwargs):
        BaseGraph.__init__(self, **kwargs)
        self.base = BaseModel(
            tn='cq_api',
            # tn='relationsDetail.1.0',
            # location='gcxy',
            # dbname='data'
        )
        pass

    def get_all_nodes_and_relationships_from_api(self, etp):
        """
        创建所有的行业实体，实体对象从外部传进来，因为行业可能
        会作为一个相对独立的研究领域，与数据库中企业基本信息中的
        行业可能不完全匹配
        :return:
        """
        etp_n = self.match_node(
            'Enterprise',
            cypher='_.URL = "{}" OR _.NAME = "{}"'
                   ''.format(Enterprise.parser_url(etp['url']),
                             etp['name']))
        if etp_n is None:
            etp_n = Enterprise(URL=etp['url'], NAME=etp['name'])
            etp_n = self.get_neo_node(etp_n)
        if etp_n is None:
            return [], []
        nodes, relationships = [], []
        nodes.append(etp_n)
        ind = etp['IndustryV3']
        if ind is None:
            return nodes, relationships
        ind1 = self.get_neo_node(Industry(**{
            'name': ind['Industry'],
            'code': ind['IndustryCode'],
            '类别': '一级'
        }))
        ind2 = self.get_neo_node(Industry(**{
            'name': ind['SubIndustry'],
            'code': ind['SubIndustryCode'],
            '类别': '二级'
        }))
        ind3 = self.get_neo_node(Industry(**{
            'name': ind['MiddleCategory'],
            'code': ind['MiddleCategoryCode'],
            '类别': '三级'
        }))
        ind4 = self.get_neo_node(Industry(**{
            'name': ind['SmallCategory'],
            'code': ind['SmallCategoryCode'],
            '类别': '四级'
        }))
        _ids_ = [ind4, ind3, ind2, ind1]
        ids = []
        for i in _ids_:
            if i is not None:
                ids.append(i)
                nodes.append(i)
        if len(ids):
            relationships.append(Belong(etp_n, ids[0]))
            for i in range(len(ids) - 1):
                relationships.append(Belong(ids[i], ids[i + 1]))
            pass
        return nodes, relationships
        pass

    def merge_all_nodes_and_relationships(self):
        enterprises = self.base.query(
            # sql={'metaModel': '企业发展'},
            field={
                '_id': 0,
                'value.Result.Name': 1,
                'value.Result.KeyNo': 1,
                'value.Result.IndustryV3': 1
            },
            limit=10000,
            # skip=2000,
            no_cursor_timeout=True)
        i, j = 0, 0
        nc, rc = 0, 0
        etp_count = 10000
        # etp_count = enterprises.count()
        nodes, relationships = {}, {}
        unique_code_pattern = re.compile('(?<=unique=)\w{32}')

        def getUniqueCode(url):
            _uc_ = re.search(unique_code_pattern, url)
            if _uc_ is not None:
                return _uc_.group(0)
            else:
                return None

        _st_ = time.time()
        for ep in enterprises:
            i += 1
            ep = ep['value']['Result']
            uc = ep['KeyNo']  # getUniqueCode(ep['url'])
            ep['name'] = ep.pop('Name')
            if uc is None:
                self.logger.info('{}:mismatch url'.format(ep['name']))
                continue
            ep['url'] = '/firm_' + uc + '.html'
            nds, rps = self.get_all_nodes_and_relationships_from_api(ep)

    def get_all_nodes_and_relationships(
            self, save_folder=None, enterprises=None, **kwargs):
        if enterprises is None:
            enterprises_data = self.base.query(
                # sql={'metaModel': '企业发展'},
                field={
                    '_id': 0,
                    'value.Result.Name': 1,
                    'value.Result.KeyNo': 1,
                    'value.Result.IndustryV3': 1
                },
                limit=10000,
                # skip=2000,
                no_cursor_timeout=True)
            etp_count = 10000
            # etp_count = enterprises_data.count()
        else:
            enterprises_data = enterprises
            etp_count = len(enterprises)
        i, j = 0, 0
        nc, rc = 0, 0
        nodes, relationships = {}, {}
        unique_code_pattern = re.compile('(?<=unique=)\w{32}')

        def getUniqueCode(url):
            _uc_ = re.search(unique_code_pattern, url)
            if _uc_ is not None:
                return _uc_.group(0)
            else:
                return None

        _st_ = time.time()
        for ep in enterprises_data:
            i += 1
            if enterprises is not None:
                ep = self.base.query_one(
                    sql={'value.Result.Name': ep['name']},
                    field={
                        '_id': 0,
                        'value.Result.Name': 1,
                        'value.Result.KeyNo': 1,
                        'value.Result.IndustryV3': 1
                    },
                )
                if ep is None:
                    continue

            ep = ep['value']['Result']
            uc = ep['KeyNo']  # getUniqueCode(ep['url'])
            ep['name'] = ep.pop('Name')
            if uc is None:
                self.logger.info('{}:mismatch url'.format(ep['name']))
                continue
            ep['url'] = '/firm_' + uc + '.html'
            nds, rps = self.get_all_nodes_and_relationships_from_api(ep)
            for _nds_ in nds:
                if _nds_ is None:
                    continue
                # _nds_ = _nds_.to_dict()
                label = list(_nds_.labels)[0]
                _nds_ = dict(label=label, **_nds_)
                if _nds_['label'] in nodes.keys():
                    nodes[_nds_['label']].append(_nds_)
                else:
                    nodes[_nds_['label']] = [_nds_]
                pass
            for _rps_ in rps:
                _rps_ = _rps_.to_dict()
                if _rps_['label'] in relationships.keys():
                    relationships[_rps_['label']].append(_rps_)
                else:
                    relationships[_rps_['label']] = [_rps_]
                pass
            if i % 10000 == 0:
                j += 1
                if save_folder is not None:
                    _nc_, _rc_ = self.save_graph(
                        save_folder, nodes,
                        relationships, **kwargs)
                    nc += _nc_
                    rc += _rc_
                    nodes.clear()
                    relationships.clear()
                self.logger.info(SuccessMessage(
                    'success trans data to csv round {} and '
                    'deal {}/{} enterprise spend {} seconds.'
                    ''.format(j, i, etp_count, int(time.time() - _st_))
                ))
                _st_ = time.time()
                pass
        if save_folder is not None:
            _nc_, _rc_ = self.save_graph(
                save_folder, nodes,
                relationships, **kwargs)
            nc += _nc_
            rc += _rc_
            nodes.clear()
            relationships.clear()
            self.logger.info('Summary:')
            self.logger.info(' save graph data:')
            self.logger.info('   {} nodes'.format(nc))
            self.logger.info('   {} relationships'.format(rc))
            pass
        return nodes, relationships

Ejemplo n.º 25

Mostrar archivo

class JusRulingTextGraph(BaseGraph):

    def __init__(self):
        BaseGraph.__init__(self)
        self.base = BaseModel(tn='重庆裁决文书(内容)')
        pass

    def create_index_and_constraint(self):
        """
        为涉及到的实体创建唯一性约束跟索引，唯一键自动带有索引
        不比再单独创建索引
        :return:
        """
        # TODO(leung): 要随时确保label的准确性
        constraint = {
            'RulingText': ['CASE_NUM'],
        }
        index = {
            # 'Enterprise': [('NAME',)]
        }
        self.add_index_and_constraint(index, constraint)
        pass

    def create_all_relationship(self):
        """
        1.ruling -[have]->ruling_text
        :return:
        """
        rts = self.base.query(
            sql={'metaModel': '裁判文书'},
            no_cursor_timeout=True)
        i, k = 0, 0
        # eg = EtpGraph()
        etp_count = rts.count()
        relationships = []
        # prs = Person()
        ruling = Ruling()
        for r in rts:
            k += 1
            rt = RulingText.create_from_original_text(
                r['content'], **{'链接': r['url']}
            )
            rl_n = self.NodeMatcher.match(ruling.label).where(
                '_.CASE_NUM="{}"'.format(   # OR _.URL="{}"
                    rt.BaseAttributes['CASE_NUM'],
                    # rt.BaseAttributes['URL']
                )
            ).first()
            if rl_n is None:
                continue
            relationships.append(
                Have(rl_n, rt.get_neo_node(primarykey=rt.primarykey)
                     ).get_relationship()
            )

            if len(relationships) > 1000:
                i += 1
                self.graph_merge_relationships(relationships)
                if not self.index_and_constraint_statue:
                    self.create_index_and_constraint()
                print(SuccessMessage('{}:success merge relationships to database '
                                     'round {} and deal {}/{} enterprise,and'
                                     ' merge {} relationships.'.format(
                    dt.datetime.now(), i, k, etp_count, len(relationships)
                )))
                relationships.clear()
        if len(relationships):
            i += 1
            self.graph_merge_relationships(relationships)
            if not self.index_and_constraint_statue:
                self.create_index_and_constraint()
            print(SuccessMessage('{}:success merge relationships to database '
                                 'round {} and deal {}/{} enterprise,and'
                                 ' merge {} relationships.'.format(
                dt.datetime.now(), i, k, etp_count, len(relationships)
            )))
            relationships.clear()
        pass


# rtg = JusRulingTextGraph()
# rtg.create_all_relationship()

Ejemplo n.º 26

Mostrar archivo

def check():
    fps = File.get_all_file(import_path)
    n_fps = []
    r_fps = []
    for p in fps:
        if 'nodes' in p:
            n_fps.append(p)
        if 'relationships':
            r_fps.append(p)
        pass

    from Calf.data import BaseModel
    base = BaseModel(tn='cq_all',
                     # tn='qcc.1.1',
                     # location='gcxy',
                     # dbname='data'
                     )

    def func1():
        # 处理非基本信息模块下的Enterprise
        etp_fps = []
        for p in n_fps:
            if 'Enterprise' in p and 'EtpGraph' not in p:
                etp_fps.append(p)
        etp_fps = set([os.path.join(*p.split('\\')[:-1]) for p in etp_fps])
        etp = entities('Enterprise')
        etp_data = []
        for ep in etp_fps:
            ed = etp.read_csv(ep, ep)
            etp_data.append(ed)
        etp_data = pd.concat(etp_data)
        etp_data.drop_duplicates(['URL:ID(Enterprise)'], inplace=True)
        etp_data.reset_index(drop=True, inplace=True)
        total = len(etp_data)

        etp_data['exist'] = False
        for i, r in etp_data.iterrows():
            try:
                _ = base.query_one(sql={
                    'name': r['NAME'],
                    'metaModel': '基本信息'
                },
                                   field={
                                       'name': 1,
                                       '_id': 0
                                   })
                if _ is not None:
                    etp_data.loc[i, ['exist']] = True
                if i % 100 == 0:
                    progress_bar(total, i, 'check')
            except Exception as e:
                print(e)
        etp_data = etp_data[~etp_data['exist']]
        # etp_data.drop(['exist'], axis=1)
        etp.to_csv(etp_data, import_path, split_header=True)
        pass

    # func1()

    def func2():
        # 处理Related
        rel_fps = []
        for p in n_fps:
            if 'Related' in p:
                rel_fps.append(p)
        rel_fps = set([os.path.join(*p.split('\\')[:-1]) for p in rel_fps])
        rel = entities('Related')
        rel_data = []
        for ep in rel_fps:
            ed = rel.read_csv(ep, ep)
            rel_data.append(ed)
        rel_data = pd.concat(rel_data)
        # rel_data.drop_duplicates(['URL:ID'], inplace=True)

        drop = rel_data.loc[:, ['URL:ID(Related)', 'NAME']]
        drop['count'] = 1
        drop = drop.groupby(['URL:ID(Related)'], as_index=False).agg({
            'count':
            'count',
            'NAME':
            'first'
        })
        drop = drop[(drop['count'] > 3) & (drop['NAME'].str.len() < 4)]
        drop = drop['URL:ID(Related)']
        # drop = drop.tolist()
        if len(drop):
            rel_data = rel_data[~rel_data['URL:ID(Related)'].isin(drop)]
        rel.to_csv(rel_data, import_path, split_header=True)
        pass

    func2()
    pass

Ejemplo n.º 27

Mostrar archivo

 def __init__(self):
     BaseGraph.__init__(self)
     self.base = BaseModel(tn='重庆裁决文书(内容)')
     pass

Ejemplo n.º 28

Mostrar archivo

Archivo: develop_graph.py Proyecto: CharsLeung/zlr

class DvpGraph(BaseGraph):
    def __init__(self, **kwargs):
        BaseGraph.__init__(self, **kwargs)
        self.base = BaseModel(
            tn='cq_all',
            # tn='qcc.1.1',
            # location='gcxy',
            # dbname='data'
        )
        pass

    def create_index_and_constraint(self):
        """
        为涉及到的实体创建唯一性约束跟索引，唯一键自动带有索引
        不比再单独创建索引
        :return:
        """
        # TODO(leung): 要随时确保label的准确性
        constraint = {
            # 'News': [News.primarykey],
            # 'Possession': [Possession.primarykey],
            # 'Involveder': ['HASH_ID'],
        }
        index = {
            # 'Enterprise': [('NAME',)]
        }
        self.add_index_and_constraint(index, constraint)
        pass

    def create_all_relationship(self):
        """
        1.enterprise -[compete]->enterprise
        :return:
        """
        ops = self.base.query(sql={'metaModel': '企业发展'},
                              field={
                                  'name': 1,
                                  'url': 1,
                                  'content.竞品信息': 1
                              },
                              limit=1000,
                              no_cursor_timeout=True)
        i, k = 0, 0
        eg = EtpGraph()
        etp_count = ops.count()
        relationships = []
        etp = Enterprise()
        for o in ops:
            k += 1
            # if k < 41321:
            #     continue
            # TODO(leung): 这里要注意，基本信息以外的模块中的url确定不了公司
            etp_n = self.match_node(*legal,
                                    cypher='_.NAME = "{}"'.format(o['name']))
            if etp_n is None:
                # 如果这个公司还没在数据库里面，那么应该创建这个公司
                _ = self.base.query_one(sql={
                    'metaModel': '基本信息',
                    'name': o['name']
                })
                if _ is not None:
                    etp = Enterprise(_)
                    etp_n = self.get_neo_node(etp)
                    # 虽然在创建司法关系的时候会创建未在库中的企业，但不会创建
                    # 这个企业的基本关系，因此需要添加其基本关系
                    relationships += eg.create_relationship_from_enterprise_baseinfo(
                        _)
                    pass
                else:
                    # 没有这个公司的信息，那就创建一个信息不全的公司
                    # etp = Enterprise({'name': o['name'], 'url': o['url']})
                    etp = Related()
                    etp['NAME'] = o['name']
                    etp['URL'] = o['url']
                    etp_n = self.get_neo_node(etp)
                    pass

            if '竞品信息' in o['content'].keys():
                data = self.get_format_dict(o['content']['竞品信息'])
                for d in data:
                    etp_2 = d.pop('关联企业')
                    if etp_2['名称'] is not None and len(etp_2['名称']) > 1:
                        etp_2['链接'] = etp.parser_url(etp_2['链接'])
                        etp_n_2 = self.match_node(*legal,
                                                  cypher='_.URL = "{}"'.format(
                                                      etp_2['链接']))
                        if etp_n_2 is None and etp_2['名称'] > 1:
                            _ = {
                                'URL': etp_2['链接'],
                                'NAME': etp_2['名称'],
                                '简介': d.pop('产品介绍'),
                                '成立日期': d.pop('成立日期'),
                                '融资信息': d.pop('融资信息'),
                                '所属地': d.pop('所属地'),
                            }
                            etp_n_2 = Related(**_)
                            etp_n_2 = self.get_neo_node(etp_n_2)
                        relationships.append(
                            Compete(etp_n, etp_n_2, **d).get_relationship())

                pass
            if len(relationships) > 1000:
                i += 1
                self.graph_merge_relationships(relationships)
                if not self.index_and_constraint_statue:
                    self.create_index_and_constraint()
                print(
                    SuccessMessage(
                        '{}:success merge relationships to database '
                        'round {} and deal {}/{} enterprise,and'
                        ' merge {} relationships.'.format(
                            dt.datetime.now(), i, k, etp_count,
                            len(relationships))))
                relationships.clear()
                # return
        if len(relationships):
            i += 1
            self.graph_merge_relationships(relationships)
            if not self.index_and_constraint_statue:
                self.create_index_and_constraint()
            print(
                SuccessMessage('{}:success merge relationships to database '
                               'round {} and deal {}/{} enterprise,and'
                               ' merge {} relationships.'.format(
                                   dt.datetime.now(), i, k, etp_count,
                                   len(relationships))))
            relationships.clear()
            pass

    def get_all_nodes_and_relationships_from_enterprise(self, etp):
        etp_n = Enterprise(URL=etp['url'], NAME=etp['name'])
        etp_n = self.get_neo_node(etp_n)
        if etp_n is None:
            return [], []
        nodes, relationships = [], []
        nodes.append(etp_n)
        if '竞品信息' in etp['content'].keys():
            data = self.get_format_dict(etp['content']['竞品信息'])
            data = Product.create_from_dict(data)
            for d in data:
                p = d.pop('product')
                p_n = self.get_neo_node(p)
                if p_n is None:
                    continue
                nodes.append(p_n)
                relationships.append(Compete(etp_n, p_n))
                etp_2 = d.pop('关联企业')
                etp_2['链接'] = Enterprise.parser_url(etp_2['链接'])
                if etp_2['名称'] is not None and len(etp_2['名称']) > 1:
                    # etp_2['链接'] = Enterprise.parser_url(etp_2['链接'])
                    etp_n_2 = self.match_node(*legal,
                                              cypher='_.URL = "{}"'.format(
                                                  etp_2['链接']))
                    if etp_n_2 is None and len(etp_2['名称']) > 1:
                        etp_n_2 = Enterprise(**etp_2)
                        if not etp_n_2.isEnterprise():
                            _ = {
                                'URL': etp_2['链接'],
                                'NAME': etp_2['名称'],
                                '简介': d.pop('产品介绍'),
                                '成立日期': d.pop('成立日期'),
                                '融资信息': d.pop('融资信息'),
                                '所属地': d.pop('所属地'),
                            }
                            etp_n_2 = Related(**{
                                '链接': etp_2['链接'],
                                '名称': etp_2['名称']
                            })
                        # etp_n_2 = Related(**_)
                        etp_n_2 = self.get_neo_node(etp_n_2)
                    nodes.append(etp_n_2)
                    relationships.append(Produce(etp_n_2, p_n))
        return nodes, relationships

    def get_all_nodes_and_relationships(self, save_folder=None, **kwargs):
        enterprises = self.base.query(
            sql={'metaModel': '企业发展'},
            field={
                'name': 1,
                'url': 1,
                'content.竞品信息': 1
            },
            # limit=100000,
            # skip=2000,
            no_cursor_timeout=True)
        i, j = 0, 0
        nc, rc = 0, 0
        etp_count = enterprises.count()
        nodes, relationships = {}, {}
        unique_code_pattern = re.compile('(?<=unique=)\w{32}')

        def getUniqueCode(url):
            _uc_ = re.search(unique_code_pattern, url)
            if _uc_ is not None:
                return _uc_.group(0)
            else:
                return None

        _st_ = time.time()
        for ep in enterprises:
            i += 1
            uc = getUniqueCode(ep['url'])
            if uc is None:
                self.logger.info('{}:mismatch url'.format(ep['name']))
                continue
            ep['url'] = '/firm_' + uc + '.html'
            nds, rps = self.get_all_nodes_and_relationships_from_enterprise(ep)
            for _nds_ in nds:
                if _nds_ is None:
                    continue
                # _nds_ = _nds_.to_dict()
                label = list(_nds_.labels)[0]
                _nds_ = dict(label=label, **_nds_)
                if _nds_['label'] in nodes.keys():
                    nodes[_nds_['label']].append(_nds_)
                else:
                    nodes[_nds_['label']] = [_nds_]
                pass
            for _rps_ in rps:
                _rps_ = _rps_.to_dict()
                if _rps_['label'] in relationships.keys():
                    relationships[_rps_['label']].append(_rps_)
                else:
                    relationships[_rps_['label']] = [_rps_]
                pass
            if i % 10000 == 0:
                j += 1
                if save_folder is not None:
                    _nc_, _rc_ = self.save_graph(save_folder, nodes,
                                                 relationships, **kwargs)
                    nc += _nc_
                    rc += _rc_
                    nodes.clear()
                    relationships.clear()
                self.logger.info(
                    SuccessMessage('success trans data to csv round {} and '
                                   'deal {}/{} enterprise spend {} seconds.'
                                   ''.format(j, i, etp_count,
                                             int(time.time() - _st_))))
                _st_ = time.time()
                pass
        if save_folder is not None:
            _nc_, _rc_ = self.save_graph(save_folder, nodes, relationships,
                                         **kwargs)
            nc += _nc_
            rc += _rc_
            nodes.clear()
            relationships.clear()
            self.logger.info('Summary:')
            self.logger.info(' save graph data:')
            self.logger.info('   {} nodes'.format(nc))
            self.logger.info('   {} relationships'.format(rc))
            pass
        return nodes, relationships

Ejemplo n.º 29

Mostrar archivo

Archivo: news_graph.py Proyecto: CharsLeung/zlr

class NewsGraph(BaseGraph):

    def __init__(self, **kwargs):
        BaseGraph.__init__(self, **kwargs)
        self.base = BaseModel(
            tn='cq_all',
            # tn='qcc.1.1',
            # location='gcxy',
            # dbname='data'
        )
        pass

    def create_index_and_constraint(self):
        """
        为涉及到的实体创建唯一性约束跟索引，唯一键自动带有索引
        不比再单独创建索引
        :return:
        """
        # TODO(leung): 要随时确保label的准确性
        used_entity = [
            'News',
        ]
        constraint = {}
        index = {}
        for l in used_entity:
            constraint[l] = [entities(l).primarykey]
            idx = entities(l).index
            if len(idx):
                index[l] = idx
        self.add_index_and_constraint(index, constraint)
        pass

    def create_all_relationship(self):
        """
        1.enterprise -[have or x]->x
        :return:
        """
        ops = self.base.query(
            sql={'metaModel': '公司新闻'},
            # limit=10,
            skip=2020,
            no_cursor_timeout=True)
        i, k = 0, 0
        eg = EtpGraph()
        etp_count = ops.count()
        relationships = []
        # etp = Enterprise()
        s_t = time.time()
        for o in ops:
            k += 1
            # if k < 43500:
            #     continue
            # TODO(leung): 这里要注意，基本信息以外的模块中的url确定不了公司
            etp_n = self.match_node(
                *legal,
                cypher='_.NAME = "{}"'.format(o['name'])
            )
            if etp_n is None:
                # 如果这个公司还没在数据库里面，那么应该创建这个公司
                _ = self.base.query_one(
                    sql={'metaModel': '基本信息', 'name': o['name']}
                )
                if _ is not None:
                    etp = Enterprise(_)
                    etp_n = self.get_neo_node(etp)
                    # 虽然在创建司法关系的时候会创建未在库中的企业，但不会创建
                    # 这个企业的基本关系，因此需要添加其基本关系
                    relationships += eg.create_relationship_from_enterprise_baseinfo(_)
                    pass
                else:
                    # 没有这个公司的信息，那就创建一个信息不全的公司
                    etp = Related(**{'名称': o['name'], '链接': o['url']})
                    # etp['NAME'] = o['name']
                    # etp['URL'] = o['url']
                    etp_n = self.get_neo_node(etp)
                    if etp_n is None:
                        continue
                    pass

            if '新闻舆情' in o['content'].keys():
                data = self.get_format_dict(o['content']['新闻舆情'])
                ns = News.create_from_dict(data)
                for n in ns:
                    n_ = n.pop('news')
                    n_n = self.get_neo_node(n_)
                    if n_n is not None:
                        relationships.append(
                            Have(etp_n, n_n, **n).get_relationship()
                        )
                pass
            if len(relationships) > 1000:
                i += 1
                sp = int(time.time() - s_t)
                s_t = time.time()
                self.graph_merge_relationships(relationships)
                if not self.index_and_constraint_statue:
                    self.create_index_and_constraint()
                print(SuccessMessage('{}:success merge relationships to database '
                                     'round {} and deal {}/{} enterprise and spend {} '
                                     'seconds,and merge {} relationships.'.format(
                    dt.datetime.now(), i, k, etp_count, sp, len(relationships)
                )))
                relationships.clear()
                # return
        if len(relationships):
            i += 1
            self.graph_merge_relationships(relationships)
            if not self.index_and_constraint_statue:
                self.create_index_and_constraint()
            print(SuccessMessage('{}:success merge relationships to database '
                                 'round {} and deal {}/{} enterprise,and'
                                 ' merge {} relationships.'.format(
                dt.datetime.now(), i, k, etp_count, len(relationships)
            )))
            relationships.clear()
            pass

    def get_all_nodes_and_relationships_from_enterprise(self, etp):
        etp_n = Enterprise(URL=etp['url'], NAME=etp['name'])
        etp_n = self.get_neo_node(etp_n)
        if etp_n is None:
            return [], []
        nodes, relationships = [], []
        nodes.append(etp_n)

        if '新闻舆情' in etp['content'].keys():
            data = self.get_format_dict(etp['content']['新闻舆情'])
            ns = News.create_from_dict(data)
            for n in ns:
                n_ = n.pop('news')
                n_n = self.get_neo_node(n_)
                if n_n is not None:
                    nodes.append(n_n)
                    relationships.append(
                        Have(etp_n, n_n, **n)
                    )
            pass
        return nodes, relationships

    def get_all_nodes_and_relationships(
            self, save_folder=None, **kwargs):
        enterprises = self.base.query(
            sql={
                'metaModel': '公司新闻',
                # 'name': '重庆轩烽建材有限公司'
            },
            # limit=10000,
            # skip=100000,
            no_cursor_timeout=True)
        i, j = 0, 0
        nc, rc = 0, 0
        etp_count = enterprises.count()
        nodes, relationships = {}, {}
        unique_code_pattern = re.compile('(?<=unique=)\w{32}')

        def getUniqueCode(url):
            _uc_ = re.search(unique_code_pattern, url)
            if _uc_ is not None:
                return _uc_.group(0)
            else:
                return None

        _st_ = time.time()
        for ep in enterprises:
            i += 1
            uc = getUniqueCode(ep['url'])
            if uc is None:
                self.logger.info('{}:mismatch url'.format(ep['name']))
                continue
            ep['url'] = '/firm_' + uc + '.html'
            nds, rps = self.get_all_nodes_and_relationships_from_enterprise(ep)
            for _nds_ in nds:
                if _nds_ is None:
                    continue
                # _nds_ = _nds_.to_dict()
                label = list(_nds_.labels)[0]
                _nds_ = dict(label=label, **_nds_)
                if _nds_['label'] in nodes.keys():
                    nodes[_nds_['label']].append(_nds_)
                else:
                    nodes[_nds_['label']] = [_nds_]
                pass
            for _rps_ in rps:
                _rps_ = _rps_.to_dict()
                if _rps_['label'] in relationships.keys():
                    relationships[_rps_['label']].append(_rps_)
                else:
                    relationships[_rps_['label']] = [_rps_]
                pass
            if i % 10000 == 0:
                j += 1
                if save_folder is not None:
                    _nc_, _rc_ = self.save_graph(
                        save_folder, nodes,
                        relationships, **kwargs)
                    nc += _nc_
                    rc += _rc_
                    nodes.clear()
                    relationships.clear()
                self.logger.info(SuccessMessage(
                    'success trans data to csv round {} and '
                    'deal {}/{} enterprise spend {} seconds.'
                    ''.format(j, i, etp_count, int(_st_ - time.time()))
                ))
                _st_ = time.time()
                pass
        if save_folder is not None:
            _nc_, _rc_ = self.save_graph(
                save_folder, nodes,
                relationships, **kwargs)
            nc += _nc_
            rc += _rc_
            nodes.clear()
            relationships.clear()
            self.logger.info('Summary:')
            self.logger.info(' save graph data:')
            self.logger.info('   {} nodes'.format(nc))
            self.logger.info('   {} relationships'.format(rc))
            pass
        return nodes, relationships

Ejemplo n.º 30

Mostrar archivo

Archivo: operating_graph.py Proyecto: CharsLeung/zlr

class OptGraph(BaseGraph):
    def __init__(self, **kwargs):
        BaseGraph.__init__(self, **kwargs)
        self.base = BaseModel(
            tn='cq_all',
            # tn='qcc.1.1',
            # location='gcxy',
            # dbname='data'
        )
        pass

    def create_index_and_constraint(self):
        """
        为涉及到的实体创建唯一性约束跟索引，唯一键自动带有索引
        不比再单独创建索引
        :return:
        """
        # TODO(leung): 要随时确保label的准确性
        # 用到是实体对象
        used_entity = [
            'License',
            'Bidding',
            'Check',
            'RandomCheck',
            'TaxCredit',
            'IAE',
            'Position',
            # 'Client',
            # 'Supplier',
            # 'Possession',
            'Plot'
        ]
        constraint = {}
        index = {}
        for l in used_entity:
            constraint[l] = [entities(l).primarykey]
            idx = entities(l).index
            if len(idx):
                index[l] = idx
        self.add_index_and_constraint(index, constraint)
        pass

    def get_all_nodes_from_enterprise(self, etp):
        nodes = [Enterprise(URL=etp['url'], NAME=etp['name'])]

        if '产权交易' in etp['content'].keys():
            # data = self.get_format_dict(etp['content']['产权交易'])
            # for d in data:
            #     bd = d.pop('标的')
            #     bd_n =
            pass
        if '行政许可' in etp['content'].keys():
            data = etp['content']['行政许可']
            if '工商局' in data.keys():
                d1 = self.get_format_dict(data['工商局'])
                ls = License.create_from_dict(d1, '工商局')
                for l in ls:
                    nodes.append(l.pop('license'))
                pass
            if '信用中国' in data.keys():
                d2 = self.get_format_dict(data['信用中国'])
                ls = License.create_from_dict(d2, '信用中国')
                for l in ls:
                    nodes.append(l.pop('license'))
                pass
            pass
        if '招投标信息' in etp['content'].keys():
            # 公示的招投标信息一般都是结果，一般情况下是找不到
            # 共同投标的单位，除非是共同中标
            data = self.get_format_dict(etp['content']['招投标信息'])
            bs = Bidding.create_from_dict(data)
            for b in bs:
                nodes.append(b.pop('bidding'))
            pass
        if '抽查检查' in etp['content'].keys():
            data = self.get_format_dict(etp['content']['抽查检查'])
            cs = Check.create_from_dict(data)
            for c in cs:
                nodes.append(c.pop('check'))
            pass
        if '双随机抽查' in etp['content'].keys():
            data = self.get_format_dict(etp['content']['双随机抽查'])
            rcs = RandomCheck.create_from_dict(data)
            # rcs_n = self.get_neo_node(rcs)
            for rc in rcs:
                # TODO(leung):随机抽查没有结果
                nodes.append(rc.pop('check'))
            pass
        if '税务信用' in etp['content'].keys():
            data = self.get_format_dict(etp['content']['税务信用'])
            ts = TaxCredit.create_from_dict(data)
            # ts_n = self.get_neo_node(ts)
            for t in ts:
                nodes.append(t.pop('TaxCredit'))
            pass
        if '进出口信用' in etp['content'].keys():
            data = self.get_format_dict(etp['content']['进出口信用'])
            ies = IAE.create_from_dict(data)
            # ies_n = self.get_neo_node(ies)
            for ie in ies:
                nodes.append(ie.pop('iae'))
            pass
        if '招聘' in etp['content'].keys():
            data = self.get_format_dict(etp['content']['招聘'])
            rs = Position.create_from_dict(data)
            for r in rs:
                nodes.append(r.pop('position'))
            pass
        if '客户' in etp['content'].keys():
            data = self.get_format_dict(etp['content']['客户'])
            cs = Client.create_from_dict(data)
            for c in cs:
                nodes.append(c.pop('client'))
            pass
        if '供应商' in etp['content'].keys():
            data = self.get_format_dict(etp['content']['供应商'])
            ss = Supplier.create_from_dict(data)
            for s in ss:
                nodes.append(s.pop('supplier'))
            pass
        if '信用评级' in etp['content'].keys():
            data = self.get_format_dict(etp['content']['信用评级'])
            for d in data:
                nodes.append(d.pop('评级公司'))
            pass
        if '土地转让' in etp['content'].keys():
            data = self.get_format_dict(etp['content']['土地转让'])
            for d in data:
                e1 = d.pop('原土地使用权人')
                e2 = d.pop('现有土地使用权人')
                p = Plot(**d)
                nodes.append(p)
            pass
        return nodes
        pass

    def get_all_nodes(self):
        enterprises = self.base.query(
            sql={
                'metaModel': '经营状况',
                # 'name': '重庆轩烽建材有限公司'
            },
            limit=1000,
            # skip=2000,
            no_cursor_timeout=True)
        i, j = 0, 0
        etp_count = enterprises.count()
        nodes = {}
        unique_code_pattern = re.compile('(?<=unique=)\w{32}')

        def getUniqueCode(url):
            _uc_ = re.search(unique_code_pattern, url)
            if _uc_ is not None:
                return _uc_.group(0)
            else:
                return None

        for ep in enterprises:
            i += 1
            uc = getUniqueCode(ep['url'])
            if uc is None:
                continue
            ep['url'] = '/firm_' + uc + '.html'
            nds = self.get_all_nodes_from_enterprise(ep)
            for _nds_ in nds:
                if _nds_ is None:
                    continue
                _nds_ = _nds_.to_dict()
                if _nds_['label'] in nodes.keys():
                    nodes[_nds_['label']].append(_nds_)
                else:
                    nodes[_nds_['label']] = [_nds_]
                pass
            if i % 1000 == 0:
                j += 1
                print(
                    SuccessMessage('{}:success merge nodes to database '
                                   'round {} and deal {}/{} enterprise'
                                   ''.format(dt.datetime.now(), i, j,
                                             etp_count)))
            pass
        return nodes

    def get_all_relationships_from_enterprise(self, etp):
        etp_n = Enterprise(URL=etp['url'], NAME=etp['name'])
        etp_n = self.get_neo_node(etp_n)
        if etp_n is None:
            return []
        relationships = []
        if '产权交易' in etp['content'].keys():
            # data = self.get_format_dict(etp['content']['产权交易'])
            # for d in data:
            #     bd = d.pop('标的')
            #     bd_n =
            pass

        if '行政许可' in etp['content'].keys():
            data = etp['content']['行政许可']
            if '工商局' in data.keys():
                d1 = self.get_format_dict(data['工商局'])
                ls = License.create_from_dict(d1, '工商局')
                for l in ls:
                    l_ = l.pop('license')
                    l_n = self.get_neo_node(l_)
                    if l_n is None:
                        continue
                    relationships.append(Have(etp_n, l_n, **l))
                pass
            if '信用中国' in data.keys():
                d2 = self.get_format_dict(data['信用中国'])
                ls = License.create_from_dict(d2, '信用中国')
                for l in ls:
                    l_ = l.pop('license')
                    l_n = self.get_neo_node(l_)
                    if l_n is None:
                        continue
                    relationships.append(Have(etp_n, l_n, **l))
                pass
            pass
        if '招投标信息' in etp['content'].keys():
            # 公示的招投标信息一般都是结果，一般情况下是找不到
            # 共同投标的单位，除非是共同中标
            data = self.get_format_dict(etp['content']['招投标信息'])
            bs = Bidding.create_from_dict(data)
            for b in bs:
                _ = b.pop('bidding')
                b_n = self.get_neo_node(_)
                if b_n is None:
                    continue
                # TODO(leung):项目分类用作了招投标结果
                relationships.append(
                    TakePartIn(etp_n, b_n, **dict(b, **{'RESULT':
                                                        b_n['TYPE']})))
            pass
        if '抽查检查' in etp['content'].keys():
            data = self.get_format_dict(etp['content']['抽查检查'])
            cs = Check.create_from_dict(data)
            for c in cs:
                _ = c.pop('check')
                n = self.get_neo_node(_)
                if n is None:
                    continue
                relationships.append(
                    Have(etp_n, n, **dict(c, **{'RESULT': n['RESULT']})))
            pass
        if '双随机抽查' in etp['content'].keys():
            data = self.get_format_dict(etp['content']['双随机抽查'])
            rcs = RandomCheck.create_from_dict(data)
            # rcs_n = self.get_neo_node(rcs)
            for rc in rcs:
                # TODO(leung):随机抽查没有结果
                _ = rc.pop('check')
                n = self.get_neo_node(_)
                if n is None:
                    continue
                relationships.append(Have(etp_n, n, **rc))
            pass
        if '税务信用' in etp['content'].keys():
            data = self.get_format_dict(etp['content']['税务信用'])
            ts = TaxCredit.create_from_dict(data)
            # ts_n = self.get_neo_node(ts)
            for t in ts:
                _ = t.pop('TaxCredit')
                n = self.get_neo_node(_)
                if n is None:
                    continue
                # TODO(leung):纳税信用等级作为税务信用评级结果
                relationships.append(
                    Have(etp_n, n, **dict(RESULT=n['GRADE'], **t)))
            pass
        if '进出口信用' in etp['content'].keys():
            data = self.get_format_dict(etp['content']['进出口信用'])
            ies = IAE.create_from_dict(data)
            # ies_n = self.get_neo_node(ies)
            for ie in ies:
                _ = ie.pop('iae')
                n = self.get_neo_node(_)
                if n is None:
                    continue
                relationships.append(Have(etp_n, n, **ie))
            pass
        if '招聘' in etp['content'].keys():
            data = self.get_format_dict(etp['content']['招聘'])
            rs = Position.create_from_dict(data)
            for r in rs:
                _ = r.pop('position')
                n = self.get_neo_node(_)
                if n is None:
                    continue
                relationships.append(Recruit(etp_n, n, **r))
            pass
        if '客户' in etp['content'].keys():
            data = self.get_format_dict(etp['content']['客户'])
            cs = Client.create_from_dict(data)
            for c in cs:
                _ = c.pop('client')
                n = self.match_node(
                    *legal,
                    cypher='_.URL = "{}" OR _.NAME = "{}"'.format(
                        _['URL'], _['NAME']))
                if n is None:
                    n = self.get_neo_node(_)
                    if n is None:
                        continue
                relationships.append(SellTo(etp_n, n, **c))
            pass
        if '供应商' in etp['content'].keys():
            data = self.get_format_dict(etp['content']['供应商'])
            ss = Supplier.create_from_dict(data)
            for s in ss:
                _ = s.pop('supplier')
                n = self.match_node(
                    *legal,
                    cypher='_.URL = "{}" OR _.NAME = "{}"'.format(
                        _['URL'], _['NAME']))
                if n is None:
                    n = self.get_neo_node(_)
                    if n is None:
                        continue
                relationships.append(BuyFrom(etp_n, n, **s))
            pass
        if '信用评级' in etp['content'].keys():
            data = self.get_format_dict(etp['content']['信用评级'])
            for d in data:
                _ = d.pop('评级公司')
                n = self.match_node(
                    *legal,
                    cypher='_.URL = "{}" OR _.NAME = "{}"'.format(
                        _['链接'], _['名称']))
                if n is None:
                    n = Enterprise(**_)
                    n = self.get_neo_node(n)
                    if n is None:
                        continue
                __ = d.pop('内容')
                d['评级内容'] = __['内容']
                d['评级链接'] = __['链接']
                relationships.append(Appraise(n, etp_n, **d))
            pass
        if '土地转让' in etp['content'].keys():
            data = self.get_format_dict(etp['content']['土地转让'])
            for d in data:
                e1 = d.pop('原土地使用权人')
                e2 = d.pop('现有土地使用权人')
                p = Plot(**d)
                p_n = self.get_neo_node(p)
                if p_n is None:
                    continue
                if e1['名称'] == etp['name'] or e1['链接'] == etp['url']:
                    n1 = etp_n
                else:
                    # 有可能是人
                    n1 = self.match_node(*legal,
                                         cypher='_.URL = "{}"'.format(
                                             e1['链接']))
                    if n1 is None:
                        n1 = Enterprise(**e1)
                        if not n1.isEnterprise():
                            n1 = Person(**e1)
                            if not n1.isPerson():
                                n1 = Related(**e1)
                        n1 = self.get_neo_node(n1)
                if n1 is not None:
                    relationships.append(Sell(n1, p_n))
                if e2['名称'] == etp['name'] or e2['链接'] == etp['url']:
                    n2 = etp_n
                else:
                    n2 = self.match_node(*legal,
                                         cypher='_.URL = "{}"'.format(
                                             e2['链接']))
                    if n2 is None:
                        n2 = Enterprise(**e2)
                        if not n2.isEnterprise():
                            n2 = Person(**e2)
                            if not n2.isPerson():
                                n2 = Related(**e2)
                        n2 = self.get_neo_node(n2)
                if n2 is not None:
                    relationships.append(Buy(n2, p_n))
            pass
        return relationships

    def get_all_relationships(self):
        enterprises = self.base.query(
            sql={
                'metaModel': '经营状况',
                # 'name': '重庆轩烽建材有限公司'
            },
            limit=1000,
            # skip=2000,
            no_cursor_timeout=True)
        i, j = 0, 0
        etp_count = enterprises.count()
        relationships = {}
        unique_code_pattern = re.compile('(?<=unique=)\w{32}')

        def getUniqueCode(url):
            _uc_ = re.search(unique_code_pattern, url)
            if _uc_ is not None:
                return _uc_.group(0)
            else:
                return None

        for ep in enterprises:
            i += 1
            uc = getUniqueCode(ep['url'])
            if uc is None:
                continue
            ep['url'] = '/firm_' + uc + '.html'
            rps = self.get_all_relationships_from_enterprise(ep)
            for _rps_ in rps:
                _rps_ = _rps_.to_dict()
                if _rps_['label'] in relationships.keys():
                    relationships[_rps_['label']].append(_rps_)
                else:
                    relationships[_rps_['label']] = [_rps_]
                pass
            if i % 1000 == 0:
                j += 1
                print(
                    SuccessMessage('{}:success merge relationship to database '
                                   'round {} and deal {}/{} enterprise'
                                   ''.format(dt.datetime.now(), i, j,
                                             etp_count)))
            pass
        return relationships

    def create_all_relationship(self):
        """
        1.enterprise -[have or x]->x
        :return:
        """
        ops = self.base.query(
            sql={
                'metaModel': '经营状况',
                # 'name': '重庆轩烽建材有限公司'
            },
            limit=1000,
            # skip=2000,
            no_cursor_timeout=True)
        i, k = 0, 0
        eg = EtpGraph()
        etp_count = ops.count()
        relationships = []
        # etp = Enterprise()
        for o in ops:
            k += 1
            # TODO(leung): 这里要注意，基本信息以外的模块中的url确定不了公司
            etp_n = self.match_node(*legal,
                                    cypher='_.NAME = "{}"'.format(o['name']))
            if etp_n is None:
                # 如果这个公司还没在数据库里面，那么应该创建这个公司
                _ = self.base.query_one(sql={
                    'metaModel': '基本信息',
                    'name': o['name']
                })
                if _ is not None:
                    etp = Enterprise(_)
                    etp_n = self.get_neo_node(etp)
                    # 虽然在创建司法关系的时候会创建未在库中的企业，但不会创建
                    # 这个企业的基本关系，因此需要添加其基本关系
                    relationships += eg.create_relationship_from_enterprise_baseinfo(
                        _)
                    pass
                else:
                    # 没有这个公司的信息，那就创建一个信息不全的公司
                    # 如果在neo4j里面存着只有name,url的公司，意味着
                    # 这家公司没有“基本信息”
                    etp = Related()
                    etp['NAME'] = o['name']
                    etp['URL'] = o['url']
                    etp_n = self.get_neo_node(etp)
                    pass

            if '产权交易' in etp['content'].keys():
                # data = self.get_format_dict(etp['content']['产权交易'])
                # for d in data:
                #     bd = d.pop('标的')
                #     bd_n =
                pass

            if '行政许可' in etp['content'].keys():
                data = etp['content']['行政许可']
                if '工商局' in data.keys():
                    d1 = self.get_format_dict(data['工商局'])
                    ls = License.create_from_dict(d1, '工商局')
                    for l in ls:
                        l_ = l.pop('license')
                        l_n = self.get_neo_node(l_)
                        if l_n is None:
                            continue
                        relationships.append(
                            Have(etp_n, l_n, **l).get_relationship())
                    pass
                if '信用中国' in data.keys():
                    d2 = self.get_format_dict(data['信用中国'])
                    ls = License.create_from_dict(d2, '信用中国')
                    for l in ls:
                        l_ = l.pop('license')
                        l_n = self.get_neo_node(l_)
                        if l_n is None:
                            continue
                        relationships.append(
                            Have(etp_n, l_n, **l).get_relationship())
                    pass
                pass
            if '招投标信息' in etp['content'].keys():
                # 公示的招投标信息一般都是结果，一般情况下是找不到
                # 共同投标的单位，除非是共同中标
                data = self.get_format_dict(etp['content']['招投标信息'])
                bs = Bidding.create_from_dict(data)
                for b in bs:
                    _ = b.pop('bidding')
                    b_n = self.get_neo_node(_)
                    if b_n is None:
                        continue
                    # TODO(leung):项目分类用作了招投标结果
                    relationships.append(
                        TakePartIn(etp_n, b_n,
                                   **dict(b,
                                          **{'RESULT':
                                             b_n['TYPE']})).get_relationship())
                pass
            if '抽查检查' in etp['content'].keys():
                data = self.get_format_dict(etp['content']['抽查检查'])
                cs = Check.create_from_dict(data)
                for c in cs:
                    _ = c.pop('check')
                    n = self.get_neo_node(_)
                    if n is None:
                        continue
                    relationships.append(
                        Have(etp_n, n,
                             **dict(c, **{'RESULT':
                                          n['RESULT']})).get_relationship())
                pass
            if '双随机抽查' in etp['content'].keys():
                data = self.get_format_dict(etp['content']['双随机抽查'])
                rcs = RandomCheck.create_from_dict(data)
                # rcs_n = self.get_neo_node(rcs)
                for rc in rcs:
                    # TODO(leung):随机抽查没有结果
                    _ = rc.pop('check')
                    n = self.get_neo_node(_)
                    if n is None:
                        continue
                    relationships.append(
                        Have(etp_n, n, **rc).get_relationship())
                pass
            if '税务信用' in etp['content'].keys():
                data = self.get_format_dict(etp['content']['税务信用'])
                ts = TaxCredit.create_from_dict(data)
                # ts_n = self.get_neo_node(ts)
                for t in ts:
                    _ = t.pop('TaxCredit')
                    n = self.get_neo_node(_)
                    if n is None:
                        continue
                    # TODO(leung):纳税信用等级作为税务信用评级结果
                    relationships.append(
                        Have(etp_n, n, **dict(RESULT=n['GRADE'],
                                              **t)).get_relationship())
                pass
            if '进出口信用' in etp['content'].keys():
                data = self.get_format_dict(etp['content']['进出口信用'])
                ies = IAE.create_from_dict(data)
                # ies_n = self.get_neo_node(ies)
                for ie in ies:
                    _ = ie.pop('iae')
                    n = self.get_neo_node(_)
                    if n is None:
                        continue
                    relationships.append(
                        Have(etp_n, n, **ie).get_relationship())
                pass
            if '招聘' in etp['content'].keys():
                data = self.get_format_dict(etp['content']['招聘'])
                rs = Position.create_from_dict(data)
                for r in rs:
                    _ = r.pop('position')
                    n = self.get_neo_node(_)
                    if n is None:
                        continue
                    relationships.append(
                        Recruit(etp_n, n, **r).get_relationship())
                pass
            if '客户' in etp['content'].keys():
                data = self.get_format_dict(etp['content']['客户'])
                cs = Client.create_from_dict(data)
                for c in cs:
                    _ = c.pop('client')
                    n = self.match_node(
                        *legal,
                        cypher='_.URL = "{}" OR _.NAME = "{}"'.format(
                            _['URL'], _['NAME']))
                    if n is None:
                        n = self.get_neo_node(_)
                        if n is None:
                            continue
                    relationships.append(
                        SellTo(etp_n, n, **c).get_relationship())
                pass
            if '供应商' in etp['content'].keys():
                data = self.get_format_dict(etp['content']['供应商'])
                ss = Supplier.create_from_dict(data)
                for s in ss:
                    _ = s.pop('supplier')
                    n = self.match_node(
                        *legal,
                        cypher='_.URL = "{}" OR _.NAME = "{}"'.format(
                            _['URL'], _['NAME']))
                    if n is None:
                        n = self.get_neo_node(_)
                        if n is None:
                            continue
                    relationships.append(
                        BuyFrom(etp_n, n, **s).get_relationship())
                pass
            if '信用评级' in etp['content'].keys():
                data = self.get_format_dict(etp['content']['信用评级'])
                for d in data:
                    _ = d.pop('评级公司')
                    n = self.match_node(
                        *legal,
                        cypher='_.URL = "{}" OR _.NAME = "{}"'.format(
                            _['链接'], _['名称']))
                    if n is None:
                        n = Related()
                        n['NAME'] = _['名称']
                        n['URL'] = _['链接']
                        n = self.get_neo_node(n)
                        if n is None:
                            continue
                    __ = d.pop('内容')
                    d['评级内容'] = __['内容']
                    d['评级链接'] = __['链接']
                    relationships.append(
                        Appraise(n, etp_n, **d).get_relationship())
                pass
            if '土地转让' in etp['content'].keys():
                data = self.get_format_dict(etp['content']['土地转让'])
                for d in data:
                    e1 = d.pop('原土地使用权人')
                    e2 = d.pop('现有土地使用权人')
                    p = Plot(**d)
                    p_n = self.get_neo_node(p)
                    if p_n is None:
                        continue
                    if e1['名称'] == o['name'] or e1['链接'] == o['url']:
                        n1 = etp_n
                    else:
                        # 有可能是人
                        n1 = self.match_node(*legal,
                                             cypher='_.URL = "{}"'.format(
                                                 e1['链接']))
                        if n1 is None:
                            n1 = Related(**e1)
                            n1 = self.get_neo_node(n1)
                    if n1 is not None:
                        relationships.append(Sell(n1, p_n).get_relationship())
                    if e2['名称'] == o['name'] or e2['链接'] == o['url']:
                        n2 = etp_n
                    else:
                        n2 = self.match_node(*legal,
                                             cypher='_.URL = "{}"'.format(
                                                 e2['链接']))
                        if n2 is None:
                            n2 = Related(**e2)
                            n2 = self.get_neo_node(n2)
                    if n2 is not None:
                        relationships.append(Buy(n2, p_n).get_relationship())
                pass

            if len(relationships) > 1000:
                i += 1
                self.graph_merge_relationships(relationships)
                if not self.index_and_constraint_statue:
                    self.create_index_and_constraint()
                print(
                    SuccessMessage(
                        '{}:success merge relationships to database '
                        'round {} and deal {}/{} enterprise,and'
                        ' merge {} relationships.'.format(
                            dt.datetime.now(), i, k, etp_count,
                            len(relationships))))
                relationships.clear()
                # return
        if len(relationships):
            i += 1
            self.graph_merge_relationships(relationships)
            if not self.index_and_constraint_statue:
                self.create_index_and_constraint()
            print(
                SuccessMessage('{}:success merge relationships to database '
                               'round {} and deal {}/{} enterprise,and'
                               ' merge {} relationships.'.format(
                                   dt.datetime.now(), i, k, etp_count,
                                   len(relationships))))
            relationships.clear()
            pass

    def get_all_nodes_and_relationships_from_enterprise(self, etp):
        etp_n = Enterprise(URL=etp['url'], NAME=etp['name'])
        etp_n = self.get_neo_node(etp_n)
        if etp_n is None:
            return [], []
        nodes, relationships = [], []
        nodes.append(etp_n)
        if '产权交易' in etp['content'].keys():
            # data = self.get_format_dict(etp['content']['产权交易'])
            # for d in data:
            #     bd = d.pop('标的')
            #     bd_n =
            pass
        if '行政许可' in etp['content'].keys():
            data = etp['content']['行政许可']
            if '工商局' in data.keys():
                d1 = self.get_format_dict(data['工商局'])
                ls = License.create_from_dict(d1, '工商局')
                for l in ls:
                    l_ = l.pop('license')
                    l_n = self.get_neo_node(l_)
                    if l_n is None:
                        continue
                    nodes.append(l_n)
                    relationships.append(Have(etp_n, l_n, **l))
                pass
            if '信用中国' in data.keys():
                d2 = self.get_format_dict(data['信用中国'])
                ls = License.create_from_dict(d2, '信用中国')
                for l in ls:
                    l_ = l.pop('license')
                    l_n = self.get_neo_node(l_)
                    if l_n is None:
                        continue
                    nodes.append(l_n)
                    relationships.append(Have(etp_n, l_n, **l))
                pass
            pass
        if '招投标信息' in etp['content'].keys():
            # 公示的招投标信息一般都是结果，一般情况下是找不到
            # 共同投标的单位，除非是共同中标
            data = self.get_format_dict(etp['content']['招投标信息'])
            bs = Bidding.create_from_dict(data)
            for b in bs:
                _ = b.pop('bidding')
                b_n = self.get_neo_node(_)
                if b_n is None:
                    continue
                # TODO(leung):项目分类用作了招投标结果
                nodes.append(b_n)
                relationships.append(
                    TakePartIn(etp_n, b_n, **dict(b, **{'RESULT':
                                                        b_n['TYPE']})))
            pass
        if '抽查检查' in etp['content'].keys():
            data = self.get_format_dict(etp['content']['抽查检查'])
            cs = Check.create_from_dict(data)
            for c in cs:
                _ = c.pop('check')
                n = self.get_neo_node(_)
                if n is None:
                    continue
                nodes.append(n)
                relationships.append(
                    Have(etp_n, n, **dict(c, **{'RESULT': n['RESULT']})))
            pass
        if '双随机抽查' in etp['content'].keys():
            data = self.get_format_dict(etp['content']['双随机抽查'])
            rcs = RandomCheck.create_from_dict(data)
            # rcs_n = self.get_neo_node(rcs)
            for rc in rcs:
                # TODO(leung):随机抽查没有结果
                _ = rc.pop('check')
                n = self.get_neo_node(_)
                if n is None:
                    continue
                nodes.append(n)
                relationships.append(Have(etp_n, n, **rc))
            pass
        if '税务信用' in etp['content'].keys():
            data = self.get_format_dict(etp['content']['税务信用'])
            ts = TaxCredit.create_from_dict(data)
            # ts_n = self.get_neo_node(ts)
            for t in ts:
                _ = t.pop('TaxCredit')
                n = self.get_neo_node(_)
                if n is None:
                    continue
                # TODO(leung):纳税信用等级作为税务信用评级结果
                nodes.append(n)
                relationships.append(
                    Have(etp_n, n, **dict(RESULT=n['GRADE'], **t)))
            pass
        if '进出口信用' in etp['content'].keys():
            data = self.get_format_dict(etp['content']['进出口信用'])
            ies = IAE.create_from_dict(data)
            # ies_n = self.get_neo_node(ies)
            for ie in ies:
                _ = ie.pop('iae')
                n = self.get_neo_node(_)
                if n is None:
                    continue
                nodes.append(n)
                relationships.append(Have(etp_n, n, **ie))
            pass
        if '招聘' in etp['content'].keys():
            data = self.get_format_dict(etp['content']['招聘'])
            rs = Position.create_from_dict(data)
            for r in rs:
                _ = r.pop('position')
                n = self.get_neo_node(_)
                if n is None:
                    continue
                nodes.append(n)
                relationships.append(Recruit(etp_n, n, **r))
            pass
        if '客户' in etp['content'].keys():
            data = self.get_format_dict(etp['content']['客户'])
            cs = Client.create_from_dict(data)
            for c in cs:
                cli = c.pop('client')
                cli_n = self.match_node(
                    *legal,
                    cypher='_.URL = "{}" OR _.NAME = "{}"'.format(
                        cli['URL'], cli['NAME']))
                if cli_n is None:
                    if cli.isEnterprise():
                        cli = Enterprise(**cli.to_dict(with_label=False))
                    cli_n = self.get_neo_node(cli)
                    if cli_n is None:
                        continue
                nodes.append(cli_n)
                relationships.append(SellTo(etp_n, cli_n, **c))
            pass
        if '供应商' in etp['content'].keys():
            data = self.get_format_dict(etp['content']['供应商'])
            ss = Supplier.create_from_dict(data)
            for s in ss:
                sup = s.pop('supplier')
                sup_n = self.match_node(
                    *legal,
                    cypher='_.URL = "{}" OR _.NAME = "{}"'.format(
                        sup['URL'], sup['NAME']))
                if sup_n is None:
                    if sup.isEnterprise():
                        sup = Enterprise(**sup.to_dict(with_label=False))
                    sup_n = self.get_neo_node(sup)
                    if sup_n is None:
                        continue
                nodes.append(sup_n)
                relationships.append(BuyFrom(etp_n, sup_n, **s))
            pass
        if '信用评级' in etp['content'].keys():
            data = self.get_format_dict(etp['content']['信用评级'])
            for d in data:
                _ = d.pop('评级公司')
                _['链接'] = Enterprise.parser_url(_['链接'])
                n = self.match_node(
                    *legal,
                    cypher='_.URL = "{}" OR _.NAME = "{}"'.format(
                        _['链接'], _['名称']))
                if n is None:
                    n = Enterprise(**_)
                    n = self.get_neo_node(n)
                    if n is None:
                        continue
                __ = d.pop('内容')
                d['评级内容'] = __['内容']
                d['评级链接'] = __['链接']
                nodes.append(n)
                relationships.append(Appraise(n, etp_n, **d))
            pass
        if '土地转让' in etp['content'].keys():
            data = self.get_format_dict(etp['content']['土地转让'])
            for d in data:
                e1 = d.pop('原土地使用权人')
                e2 = d.pop('现有土地使用权人')
                p = Plot(**d)
                p_n = self.get_neo_node(p)
                if p_n is None:
                    continue
                e1['链接'] = Enterprise.parser_url(e1['链接'])
                if e1['名称'] == etp['name'] or e1['链接'] == etp['url']:
                    n1 = etp_n
                else:
                    # 有可能是人
                    n1 = self.match_node(*legal,
                                         cypher='_.URL = "{}"'.format(
                                             e1['链接']))
                    if n1 is None:
                        n1 = Enterprise(**e1)
                        if not n1.isEnterprise():
                            n1 = Person(**e1)
                            if not n1.isPerson():
                                n1 = Related(**e1)
                        n1 = self.get_neo_node(n1)
                if n1 is not None:
                    nodes.append(n1)
                    nodes.append(p_n)
                    relationships.append(Sell(n1, p_n))
                e2['链接'] = Enterprise.parser_url(e2['链接'])
                if e2['名称'] == etp['name'] or e2['链接'] == etp['url']:
                    n2 = etp_n
                else:
                    n2 = self.match_node(*legal,
                                         cypher='_.URL = "{}"'.format(
                                             e2['链接']))
                    if n2 is None:
                        n2 = Enterprise(**e2)
                        if not n2.isEnterprise():
                            n2 = Person(**e2)
                            if not n2.isPerson():
                                n2 = Related(**e2)
                        n2 = self.get_neo_node(n2)
                if n2 is not None:
                    nodes.append(n2)
                    nodes.append(p_n)
                    relationships.append(Buy(n2, p_n))
            pass
        return nodes, relationships

    def get_all_nodes_and_relationships(self, save_folder=None, **kwargs):
        enterprises = self.base.query(
            sql={
                'metaModel': '经营状况',
                # 'name': '重庆轩烽建材有限公司'
            },
            # limit=100000,
            # skip=10000,
            no_cursor_timeout=True)
        i, j = 0, 0
        nc, rc = 0, 0
        etp_count = enterprises.count()
        nodes, relationships = {}, {}
        unique_code_pattern = re.compile('(?<=unique=)\w{32}')

        def getUniqueCode(url):
            _uc_ = re.search(unique_code_pattern, url)
            if _uc_ is not None:
                return _uc_.group(0)
            else:
                return None

        _st_ = time.time()
        for ep in enterprises:
            i += 1
            uc = getUniqueCode(ep['url'])
            if uc is None:
                self.logger.info('{}:mismatch url'.format(ep['name']))
                continue
            ep['url'] = '/firm_' + uc + '.html'
            nds, rps = self.get_all_nodes_and_relationships_from_enterprise(ep)
            for _nds_ in nds:
                if _nds_ is None:
                    continue
                # _nds_ = _nds_.to_dict()
                label = list(_nds_.labels)[0]
                _nds_ = dict(label=label, **_nds_)
                if _nds_['label'] in nodes.keys():
                    nodes[_nds_['label']].append(_nds_)
                else:
                    nodes[_nds_['label']] = [_nds_]
                pass
            for _rps_ in rps:
                _rps_ = _rps_.to_dict()
                if _rps_['label'] in relationships.keys():
                    relationships[_rps_['label']].append(_rps_)
                else:
                    relationships[_rps_['label']] = [_rps_]
                pass
            if i % 10000 == 0:
                j += 1
                if save_folder is not None:
                    _nc_, _rc_ = self.save_graph(save_folder, nodes,
                                                 relationships, **kwargs)
                    nc += _nc_
                    rc += _rc_
                    nodes.clear()
                    relationships.clear()
                self.logger.info(
                    SuccessMessage('success trans data to csv round {} and '
                                   'deal {}/{} enterprise spend {} seconds.'
                                   ''.format(j, i, etp_count,
                                             int(time.time() - _st_))))
                _st_ = time.time()
            pass
        if save_folder is not None:
            _nc_, _rc_ = self.save_graph(save_folder, nodes, relationships,
                                         **kwargs)
            nc += _nc_
            rc += _rc_
            nodes.clear()
            relationships.clear()
            self.logger.info('Summary:')
            self.logger.info(' save graph data:')
            self.logger.info('   {} nodes'.format(nc))
            self.logger.info('   {} relationships'.format(rc))
            pass
        return nodes, relationships