def __init__(self, pregranted_canopies, granted_canopies, config): self.pregranted_canopies = pregranted_canopies self.granted_canopies = granted_canopies self.cnx_g = pvdb.granted_table(config) self.cnx_pg = pvdb.pregranted_table(config) self.cnx_g_inc = pvdb.incremental_granted_table(config) self.cnx_pg_inc = pvdb.incremental_pregranted_table(config)
def drop_tables(config): cnx_g = pvdb.granted_table(config) cnx_pg = pvdb.pregranted_table(config) g_cursor = cnx_g.cursor() g_cursor.execute("TRUNCATE TABLE temp_assignee_disambiguation_mapping") pg_cursor = cnx_pg.cursor() pg_cursor.execute("TRUNCATE TABLE temp_assignee_disambiguation_mapping") g_cursor.close() pg_cursor.close()
def drop_tables(config): cnx_g = pvdb.granted_table(config) cnx_pg = pvdb.pregranted_table(config) g_cursor = cnx_g.cursor() g_cursor.execute("DROP TABLE location_disambiguation_mapping") pg_cursor = cnx_pg.cursor() pg_cursor.execute("DROP TABLE location_disambiguation_mapping") g_cursor.close() pg_cursor.close()
def create_tables(config): cnx_g = pvdb.granted_table(config) cnx_pg = pvdb.pregranted_table(config) g_cursor = cnx_g.cursor() g_cursor.execute( "CREATE TABLE location_disambiguation_mapping (uuid VARCHAR(255), location_id VARCHAR(255))") pg_cursor = cnx_pg.cursor() pg_cursor.execute( "CREATE TABLE location_disambiguation_mapping (uuid VARCHAR(255), location_id VARCHAR(255))") g_cursor.close() pg_cursor.close()
def build_pregrants(canopy2mentions, pregranted_uuid2canopy, config): cnx = pvdb.pregranted_table(config) cursor = cnx.cursor() query = "SELECT id, city, state, country, latitude, longitude, filename, created_date, updated_date FROM rawlocation;" cursor.execute(query) for rec in tqdm(cursor, 'working on pregrants', total=10866744): lm = LocationMention.from_application_sql_record(rec) if lm.uuid in pregranted_uuid2canopy: canopy = pregranted_uuid2canopy[lm.uuid] canopy2mentions[canopy].append(lm) return canopy2mentions
def build_pregrants(config): canopy2uuids = collections.defaultdict(list) uuid2canopy = dict() uuid2entityid = load_disambiguation(config) cnx = pvdb.pregranted_table(config) cursor = cnx.cursor() query = "SELECT id, rawlocation_id FROM rawinventor;" cursor.execute(query) for uuid, rawlocation_id in tqdm(cursor, 'process', total=8100000): canopy2uuids[uuid2entityid[uuid]].append(rawlocation_id) uuid2canopy[rawlocation_id] = uuid2entityid[uuid] return canopy2uuids, uuid2canopy
def create_tables(config): cnx_g = pvdb.granted_table(config) cnx_pg = pvdb.pregranted_table(config) g_cursor = cnx_g.cursor() g_cursor.execute( "CREATE TABLE tmp_inventor_disambiguation_granted2 (uuid VARCHAR(255), disambiguated_id VARCHAR(255))" ) pg_cursor = cnx_pg.cursor() pg_cursor.execute( "CREATE TABLE tmp_inventor_disambiguation_pregranted2 (uuid VARCHAR(255), disambiguated_id VARCHAR(255))" ) g_cursor.close() pg_cursor.close()
def build_pregrants(config): # | id | document_number | sequence | name_first | name_last | organization | type | rawlocation_id | city | state | country | filename | created_date | updated_date | cnx = pvdb.pregranted_table(config) cursor = cnx.cursor() query = "SELECT id, document_number, sequence -1 as sequence, name_first, name_last, organization, type, rawlocation_id, city, state, country FROM rawassignee" cursor.execute(query) feature_map = collections.defaultdict(list) idx = 0 for rec in cursor: am = AssigneeMention.from_application_sql_record(rec) feature_map[am.name_features()[0]].append(am) idx += 1 logging.log_every_n(logging.INFO, 'Processed %s pregrant records - %s features', 10000, idx, len(feature_map)) return feature_map
def create_uuid_map(config): cnx_g = pvdb.granted_table(config) cnx_pg = pvdb.pregranted_table(config) g_cursor = cnx_g.cursor() g_cursor.execute("SELECT uuid, patent_id, sequence FROM rawassignee;") granted_uuids = dict() for uuid, patent_id, seq in tqdm(g_cursor, 'granted uuids'): granted_uuids['%s-%s' % (patent_id, seq)] = uuid pg_cursor = cnx_pg.cursor() pg_cursor.execute("SELECT id, document_number, sequence-1 as sequence FROM rawassignee;") pgranted_uuids = dict() for uuid, doc_id, seq in tqdm(pg_cursor, 'pregranted uuids'): pgranted_uuids['pg-%s-%s' % (doc_id, seq)] = uuid return granted_uuids, pgranted_uuids
def upload(granted_ids, pregranted_ids, config): pairs_pregranted = [] pairs_granted = [] with open(config['ASSIGNEE_UPLOAD']['input'], 'r') as fin: for line in fin: splt = line.strip().split('\t') if splt[0] in pregranted_ids: pairs_pregranted.append((pregranted_ids[splt[0]], splt[1])) elif splt[0] in granted_ids: pairs_granted.append((granted_ids[splt[0]], splt[1])) cnx_g = pvdb.granted_table(config) cnx_pg = pvdb.pregranted_table(config) g_cursor = cnx_g.cursor() batch_size = 100000 offsets = [x for x in range(0, len(pairs_granted), batch_size)] for idx in tqdm(range(len(offsets)), 'adding granted', total=len(offsets)): sidx = offsets[idx] eidx = min(len(pairs_granted), offsets[idx] + batch_size) sql = "INSERT INTO temp_assignee_disambiguation_mapping (uuid, assignee_id, version_indicator) VALUES " + ', '.join( ['("%s", "%s", "20201229")' % x for x in pairs_granted[sidx:eidx]]) #logging.log_first_n(logging.INFO, '%s', 1, sql) g_cursor.execute(sql) cnx_g.commit() # g_cursor.execute('alter table temp_assignee_disambiguation_mapping add primary key (uuid)') cnx_g.close() pg_cursor = cnx_pg.cursor() batch_size = 100000 offsets = [x for x in range(0, len(pairs_pregranted), batch_size)] for idx in tqdm(range(len(offsets)), 'adding pregranted', total=len(offsets)): sidx = offsets[idx] eidx = min(len(pairs_pregranted), offsets[idx] + batch_size) sql = "INSERT INTO temp_assignee_disambiguation_mapping (uuid, assignee_id, version_indicator) VALUES " + ', '.join( [ '("%s", "%s", "20201229")' % x for x in pairs_pregranted[sidx:eidx] ]) # logging.log_first_n(logging.INFO, '%s', 1, sql) pg_cursor.execute(sql) cnx_pg.commit() # pg_cursor.execute('alter table temp_assignee_disambiguation_mapping add primary key (uuid)') cnx_pg.close()
def create_uuid_map(config): cnx_g = pvdb.granted_table(config) cnx_pg = pvdb.pregranted_table(config) g_cursor = cnx_g.cursor() g_cursor.execute("SELECT id FROM rawlocation;") granted_uuids = set() for uuid in tqdm(g_cursor, 'granted uuids'): granted_uuids.add(uuid[0]) pg_cursor = cnx_pg.cursor() pg_cursor.execute("SELECT id FROM rawlocation;") pgranted_uuids = set() for uuid in tqdm(pg_cursor, 'pregranted uuids'): pgranted_uuids.add(uuid[0]) return granted_uuids, pgranted_uuids
def build_pregrants(config): feature_map = dict() cnx = pvdb.pregranted_table(config) if cnx is None: return feature_map cursor = cnx.cursor() query = "select document_number,invention_title from application;" cursor.execute(query) idx = 0 for rec in cursor: record_id = 'pg-%s' % rec[0] feature_map[record_id] = rec[1] idx += 1 logging.log_every_n(logging.INFO, 'Processed %s pregrant records - %s features', 10000, idx, len(feature_map)) logging.log(logging.INFO, 'Processed %s pregrant records - %s features', idx, len(feature_map)) return feature_map
def build_pregrants(config): canopy2uuids = collections.defaultdict(list) cnx = pvdb.pregranted_table(config) # cnx is none if we haven't specified a pregranted table if cnx is None: return canopy2uuids cursor = cnx.cursor() query = "SELECT id, name_first, name_last FROM rawinventor;" cursor.execute(query) idx = 0 for uuid, name_first, name_last in cursor: im = InventorMention(uuid, '0', '', name_first if name_first else '', name_last if name_last else '', '', '', '') canopy2uuids[first_letter_last_name(im)].append(uuid) idx += 1 logging.log_every_n(logging.INFO, 'Processed %s pregrant records - %s canopies', 10000, idx, len(canopy2uuids)) logging.log(logging.INFO, 'Processed %s pregrant records - %s canopies', idx, len(canopy2uuids)) return canopy2uuids
def upload(config): loader = load_mysql.Loader.from_config(config) pregranted_ids = set( [y for x in loader.pregranted_canopies.values() for y in x]) granted_ids = set([y for x in loader.granted_canopies.values() for y in x]) disamb = dict() with open(config['INVENTOR_UPLOAD']['input'], 'r') as fin: for line in fin: splt = line.strip().split('\t') if len(splt) != 2: print('error %s' % str(splt)) else: disamb[splt[0]] = splt[1] pairs_pregranted = [] pairs_granted = [] with open(config['INVENTOR_UPLOAD']['input'], 'r') as fin: for line in fin: splt = line.strip().split('\t') if splt[0] in pregranted_ids: pairs_pregranted.append((splt[0], splt[1])) elif splt[0] in granted_ids: pairs_granted.append((splt[0], splt[1])) cnx_g = pvdb.granted_table(config) cnx_pg = pvdb.pregranted_table(config) g_cursor = cnx_g.cursor() batch_size = 100000 offsets = [x for x in range(0, len(pairs_granted), batch_size)] for idx in tqdm(range(len(offsets)), 'adding granted', total=len(offsets)): sidx = offsets[idx] eidx = min(len(pairs_granted), offsets[idx] + batch_size) sql = "INSERT INTO tmp_inventor_disambiguation_granted2 (uuid, disambiguated_id) VALUES " + ', '.join( ['("%s", "%s")' % x for x in pairs_granted[sidx:eidx]]) # logging.log_first_n(logging.INFO, '%s', 1, sql) g_cursor.execute(sql) cnx_g.commit() g_cursor.execute( 'alter table tmp_inventor_disambiguation_granted2 add primary key (uuid)' ) cnx_g.close() pg_cursor = cnx_pg.cursor() batch_size = 100000 offsets = [x for x in range(0, len(pairs_pregranted), batch_size)] for idx in tqdm(range(len(offsets)), 'adding pregranted', total=len(offsets)): sidx = offsets[idx] eidx = min(len(pairs_pregranted), offsets[idx] + batch_size) sql = "INSERT INTO tmp_inventor_disambiguation_pregranted2 (uuid, disambiguated_id) VALUES " + ', '.join( ['("%s", "%s")' % x for x in pairs_pregranted[sidx:eidx]]) # logging.log_first_n(logging.INFO, '%s', 1, sql) pg_cursor.execute(sql) cnx_pg.commit() pg_cursor.execute( 'alter table tmp_inventor_disambiguation_pregranted2 add primary key (uuid)' ) cnx_pg.close()