def upload(granted_ids, pregranted_ids, config):
    logging.info('granted_ids size %s', len(granted_ids))
    logging.info('pregranted_ids size %s', len(pregranted_ids))

    pairs_pregranted = []
    pairs_granted = []
    with open(config['LOCATION_UPLOAD']['input'], 'r') as fin:
        for line in fin:
            splt = line.strip().split('\t')
            if splt[0] in pregranted_ids:
                pairs_pregranted.append((splt[0], splt[1]))
            elif splt[0] in granted_ids:
                pairs_granted.append((splt[0], splt[1]))
            else:
                logging.warning('missing id %s', splt[0])
    logging.info('pairs granted size %s', len(pairs_granted))
    logging.info('pairs pregranted size %s', len(pairs_pregranted))

    cnx_g = pvdb.granted_table(config)

    cnx_pg = pvdb.granted_table(config)

    g_cursor = cnx_g.cursor()
    batch_size = 100000
    offsets = [x for x in range(0, len(pairs_granted), batch_size)]
    for idx in tqdm(range(len(offsets)), 'adding granted', total=len(offsets)):
        sidx = offsets[idx]
        eidx = min(len(pairs_granted), offsets[idx] + batch_size)
        sql = "INSERT INTO location_disambiguation_mapping (uuid, location_id,in_granted,in_pregrant) VALUES " + ', '.join(
            ['("%s", "%s", 1, 0)' % x for x in pairs_granted[sidx:eidx]])
        # logging.log_first_n(logging.INFO, '%s', 1, sql)
        g_cursor.execute(sql)
    cnx_g.commit()
    #g_cursor.execute('alter table location_disambiguation_mapping add primary key (uuid)')
    cnx_g.close()

    pg_cursor = cnx_pg.cursor()
    batch_size = 100000
    offsets = [x for x in range(0, len(pairs_pregranted), batch_size)]
    for idx in tqdm(range(len(offsets)), 'adding pregranted', total=len(offsets)):
        sidx = offsets[idx]
        eidx = min(len(pairs_pregranted), offsets[idx] + batch_size)
        sql = "INSERT INTO location_disambiguation_mapping (uuid, location_id, in_granted, in_pregrant) VALUES " + ', '.join(
            ['("%s", "%s", 0, 1)' % x for x in pairs_pregranted[sidx:eidx]])
        # logging.log_first_n(logging.INFO, '%s', 1, sql)
        pg_cursor.execute(sql)
    cnx_pg.commit()
    #pg_cursor.execute('alter table location_disambiguation_mapping add primary key (uuid)')
    cnx_pg.close()
Ejemplo n.º 2
0
 def __init__(self, pregranted_canopies, granted_canopies, config):
     self.pregranted_canopies = pregranted_canopies
     self.granted_canopies = granted_canopies
     self.cnx_g = pvdb.granted_table(config)
     self.cnx_pg = pvdb.pregranted_table(config)
     self.cnx_g_inc = pvdb.incremental_granted_table(config)
     self.cnx_pg_inc = pvdb.incremental_pregranted_table(config)
Ejemplo n.º 3
0
def build_granted(fout, config):
    cnx = pvdb.granted_table(config)
    cursor = cnx.cursor()
    query = "SELECT uuid, patent_id, sequence FROM rawinventor;"
    cursor.execute(query)
    for uuid, patent_id, sequence in tqdm(cursor, 'process', total=17000000):
        fout.write('%s\t%s-%s\n' % (uuid, patent_id, sequence))
def collection_location_mentions_granted(config):
    canopy2uuids = collections.defaultdict(list)
    uuid2entityid = load_disambiguation(config)
    cnx = pvdb.granted_table(config)
    cursor = cnx.cursor()
    query = "SELECT uuid, rawlocation_id FROM rawinventor;"
    cursor.execute(query)
    for uuid, rawlocation_id in tqdm(cursor, 'process', total=18000000):
        canopy2uuids[uuid2entityid[uuid]].append(rawlocation_id)
    return canopy2uuids
def drop_tables(config):
    cnx_g = pvdb.granted_table(config)
    cnx_pg = pvdb.pregranted_table(config)

    g_cursor = cnx_g.cursor()
    g_cursor.execute("TRUNCATE TABLE temp_assignee_disambiguation_mapping")
    pg_cursor = cnx_pg.cursor()
    pg_cursor.execute("TRUNCATE TABLE temp_assignee_disambiguation_mapping")
    g_cursor.close()
    pg_cursor.close()
def create_tables(config):
    cnx_g = pvdb.granted_table(config)
    cnx_pg = pvdb.pregranted_table(config)
    g_cursor = cnx_g.cursor()
    g_cursor.execute(
        "CREATE TABLE location_disambiguation_mapping (uuid VARCHAR(255), location_id VARCHAR(255))")
    pg_cursor = cnx_pg.cursor()
    pg_cursor.execute(
        "CREATE TABLE location_disambiguation_mapping (uuid VARCHAR(255), location_id VARCHAR(255))")
    g_cursor.close()
    pg_cursor.close()
Ejemplo n.º 7
0
def build_granted(canopy2mentions, granted_uuid2canopy, config):
    cnx = pvdb.granted_table(config)
    cursor = cnx.cursor()
    query = "SELECT id , location_id , city  , state, country, country_transformed, location_id_transformed  FROM rawlocation;"
    cursor.execute(query)
    for rec in tqdm(cursor, 'working on granted patents', total=29032921):
        lm = LocationMention.from_granted_sql_record(rec)
        if lm.uuid in granted_uuid2canopy:
            canopy = granted_uuid2canopy[lm.uuid]
            canopy2mentions[canopy].append(lm)
    return canopy2mentions
def drop_tables(config):
    cnx_g = pvdb.granted_table(config)

    cnx_pg = pvdb.pregranted_table(config)

    g_cursor = cnx_g.cursor()
    g_cursor.execute("DROP TABLE location_disambiguation_mapping")
    pg_cursor = cnx_pg.cursor()
    pg_cursor.execute("DROP TABLE location_disambiguation_mapping")
    g_cursor.close()
    pg_cursor.close()
def build_granted(granted_uuid2canopy, config):
    canopy2mentions = collections.defaultdict(list)
    cnx = pvdb.granted_table(config)
    cursor = cnx.cursor()
    query = "SELECT id  , location_id  , city , state , country , country_transformed , location_id_transformed FROM rawlocation;"
    cursor.execute(query)
    for rec in tqdm(cursor, 'process', total=18000000):
        lm = LocationMention.from_granted_sql_record(rec)
        if lm.uuid in granted_uuid2canopy:
            canopy = granted_uuid2canopy[lm.uuid]
            canopy2mentions[canopy].append(lm)
    return canopy2mentions
Ejemplo n.º 10
0
def build_granted(granted_uuids, pgranted_uuids, config):
    canopy2uuids = collections.defaultdict(list)
    uuid2canopy = dict()
    uuid2entityid = load_disambiguation(granted_uuids, pgranted_uuids, config)
    cnx = pvdb.granted_table(config)
    cursor = cnx.cursor()
    query = "SELECT uuid, rawlocation_id FROM rawassignee;"
    cursor.execute(query)
    for uuid, rawlocation_id in tqdm(cursor, 'process', total=6789244):
        canopy2uuids[uuid2entityid[uuid]].append(rawlocation_id)
        uuid2canopy[rawlocation_id] = uuid2entityid[uuid]
    return canopy2uuids, uuid2canopy
Ejemplo n.º 11
0
def create_tables(config):
    cnx_g = pvdb.granted_table(config)
    cnx_pg = pvdb.pregranted_table(config)

    g_cursor = cnx_g.cursor()
    g_cursor.execute(
        "CREATE TABLE tmp_inventor_disambiguation_granted2 (uuid VARCHAR(255), disambiguated_id VARCHAR(255))"
    )
    pg_cursor = cnx_pg.cursor()
    pg_cursor.execute(
        "CREATE TABLE tmp_inventor_disambiguation_pregranted2 (uuid VARCHAR(255), disambiguated_id VARCHAR(255))"
    )
    g_cursor.close()
    pg_cursor.close()
Ejemplo n.º 12
0
def create_uuid_map(config):
    cnx_g = pvdb.granted_table(config)
    cnx_pg = pvdb.pregranted_table(config)

    g_cursor = cnx_g.cursor()
    g_cursor.execute("SELECT uuid, patent_id, sequence FROM rawassignee;")
    granted_uuids = dict()
    for uuid, patent_id, seq in tqdm(g_cursor, 'granted uuids'):
        granted_uuids['%s-%s' % (patent_id, seq)] = uuid

    pg_cursor = cnx_pg.cursor()
    pg_cursor.execute("SELECT id, document_number, sequence-1 as sequence FROM rawassignee;")
    pgranted_uuids = dict()
    for uuid, doc_id, seq in tqdm(pg_cursor, 'pregranted uuids'):
        pgranted_uuids['pg-%s-%s' % (doc_id, seq)] = uuid
    return granted_uuids, pgranted_uuids
def build_granted(config):
    # | uuid | patent_id | assignee_id | rawlocation_id | type | name_first | name_last | organization | sequence |
    cnx = pvdb.granted_table(config)
    cursor = cnx.cursor()
    query = "SELECT uuid , patent_id , assignee_id , rawlocation_id , type , name_first , name_last , organization , sequence FROM rawassignee;"
    cursor.execute(query)
    feature_map = collections.defaultdict(list)
    idx = 0
    for rec in cursor:
        am = AssigneeMention.from_granted_sql_record(rec)
        feature_map[am.name_features()[0]].append(am)
        idx += 1
        logging.log_every_n(logging.INFO,
                            'Processed %s granted records - %s features',
                            10000, idx, len(feature_map))
    return feature_map
def upload(granted_ids, pregranted_ids, config):
    pairs_pregranted = []
    pairs_granted = []
    with open(config['ASSIGNEE_UPLOAD']['input'], 'r') as fin:
        for line in fin:
            splt = line.strip().split('\t')
            if splt[0] in pregranted_ids:
                pairs_pregranted.append((pregranted_ids[splt[0]], splt[1]))
            elif splt[0] in granted_ids:
                pairs_granted.append((granted_ids[splt[0]], splt[1]))

    cnx_g = pvdb.granted_table(config)
    cnx_pg = pvdb.pregranted_table(config)

    g_cursor = cnx_g.cursor()
    batch_size = 100000
    offsets = [x for x in range(0, len(pairs_granted), batch_size)]
    for idx in tqdm(range(len(offsets)), 'adding granted', total=len(offsets)):
        sidx = offsets[idx]
        eidx = min(len(pairs_granted), offsets[idx] + batch_size)
        sql = "INSERT INTO temp_assignee_disambiguation_mapping (uuid,  assignee_id, version_indicator) VALUES " + ', '.join(
            ['("%s", "%s", "20201229")' % x for x in pairs_granted[sidx:eidx]])
        #logging.log_first_n(logging.INFO, '%s', 1, sql)
        g_cursor.execute(sql)
    cnx_g.commit()
    #    g_cursor.execute('alter table temp_assignee_disambiguation_mapping add primary key (uuid)')
    cnx_g.close()

    pg_cursor = cnx_pg.cursor()
    batch_size = 100000
    offsets = [x for x in range(0, len(pairs_pregranted), batch_size)]
    for idx in tqdm(range(len(offsets)),
                    'adding pregranted',
                    total=len(offsets)):
        sidx = offsets[idx]
        eidx = min(len(pairs_pregranted), offsets[idx] + batch_size)
        sql = "INSERT INTO temp_assignee_disambiguation_mapping (uuid, assignee_id, version_indicator) VALUES " + ', '.join(
            [
                '("%s", "%s", "20201229")' % x
                for x in pairs_pregranted[sidx:eidx]
            ])
        # logging.log_first_n(logging.INFO, '%s', 1, sql)
        pg_cursor.execute(sql)
    cnx_pg.commit()
    #    pg_cursor.execute('alter table temp_assignee_disambiguation_mapping add primary key (uuid)')
    cnx_pg.close()
def create_uuid_map(config):
    cnx_g = pvdb.granted_table(config)

    cnx_pg = pvdb.pregranted_table(config)

    g_cursor = cnx_g.cursor()
    g_cursor.execute("SELECT id FROM rawlocation;")
    granted_uuids = set()
    for uuid in tqdm(g_cursor, 'granted uuids'):
        granted_uuids.add(uuid[0])

    pg_cursor = cnx_pg.cursor()
    pg_cursor.execute("SELECT id FROM rawlocation;")
    pgranted_uuids = set()
    for uuid in tqdm(pg_cursor, 'pregranted uuids'):
        pgranted_uuids.add(uuid[0])
    return granted_uuids, pgranted_uuids
def build_granted(config):
    feature_map = dict()
    cnx = pvdb.granted_table(config)
    if cnx is None:
        return feature_map
    cursor = cnx.cursor()
    query = "SELECT id,title FROM patent;"
    cursor.execute(query)
    idx = 0
    for rec in cursor:
        record_id = '%s' % rec[0]
        feature_map[record_id] = rec[1]
        idx += 1
        logging.log_every_n(logging.INFO,
                            'Processed %s grant records - %s features', 10000,
                            idx, len(feature_map))
    logging.log(logging.INFO, 'Processed %s grant records - %s features', idx,
                len(feature_map))
    return feature_map
def build_granted(config):
    canopy2uuids = collections.defaultdict(list)

    cnx = pvdb.granted_table(config)
    # cnx is none if we haven't specified a granted table
    if cnx is None:
        return canopy2uuids
    cursor = cnx.cursor()
    query = "SELECT uuid, name_first, name_last FROM rawinventor;"
    cursor.execute(query)
    idx = 0
    for uuid, name_first, name_last in cursor:
        im = InventorMention(uuid, '0', '', name_first if name_first else '',
                             name_last if name_last else '', '', '', '')
        canopy2uuids[first_letter_last_name(im)].append(uuid)
        idx += 1
        logging.log_every_n(logging.INFO,
                            'Processed %s granted records - %s canopies',
                            10000, idx, len(canopy2uuids))
    logging.log(logging.INFO, 'Processed %s granted records - %s canopies',
                idx, len(canopy2uuids))
    return canopy2uuids
Ejemplo n.º 18
0
def upload(config):
    loader = load_mysql.Loader.from_config(config)
    pregranted_ids = set(
        [y for x in loader.pregranted_canopies.values() for y in x])
    granted_ids = set([y for x in loader.granted_canopies.values() for y in x])

    disamb = dict()
    with open(config['INVENTOR_UPLOAD']['input'], 'r') as fin:
        for line in fin:
            splt = line.strip().split('\t')
            if len(splt) != 2:
                print('error %s' % str(splt))
            else:
                disamb[splt[0]] = splt[1]

    pairs_pregranted = []
    pairs_granted = []
    with open(config['INVENTOR_UPLOAD']['input'], 'r') as fin:
        for line in fin:
            splt = line.strip().split('\t')
            if splt[0] in pregranted_ids:
                pairs_pregranted.append((splt[0], splt[1]))
            elif splt[0] in granted_ids:
                pairs_granted.append((splt[0], splt[1]))

    cnx_g = pvdb.granted_table(config)
    cnx_pg = pvdb.pregranted_table(config)

    g_cursor = cnx_g.cursor()
    batch_size = 100000
    offsets = [x for x in range(0, len(pairs_granted), batch_size)]
    for idx in tqdm(range(len(offsets)), 'adding granted', total=len(offsets)):
        sidx = offsets[idx]
        eidx = min(len(pairs_granted), offsets[idx] + batch_size)
        sql = "INSERT INTO tmp_inventor_disambiguation_granted2 (uuid, disambiguated_id) VALUES " + ', '.join(
            ['("%s", "%s")' % x for x in pairs_granted[sidx:eidx]])
        # logging.log_first_n(logging.INFO, '%s', 1, sql)
        g_cursor.execute(sql)
    cnx_g.commit()
    g_cursor.execute(
        'alter table tmp_inventor_disambiguation_granted2 add primary key (uuid)'
    )
    cnx_g.close()

    pg_cursor = cnx_pg.cursor()
    batch_size = 100000
    offsets = [x for x in range(0, len(pairs_pregranted), batch_size)]
    for idx in tqdm(range(len(offsets)),
                    'adding pregranted',
                    total=len(offsets)):
        sidx = offsets[idx]
        eidx = min(len(pairs_pregranted), offsets[idx] + batch_size)
        sql = "INSERT INTO tmp_inventor_disambiguation_pregranted2 (uuid, disambiguated_id) VALUES " + ', '.join(
            ['("%s", "%s")' % x for x in pairs_pregranted[sidx:eidx]])
        # logging.log_first_n(logging.INFO, '%s', 1, sql)
        pg_cursor.execute(sql)
    cnx_pg.commit()
    pg_cursor.execute(
        'alter table tmp_inventor_disambiguation_pregranted2 add primary key (uuid)'
    )
    cnx_pg.close()