Ejemplo n.º 1
0
def from_citeseerx(username, password, host, database, table):
    """
    Reads person names from the given table that are in the same format as names from ExPORTER,
    parses them, and writes the parts back out to the tables.

    """

    # Connect to the database.
    read_cnx = mysql.connector.connect(user=username, password=password, database=database,
                                       host=host)
    read_cursor = read_cnx.cursor()
    write_cnx = mysql.connector.connect(user=username, password=password, database=database,
                                        host=host)
    write_cursor = write_cnx.cursor()

    query_string = "select PersonNameId, FullName from {0} pn inner join PersonAttribute pa"\
        " on pa.PersonId=pn.PersonId and pa.RelationshipCode='CITESEERX_CLUSTER'"\
        " where GivenName is null and FamilyName is null;".format(table)
    read_cursor.execute(query_string)

    num_rows_read = 0
    print(datetime.datetime.now(), num_rows_read)

    for (PersonNameId, FullName) in read_cursor:
        name_components = name_parser.parse_name(name_parser.NameFormat.CITESEERX, FullName)
        if (name_components.Prefix is not None) or (name_components.GivenName is not None)\
                or (name_components.OtherName is not None) or (name_components.FamilyName is not None)\
                or (name_components.Suffix is not None) or (name_components.NickName is not None):
            query_string = "UPDATE {0} SET Prefix=%s, GivenName=%s, OtherName=%s, FamilyName=%s, Suffix=%s"\
                " WHERE PersonNameId=%s".format(table)
            write_cursor.execute(query_string, (name_components.Prefix, name_components.GivenName,
                                                name_components.OtherName, name_components.FamilyName,
                                                name_components.Suffix, PersonNameId))
        num_rows_read += 1
        if divmod(num_rows_read, 10000)[1] == 0:
            print(datetime.datetime.now(), num_rows_read)
            write_cnx.commit()

    write_cnx.commit()

    write_cursor.close()
    write_cnx.close()
    read_cursor.close()
    read_cnx.close()
    return
Ejemplo n.º 2
0
def other_with_comma(username, password, host, database, table):
    """
    Reads person names from the given table that are from an unknown source and that has one comma in it,
    as these may be of the same format as ones from ExPORTER; that is, they are of the format
    [familyname], [givenname] [othername]

    """

    # Connect to the database.
    read_cnx = mysql.connector.connect(user=username, password=password, database=database, host=host)
    read_cursor = read_cnx.cursor()
    write_cnx = mysql.connector.connect(user=username, password=password, database=database, host=host)
    write_cursor = write_cnx.cursor()

    query_string = "select PersonNameId, FullName from {0} where GivenName is null and FamilyName is null" \
                   " and locate(',',FullName)>0".format(table)
    read_cursor.execute(query_string)

    num_rows_read = 0
    print(datetime.datetime.now(), num_rows_read)

    for (PersonNameId, FullName) in read_cursor:
        name_components = name_parser.parse_name(name_parser.NameFormat.EXPORTER, FullName)
        if (name_components.Prefix is not None) or (name_components.GivenName is not None)\
            or (name_components.OtherName is not None) or (name_components.FamilyName is not None)\
            or (name_components.Suffix is not None) or (name_components.NickName is not None):
            query_string = "UPDATE {0} SET Prefix=%s, GivenName=%s, OtherName=%s, FamilyName=%s, Suffix=%s"\
                " WHERE PersonNameId=%s".format(table)
            write_cursor.execute(query_string, (name_components.Prefix, name_components.GivenName,
                                                name_components.OtherName, name_components.FamilyName,
                                                name_components.Suffix, PersonNameId))
        num_rows_read += 1
        if divmod(num_rows_read,10000)[1] == 0:
            print(datetime.datetime.now(), num_rows_read)
            write_cnx.commit()

    write_cnx.commit()

    write_cursor.close()
    write_cnx.close()
    read_cursor.close()
    read_cnx.close()
    return
Ejemplo n.º 3
0
def integrate(disambig_input_file, disambig_output_file):
    """
    We have two files: the input to the disambiguator:
        uuid, first name, middle name, last name, patent, mainclass, subclass, city, state, country, rawassignee, disambiguated assignee
    And the output of the disambiguator:
        uuid, unique inventor id

    The files will line up line by line, so we can easily get the collection of raw
    records that map to a single disambiguated record (D_REC).  For each of the raw records
    for a given disambiguated id (D_ID), we want to vote the most frequent values for
    each of the columns, and use those to populate the D_REC.


    just have to populate the fields of the disambiguated inventor object:
        inventor id, first name, last name, nationality (?)
    """
    disambig_input = pd.read_csv(disambig_input_file,
                                 header=None,
                                 delimiter='\t',
                                 encoding='utf-8')
    disambig_output = pd.read_csv(disambig_output_file,
                                  header=None,
                                  delimiter='\t',
                                  encoding='utf-8')
    disambig_input[0] = disambig_input[0].apply(str)
    disambig_output[0] = disambig_output[0].apply(str)
    print 'finished loading csvs'
    merged = pd.merge(disambig_input, disambig_output, on=0)
    # If there are any prior_inventor_ids at all, then the merge will have 15 columns. Otherwise, if there are not any
    # prior_inventor_ids, then the merge will not have a column for it and there will be only 14 columns.
    if disambig_input.shape[1] > 14:
        merged.columns = [
            'rawinventor_uuid', 'isgrant', 'granted', 'name_first',
            'name_middle', 'name_last', 'patent_id', 'mainclass', 'subclass',
            'city', 'state', 'country', 'assignee', 'rawassignee',
            'prev_inventorid', 'current_inventorid'
        ]
    else:
        merged.columns = [
            'rawinventor_uuid', 'isgrant', 'granted', 'name_first',
            'name_middle', 'name_last', 'patent_id', 'mainclass', 'subclass',
            'city', 'state', 'country', 'assignee', 'rawassignee',
            'current_inventorid'
        ]
    print 'finished merging'
    apps = merged[merged['isgrant'] == 0]

    inventor_attributes = merged[[
        'isgrant', 'rawinventor_uuid', 'current_inventorid', 'name_first',
        'name_middle', 'name_last', 'patent_id'
    ]]  # rawinventor uuid, inventor id, first name, middle name, last name, patent_id
    inventor_attributes = inventor_attributes.dropna(
        subset=['rawinventor_uuid'], how='all')
    inventor_attributes['name_first'] = inventor_attributes[
        'name_first'].fillna('')
    inventor_attributes['name_middle'] = inventor_attributes[
        'name_middle'].fillna('')
    inventor_attributes['name_last'] = inventor_attributes['name_last'].fillna(
        '')

    grants = inventor_attributes[inventor_attributes['isgrant'] == 1]
    apps = inventor_attributes[inventor_attributes['isgrant'] == 0]
    del grants['isgrant']
    del apps['isgrant']

    ####### DO GRANTS #######
    rawinventors = defaultdict(list)
    inventor_inserts = []
    rawinventor_updates = []
    patentinventor_inserts = []
    for row in grants.iterrows():
        uuid = row[1]['current_inventorid']
        rawinventors[uuid].append(row[1])
        patentinventor_inserts.append({
            'inventor_id': uuid,
            'patent_id': row[1]['patent_id']
        })
    print 'finished associating ids'
    i = 0
    for inventor_id in rawinventors.iterkeys():
        i += 1
        freq = defaultdict(Counter)
        param = {}
        rawuuids = []
        names = []
        for raw in rawinventors[inventor_id]:
            rawuuids.append(raw[0])
            name = ' '.join(x for x in (raw['name_first'], raw['name_middle'],
                                        raw['name_last']) if x)
            freq['name'][name] += 1
            for k, v in raw.iteritems():
                freq[k][v] += 1
        param['id'] = inventor_id
        name = freq['name'].most_common(1)[0][0]
        parsedNames = name_parser.parse_name(name_parser.NameFormat.CITESEERX,
                                             name)
        name_first = ' '.join(
            filter(None, (parsedNames.Prefix, parsedNames.GivenName,
                          parsedNames.OtherName)))
        name_last = ' '.join(
            filter(None, (parsedNames.FamilyName, parsedNames.Suffix)))
        param['name_first'] = name_first
        param['name_last'] = name_last
        param['nationality'] = ''
        assert set(
            param.keys()) == {'id', 'name_first', 'name_last', 'nationality'}
        inventor_inserts.append(param)
        for rawuuid in rawuuids:
            rawinventor_updates.append({'pk': rawuuid, 'update': param['id']})
        if i % 100000 == 0:
            print i, datetime.now(), rawuuids[0]
    print 'finished voting'
    session_generator = alchemy.session_generator(dbtype='grant')
    session = session_generator()
    if alchemy.is_mysql():
        session.execute('truncate inventor; truncate patent_inventor;')
    else:
        session.execute('delete from inventor; delete from patent_inventor;')

    from lib.tasks import bulk_commit_inserts, bulk_commit_updates
    bulk_commit_inserts(inventor_inserts, Inventor.__table__, is_mysql(),
                        20000, 'grant')
    bulk_commit_inserts(patentinventor_inserts, patentinventor, is_mysql(),
                        20000, 'grant')
    bulk_commit_updates('inventor_id', rawinventor_updates,
                        RawInventor.__table__, is_mysql(), 20000, 'grant')

    ###### DO APPLICATIONS ######

    rawinventors = defaultdict(list)
    inventor_inserts = []
    rawinventor_updates = []
    applicationinventor_inserts = []
    for row in apps.iterrows():
        uuid = row[1]['current_inventorid']
        rawinventors[uuid].append(row[1])
        applicationinventor_inserts.append({
            'inventor_id':
            uuid,
            'application_id':
            row[1]['patent_id']
        })
    print 'finished associating ids'
    i = 0
    for inventor_id in rawinventors.iterkeys():
        i += 1
        freq = defaultdict(Counter)
        param = {}
        rawuuids = []
        names = []
        for raw in rawinventors[inventor_id]:
            rawuuids.append(raw[0])
            name = ' '.join(x for x in (raw['name_first'], raw['name_middle'],
                                        raw['name_last']) if x)
            freq['name'][name] += 1
            for k, v in raw.iteritems():
                freq[k][v] += 1
        param['id'] = inventor_id
        name = freq['name'].most_common(1)[0][0]
        parsedNames = name_parser.parse_name(name_parser.NameFormat.CITESEERX,
                                             name)
        name_first = ' '.join(
            filter(None, (parsedNames.Prefix, parsedNames.GivenName,
                          parsedNames.OtherName)))
        name_last = ' '.join(
            filter(None, (parsedNames.FamilyName, parsedNames.Suffix)))
        param['name_first'] = name_first
        param['name_last'] = name_last
        param['nationality'] = ''
        assert set(
            param.keys()) == {'id', 'name_first', 'name_last', 'nationality'}
        inventor_inserts.append(param)
        for rawuuid in rawuuids:
            rawinventor_updates.append({'pk': rawuuid, 'update': param['id']})
        if i % 100000 == 0:
            print i, datetime.now(), rawuuids[0]
    print 'finished voting'
    session_generator = alchemy.session_generator(dbtype='application')
    session = session_generator()
    if alchemy.is_mysql():
        session.execute('truncate inventor; truncate application_inventor;')
    else:
        session.execute(
            'delete from inventor; delete from application_inventor;')

    from lib.tasks import bulk_commit_inserts, bulk_commit_updates
    bulk_commit_inserts(inventor_inserts, App_Inventor.__table__, is_mysql(),
                        20000, 'application')
    bulk_commit_inserts(applicationinventor_inserts, applicationinventor,
                        is_mysql(), 20000, 'application')
    bulk_commit_updates('inventor_id',
                        rawinventor_updates, App_RawInventor.__table__,
                        is_mysql(), 20000, 'application')

    session_generator = alchemy.session_generator(dbtype='grant')
    session = session_generator()
    doctype = 'grant'
    session.execute('truncate location_assignee;')
    res = session.execute(
        'select distinct location_id, assignee_id from patent \
        left join rawassignee on rawassignee.patent_id = patent.id \
        left join rawlocation on rawlocation.id = rawassignee.rawlocation_id \
        where assignee_id != "" and location_id != "";')
    assigneelocation = pd.DataFrame.from_records(res.fetchall())
    assigneelocation.columns = ['location_id', 'assignee_id']
    assigneelocation = assigneelocation.sort('assignee_id')
    print assigneelocation.info()
    locationassignee_inserts = [
        row[1].to_dict() for row in assigneelocation.iterrows()
    ]
    bulk_commit_inserts(locationassignee_inserts,
                        alchemy.schema.locationassignee, alchemy.is_mysql(),
                        20000, 'grant')

    session.execute('truncate location_inventor;')
    res = session.execute(
        'select distinct location_id, inventor_id from patent \
        left join rawinventor on rawinventor.patent_id = patent.id \
        left join rawlocation on rawlocation.id = rawinventor.rawlocation_id \
        where inventor_id != "" and location_id != "";')
    inventorlocation = pd.DataFrame.from_records(res.fetchall())
    inventorlocation.columns = ['location_id', 'inventor_id']
    inventorlocation = inventorlocation.sort('inventor_id')
    print inventorlocation.info()
    locationinventor_inserts = [
        row[1].to_dict() for row in inventorlocation.iterrows()
    ]
    bulk_commit_inserts(locationinventor_inserts,
                        alchemy.schema.locationinventor, alchemy.is_mysql(),
                        20000, 'grant')

    doctype = 'application'
    session_generator = alchemy.session_generator(dbtype='application')
    session = session_generator()
    session.execute('truncate location_assignee;')
    res = session.execute(
        'select distinct location_id, assignee_id from application \
        left join rawassignee on rawassignee.application_id = application.id \
        left join rawlocation on rawlocation.id = rawassignee.rawlocation_id \
        where assignee_id != "" and location_id != "";')
    assigneelocation = pd.DataFrame.from_records(res.fetchall())
    assigneelocation.columns = ['location_id', 'assignee_id']
    assigneelocation = assigneelocation.sort('assignee_id')
    print assigneelocation.info()
    locationassignee_inserts = [
        row[1].to_dict() for row in assigneelocation.iterrows()
    ]
    bulk_commit_inserts(locationassignee_inserts,
                        alchemy.schema.app_locationassignee,
                        alchemy.is_mysql(), 20000, 'application')

    session.execute('truncate location_inventor;')
    res = session.execute(
        'select distinct location_id, inventor_id from application \
        left join rawinventor on rawinventor.application_id = application.id \
        left join rawlocation on rawlocation.id = rawinventor.rawlocation_id \
        where inventor_id != "" and location_id != "";')
    inventorlocation = pd.DataFrame.from_records(res.fetchall())
    inventorlocation.columns = ['location_id', 'inventor_id']
    inventorlocation = inventorlocation.sort('inventor_id')
    print inventorlocation.info()
    locationinventor_inserts = [
        row[1].to_dict() for row in inventorlocation.iterrows()
    ]
    bulk_commit_inserts(locationinventor_inserts,
                        alchemy.schema.app_locationinventor,
                        alchemy.is_mysql(), 20000, 'application')
Ejemplo n.º 4
0
def integrate(disambig_input_file, disambig_output_file):
    """
    We have two files: the input to the disambiguator:
        uuid, first name, middle name, last name, patent, mainclass, subclass, city, state, country, rawassignee, disambiguated assignee
    And the output of the disambiguator:
        uuid, unique inventor id

    The files will line up line by line, so we can easily get the collection of raw
    records that map to a single disambiguated record (D_REC).  For each of the raw records
    for a given disambiguated id (D_ID), we want to vote the most frequent values for
    each of the columns, and use those to populate the D_REC.


    just have to populate the fields of the disambiguated inventor object:
        inventor id, first name, last name, nationality (?)
    """
    disambig_input = pd.read_csv(disambig_input_file,header=None,delimiter='\t',encoding='utf-8')
    disambig_output = pd.read_csv(disambig_output_file,header=None,delimiter='\t',encoding='utf-8')
    disambig_input[0] = disambig_input[0].apply(str)
    disambig_output[0] = disambig_output[0].apply(str)
    print 'finished loading csvs'
    merged = pd.merge(disambig_input, disambig_output, on=0)
    # If there are any prior_inventor_ids at all, then the merge will have 15 columns. Otherwise, if there are not any
    # prior_inventor_ids, then the merge will not have a column for it and there will be only 14 columns.
    if disambig_input.shape[1] > 14:
        merged.columns = ['rawinventor_uuid','isgrant','granted','name_first','name_middle','name_last','patent_id','mainclass','subclass','city','state','country','assignee','rawassignee','prev_inventorid','current_inventorid']
    else:
        merged.columns = ['rawinventor_uuid','isgrant','granted','name_first','name_middle','name_last','patent_id','mainclass','subclass','city','state','country','assignee','rawassignee','current_inventorid']
    print 'finished merging'
    apps = merged[merged['isgrant'] == 0]


    inventor_attributes = merged[['isgrant','rawinventor_uuid','current_inventorid','name_first','name_middle','name_last','patent_id']] # rawinventor uuid, inventor id, first name, middle name, last name, patent_id
    inventor_attributes = inventor_attributes.dropna(subset=['rawinventor_uuid'],how='all')
    inventor_attributes['name_first'] = inventor_attributes['name_first'].fillna('')
    inventor_attributes['name_middle'] = inventor_attributes['name_middle'].fillna('')
    inventor_attributes['name_last'] = inventor_attributes['name_last'].fillna('')

    grants = inventor_attributes[inventor_attributes['isgrant'] == 1]
    apps = inventor_attributes[inventor_attributes['isgrant'] == 0]
    del grants['isgrant']
    del apps['isgrant']

    ####### DO GRANTS #######
    rawinventors = defaultdict(list)
    inventor_inserts = []
    rawinventor_updates = []
    patentinventor_inserts = []
    for row in grants.iterrows():
        uuid = row[1]['current_inventorid']
        rawinventors[uuid].append(row[1])
        patentinventor_inserts.append({'inventor_id': uuid, 'patent_id': row[1]['patent_id']})
    print 'finished associating ids'
    i = 0
    for inventor_id in rawinventors.iterkeys():
        i += 1
        freq = defaultdict(Counter)
        param = {}
        rawuuids = []
        names = []
        for raw in rawinventors[inventor_id]:
            rawuuids.append(raw[0])
            name = ' '.join(x for x in (raw['name_first'], raw['name_middle'], raw['name_last']) if x)
            freq['name'][name] += 1
            for k,v in raw.iteritems():
                freq[k][v] += 1
        param['id'] = inventor_id
        name = freq['name'].most_common(1)[0][0]
        parsedNames = name_parser.parse_name(name_parser.NameFormat.CITESEERX, name)
        name_first = ' '.join(filter(None, (parsedNames.Prefix, parsedNames.GivenName, parsedNames.OtherName)))
        name_last = ' '.join(filter(None, (parsedNames.FamilyName, parsedNames.Suffix)))
        param['name_first'] = name_first
        param['name_last'] = name_last
        param['nationality'] = ''
        assert set(param.keys()) == {'id','name_first','name_last','nationality'}
        inventor_inserts.append(param)
        for rawuuid in rawuuids:
            rawinventor_updates.append({'pk': rawuuid, 'update': param['id']})
        if i % 100000 == 0:
            print i, datetime.now(), rawuuids[0]
    print 'finished voting'
    session_generator = alchemy.session_generator(dbtype='grant')
    session = session_generator()
    if alchemy.is_mysql():
        session.execute('truncate inventor; truncate patent_inventor;')
    else:
        session.execute('delete from inventor; delete from patent_inventor;')

    from lib.tasks import bulk_commit_inserts, bulk_commit_updates
    bulk_commit_inserts(inventor_inserts, Inventor.__table__, is_mysql(), 20000,'grant')
    bulk_commit_inserts(patentinventor_inserts, patentinventor, is_mysql(), 20000,'grant')
    bulk_commit_updates('inventor_id', rawinventor_updates, RawInventor.__table__, is_mysql(), 20000,'grant')


    ###### DO APPLICATIONS ######

    rawinventors = defaultdict(list)
    inventor_inserts = []
    rawinventor_updates = []
    applicationinventor_inserts = []
    for row in apps.iterrows():
        uuid = row[1]['current_inventorid']
        rawinventors[uuid].append(row[1])
        applicationinventor_inserts.append({'inventor_id': uuid, 'application_id': row[1]['patent_id']})
    print 'finished associating ids'
    i = 0
    for inventor_id in rawinventors.iterkeys():
        i += 1
        freq = defaultdict(Counter)
        param = {}
        rawuuids = []
        names = []
        for raw in rawinventors[inventor_id]:
            rawuuids.append(raw[0])
            name = ' '.join(x for x in (raw['name_first'], raw['name_middle'], raw['name_last']) if x)
            freq['name'][name] += 1
            for k,v in raw.iteritems():
                freq[k][v] += 1
        param['id'] = inventor_id
        name = freq['name'].most_common(1)[0][0]
        parsedNames = name_parser.parse_name(name_parser.NameFormat.CITESEERX, name)
        name_first = ' '.join(filter(None, (parsedNames.Prefix, parsedNames.GivenName, parsedNames.OtherName)))
        name_last = ' '.join(filter(None, (parsedNames.FamilyName, parsedNames.Suffix)))
        param['name_first'] = name_first
        param['name_last'] = name_last
        param['nationality'] = ''
        assert set(param.keys()) == {'id','name_first','name_last','nationality'}
        inventor_inserts.append(param)
        for rawuuid in rawuuids:
            rawinventor_updates.append({'pk': rawuuid, 'update': param['id']})
        if i % 100000 == 0:
            print i, datetime.now(), rawuuids[0]
    print 'finished voting'
    session_generator = alchemy.session_generator(dbtype='application')
    session = session_generator()
    if alchemy.is_mysql():
        session.execute('truncate inventor; truncate application_inventor;')
    else:
        session.execute('delete from inventor; delete from application_inventor;')

    from lib.tasks import bulk_commit_inserts, bulk_commit_updates
    bulk_commit_inserts(inventor_inserts, App_Inventor.__table__, is_mysql(), 20000,'application')
    bulk_commit_inserts(applicationinventor_inserts, applicationinventor, is_mysql(), 20000,'application')
    bulk_commit_updates('inventor_id', rawinventor_updates, App_RawInventor.__table__, is_mysql(), 20000,'application')


    session_generator = alchemy.session_generator(dbtype='grant')
    session = session_generator()
    doctype = 'grant'
    session.execute('truncate location_assignee;')
    res = session.execute('select distinct location_id, assignee_id from patent \
        left join rawassignee on rawassignee.patent_id = patent.id \
        left join rawlocation on rawlocation.id = rawassignee.rawlocation_id \
        where assignee_id != "" and location_id != "";')
    assigneelocation = pd.DataFrame.from_records(res.fetchall())
    assigneelocation.columns = ['location_id','assignee_id']
    assigneelocation = assigneelocation.sort('assignee_id')
    print assigneelocation.info()
    locationassignee_inserts = [row[1].to_dict() for row in assigneelocation.iterrows()]
    bulk_commit_inserts(locationassignee_inserts, alchemy.schema.locationassignee, alchemy.is_mysql(), 20000, 'grant')

    session.execute('truncate location_inventor;')
    res = session.execute('select distinct location_id, inventor_id from patent \
        left join rawinventor on rawinventor.patent_id = patent.id \
        left join rawlocation on rawlocation.id = rawinventor.rawlocation_id \
        where inventor_id != "" and location_id != "";')
    inventorlocation = pd.DataFrame.from_records(res.fetchall())
    inventorlocation.columns = ['location_id','inventor_id']
    inventorlocation = inventorlocation.sort('inventor_id')
    print inventorlocation.info()
    locationinventor_inserts = [row[1].to_dict() for row in inventorlocation.iterrows()]
    bulk_commit_inserts(locationinventor_inserts, alchemy.schema.locationinventor, alchemy.is_mysql(), 20000, 'grant')

    doctype = 'application'
    session_generator = alchemy.session_generator(dbtype='application')
    session = session_generator()
    session.execute('truncate location_assignee;')
    res = session.execute('select distinct location_id, assignee_id from application \
        left join rawassignee on rawassignee.application_id = application.id \
        left join rawlocation on rawlocation.id = rawassignee.rawlocation_id \
        where assignee_id != "" and location_id != "";')
    assigneelocation = pd.DataFrame.from_records(res.fetchall())
    assigneelocation.columns = ['location_id','assignee_id']
    assigneelocation = assigneelocation.sort('assignee_id')
    print assigneelocation.info()
    locationassignee_inserts = [row[1].to_dict() for row in assigneelocation.iterrows()]
    bulk_commit_inserts(locationassignee_inserts, alchemy.schema.app_locationassignee, alchemy.is_mysql(), 20000, 'application')

    session.execute('truncate location_inventor;')
    res = session.execute('select distinct location_id, inventor_id from application \
        left join rawinventor on rawinventor.application_id = application.id \
        left join rawlocation on rawlocation.id = rawinventor.rawlocation_id \
        where inventor_id != "" and location_id != "";')
    inventorlocation = pd.DataFrame.from_records(res.fetchall())
    inventorlocation.columns = ['location_id','inventor_id']
    inventorlocation = inventorlocation.sort('inventor_id')
    print inventorlocation.info()
    locationinventor_inserts = [row[1].to_dict() for row in inventorlocation.iterrows()]
    bulk_commit_inserts(locationinventor_inserts, alchemy.schema.app_locationinventor, alchemy.is_mysql(), 20000, 'application')
Ejemplo n.º 5
0
                         port=args.port)
read_cursor = read_cnx.cursor()
write_cnx = mySQL.connect(user=args.user, passwd=password, db=args.database, host=args.host,
                          port=args.port)
write_cursor = write_cnx.cursor()

# Parse out the program officer name in the Award table
query_string = "select AwardPKID, ProgramOfficer from NSF_Award a"\
    " where ProgramOfficer is not null and ProgramOfficer <> 'name not available';"
read_cursor.execute(query_string)

num_rows_read = 0
print(datetime.datetime.now(), num_rows_read)

for (AwardPKId, FullName) in read_cursor:
    name_components = name_parser.parse_name(name_parser.NameFormat.CITESEERX, FullName)
    if (name_components.Prefix is not None) or (name_components.GivenName is not None)\
            or (name_components.OtherName is not None) or (name_components.FamilyName is not None)\
            or (name_components.Suffix is not None) or (name_components.NickName is not None):
        query_string = "UPDATE NSF_Award SET UM_ProgramOfficer_Prefix=%s, UM_ProgramOfficer_GivenName=%s," \
                       " UM_ProgramOfficer_OtherName=%s, UM_ProgramOfficer_FamilyName=%s, UM_ProgramOfficer_Suffix=%s" \
                       " WHERE AwardPKId=%s;"
        write_cursor.execute(query_string, (name_components.Prefix, name_components.GivenName,
                                            name_components.OtherName, name_components.FamilyName,
                                            name_components.Suffix, AwardPKId))
    num_rows_read += 1
    if divmod(num_rows_read, 10000)[1] == 0:
        print(datetime.datetime.now(), num_rows_read)
        write_cnx.commit()

write_cnx.commit()
Ejemplo n.º 6
0
write_cnx = mySQL.connect(user=args.user, passwd=password, db=args.database, host=args.host,
                          port=args.port)
write_cursor = write_cnx.cursor()



# Parse out the program officer name in the Award table
query_string = "select AwardId, PDPIName, ProgramOfficerName from rg_award a"\
    " where PDPIName is not null or ProgramOfficerName is not null;"
read_cursor.execute(query_string)

num_rows_read = 0
print(datetime.datetime.now(), num_rows_read)

for (AwardId, PDPIFullName, POFullname) in read_cursor:
    pdpi_name_components = name_parser.parse_name(name_parser.NameFormat.CITESEERX, PDPIFullName)
    po_name_components = name_parser.parse_name(name_parser.NameFormat.CITESEERX, POFullname)
    if (pdpi_name_components.Prefix is not None) or (pdpi_name_components.GivenName is not None)\
            or (pdpi_name_components.OtherName is not None) or (pdpi_name_components.FamilyName is not None)\
            or (pdpi_name_components.Suffix is not None) or (pdpi_name_components.NickName is not None)\
            or (po_name_components.Prefix is not None) or (po_name_components.GivenName is not None)\
            or (po_name_components.OtherName is not None) or (po_name_components.FamilyName is not None)\
            or (po_name_components.Suffix is not None) or (po_name_components.NickName is not None):
        query_string = "UPDATE rg_award SET UM_PDPIName_Prefix=%s, UM_PDPIName_GivenName=%s," \
                       " UM_PDPIName_OtherName=%s, UM_PDPIName_FamilyName=%s, UM_PDPIName_Suffix=%s,"\
                       " UM_ProgramOfficerName_Prefix=%s, UM_ProgramOfficerName_GivenName=%s, "\
                       " UM_ProgramOfficerName_OtherName=%s, UM_ProgramOfficerName_FamilyName=%s, "\
                       " UM_ProgramOfficerName_Suffix=%s"\
                       " WHERE AwardId=%s;"
        write_cursor.execute(query_string, (pdpi_name_components.Prefix, pdpi_name_components.GivenName,
                                            pdpi_name_components.OtherName, pdpi_name_components.FamilyName,
Ejemplo n.º 7
0
def main(year, doctype):
    # get patents as iterator to save memory
    # use subqueryload to get better performance by using less queries on the backend:
    # --> http://docs.sqlalchemy.org/en/latest/orm/tutorial.html#eager-loading
    session = alchemy.fetch_session(dbtype=doctype)
    schema = alchemy.schema.Patent
    if doctype == 'application':
        schema = alchemy.schema.App_Application
        if year:
            patents = (p for p in session.query(schema).filter(extract('year', schema.date) == year).options(subqueryload('rawinventors'), subqueryload('rawassignees'), subqueryload('classes')).yield_per(1))
        else:
            patents = (p for p in session.query(schema).options(subqueryload('rawinventors'), subqueryload('rawassignees'), subqueryload('classes')).yield_per(1))
    else:
        if year:
            patents = (p for p in session.query(schema).filter(extract('year', schema.date) == year).options(subqueryload('rawinventors'), subqueryload('rawassignees'), subqueryload('current_classes')).yield_per(1))
        else:
            patents = (p for p in session.query(schema).options(subqueryload('rawinventors'), subqueryload('rawassignees'), subqueryload('current_classes')).yield_per(1))
    i = 0
    for patent in patents:
        i += 1
        if i % 100000 == 0:
          print i, datetime.now()
        try:
          # create common dict for this patent
          primrawloc = patent.rawinventors[0].rawlocation
          if primrawloc:
            primloc = patent.rawinventors[0].rawlocation.location
          else:
            primloc = primrawloc
          if doctype == 'application':
            mainclass = patent.classes[0].mainclass_id if patent.classes else ''
            subclass = patent.classes[0].subclass_id if patent.classes else ''
          else:
            mainclass = patent.current_classes[0].mainclass_id if patent.current_classes else ''
            subclass = patent.current_classes[0].subclass_id if patent.current_classes else ''
          row = {'number': patent.id,
                 'mainclass': mainclass,
                 'subclass': subclass,
                 'ignore': 0,
                 }
          if doctype == 'grant':
            row['isgrant'] = 1
          elif doctype == 'application':
            row['isgrant'] = 0
            if patent.granted == True:
              row['ignore'] = 1
          row['assignee'] = get_cleanid(patent.rawassignees[0]) if patent.rawassignees else ''
          row['assignee'] = row['assignee'].split('\t')[0]
          row['rawassignee'] = get_cleanid(patent.rawassignees[0]) if patent.rawassignees else ''
          row['rawassignee'] = row['rawassignee'].split('\t')[0]
          # generate a row for each of the inventors on a patent
          for ri in patent.rawinventors:
              if not len(ri.name_first.strip()):
                  continue
              namedict = {'uuid': ri.uuid}
              parsedNames = name_parser.parse_name(name_parser.NameFormat.CITESEERX, ri.name_first + ' ' + ri.name_last)
              namedict['name_first'] = ' '.join(filter(None, (parsedNames.Prefix, parsedNames.GivenName)))
              namedict['name_middle'] = parsedNames.OtherName if parsedNames.OtherName is not None else ''
              namedict['name_last'] = ' '.join(filter(None, (parsedNames.FamilyName, parsedNames.Suffix)))

              rawloc = ri.rawlocation
              if rawloc:
                if rawloc.location:
                  loc = rawloc.location
                else:
                  loc = primloc
              else:
                loc = primloc
              namedict['state'] = loc.state if loc else ''# if loc else rawloc.state if rawloc else primloc.state if primloc else ''
              namedict['country'] = loc.country if loc else ''# if loc else rawloc.country if rawloc else primloc.country if primloc else ''
              namedict['city'] = loc.city if loc else ''# if loc else rawloc.city if rawloc else primloc.city if primloc else ''
              if '??' in namedict['state'] or len(namedict['state']) == 0:
                namedict['state'] = rawloc.state if rawloc else primloc.state if primloc else ''
              if '??' in namedict['country'] or len(namedict['country']) == 0:
                namedict['country'] = rawloc.country if rawloc else primloc.country if primloc else ''
              if '??' in namedict['city'] or len(namedict['city']) == 0:
                namedict['city'] = rawloc.city if rawloc else primloc.city if primloc else ''
              tmprow = row.copy()
              tmprow.update(namedict)
              newrow = normalize_utf8(ROW(tmprow))
              with codecs.open('disambiguator.csv', 'a', encoding='utf-8') as csv:
                  csv.write(newrow)
        except Exception as e:
          print e
          continue