def create_lawyer_table(session): """ Given a list of lawyers and the redis key-value disambiguation, populates the lawyer table in the database """ print 'Disambiguating lawyers...' if alchemy.is_mysql(): session.execute('set foreign_key_checks = 0;') session.commit() i = 0 for lawyer in blocks.iterkeys(): ra_ids = (id_map[ra] for ra in blocks[lawyer]) for block in ra_ids: i += 1 rawlawyers = [lawyer_dict[ra_id] for ra_id in block] if i % 20000 == 0: print i, datetime.now() lawyer_match(rawlawyers, session, commit=True) else: lawyer_match(rawlawyers, session, commit=False) t1 = bulk_commit_inserts.delay(lawyer_insert_statements, Lawyer.__table__, alchemy.is_mysql(), 20000) t2 = bulk_commit_inserts.delay(patentlawyer_insert_statements, patentlawyer, alchemy.is_mysql(), 20000) t3 = bulk_commit_updates.delay('lawyer_id', update_statements, RawLawyer.__table__, alchemy.is_mysql(), 20000) t1.get() t2.get() t3.get() session.commit() print i, datetime.now()
def run_disambiguation(doctype='grant'): # get all lawyers in database global blocks global lawyer_insert_statements global patentlawyer_insert_statements global update_statements session = alchemy.fetch_session(dbtype=doctype) if doctype == 'grant': lawyers = deque(session.query(RawLawyer)) if doctype == 'application': lawyers = deque(session.query(App_RawLawyer)) lawyer_alpha_blocks = clean_lawyers(lawyers) lawyer_insert_statements = [] patentlawyer_insert_statements = [] update_statements = [] for letter in alphabet: print letter, datetime.now() blocks = defaultdict(list) lawyer_insert_statements = [] patentlawyer_insert_statements = [] update_statements = [] letterblock = [ x for x in lawyer_alpha_blocks if x.lower().startswith(letter) ] create_jw_blocks(letterblock) create_lawyer_table(session) print len(lawyer_insert_statements) print len(update_statements) bulk_commit_inserts(lawyer_insert_statements, Lawyer.__table__, alchemy.is_mysql(), 20000, 'grant') bulk_commit_inserts(patentlawyer_insert_statements, patentlawyer, alchemy.is_mysql(), 20000) bulk_commit_updates('lawyer_id', update_statements, RawLawyer.__table__, alchemy.is_mysql(), 20000)
def create_assignee_table(session): """ Given a list of assignees and the redis key-value disambiguation, populates the Assignee table in the database """ print 'Disambiguating assignees...' if alchemy.is_mysql(): session.execute('set foreign_key_checks = 0;') session.commit() i = 0 for assignee in blocks.iterkeys(): ra_ids = (id_map[ra] for ra in blocks[assignee]) for block in ra_ids: i += 1 rawassignees = [assignee_dict[ra_id] for ra_id in block] if i % 20000 == 0: print i, datetime.now() assignee_match(rawassignees, session, commit=True) else: assignee_match(rawassignees, session, commit=False) celery_commit_inserts(assignee_insert_statements, Assignee.__table__, alchemy.is_mysql(), 20000) celery_commit_inserts(patentassignee_insert_statements, patentassignee, alchemy.is_mysql(), 20000) celery_commit_updates('assignee_id', update_statements, RawAssignee.__table__, alchemy.is_mysql(), 20000) session.commit() print i, datetime.now()
def match_grouped_locations(identified_grouped_locations_enum, t, alchemy_session): if alchemy.is_mysql(): alchemy_session.execute("set foreign_key_checks = 0; truncate location;") alchemy_session.commit() for i, item in identified_grouped_locations_enum: #grouped_locations_list = a list of every grouped location with the same grouping_id # Note that a grouped_location is a dict, as described above #grouping_id = the grouping_id of all items in the list grouping_id, grouped_locations_list = item #We need to get only the RawLocation objects back from the grouped_location dict #match_group is the list of RawLocation objects which we call match on match_group = [] first_pass=True for grouped_location in grouped_locations_list: match_group.append(grouped_location["raw_location"]) if(first_pass): first_matching_location = grouped_location["matching_location"] """ default is a dict containing the default values of the parameters (id, city, region, country, latitude, longtidue) for all locations that are part of the same group. Here we set the defaults to be the values for the first entry in the grouped_locations_list In theory, all entries in the grouped_locations_list should have the same lat/long. """ default = {"id": grouping_id, "city":first_matching_location.city, "state":first_matching_location.region, "country": first_matching_location.country.upper(), "latitude":first_matching_location.latitude, "longitude":first_matching_location.longitude} #No need to run match() if no matching location was found. if(grouping_id!="nolocationfound"): run_geo_match(grouping_id, default, match_group, i, t, alchemy_session) if alchemy.is_mysql(): alchemy_session.execute('truncate location; truncate location_assignee; truncate location_inventor;') else: alchemy_session.execute('delete from location;') alchemy_session.commit() alchemy_session.execute('delete from location_assignee;') alchemy_session.commit() alchemy_session.execute('delete from location_inventor;') alchemy_session.commit() if doctype == 'grant': bulk_commit_inserts(location_insert_statements, alchemy.schema.Location.__table__, alchemy.is_mysql(), commit_freq, 'grant') bulk_commit_updates('location_id', update_statements, alchemy.schema.RawLocation.__table__, alchemy.is_mysql(), commit_freq, 'grant') elif doctype == 'application': bulk_commit_inserts(location_insert_statements, alchemy.schema.App_Location.__table__, alchemy.is_mysql(), commit_freq, 'application') bulk_commit_updates('location_id', update_statements, alchemy.schema.App_RawLocation.__table__, alchemy.is_mysql(), commit_freq, 'application') alchemy_session.commit() session_generator = alchemy.session_generator(dbtype=doctype) session = session_generator() session.commit() print 'Committed!!!'
def run_disambiguation(): """ Runs disambiguation algorithm on grant and application assignees from the database indicated by lib/alchemy/config """ # retrieve database connections and pull in all assignees from # both grant and application databases grtsesh = grantsessiongen() appsesh = appsessiongen() print "fetching raw assignees", datetime.now() rawassignees = list(grtsesh.query(RawAssignee)) rawassignees.extend(list(appsesh.query(App_RawAssignee))) # clear the destination tables if alchemy.is_mysql(): grtsesh.execute("truncate assignee; truncate patent_assignee;") appsesh.execute("truncate assignee; truncate application_assignee;") else: grtsesh.execute("delete from assignee; delete from patent_assignee;") appsesh.execute("delete from assignee; delete from patent_assignee;") print "cleaning ids", datetime.now() # uses the get_cleanid method to remove undesirable characters and # normalize to case and group by first letter for ra in rawassignees: uuid_to_object[ra.uuid] = ra cleanid = get_cleanid(ra) uuid_to_cleanid[ra.uuid] = cleanid if not cleanid: continue firstletter = cleanid[0] uuids_by_cleanidletter[firstletter].append(ra.uuid) print "disambiguating blocks", datetime.now() # disambiguates each of the letter blocks using # the list of assignees as a stack and only performing # jaro-winkler comparisons on the first item of each block allrecords = [] for letter in alphabet: print "disambiguating", "({0})".format(letter), datetime.now() lettergroup = disambiguate_letter(letter) print "got", len(lettergroup), "records" print "creating disambiguated records", "({0})".format(letter), datetime.now() allrecords.extend(lettergroup.values()) # create the attributes for the disambiguated assignee record from the # raw records placed into a block in the disambiguation phase res = map(create_disambiguated_record_for_block, allrecords) mid = itertools.izip(*res) grant_assignee_inserts = list(itertools.chain.from_iterable(mid.next())) app_assignee_inserts = list(itertools.chain.from_iterable(mid.next())) patentassignee_inserts = list(itertools.chain.from_iterable(mid.next())) applicationassignee_inserts = list(itertools.chain.from_iterable(mid.next())) grant_rawassignee_updates = list(itertools.chain.from_iterable(mid.next())) app_rawassignee_updates = list(itertools.chain.from_iterable(mid.next())) # write out the insert counts for each table into a text file with open("mid.txt", "wb") as f: f.write(str(len(grant_assignee_inserts)) + "\n") f.write(str(len(app_assignee_inserts)) + "\n") f.write(str(len(patentassignee_inserts)) + "\n") f.write(str(len(applicationassignee_inserts)) + "\n") f.write(str(len(grant_rawassignee_updates)) + "\n") f.write(str(len(app_rawassignee_updates)) + "\n") # insert disambiguated assignee records bulk_commit_inserts(grant_assignee_inserts, Assignee.__table__, alchemy.is_mysql(), 20000, "grant") bulk_commit_inserts(app_assignee_inserts, App_Assignee.__table__, alchemy.is_mysql(), 20000, "application") # insert patent/assignee link records bulk_commit_inserts(patentassignee_inserts, patentassignee, alchemy.is_mysql(), 20000, "grant") bulk_commit_inserts(applicationassignee_inserts, applicationassignee, alchemy.is_mysql(), 20000, "application") # update rawassignees with their disambiguated record bulk_commit_updates( "assignee_id", grant_rawassignee_updates, RawAssignee.__table__, alchemy.is_mysql(), 20000, "grant" ) bulk_commit_updates( "assignee_id", app_rawassignee_updates, App_RawAssignee.__table__, alchemy.is_mysql(), 20000, "application" )
def run_disambiguation(): """ Runs disambiguation algorithm on grant and application assignees from the database indicated by lib/alchemy/config """ # retrieve database connections and pull in all assignees from # both grant and application databases grtsesh = grantsessiongen() appsesh = appsessiongen() print 'fetching raw assignees', datetime.now() rawassignees = list(grtsesh.query(RawAssignee)) rawassignees.extend(list(appsesh.query(App_RawAssignee))) # clear the destination tables if alchemy.is_mysql(): grtsesh.execute('truncate assignee; truncate patent_assignee;') appsesh.execute('truncate assignee; truncate application_assignee;') else: grtsesh.execute('delete from assignee; delete from patent_assignee;') appsesh.execute('delete from assignee; delete from patent_assignee;') print 'cleaning ids', datetime.now() # uses the get_cleanid method to remove undesirable characters and # normalize to case and group by first letter for ra in rawassignees: uuid_to_object[ra.uuid] = ra cleanid = get_cleanid(ra) uuid_to_cleanid[ra.uuid] = cleanid if not cleanid: continue firstletter = cleanid[0] uuids_by_cleanidletter[firstletter].append(ra.uuid) print 'disambiguating blocks', datetime.now() # disambiguates each of the letter blocks using # the list of assignees as a stack and only performing # jaro-winkler comparisons on the first item of each block allrecords = [] for letter in alphabet: print 'disambiguating', '({0})'.format(letter), datetime.now() lettergroup = disambiguate_letter(letter) print 'got', len(lettergroup), 'records' print 'creating disambiguated records', '({0})'.format( letter), datetime.now() allrecords.extend(lettergroup.values()) # create the attributes for the disambiguated assignee record from the # raw records placed into a block in the disambiguation phase res = map(create_disambiguated_record_for_block, allrecords) mid = itertools.izip(*res) grant_assignee_inserts = list(itertools.chain.from_iterable(mid.next())) app_assignee_inserts = list(itertools.chain.from_iterable(mid.next())) patentassignee_inserts = list(itertools.chain.from_iterable(mid.next())) applicationassignee_inserts = list( itertools.chain.from_iterable(mid.next())) grant_rawassignee_updates = list(itertools.chain.from_iterable(mid.next())) app_rawassignee_updates = list(itertools.chain.from_iterable(mid.next())) # write out the insert counts for each table into a text file with open('mid.txt', 'wb') as f: f.write(str(len(grant_assignee_inserts)) + '\n') f.write(str(len(app_assignee_inserts)) + '\n') f.write(str(len(patentassignee_inserts)) + '\n') f.write(str(len(applicationassignee_inserts)) + '\n') f.write(str(len(grant_rawassignee_updates)) + '\n') f.write(str(len(app_rawassignee_updates)) + '\n') # insert disambiguated assignee records bulk_commit_inserts(grant_assignee_inserts, Assignee.__table__, alchemy.is_mysql(), 20000, 'grant') bulk_commit_inserts(app_assignee_inserts, App_Assignee.__table__, alchemy.is_mysql(), 20000, 'application') # insert patent/assignee link records bulk_commit_inserts(patentassignee_inserts, patentassignee, alchemy.is_mysql(), 20000, 'grant') bulk_commit_inserts(applicationassignee_inserts, applicationassignee, alchemy.is_mysql(), 20000, 'application') # update rawassignees with their disambiguated record bulk_commit_updates('assignee_id', grant_rawassignee_updates, RawAssignee.__table__, alchemy.is_mysql(), 20000, 'grant') bulk_commit_updates('assignee_id', app_rawassignee_updates, App_RawAssignee.__table__, alchemy.is_mysql(), 20000, 'application')
def match_grouped_locations(identified_grouped_locations_enum, t, alchemy_session): if alchemy.is_mysql(): alchemy_session.execute("set foreign_key_checks = 0; truncate location;") alchemy_session.commit() for i, item in identified_grouped_locations_enum: #grouped_locations_list = a list of every grouped location with the same grouping_id # Note that a grouped_location is a dict, as described above #grouping_id = the grouping_id of all items in the list grouping_id, grouped_locations_list = item #We need to get only the RawLocation objects back from the grouped_location dict #match_group is the list of RawLocation objects which we call match on match_group = [] first_pass=True for grouped_location in grouped_locations_list: match_group.append(grouped_location["raw_location"]) if(first_pass): first_matching_location = grouped_location["matching_location"] """ default is a dict containing the default values of the parameters (id, city, region, country, latitude, longtidue) for all locations that are part of the same group. Here we set the defaults to be the values for the first entry in the grouped_locations_list In theory, all entries in the grouped_locations_list should have the same lat/long. """ default = {"id": grouping_id, "city":first_matching_location.city, "state":first_matching_location.region, "country":first_matching_location.country, "latitude":first_matching_location.latitude, "longitude":first_matching_location.longitude} #No need to run match() if no matching location was found. if(grouping_id!="nolocationfound"): run_geo_match(grouping_id, default, match_group, i, t, alchemy_session) alchemy_session.execute('truncate location; truncate assignee_location; truncate inventor_location;') celery_commit_inserts(location_insert_statements, alchemy.schema.Location.__table__, alchemy.is_mysql(), commit_freq) celery_commit_updates('location_id', update_statements, alchemy.schema.RawLocation.__table__, alchemy.is_mysql(), commit_freq) alchemy_session.commit() session_generator = alchemy.session_generator() session = session_generator() res = session.execute('select location.id, assignee.id from assignee \ left join rawassignee on rawassignee.assignee_id = assignee.id \ right join rawlocation on rawlocation.id = rawassignee.rawlocation_id \ right join location on location.id = rawlocation.location_id;') assigneelocation = pd.DataFrame.from_records(res.fetchall()) assigneelocation = assigneelocation[assigneelocation[0].notnull()] assigneelocation = assigneelocation[assigneelocation[1].notnull()] assigneelocation.columns = ['location_id','assignee_id'] locationassignee_inserts = [row[1].to_dict() for row in assigneelocation.iterrows()] celery_commit_inserts(locationassignee_inserts, alchemy.schema.locationassignee, alchemy.is_mysql(), 20000) res = session.execute('select location.id, inventor.id from inventor \ left join rawinventor on rawinventor.inventor_id = inventor.id \ right join rawlocation on rawlocation.id = rawinventor.rawlocation_id \ right join location on location.id = rawlocation.location_id;') inventorlocation = pd.DataFrame.from_records(res.fetchall()) inventorlocation = inventorlocation[inventorlocation[0].notnull()] inventorlocation = inventorlocation[inventorlocation[1].notnull()] inventorlocation.columns = ['location_id','inventor_id'] locationinventor_inserts = [row[1].to_dict() for row in inventorlocation.iterrows()] celery_commit_inserts(locationinventor_inserts, alchemy.schema.locationinventor, alchemy.is_mysql(), 20000) session.commit()
# update statements for rawassignee tables for ra in ra_objs: if isgrant(ra): grant_rawassignee_updates.append({'pk': ra.uuid, 'update': param['id']}) else: app_rawassignee_updates.append({'pk': ra.uuid, 'update': param['id']}) return grant_assignee_inserts, app_assignee_inserts, patentassignee_inserts, applicationassignee_inserts, grant_rawassignee_updates, app_rawassignee_updates grtsesh = grantsessiongen() #appsesh = appsessiongen() print 'fetching raw assignees',datetime.now() rawassignees = list(grtsesh.query(RawAssignee)) print len(rawassignees) #rawassignees.extend(list(appsesh.query(App_RawAssignee))) # clear the destination tables if alchemy.is_mysql(): grtsesh.execute('truncate assignee; truncate patent_assignee;') #appsesh.execute('truncate assignee; truncate application_assignee;') else: grtsesh.execute('delete from assignee; delete from patent_assignee;') appsesh.execute('delete from assignee; delete from patent_assignee;') print 'cleaning ids', datetime.now() counter = 0 for ra in rawassignees: counter +=1 if counter%1000000 == 0: print counter uuid_to_object[ra.uuid] = ra cleanid = get_cleanid(ra) uuid_to_cleanid[ra.uuid] = cleanid