def run_disambiguation(doctype='grant'): # get all lawyers in database global blocks global lawyer_insert_statements global patentlawyer_insert_statements global update_statements session = alchemy.fetch_session(dbtype=doctype) if doctype == 'grant': lawyers = deque(session.query(RawLawyer)) if doctype == 'application': lawyers = deque(session.query(App_RawLawyer)) lawyer_alpha_blocks = clean_lawyers(lawyers) lawyer_insert_statements = [] patentlawyer_insert_statements = [] update_statements = [] for letter in alphabet: print letter, datetime.now() blocks = defaultdict(list) lawyer_insert_statements = [] patentlawyer_insert_statements = [] update_statements = [] letterblock = [ x for x in lawyer_alpha_blocks if x.lower().startswith(letter) ] create_jw_blocks(letterblock) create_lawyer_table(session) print len(lawyer_insert_statements) print len(update_statements) bulk_commit_inserts(lawyer_insert_statements, Lawyer.__table__, alchemy.is_mysql(), 20000, 'grant') bulk_commit_inserts(patentlawyer_insert_statements, patentlawyer, alchemy.is_mysql(), 20000) bulk_commit_updates('lawyer_id', update_statements, RawLawyer.__table__, alchemy.is_mysql(), 20000)
def run_disambiguation(doctype='grant'): # get all assignees in database global blocks global assignee_insert_statements global patentassignee_insert_statements global update_statements session = alchemy.fetch_session(dbtype=doctype) if doctype == 'grant': assignees = deque(session.query(RawAssignee)) if doctype == 'application': assignees = deque(session.query(App_RawAssignee)) assignee_alpha_blocks = clean_assignees(assignees) session.execute('truncate assignee; truncate patent_assignee;') session.commit() for letter in alphabet: print letter, datetime.now() blocks = defaultdict(list) assignee_insert_statements = [] patentassignee_insert_statements = [] update_statements = [] letterblock = [ x for x in assignee_alpha_blocks if x.lower().startswith(letter) ] create_jw_blocks(letterblock) create_assignee_table(session)
def run_disambiguation(doctype='grant'): # get all lawyers in database global blocks global lawyer_insert_statements global patentlawyer_insert_statements global update_statements session = alchemy.fetch_session(dbtype=doctype) if doctype == 'grant': lawyers = deque(session.query(RawLawyer)) if doctype == 'application': lawyers = deque(session.query(App_RawLawyer)) lawyer_alpha_blocks = clean_lawyers(lawyers) for letter in alphabet: print letter, datetime.now() blocks = defaultdict(list) lawyer_insert_statements = [] patentlawyer_insert_statements = [] update_statements = [] letterblock = [x for x in lawyer_alpha_blocks if x.lower().startswith(letter)] create_jw_blocks(letterblock) create_lawyer_table(session)
if doctype == 'application': lawyers = deque(session.query(App_RawLawyer)) lawyer_alpha_blocks = clean_lawyers(lawyers) for letter in alphabet: print letter, datetime.now() blocks = defaultdict(list) lawyer_insert_statements = [] patentlawyer_insert_statements = [] update_statements = [] letterblock = [ x for x in lawyer_alpha_blocks if x.lower().startswith(letter) ] create_jw_blocks(letterblock) create_lawyer_table(session) if __name__ == '__main__': if len(sys.argv) < 2: print "Need doctype" sys.exit(0) elif len(sys.argv) < 3: doctype = sys.argv[1] print('Running ' + doctype) run_disambiguation(doctype) else: doctype = sys.argv[1] letter = sys.argv[2] session = alchemy.fetch_session(dbtype=doctype) print('Running ' + letter + ' ' + doctype) run_letter(letter, session, doctype)
import sqlalchemy import sqlalchemy.orm as orm import sqlalchemy.ext.declarative as declarative import sqlalchemy.sql.expression as expression import geoalchemy_util import itertools import os import datetime import re import alchemy #The config file alchemy uses to store information alchemy_config = alchemy.get_config() #Used to query the database used for input and output alchemy_session = alchemy.fetch_session() #The path to the database which holds geolocation data geo_data_dbpath = os.path.join( alchemy_config.get("location").get('path'), alchemy_config.get("location").get('database')) geo_data_engine = sqlalchemy.create_engine('sqlite:///%s' % geo_data_dbpath) geo_data_session_class = orm.sessionmaker(bind=geo_data_engine) #Used to query the database that holds the data from google #As well as a MaxMinds database containing every city in the world geo_data_session = geo_data_session_class() base = declarative.declarative_base() #Stores an address disambiguated by the Google API class RawGoogle(base): __tablename__ = 'raw_google' id = sqlalchemy.Column("rowid", sqlalchemy.Integer, primary_key=True)
def main(limit=None, offset=0, minimum_match_value=0.8, doctype='grant'): alchemy_session = alchemy.fetch_session(dbtype=doctype) t = datetime.datetime.now() print "geocoding started", doctype, t #Construct a list of all addresses which Google was capable of identifying #Making this now allows it to be referenced quickly later valid_input_addresses = construct_valid_input_addresses() #Get all of the raw locations in alchemy.db that were parsed from XML if doctype == 'grant': raw_parsed_locations = alchemy_session.query(alchemy.schema.RawLocation).limit(limit).offset(offset) elif doctype == 'application': raw_parsed_locations = alchemy_session.query(alchemy.schema.App_RawLocation).limit(limit).offset(offset) raw_parsed_locations_count = raw_parsed_locations.count() #If there are no locations, there is no point in continuing if raw_parsed_locations_count == 0: return False print 'Constructed list of all parsed locations containing', raw_parsed_locations_count, 'items' """ grouped_loations will contain a list of dicts. Each dict will contain three values: raw_location = Location object containing the original location found in the XML matching_location = RawGoogle object containing the disambiguated location grouping_id = ID constructed from the city, region, and country of the matching_location """ identified_grouped_locations = [] unidentified_grouped_locations = [] for instance in raw_parsed_locations: #Convert the location into a string that matches the Google format parsed_raw_location = geoalchemy_util.concatenate_location(instance.city, instance.state, instance.country) cleaned_location = geoalchemy_util.clean_raw_location(parsed_raw_location) #If the cleaned location has a match in the raw_google database, #we use that to classify it if input_address_exists(valid_input_addresses, cleaned_location): matching_location = geo_data_session.query(RawGoogle).filter( RawGoogle.input_address==cleaned_location).first() if matching_location: grouping_id = u"{0}|{1}".format(matching_location.latitude, matching_location.longitude) identified_grouped_locations.append({"raw_location": instance, "matching_location": matching_location, "grouping_id": grouping_id}) else: print 'Cleaned location not matched', cleaned_location country = geoalchemy_util.get_country_from_cleaned(cleaned_location) unidentified_grouped_locations.append({"raw_location": instance, "cleaned_location": cleaned_location, "country": country}) else: """ If there is no match in the raw_google database, we leave the location alone TODO: analyze the location's edit distance to make minor adjustments to it such that it can be matched. Particularly good if we can combine the all_cities database with the list of valid input_address values in the raw_google database. """ #Sort the locations by their country country = geoalchemy_util.get_country_from_cleaned(cleaned_location) unidentified_grouped_locations.append({"raw_location": instance, "cleaned_location": cleaned_location, "country": country}) if ((len(identified_grouped_locations)+len(unidentified_grouped_locations))%10000 == 0): print "Processed", len(identified_grouped_locations)+len(unidentified_grouped_locations), datetime.datetime.now() print "locations grouped", datetime.datetime.now() - t print 'count of identified locations:', len(identified_grouped_locations) t = datetime.datetime.now() alchemy_session.close() #We now have two lists of locations. First, consider the unmatched locations. keyfunc = lambda x:x["country"] #Sort the list by the country unidentified_grouped_locations.sort(key=keyfunc) #Create an iterator that will access everything in the list with the same #country unidentified_grouped_locations_enum = enumerate(itertools.groupby(unidentified_grouped_locations, keyfunc)) #Identify the correct location for each entry by comparing to all_cities identify_missing_locations(unidentified_grouped_locations_enum, identified_grouped_locations, minimum_match_value, t) print 'new count of identified locations:', len(identified_grouped_locations) #We now have a list of all locations in the file, along with their #matching locations and the id used to group them #Perform a quickfix to correct state names geoalchemy_util.fix_state_abbreviations(identified_grouped_locations) #Sort the list by the grouping_id keyfunc = lambda x: x['grouping_id'] identified_grouped_locations.sort(key=keyfunc) #Create an iterator that will access everything in the list with the same #grouping_id identified_grouped_locations_enum = enumerate(itertools.groupby(identified_grouped_locations, keyfunc)) print "identified_grouped_locations sorted", datetime.datetime.now() - t t = datetime.datetime.now() alchemy_session = alchemy.fetch_session(dbtype=doctype) #Match the locations match_grouped_locations(identified_grouped_locations_enum, t, alchemy_session) print "Matches made!", datetime.datetime.now() - t if doctype == 'grant': unique_group_count = alchemy_session.query(expression.func.count(sqlalchemy.distinct(alchemy.schema.Location.id))).all() elif doctype == 'application': unique_group_count = alchemy_session.query(expression.func.count(sqlalchemy.distinct(alchemy.schema.App_Location.id))).all() print "%s groups formed from %s locations" % (unique_group_count, raw_parsed_locations_count) alchemy_session.close()
if doctype == 'grant': lawyers = deque(session.query(RawLawyer)) if doctype == 'application': lawyers = deque(session.query(App_RawLawyer)) lawyer_alpha_blocks = clean_lawyers(lawyers) for letter in alphabet: print letter, datetime.now() blocks = defaultdict(list) lawyer_insert_statements = [] patentlawyer_insert_statements = [] update_statements = [] letterblock = [x for x in lawyer_alpha_blocks if x.lower().startswith(letter)] create_jw_blocks(letterblock) create_lawyer_table(session) if __name__ == '__main__': if len(sys.argv) < 2: print "Need doctype" sys.exit(0) elif len(sys.argv) < 3: doctype = sys.argv[1] print ('Running ' + doctype) run_disambiguation(doctype) else: doctype = sys.argv[1] letter = sys.argv[2] session = alchemy.fetch_session(dbtype=doctype) print('Running ' + letter + ' ' + doctype) run_letter(letter, session, doctype)
import sqlalchemy import sqlalchemy.orm as orm import sqlalchemy.ext.declarative as declarative import sqlalchemy.sql.expression as expression import geoalchemy_util import itertools import os import datetime import re import alchemy #The config file alchemy uses to store information alchemy_config = alchemy.get_config() #Used to query the database used for input and output alchemy_session = alchemy.fetch_session() #The path to the database which holds geolocation data geo_data_dbpath = os.path.join( alchemy_config.get("location").get('path'), alchemy_config.get("location").get('database')) geo_data_engine = sqlalchemy.create_engine('sqlite:///%s' % geo_data_dbpath) geo_data_session_class = orm.sessionmaker(bind=geo_data_engine) #Used to query the database that holds the data from google #As well as a MaxMinds database containing every city in the world geo_data_session = geo_data_session_class() base = declarative.declarative_base() #Stores an address disambiguated by the Google API class RawGoogle(base): __tablename__ = 'raw_google' id = sqlalchemy.Column("rowid", sqlalchemy.Integer, primary_key=True) input_address = sqlalchemy.Column(sqlalchemy.String)
print 'Error: list of valid input addresses not constructed' return False def find_difficult_locations_from_file(inputfilename, outputfilename): inputfile = open(inputfilename, 'r') outputfile = open(outputfilename, 'w+') t = datetime.datetime.now() all_japan_cities_query = geo_data_session.query(AllCities.city).filter(AllCities.country=='JP').group_by(AllCities.city).all() all_japan_cities = [] for row in all_japan_cities_query: all_japan_cities.append(row.city) print 'list of all_japan_cities created', datetime.datetime.now()-t for line in inputfile: line = line.decode('utf8') line = geoalchemy_util.remove_eol_pattern.sub('', line) if line.endswith(', JP') or line.endswith(', JA'): city = line.split(',')[0].strip() most_similar_city = geoalchemy_util.get_closest_match_leven(city, all_japan_cities, 0.8) if most_similar_city!='': outputfile.write('{0}|{1}\n'.format(city.encode('utf8'), most_similar_city.encode('utf8'))) print datetime.datetime.now()-t if __name__=='__main__': doctype = 'grant' schema = alchemy.schema.RawLocation if len(sys.argv) > 1: doctype = sys.argv[1] schema = schema if doctype == 'grant' else alchemy.schema.App_RawLocation numlocs = alchemy.fetch_session(dbtype=doctype).query(schema).count() main(doctype=doctype)