Beispiel #1
0
def run_disambiguation(doctype='grant'):
    # get all lawyers in database
    global blocks
    global lawyer_insert_statements
    global patentlawyer_insert_statements
    global update_statements
    session = alchemy.fetch_session(dbtype=doctype)
    if doctype == 'grant':
        lawyers = deque(session.query(RawLawyer))
    if doctype == 'application':
        lawyers = deque(session.query(App_RawLawyer))
    lawyer_alpha_blocks = clean_lawyers(lawyers)
    lawyer_insert_statements = []
    patentlawyer_insert_statements = []
    update_statements = []
    for letter in alphabet:
        print letter, datetime.now()
        blocks = defaultdict(list)
        lawyer_insert_statements = []
        patentlawyer_insert_statements = []
        update_statements = []
        letterblock = [
            x for x in lawyer_alpha_blocks if x.lower().startswith(letter)
        ]
        create_jw_blocks(letterblock)
        create_lawyer_table(session)
    print len(lawyer_insert_statements)
    print len(update_statements)
    bulk_commit_inserts(lawyer_insert_statements, Lawyer.__table__,
                        alchemy.is_mysql(), 20000, 'grant')
    bulk_commit_inserts(patentlawyer_insert_statements, patentlawyer,
                        alchemy.is_mysql(), 20000)
    bulk_commit_updates('lawyer_id', update_statements, RawLawyer.__table__,
                        alchemy.is_mysql(), 20000)
def run_disambiguation(doctype='grant'):
    # get all assignees in database
    global blocks
    global assignee_insert_statements
    global patentassignee_insert_statements
    global update_statements
    session = alchemy.fetch_session(dbtype=doctype)
    if doctype == 'grant':
        assignees = deque(session.query(RawAssignee))
    if doctype == 'application':
        assignees = deque(session.query(App_RawAssignee))
    assignee_alpha_blocks = clean_assignees(assignees)
    session.execute('truncate assignee; truncate patent_assignee;')
    session.commit()
    for letter in alphabet:
        print letter, datetime.now()
        blocks = defaultdict(list)
        assignee_insert_statements = []
        patentassignee_insert_statements = []
        update_statements = []
        letterblock = [
            x for x in assignee_alpha_blocks if x.lower().startswith(letter)
        ]
        create_jw_blocks(letterblock)
        create_assignee_table(session)
def run_disambiguation(doctype='grant'):
    # get all lawyers in database
    global blocks
    global lawyer_insert_statements
    global patentlawyer_insert_statements
    global update_statements
    session = alchemy.fetch_session(dbtype=doctype)
    if doctype == 'grant':
        lawyers = deque(session.query(RawLawyer))
    if doctype == 'application':
        lawyers = deque(session.query(App_RawLawyer))
    lawyer_alpha_blocks = clean_lawyers(lawyers)
    for letter in alphabet:
        print letter, datetime.now()
        blocks = defaultdict(list)
        lawyer_insert_statements = []
        patentlawyer_insert_statements = []
        update_statements = []
        letterblock = [x for x in lawyer_alpha_blocks if x.lower().startswith(letter)]
        create_jw_blocks(letterblock)
        create_lawyer_table(session)
Beispiel #4
0
    if doctype == 'application':
        lawyers = deque(session.query(App_RawLawyer))
    lawyer_alpha_blocks = clean_lawyers(lawyers)
    for letter in alphabet:
        print letter, datetime.now()
        blocks = defaultdict(list)
        lawyer_insert_statements = []
        patentlawyer_insert_statements = []
        update_statements = []
        letterblock = [
            x for x in lawyer_alpha_blocks if x.lower().startswith(letter)
        ]
        create_jw_blocks(letterblock)
        create_lawyer_table(session)


if __name__ == '__main__':
    if len(sys.argv) < 2:
        print "Need doctype"
        sys.exit(0)
    elif len(sys.argv) < 3:
        doctype = sys.argv[1]
        print('Running ' + doctype)
        run_disambiguation(doctype)
    else:
        doctype = sys.argv[1]
        letter = sys.argv[2]
        session = alchemy.fetch_session(dbtype=doctype)
        print('Running ' + letter + ' ' + doctype)
        run_letter(letter, session, doctype)
import sqlalchemy
import sqlalchemy.orm as orm
import sqlalchemy.ext.declarative as declarative
import sqlalchemy.sql.expression as expression
import geoalchemy_util
import itertools
import os
import datetime
import re

import alchemy
#The config file alchemy uses to store information
alchemy_config = alchemy.get_config()
#Used to query the database used for input and output
alchemy_session = alchemy.fetch_session()
#The path to the database which holds geolocation data
geo_data_dbpath = os.path.join(
    alchemy_config.get("location").get('path'),
    alchemy_config.get("location").get('database'))
geo_data_engine = sqlalchemy.create_engine('sqlite:///%s' % geo_data_dbpath)
geo_data_session_class = orm.sessionmaker(bind=geo_data_engine)
#Used to query the database that holds the data from google
#As well as a MaxMinds database containing every city in the world
geo_data_session = geo_data_session_class()
base = declarative.declarative_base()


#Stores an address disambiguated by the Google API
class RawGoogle(base):
    __tablename__ = 'raw_google'
    id = sqlalchemy.Column("rowid", sqlalchemy.Integer, primary_key=True)
def main(limit=None, offset=0, minimum_match_value=0.8, doctype='grant'):
    alchemy_session = alchemy.fetch_session(dbtype=doctype)
    t = datetime.datetime.now()
    print "geocoding started", doctype, t
    #Construct a list of all addresses which Google was capable of identifying
    #Making this now allows it to be referenced quickly later
    valid_input_addresses = construct_valid_input_addresses()
    #Get all of the raw locations in alchemy.db that were parsed from XML
    if doctype == 'grant':
        raw_parsed_locations = alchemy_session.query(alchemy.schema.RawLocation).limit(limit).offset(offset)
    elif doctype == 'application':
        raw_parsed_locations = alchemy_session.query(alchemy.schema.App_RawLocation).limit(limit).offset(offset)
    raw_parsed_locations_count = raw_parsed_locations.count()

    #If there are no locations, there is no point in continuing
    if raw_parsed_locations_count == 0:
        return False
    print 'Constructed list of all parsed locations containing', raw_parsed_locations_count, 'items'
    """
    grouped_loations will contain a list of dicts. Each dict will contain three values:
    raw_location = Location object containing the original location found in the XML
    matching_location = RawGoogle object containing the disambiguated location
    grouping_id = ID constructed from the city, region, and country of the matching_location
    """
    identified_grouped_locations = []
    unidentified_grouped_locations = []
    for instance in raw_parsed_locations:
        #Convert the location into a string that matches the Google format
        parsed_raw_location = geoalchemy_util.concatenate_location(instance.city, instance.state, instance.country)
        cleaned_location = geoalchemy_util.clean_raw_location(parsed_raw_location)
        #If the cleaned location has a match in the raw_google database,
        #we use that to classify it
        if input_address_exists(valid_input_addresses, cleaned_location):
            matching_location = geo_data_session.query(RawGoogle).filter(
                                     RawGoogle.input_address==cleaned_location).first()
            if matching_location:
                grouping_id = u"{0}|{1}".format(matching_location.latitude, matching_location.longitude)
                identified_grouped_locations.append({"raw_location": instance,
                                      "matching_location": matching_location,
                                      "grouping_id": grouping_id})
            else:
                print 'Cleaned location not matched', cleaned_location
                country = geoalchemy_util.get_country_from_cleaned(cleaned_location)
                unidentified_grouped_locations.append({"raw_location": instance,
                                                       "cleaned_location": cleaned_location,
                                                       "country": country})

        else:
            """
            If there is no match in the raw_google database, we leave the location alone
            TODO: analyze the location's edit distance to make minor adjustments to it
            such that it can be matched. Particularly good if we can combine the
            all_cities database with the list of valid input_address values in the
            raw_google database.
            """
            #Sort the locations by their country
            country = geoalchemy_util.get_country_from_cleaned(cleaned_location)
            unidentified_grouped_locations.append({"raw_location": instance,
                                                   "cleaned_location": cleaned_location,
                                                   "country": country})
        if ((len(identified_grouped_locations)+len(unidentified_grouped_locations))%10000 == 0):
            print "Processed", len(identified_grouped_locations)+len(unidentified_grouped_locations), datetime.datetime.now()
    print "locations grouped", datetime.datetime.now() - t
    print 'count of identified locations:', len(identified_grouped_locations)
    t = datetime.datetime.now()
    alchemy_session.close()


    #We now have two lists of locations. First, consider the unmatched locations.
    keyfunc = lambda x:x["country"]
    #Sort the list by the country
    unidentified_grouped_locations.sort(key=keyfunc)
    #Create an iterator that will access everything in the list with the same
    #country
    unidentified_grouped_locations_enum = enumerate(itertools.groupby(unidentified_grouped_locations, keyfunc))
    #Identify the correct location for each entry by comparing to all_cities
    identify_missing_locations(unidentified_grouped_locations_enum,
                               identified_grouped_locations,
                               minimum_match_value, t)
    print 'new count of identified locations:', len(identified_grouped_locations)

    #We now have a list of all locations in the file, along with their
    #matching locations and the id used to group them
    #Perform a quickfix to correct state names
    geoalchemy_util.fix_state_abbreviations(identified_grouped_locations)

    #Sort the list by the grouping_id
    keyfunc = lambda x: x['grouping_id']
    identified_grouped_locations.sort(key=keyfunc)
    #Create an iterator that will access everything in the list with the same
    #grouping_id
    identified_grouped_locations_enum = enumerate(itertools.groupby(identified_grouped_locations, keyfunc))
    print "identified_grouped_locations sorted", datetime.datetime.now() - t
    t = datetime.datetime.now()

    alchemy_session = alchemy.fetch_session(dbtype=doctype)

    #Match the locations
    match_grouped_locations(identified_grouped_locations_enum, t, alchemy_session)

    print "Matches made!", datetime.datetime.now() - t
    if doctype == 'grant':
        unique_group_count = alchemy_session.query(expression.func.count(sqlalchemy.distinct(alchemy.schema.Location.id))).all()
    elif doctype == 'application':
        unique_group_count = alchemy_session.query(expression.func.count(sqlalchemy.distinct(alchemy.schema.App_Location.id))).all()

    print "%s groups formed from %s locations" % (unique_group_count, raw_parsed_locations_count)
    alchemy_session.close()
    if doctype == 'grant':
        lawyers = deque(session.query(RawLawyer))
    if doctype == 'application':
        lawyers = deque(session.query(App_RawLawyer))
    lawyer_alpha_blocks = clean_lawyers(lawyers)
    for letter in alphabet:
        print letter, datetime.now()
        blocks = defaultdict(list)
        lawyer_insert_statements = []
        patentlawyer_insert_statements = []
        update_statements = []
        letterblock = [x for x in lawyer_alpha_blocks if x.lower().startswith(letter)]
        create_jw_blocks(letterblock)
        create_lawyer_table(session)


if __name__ == '__main__':
    if len(sys.argv) < 2:
        print "Need doctype"
        sys.exit(0)
    elif len(sys.argv) < 3:
        doctype = sys.argv[1]
        print ('Running ' + doctype)
        run_disambiguation(doctype)
    else:
        doctype = sys.argv[1]
        letter = sys.argv[2]
        session = alchemy.fetch_session(dbtype=doctype)
        print('Running ' + letter + ' ' + doctype)
        run_letter(letter, session, doctype)
Beispiel #8
0
import sqlalchemy
import sqlalchemy.orm as orm
import sqlalchemy.ext.declarative as declarative
import sqlalchemy.sql.expression as expression
import geoalchemy_util
import itertools
import os
import datetime
import re

import alchemy
#The config file alchemy uses to store information
alchemy_config = alchemy.get_config()
#Used to query the database used for input and output
alchemy_session = alchemy.fetch_session()
#The path to the database which holds geolocation data
geo_data_dbpath = os.path.join(
    alchemy_config.get("location").get('path'),
    alchemy_config.get("location").get('database'))
geo_data_engine = sqlalchemy.create_engine('sqlite:///%s' % geo_data_dbpath)
geo_data_session_class = orm.sessionmaker(bind=geo_data_engine)
#Used to query the database that holds the data from google
#As well as a MaxMinds database containing every city in the world
geo_data_session = geo_data_session_class()
base = declarative.declarative_base()

#Stores an address disambiguated by the Google API
class RawGoogle(base):
    __tablename__ = 'raw_google'
    id = sqlalchemy.Column("rowid", sqlalchemy.Integer, primary_key=True)
    input_address = sqlalchemy.Column(sqlalchemy.String)
Beispiel #9
0
        print 'Error: list of valid input addresses not constructed'
        return False

def find_difficult_locations_from_file(inputfilename, outputfilename):
    inputfile = open(inputfilename, 'r')
    outputfile = open(outputfilename, 'w+')
    t = datetime.datetime.now()
    all_japan_cities_query = geo_data_session.query(AllCities.city).filter(AllCities.country=='JP').group_by(AllCities.city).all()
    all_japan_cities = []
    for row in all_japan_cities_query:
        all_japan_cities.append(row.city)
    print 'list of all_japan_cities created', datetime.datetime.now()-t
    for line in inputfile:
        line = line.decode('utf8')
        line = geoalchemy_util.remove_eol_pattern.sub('', line)
        if line.endswith(', JP') or line.endswith(', JA'):
            city = line.split(',')[0].strip()
            most_similar_city = geoalchemy_util.get_closest_match_leven(city, all_japan_cities, 0.8)
            if most_similar_city!='':
                outputfile.write('{0}|{1}\n'.format(city.encode('utf8'), most_similar_city.encode('utf8')))
    print datetime.datetime.now()-t

if __name__=='__main__':
    doctype = 'grant'
    schema = alchemy.schema.RawLocation
    if len(sys.argv) > 1:
        doctype = sys.argv[1]
        schema = schema if doctype == 'grant' else alchemy.schema.App_RawLocation
    numlocs = alchemy.fetch_session(dbtype=doctype).query(schema).count()
    main(doctype=doctype)