Exemple #1
0
def create_lawyer_table(session):
    """
    Given a list of lawyers and the redis key-value disambiguation,
    populates the lawyer table in the database
    """
    print 'Disambiguating lawyers...'
    if alchemy.is_mysql():
        session.execute('set foreign_key_checks = 0;')
        session.commit()
    i = 0
    for lawyer in blocks.iterkeys():
        ra_ids = (id_map[ra] for ra in blocks[lawyer])
        for block in ra_ids:
            i += 1
            rawlawyers = [lawyer_dict[ra_id] for ra_id in block]
            if i % 20000 == 0:
                print i, datetime.now()
                lawyer_match(rawlawyers, session, commit=True)
            else:
                lawyer_match(rawlawyers, session, commit=False)
    t1 = bulk_commit_inserts(lawyer_insert_statements, Lawyer.__table__,
                             alchemy.is_mysql(), 20000, 'grant')
    t2 = bulk_commit_inserts(patentlawyer_insert_statements, patentlawyer,
                             alchemy.is_mysql(), 20000)
    t3 = bulk_commit_updates('lawyer_id', update_statements,
                             RawLawyer.__table__, alchemy.is_mysql(), 20000)
    # t1.get()
    # t2.get()
    # t3.get()
    # session.commit()
    print i, datetime.now()
Exemple #2
0
def run_disambiguation(doctype='grant'):
    # get all lawyers in database
    global blocks
    global lawyer_insert_statements
    global patentlawyer_insert_statements
    global update_statements
    session = alchemy.fetch_session(dbtype=doctype)
    if doctype == 'grant':
        lawyers = deque(session.query(RawLawyer))
    if doctype == 'application':
        lawyers = deque(session.query(App_RawLawyer))
    lawyer_alpha_blocks = clean_lawyers(lawyers)
    lawyer_insert_statements = []
    patentlawyer_insert_statements = []
    update_statements = []
    for letter in alphabet:
        print letter, datetime.now()
        blocks = defaultdict(list)
        lawyer_insert_statements = []
        patentlawyer_insert_statements = []
        update_statements = []
        letterblock = [
            x for x in lawyer_alpha_blocks if x.lower().startswith(letter)
        ]
        create_jw_blocks(letterblock)
        create_lawyer_table(session)
    print len(lawyer_insert_statements)
    print len(update_statements)
    bulk_commit_inserts(lawyer_insert_statements, Lawyer.__table__,
                        alchemy.is_mysql(), 20000, 'grant')
    bulk_commit_inserts(patentlawyer_insert_statements, patentlawyer,
                        alchemy.is_mysql(), 20000)
    bulk_commit_updates('lawyer_id', update_statements, RawLawyer.__table__,
                        alchemy.is_mysql(), 20000)
def create_lawyer_table(session):
    """
    Given a list of lawyers and the redis key-value disambiguation,
    populates the lawyer table in the database
    """
    print 'Disambiguating lawyers...'
    if alchemy.is_mysql():
        session.execute('set foreign_key_checks = 0;')
        session.commit()
    i = 0
    for lawyer in blocks.iterkeys():
        ra_ids = (id_map[ra] for ra in blocks[lawyer])
        for block in ra_ids:
          i += 1
          rawlawyers = [lawyer_dict[ra_id] for ra_id in block]
          if i % 20000 == 0:
              print i, datetime.now()
              lawyer_match(rawlawyers, session, commit=True)
          else:
              lawyer_match(rawlawyers, session, commit=False)
    t1 = bulk_commit_inserts(lawyer_insert_statements, Lawyer.__table__, alchemy.is_mysql(), 20000, 'grant')
    t2 = bulk_commit_inserts(patentlawyer_insert_statements, patentlawyer, alchemy.is_mysql(), 20000)
    t3 = bulk_commit_updates('lawyer_id', update_statements, RawLawyer.__table__, alchemy.is_mysql(), 20000)
    # t1.get()
    # t2.get()
    # t3.get()
    # session.commit()
    print i, datetime.now()
def match_grouped_locations(identified_grouped_locations_enum, t, alchemy_session):
    if alchemy.is_mysql():
        alchemy_session.execute("set foreign_key_checks = 0; truncate location;")
        alchemy_session.commit()
    for i, item in identified_grouped_locations_enum:
        #grouped_locations_list = a list of every grouped location with the same grouping_id
        # Note that a grouped_location is a dict, as described above
        #grouping_id = the grouping_id of all items in the list
        grouping_id, grouped_locations_list = item
        #We need to get only the RawLocation objects back from the grouped_location dict
        #match_group is the list of RawLocation objects which we call match on
        match_group = []
        first_pass=True
        for grouped_location in grouped_locations_list:
            match_group.append(grouped_location["raw_location"])
            if(first_pass):
                first_matching_location = grouped_location["matching_location"]
        """
        default is a dict containing the default values of the parameters
        (id, city, region, country, latitude, longtidue)
        for all locations that are part of the same group.
        Here we set the defaults to be the values for the first entry in the grouped_locations_list
        In theory, all entries in the grouped_locations_list should have the same lat/long.
        """
        default = {"id": grouping_id, "city":first_matching_location.city,
                   "state":first_matching_location.region,
                   "country": first_matching_location.country.upper(),
                   "latitude":first_matching_location.latitude,
                   "longitude":first_matching_location.longitude}
        #No need to run match() if no matching location was found.
        if(grouping_id!="nolocationfound"):
            run_geo_match(grouping_id, default, match_group, i, t, alchemy_session)

    if alchemy.is_mysql():
        alchemy_session.execute('truncate location; truncate location_assignee; truncate location_inventor;')
    else:
        alchemy_session.execute('delete from location;')
        alchemy_session.commit()
        alchemy_session.execute('delete from location_assignee;')
        alchemy_session.commit()
        alchemy_session.execute('delete from location_inventor;')
        alchemy_session.commit()
    if doctype == 'grant':
        bulk_commit_inserts(location_insert_statements, alchemy.schema.Location.__table__, alchemy.is_mysql(), commit_freq, 'grant')
        bulk_commit_updates('location_id', update_statements, alchemy.schema.RawLocation.__table__, alchemy.is_mysql(), commit_freq, 'grant')
    elif doctype == 'application':
        bulk_commit_inserts(location_insert_statements, alchemy.schema.App_Location.__table__, alchemy.is_mysql(), commit_freq, 'application')
        bulk_commit_updates('location_id', update_statements, alchemy.schema.App_RawLocation.__table__, alchemy.is_mysql(), commit_freq, 'application')
    alchemy_session.commit()
    session_generator = alchemy.session_generator(dbtype=doctype)
    session = session_generator()

    session.commit()
    print 'Committed!!!'
def run_disambiguation():
    """
    Runs disambiguation algorithm on grant and application assignees from
    the database indicated by lib/alchemy/config
    """
    # retrieve database connections and pull in all assignees from
    # both grant and application databases
    grtsesh = grantsessiongen()
    appsesh = appsessiongen()
    print "fetching raw assignees", datetime.now()
    rawassignees = list(grtsesh.query(RawAssignee))
    rawassignees.extend(list(appsesh.query(App_RawAssignee)))
    # clear the destination tables
    if alchemy.is_mysql():
        grtsesh.execute("truncate assignee; truncate patent_assignee;")
        appsesh.execute("truncate assignee; truncate application_assignee;")
    else:
        grtsesh.execute("delete from assignee; delete from patent_assignee;")
        appsesh.execute("delete from assignee; delete from patent_assignee;")
    print "cleaning ids", datetime.now()
    # uses the get_cleanid method to remove undesirable characters and
    # normalize to case and group by first letter
    for ra in rawassignees:
        uuid_to_object[ra.uuid] = ra
        cleanid = get_cleanid(ra)
        uuid_to_cleanid[ra.uuid] = cleanid
        if not cleanid:
            continue
        firstletter = cleanid[0]
        uuids_by_cleanidletter[firstletter].append(ra.uuid)

    print "disambiguating blocks", datetime.now()
    # disambiguates each of the letter blocks using
    # the list of assignees as a stack and only performing
    # jaro-winkler comparisons on the first item of each block
    allrecords = []
    for letter in alphabet:
        print "disambiguating", "({0})".format(letter), datetime.now()
        lettergroup = disambiguate_letter(letter)
        print "got", len(lettergroup), "records"
        print "creating disambiguated records", "({0})".format(letter), datetime.now()
        allrecords.extend(lettergroup.values())
    # create the attributes for the disambiguated assignee record from the
    # raw records placed into a block in the disambiguation phase
    res = map(create_disambiguated_record_for_block, allrecords)
    mid = itertools.izip(*res)
    grant_assignee_inserts = list(itertools.chain.from_iterable(mid.next()))
    app_assignee_inserts = list(itertools.chain.from_iterable(mid.next()))
    patentassignee_inserts = list(itertools.chain.from_iterable(mid.next()))
    applicationassignee_inserts = list(itertools.chain.from_iterable(mid.next()))
    grant_rawassignee_updates = list(itertools.chain.from_iterable(mid.next()))
    app_rawassignee_updates = list(itertools.chain.from_iterable(mid.next()))

    # write out the insert counts for each table into a text file
    with open("mid.txt", "wb") as f:
        f.write(str(len(grant_assignee_inserts)) + "\n")
        f.write(str(len(app_assignee_inserts)) + "\n")
        f.write(str(len(patentassignee_inserts)) + "\n")
        f.write(str(len(applicationassignee_inserts)) + "\n")
        f.write(str(len(grant_rawassignee_updates)) + "\n")
        f.write(str(len(app_rawassignee_updates)) + "\n")
    # insert disambiguated assignee records
    bulk_commit_inserts(grant_assignee_inserts, Assignee.__table__, alchemy.is_mysql(), 20000, "grant")
    bulk_commit_inserts(app_assignee_inserts, App_Assignee.__table__, alchemy.is_mysql(), 20000, "application")
    # insert patent/assignee link records
    bulk_commit_inserts(patentassignee_inserts, patentassignee, alchemy.is_mysql(), 20000, "grant")
    bulk_commit_inserts(applicationassignee_inserts, applicationassignee, alchemy.is_mysql(), 20000, "application")
    # update rawassignees with their disambiguated record
    bulk_commit_updates(
        "assignee_id", grant_rawassignee_updates, RawAssignee.__table__, alchemy.is_mysql(), 20000, "grant"
    )
    bulk_commit_updates(
        "assignee_id", app_rawassignee_updates, App_RawAssignee.__table__, alchemy.is_mysql(), 20000, "application"
    )
Exemple #6
0
def run_disambiguation():
    """
    Runs disambiguation algorithm on grant and application assignees from
    the database indicated by lib/alchemy/config
    """
    # retrieve database connections and pull in all assignees from
    # both grant and application databases
    grtsesh = grantsessiongen()
    appsesh = appsessiongen()
    print 'fetching raw assignees', datetime.now()
    rawassignees = list(grtsesh.query(RawAssignee))
    rawassignees.extend(list(appsesh.query(App_RawAssignee)))
    # clear the destination tables
    if alchemy.is_mysql():
        grtsesh.execute('truncate assignee; truncate patent_assignee;')
        appsesh.execute('truncate assignee; truncate application_assignee;')
    else:
        grtsesh.execute('delete from assignee; delete from patent_assignee;')
        appsesh.execute('delete from assignee; delete from patent_assignee;')
    print 'cleaning ids', datetime.now()
    # uses the get_cleanid method to remove undesirable characters and
    # normalize to case and group by first letter
    for ra in rawassignees:
        uuid_to_object[ra.uuid] = ra
        cleanid = get_cleanid(ra)
        uuid_to_cleanid[ra.uuid] = cleanid
        if not cleanid:
            continue
        firstletter = cleanid[0]
        uuids_by_cleanidletter[firstletter].append(ra.uuid)

    print 'disambiguating blocks', datetime.now()
    # disambiguates each of the letter blocks using
    # the list of assignees as a stack and only performing
    # jaro-winkler comparisons on the first item of each block
    allrecords = []
    for letter in alphabet:
        print 'disambiguating', '({0})'.format(letter), datetime.now()
        lettergroup = disambiguate_letter(letter)
        print 'got', len(lettergroup), 'records'
        print 'creating disambiguated records', '({0})'.format(
            letter), datetime.now()
        allrecords.extend(lettergroup.values())
    # create the attributes for the disambiguated assignee record from the
    # raw records placed into a block in the disambiguation phase
    res = map(create_disambiguated_record_for_block, allrecords)
    mid = itertools.izip(*res)
    grant_assignee_inserts = list(itertools.chain.from_iterable(mid.next()))
    app_assignee_inserts = list(itertools.chain.from_iterable(mid.next()))
    patentassignee_inserts = list(itertools.chain.from_iterable(mid.next()))
    applicationassignee_inserts = list(
        itertools.chain.from_iterable(mid.next()))
    grant_rawassignee_updates = list(itertools.chain.from_iterable(mid.next()))
    app_rawassignee_updates = list(itertools.chain.from_iterable(mid.next()))

    # write out the insert counts for each table into a text file
    with open('mid.txt', 'wb') as f:
        f.write(str(len(grant_assignee_inserts)) + '\n')
        f.write(str(len(app_assignee_inserts)) + '\n')
        f.write(str(len(patentassignee_inserts)) + '\n')
        f.write(str(len(applicationassignee_inserts)) + '\n')
        f.write(str(len(grant_rawassignee_updates)) + '\n')
        f.write(str(len(app_rawassignee_updates)) + '\n')
    # insert disambiguated assignee records
    bulk_commit_inserts(grant_assignee_inserts, Assignee.__table__,
                        alchemy.is_mysql(), 20000, 'grant')
    bulk_commit_inserts(app_assignee_inserts, App_Assignee.__table__,
                        alchemy.is_mysql(), 20000, 'application')
    # insert patent/assignee link records
    bulk_commit_inserts(patentassignee_inserts, patentassignee,
                        alchemy.is_mysql(), 20000, 'grant')
    bulk_commit_inserts(applicationassignee_inserts, applicationassignee,
                        alchemy.is_mysql(), 20000, 'application')
    # update rawassignees with their disambiguated record
    bulk_commit_updates('assignee_id',
                        grant_rawassignee_updates, RawAssignee.__table__,
                        alchemy.is_mysql(), 20000, 'grant')
    bulk_commit_updates('assignee_id',
                        app_rawassignee_updates, App_RawAssignee.__table__,
                        alchemy.is_mysql(), 20000, 'application')
        counter +=1
        if counter%10000 ==0:
            print counter
        res.append(create_disambiguated_record_for_block(i))
    print datetime.now()
    return res
def post_process(res):
    print "Itertools"
    print datetime.now()
    mid = itertools.izip(*res)
    print "making lists"
    print datetime.now()
    grant_assignee_inserts.extend(list(itertools.chain.from_iterable(mid.next())))
    app_assignee_inserts.extend(list(itertools.chain.from_iterable(mid.next())))
    patentassignee_inserts.extend(list(itertools.chain.from_iterable(mid.next())))
    applicationassignee_inserts.extend(list(itertools.chain.from_iterable(mid.next())))
    grant_rawassignee_updates.extend(list(itertools.chain.from_iterable(mid.next())))
    app_rawassignee_updates.extend(list(itertools.chain.from_iterable(mid.next())))


for letter in alphabet:
    allrecords = run_letter(letter)
    print "Done disambiguating"
    processed = map_disamb(allrecords)
    post_process(processed)

bulk_commit_inserts(grant_assignee_inserts, Assignee.__table__, alchemy.is_mysql(), 10000, 'grant')
bulk_commit_inserts(patentassignee_inserts, patentassignee, alchemy.is_mysql(), 10000, 'grant')
print "May stick here after a while, use Scripts/Temporary/assignee_patch.py to fix"
bulk_commit_updates('assignee_id', grant_rawassignee_updates, RawAssignee.__table__, alchemy.is_mysql(), 10000, 'grant')
print "Done!"