def create_assignee_table(session):
    """
    Given a list of assignees and the redis key-value disambiguation,
    populates the Assignee table in the database
    """
    print 'Disambiguating assignees...'
    if alchemy.is_mysql():
        session.execute('set foreign_key_checks = 0;')
        session.commit()
    i = 0
    for assignee in blocks.iterkeys():
        ra_ids = (id_map[ra] for ra in blocks[assignee])
        for block in ra_ids:
          i += 1
          rawassignees = [assignee_dict[ra_id] for ra_id in block]
          if i % 20000 == 0:
              print i, datetime.now()
              assignee_match(rawassignees, session, commit=True)
          else:
              assignee_match(rawassignees, session, commit=False)
    celery_commit_inserts(assignee_insert_statements, Assignee.__table__, alchemy.is_mysql(), 20000)
    celery_commit_inserts(patentassignee_insert_statements, patentassignee, alchemy.is_mysql(), 20000)
    celery_commit_updates('assignee_id', update_statements, RawAssignee.__table__, alchemy.is_mysql(), 20000)
    session.commit()
    print i, datetime.now()
Esempio n. 2
0
def match_grouped_locations(identified_grouped_locations_enum, t, alchemy_session):
    if alchemy.is_mysql():
        alchemy_session.execute("set foreign_key_checks = 0; truncate location;")
        alchemy_session.commit()
    for i, item in identified_grouped_locations_enum:
        #grouped_locations_list = a list of every grouped location with the same grouping_id
        # Note that a grouped_location is a dict, as described above
        #grouping_id = the grouping_id of all items in the list
        grouping_id, grouped_locations_list = item
        #We need to get only the RawLocation objects back from the grouped_location dict
        #match_group is the list of RawLocation objects which we call match on
        match_group = []
        first_pass=True
        for grouped_location in grouped_locations_list:
            match_group.append(grouped_location["raw_location"])
            if(first_pass):
                first_matching_location = grouped_location["matching_location"]
        """
        default is a dict containing the default values of the parameters
        (id, city, region, country, latitude, longtidue)
        for all locations that are part of the same group.
        Here we set the defaults to be the values for the first entry in the grouped_locations_list
        In theory, all entries in the grouped_locations_list should have the same lat/long.
        """
        default = {"id": grouping_id, "city":first_matching_location.city,
                   "state":first_matching_location.region,
                   "country":first_matching_location.country,
                   "latitude":first_matching_location.latitude,
                   "longitude":first_matching_location.longitude}
        #No need to run match() if no matching location was found.
        if(grouping_id!="nolocationfound"):
            run_geo_match(grouping_id, default, match_group, i, t, alchemy_session)
    alchemy_session.execute('truncate location; truncate assignee_location; truncate inventor_location;')
    celery_commit_inserts(location_insert_statements, alchemy.schema.Location.__table__, alchemy.is_mysql(), commit_freq)
    celery_commit_updates('location_id', update_statements, alchemy.schema.RawLocation.__table__, alchemy.is_mysql(), commit_freq)
    alchemy_session.commit()
    session_generator = alchemy.session_generator()
    session = session_generator()
    res = session.execute('select location.id, assignee.id from assignee \
                           left join rawassignee on rawassignee.assignee_id = assignee.id \
                           right join rawlocation on rawlocation.id = rawassignee.rawlocation_id \
                           right join location on location.id = rawlocation.location_id;')
    assigneelocation = pd.DataFrame.from_records(res.fetchall())
    assigneelocation = assigneelocation[assigneelocation[0].notnull()]
    assigneelocation = assigneelocation[assigneelocation[1].notnull()]
    assigneelocation.columns = ['location_id','assignee_id']
    locationassignee_inserts = [row[1].to_dict() for row in assigneelocation.iterrows()]
    celery_commit_inserts(locationassignee_inserts, alchemy.schema.locationassignee, alchemy.is_mysql(), 20000)

    res = session.execute('select location.id, inventor.id from inventor \
                           left join rawinventor on rawinventor.inventor_id = inventor.id \
                           right join rawlocation on rawlocation.id = rawinventor.rawlocation_id \
                           right join location on location.id = rawlocation.location_id;')
    inventorlocation = pd.DataFrame.from_records(res.fetchall())
    inventorlocation = inventorlocation[inventorlocation[0].notnull()]
    inventorlocation = inventorlocation[inventorlocation[1].notnull()]
    inventorlocation.columns = ['location_id','inventor_id']
    locationinventor_inserts = [row[1].to_dict() for row in inventorlocation.iterrows()]
    celery_commit_inserts(locationinventor_inserts, alchemy.schema.locationinventor, alchemy.is_mysql(), 20000)

    session.commit()