コード例 #1
0
ファイル: tasks.py プロジェクト: phestoem/patentprocessor
def celery_commit_updates(update_key, update_statements, table, is_mysql, commit_frequency = 1000):
    """
    Executes bulk updates for a given table. This is typically much faster than going through
    the SQLAlchemy ORM. In order to be flexible, the update statements must be set up in a specific
    way. You can only update one column at a time. The dictionaries in the list `update_statements`
    must have two keys: `pk`, which is the primary_key for the record to be updated, and `update`
    which is the new value for the column you want to change. The column you want to change
    is specified as a string by the argument `update_key`.

    If is_mysql is True, then the update will be performed by inserting the record updates
    into the table temporary_update and then executing an UPDATE/JOIN. If is_mysql is False,
    then SQLite is assumed, and traditional updates are used (lib.alchemy.match.commit_updates)

    A session is generated using the scoped_session factory through SQLAlchemy, and then
    the actual task is dispatched.

    Args:
    update_key -- the name of the column we want to update
    update_statements -- list of dictionaries of updates. See above description
    table -- SQLAlchemy table object. If you have a table reference, you can use TableName.__table
    commit_frequency -- tune this for speed. Runs "session.commit" every `commit_frequency` items
    """
    session = session_generator()
    if not is_mysql:
        commit_updates(session, update_key, update_statements, table, commit_frequency)
        return
    commit_inserts(session, update_statements, temporary_update, is_mysql, 10000)
    # now update using the join
    primary_key = table.primary_key.columns.values()[0]
    update_key = table.columns[update_key]
    session.execute("UPDATE {0} join temporary_update ON temporary_update.pk = {1} SET {2} = temporary_update.update;".format(table.name, primary_key.name, update_key.name ))
    session.commit()
    session.execute("truncate temporary_update;")
    session.commit()
コード例 #2
0
ファイル: tasks.py プロジェクト: aviyallapalli/PatentsView-DB
def bulk_commit_inserts(insert_statements,
                        table,
                        is_mysql,
                        commit_frequency=1000,
                        dbtype='grant'):
    """
    Executes bulk inserts for a given table. This is typically much faster than going through
    the SQLAlchemy ORM. The insert_statement list of dictionaries may fall victim to SQLAlchemy
    complaining that certain columns are null, if you did not specify a value for every single
    column for a table.

    A session is generated using the scoped_session factory through SQLAlchemy, and then
    the actual lib.alchemy.match.commit_inserts task is dispatched.

    Args:
    insert_statements -- list of dictionaries where each dictionary contains key-value pairs of the object
    table -- SQLAlchemy table object. If you have a table reference, you can use TableName.__table__
    is_mysql -- adjusts syntax based on if we are committing to MySQL or SQLite. You can use alchemy.is_mysql() to get this
    commit_frequency -- tune this for speed. Runs "session.commit" every `commit_frequency` items
    dbtype -- which base schema to use. Either 'grant' or 'application'
    """
    session = session_generator(dbtype=dbtype)
    commit_inserts(session, insert_statements, table, is_mysql,
                   commit_frequency)
    session.commit()
コード例 #3
0
ファイル: tasks.py プロジェクト: aviyallapalli/PatentsView-DB
def bulk_commit_updates(update_key,
                        update_statements,
                        table,
                        is_mysql,
                        commit_frequency=1000,
                        dbtype='grant'):
    """
    Executes bulk updates for a given table. This is typically much faster than going through
    the SQLAlchemy ORM. In order to be flexible, the update statements must be set up in a specific
    way. You can only update one column at a time. The dictionaries in the list `update_statements`
    must have two keys: `pk`, which is the primary_key for the record to be updated, and `update`
    which is the new value for the column you want to change. The column you want to change
    is specified as a string by the argument `update_key`.

    If is_mysql is True, then the update will be performed by inserting the record updates
    into the table temporary_update and then executing an UPDATE/JOIN. If is_mysql is False,
    then SQLite is assumed, and traditional updates are used (lib.alchemy.match.commit_updates)

    A session is generated using the scoped_session factory through SQLAlchemy, and then
    the actual task is dispatched.

    Args:
    update_key -- the name of the column we want to update
    update_statements -- list of dictionaries of updates. See above description
    table -- SQLAlchemy table object. If you have a table reference, you can use TableName.__table
    is_mysql -- adjusts syntax based on if we are committing to MySQL or SQLite. You can use alchemy.is_mysql() to get this
    commit_frequency -- tune this for speed. Runs "session.commit" every `commit_frequency` items
    dbtype -- which base schema to use. Either 'grant' or 'application'
    """
    session = session_generator(dbtype=dbtype)
    if not is_mysql:
        commit_updates(session, update_key, update_statements, table,
                       commit_frequency)
        return
    session.rollback()
    if is_mysql:
        session.execute('truncate temporary_update;')
    else:
        session.execute('delete from temporary_update;')
    if dbtype == 'grant':
        commit_inserts(session, update_statements, temporary_update, is_mysql,
                       10000)
    else:
        commit_inserts(session, update_statements, app_temporary_update,
                       is_mysql, 10000)
    # now update using the join
    primary_key = table.primary_key.columns.values()[0]
    update_key = table.columns[update_key]
    session.execute(
        "UPDATE {0} join temporary_update ON temporary_update.pk = {1} SET {2} = temporary_update.update;"
        .format(table.name, primary_key.name, update_key.name))
    session.commit()
    if is_mysql:
        session.execute('truncate temporary_update;')
    else:
        session.execute('delete from temporary_update;')
    session.commit()
コード例 #4
0
def match_grouped_locations(identified_grouped_locations_enum, t, alchemy_session):
    if alchemy.is_mysql():
        alchemy_session.execute("set foreign_key_checks = 0; truncate location;")
        alchemy_session.commit()
    for i, item in identified_grouped_locations_enum:
        #grouped_locations_list = a list of every grouped location with the same grouping_id
        # Note that a grouped_location is a dict, as described above
        #grouping_id = the grouping_id of all items in the list
        grouping_id, grouped_locations_list = item
        #We need to get only the RawLocation objects back from the grouped_location dict
        #match_group is the list of RawLocation objects which we call match on
        match_group = []
        first_pass=True
        for grouped_location in grouped_locations_list:
            match_group.append(grouped_location["raw_location"])
            if(first_pass):
                first_matching_location = grouped_location["matching_location"]
        """
        default is a dict containing the default values of the parameters
        (id, city, region, country, latitude, longtidue)
        for all locations that are part of the same group.
        Here we set the defaults to be the values for the first entry in the grouped_locations_list
        In theory, all entries in the grouped_locations_list should have the same lat/long.
        """
        default = {"id": grouping_id, "city":first_matching_location.city,
                   "state":first_matching_location.region,
                   "country": first_matching_location.country.upper(),
                   "latitude":first_matching_location.latitude,
                   "longitude":first_matching_location.longitude}
        #No need to run match() if no matching location was found.
        if(grouping_id!="nolocationfound"):
            run_geo_match(grouping_id, default, match_group, i, t, alchemy_session)

    if alchemy.is_mysql():
        alchemy_session.execute('truncate location; truncate location_assignee; truncate location_inventor;')
    else:
        alchemy_session.execute('delete from location;')
        alchemy_session.commit()
        alchemy_session.execute('delete from location_assignee;')
        alchemy_session.commit()
        alchemy_session.execute('delete from location_inventor;')
        alchemy_session.commit()
    if doctype == 'grant':
        bulk_commit_inserts(location_insert_statements, alchemy.schema.Location.__table__, alchemy.is_mysql(), commit_freq, 'grant')
        bulk_commit_updates('location_id', update_statements, alchemy.schema.RawLocation.__table__, alchemy.is_mysql(), commit_freq, 'grant')
    elif doctype == 'application':
        bulk_commit_inserts(location_insert_statements, alchemy.schema.App_Location.__table__, alchemy.is_mysql(), commit_freq, 'application')
        bulk_commit_updates('location_id', update_statements, alchemy.schema.App_RawLocation.__table__, alchemy.is_mysql(), commit_freq, 'application')
    alchemy_session.commit()
    session_generator = alchemy.session_generator(dbtype=doctype)
    session = session_generator()

    session.commit()
    print 'Committed!!!'
コード例 #5
0
ファイル: tasks.py プロジェクト: phestoem/patentprocessor
def celery_commit_inserts(insert_statements, table, is_mysql, commit_frequency = 1000):
    """
    Executes bulk inserts for a given table. This is typically much faster than going through
    the SQLAlchemy ORM. The insert_statement list of dictionaries may fall victim to SQLAlchemy
    complaining that certain columns are null, if you did not specify a value for every single
    column for a table.

    A session is generated using the scoped_session factory through SQLAlchemy, and then
    the actual lib.alchemy.match.commit_inserts task is dispatched.

    Args:
    insert_statements -- list of dictionaries where each dictionary contains key-value pairs of the object
    table -- SQLAlchemy table object. If you have a table reference, you can use TableName.__table__
    is_mysql -- adjusts syntax based on if we are committing to MySQL or SQLite. You can use alchemy.is_mysql() to get this
    commit_frequency -- tune this for speed. Runs "session.commit" every `commit_frequency` items
    """
    session = session_generator()
    commit_inserts(session, insert_statements, table, is_mysql, commit_frequency)
コード例 #6
0
import itertools
import sys
import json

config = get_config()

THRESHOLD = config.get("assignee").get("threshold")

uuid_to_object = {}
uuid_to_cleanid = {}
letter_to_cleanid = {}
uuids_by_cleanidletter = defaultdict(list)

grant_uuids = set()
app_uuids = set()
grantsessiongen = alchemy.session_generator(dbtype="grant")
appsessiongen = alchemy.session_generator(dbtype="application")

nodigits = re.compile(r"[a-z ]")
stoplist = ["the", "of", "and", "a", "an", "at"]
substitutions = json.load(open("nber_substitutions.json"))


def isgrant(obj):
    """
    returns True of obj is from Grant table, False if from App table
    """
    return hasattr(obj, "patent")


def get_cleanid(obj):
コード例 #7
0
import itertools
import sys
import json

config = get_config()

THRESHOLD = config.get("assignee").get("threshold")

uuid_to_object = {}
uuid_to_cleanid = {}
letter_to_cleanid = {}
uuids_by_cleanidletter = defaultdict(list)

grant_uuids = set()
app_uuids = set()
grantsessiongen = alchemy.session_generator(dbtype='grant')
appsessiongen = alchemy.session_generator(dbtype='application')

nodigits = re.compile(r'[a-z ]')
stoplist = ['the','of','and','a','an','at']
substitutions = json.load(open('nber_substitutions.json'))

def isgrant(obj):
    """
    returns True of obj is from Grant table, False if from App table
    """
    return hasattr(obj, 'patent')

def get_cleanid(obj):
    """
    Returns a cleaned string version of the object representation:
コード例 #8
0
import itertools
import sys
import json

config = get_config()

THRESHOLD = config.get("assignee").get("threshold")

uuid_to_object = {}
uuid_to_cleanid = {}
letter_to_cleanid = {}
uuids_by_cleanidletter = defaultdict(list)

grant_uuids = set()
app_uuids = set()
grantsessiongen = alchemy.session_generator(dbtype='grant')
appsessiongen = alchemy.session_generator(dbtype='application')

nodigits = re.compile(r'[a-z ]')
stoplist = ['the', 'of', 'and', 'a', 'an', 'at']
substitutions = json.load(open('nber_substitutions.json'))


def isgrant(obj):
    """
    returns True of obj is from Grant table, False if from App table
    """
    return hasattr(obj, 'patent_id')


def get_cleanid(obj):
コード例 #9
0
def match_grouped_locations(identified_grouped_locations_enum, t, alchemy_session):
    if alchemy.is_mysql():
        alchemy_session.execute("set foreign_key_checks = 0; truncate location;")
        alchemy_session.commit()
    for i, item in identified_grouped_locations_enum:
        #grouped_locations_list = a list of every grouped location with the same grouping_id
        # Note that a grouped_location is a dict, as described above
        #grouping_id = the grouping_id of all items in the list
        grouping_id, grouped_locations_list = item
        #We need to get only the RawLocation objects back from the grouped_location dict
        #match_group is the list of RawLocation objects which we call match on
        match_group = []
        first_pass=True
        for grouped_location in grouped_locations_list:
            match_group.append(grouped_location["raw_location"])
            if(first_pass):
                first_matching_location = grouped_location["matching_location"]
        """
        default is a dict containing the default values of the parameters
        (id, city, region, country, latitude, longtidue)
        for all locations that are part of the same group.
        Here we set the defaults to be the values for the first entry in the grouped_locations_list
        In theory, all entries in the grouped_locations_list should have the same lat/long.
        """
        default = {"id": grouping_id, "city":first_matching_location.city,
                   "state":first_matching_location.region,
                   "country":first_matching_location.country,
                   "latitude":first_matching_location.latitude,
                   "longitude":first_matching_location.longitude}
        #No need to run match() if no matching location was found.
        if(grouping_id!="nolocationfound"):
            run_geo_match(grouping_id, default, match_group, i, t, alchemy_session)
    alchemy_session.execute('truncate location; truncate assignee_location; truncate inventor_location;')
    celery_commit_inserts(location_insert_statements, alchemy.schema.Location.__table__, alchemy.is_mysql(), commit_freq)
    celery_commit_updates('location_id', update_statements, alchemy.schema.RawLocation.__table__, alchemy.is_mysql(), commit_freq)
    alchemy_session.commit()
    session_generator = alchemy.session_generator()
    session = session_generator()
    res = session.execute('select location.id, assignee.id from assignee \
                           left join rawassignee on rawassignee.assignee_id = assignee.id \
                           right join rawlocation on rawlocation.id = rawassignee.rawlocation_id \
                           right join location on location.id = rawlocation.location_id;')
    assigneelocation = pd.DataFrame.from_records(res.fetchall())
    assigneelocation = assigneelocation[assigneelocation[0].notnull()]
    assigneelocation = assigneelocation[assigneelocation[1].notnull()]
    assigneelocation.columns = ['location_id','assignee_id']
    locationassignee_inserts = [row[1].to_dict() for row in assigneelocation.iterrows()]
    celery_commit_inserts(locationassignee_inserts, alchemy.schema.locationassignee, alchemy.is_mysql(), 20000)

    res = session.execute('select location.id, inventor.id from inventor \
                           left join rawinventor on rawinventor.inventor_id = inventor.id \
                           right join rawlocation on rawlocation.id = rawinventor.rawlocation_id \
                           right join location on location.id = rawlocation.location_id;')
    inventorlocation = pd.DataFrame.from_records(res.fetchall())
    inventorlocation = inventorlocation[inventorlocation[0].notnull()]
    inventorlocation = inventorlocation[inventorlocation[1].notnull()]
    inventorlocation.columns = ['location_id','inventor_id']
    locationinventor_inserts = [row[1].to_dict() for row in inventorlocation.iterrows()]
    celery_commit_inserts(locationinventor_inserts, alchemy.schema.locationinventor, alchemy.is_mysql(), 20000)

    session.commit()