Example #1
0
def create_assignee(update_config):
    engine = create_engine(get_connection_string(update_config, "NEW_DB"))
    assignee_name_with_count = pd.read_sql_query(
        "SELECT assignee_id, name_first, name_last, type, count(1) name_count from rawassignee where assignee_id "
        "is not null and organization is null group by assignee_id, name_first, name_last, type;",
        con=engine)
    assignee_name_data = assignee_name_with_count.sort_values(
        "name_count",
        ascending=False).groupby("assignee_id").head(1).reset_index(drop=True)
    assignee_name_data = assignee_name_data.drop(
        "name_count", axis=1).assign(organization=None)
    assignee_name_data.rename({
        "assignee_id": "id"
    }, axis=1).to_sql(name='assignee',
                      con=engine,
                      if_exists='append',
                      index=False)

    assignee_organization_with_count = pd.read_sql_query(
        "SELECT assignee_id, organization, type, count(1) org_count from rawassignee where assignee_id is not "
        "null and organization is not null group by assignee_id, organization, type;",
        con=engine)
    assignee_org_data = assignee_organization_with_count.sort_values(
        "org_count",
        ascending=False).groupby("assignee_id").head(1).reset_index(drop=True)
    assignee_org_data = assignee_org_data.drop("org_count",
                                               axis=1).assign(name_first=None,
                                                              name_last=None)
    assignee_org_data.rename({
        "assignee_id": "id"
    }, axis=1).to_sql(name='assignee',
                      con=engine,
                      if_exists='append',
                      index=False)
Example #2
0
def process_and_upload_wipo(config):
    myengine = create_engine(get_connection_string(config, "NEW_DB"))
    wipo_output = '{}/{}'.format(config['FOLDERS']['WORKING_FOLDER'],
                                 'wipo_output')
    if not os.path.exists(wipo_output):
        os.mkdir(wipo_output)
    persistent_files = config['FOLDERS']['PERSISTENT_FILES']
    ipc_tech_file = '{}/ipc_technology.csv'.format(persistent_files)
    ipc_tech_field_map = get_ipc_tech_code_field_map(ipc_tech_file)

    concordance_file = '{}/{}/{}'.format(config['FOLDERS']['WORKING_FOLDER'],
                                         'cpc_input', 'ipc_concordance.txt')

    cpc_ipc_concordance_map = get_ipc_cpc_ipc_concordance_map(concordance_file)

    limit = 10000
    offset = 0
    batch_counter = 0
    base_query_template = "SELECT id from patent order by id limit {limit} offset {offset}"
    cpc_query_template = "SELECT c.patent_id, c.subgroup_id from cpc_current c join ({base_query}) p on p.id = " \
                         "c.patent_id"
    while True:
        start = time.time()
        batch_counter += 1
        base_query = base_query_template.format(limit=limit, offset=offset)
        cpc_join_query = cpc_query_template.format(base_query=base_query)
        cpc_current_data = pd.read_sql_query(con=myengine, sql=cpc_join_query)
        if cpc_current_data.shape[0] < 1:
            break
        wipo_chunk_processor(cpc_current_data, ipc_tech_field_map,
                             cpc_ipc_concordance_map, config)
        offset = offset + limit
        end = time.time()
        print("Chunk Time:" + str(round(end - start)))
    consolidate_wipo(config)
Example #3
0
def session_generator(dbtype='grant'):
    """
    Read from ../project/Development/config.ini file and load appropriate database

    @dbtype: string indicating if we are fetching the session for
             the grant database or the application database
    session_generator will return an object that can be called
    to retrieve more sessions, e.g.
    sg = session_generator(dbtype='grant')
    session1 = sg()
    session2 = sg()
    etc.
    These sessions will be protected with the ping refresher above
    """
    # config = get_config()
    # echo = config.get('GLOBAL').get('echo')
    echo = True
    read_database = "NEW_DB"
    cstr = get_connection_string(config, read_database)
    engine = create_engine(cstr)

    #     engine = create_engine('mysql+mysqldb://{0}:{1}@{2}/{3}?charset=utf8mb4'.format(
    #         config.get('DATABASE').get('user'),
    #         config.get('DATABASE').get('password'),
    #         config.get('DATABASE').get('host'),
    #         config.get('DATABASE').get(read_database), echo=echo), pool_size=3, pool_recycle=3600, echo_pool=True)

    schema.GrantBase.metadata.create_all(engine)

    Session = sessionmaker(bind=engine, _enable_transaction_accounting=False)
    return scoped_session(Session)
Example #4
0
def rename_old_db(update_config):
    old_database = update_config["DATABASE"]["OLD_DB"]
    new_database = update_config["DATABASE"]["NEW_DB"]
    connection_string = get_connection_string(update_config, "OLD_DB")
    engine = create_engine(connection_string)
    tables = [
        t[0]
        for t in engine.execute('show tables from {}'.format(old_database))
        if not t[0].startswith('temp')
    ]
    tables_to_truncate = [
        'inventor_disambiguation_mapping', 'assignee_disambiguation_mapping',
        'assignee', 'cpc_current', 'cpc_group', 'cpc_subgroup',
        'cpc_subsection', 'inventor', 'location', 'location_assignee',
        'location_inventor', 'patent_assignee', 'patent_inventor',
        'patent_lawyer'
    ]

    for table in tables:
        con = engine.connect()
        con.execute('alter table {0}.{2} rename {1}.{2}'.format(
            old_database, new_database, table))
        if table in tables_to_truncate:
            con.execute('truncate table {}.{}'.format(new_database, table))
        con.close()
Example #5
0
 def test_column_encoding(self):
     new_database = self.config['DATABASE']["TEMP_UPLOAD_DB"]
     connection_string = get_connection_string(self.config, database='TEMP_UPLOAD_DB')
     engine = create_engine(connection_string)
     collation_query = '''
                     SELECT TABLE_NAME, COLUMN_NAME, character_set_name,
                         collation_name
                     FROM   information_schema.columns
                     WHERE  table_schema='{new_db}'
                     AND    data_type IN ('varchar',
                                         'longtext',
                                         'mediumtext',
                                         'text',
                                         'enum',
                                         'char',
                                         'set',
                                         'int')
     '''.format(
         new_db=new_database)
     collation_cursor = engine.execute(collation_query)
     for column_collation_name in collation_cursor:
         if column_collation_name[2] != 'utf8mb4':
             raise AssertionError(
                 "Table character set should be utf8mb4 instead found {cset} for table {tbl}. column name {col}".format(
                     cset=column_collation_name[2], tbl=column_collation_name[0], col=column_collation_name[1]))
         if column_collation_name[3] != 'utf8mb4_unicode_ci':
             raise AssertionError(
                 "Table  collation should be utf8mb4_unicode_ci instead found {collation} for table {tbl}".format(
                     collation=column_collation_name[3], tbl=column_collation_name[0], col=column_collation_name[1]))
Example #6
0
    def test_inventor_table_generator(self, config):
        from updater.post_processing.post_process_inventor import create_inventor
        create_inventor(config)
        from sqlalchemy import create_engine
        import pandas as pd
        engine = create_engine(get_connection_string(config, "NEW_DB"))
        count_query = "SELECT count(1)  as `rows` from inventor"
        distinct_id_query = "SELECT count(distinct inventor_id) inventors from rawinventor"
        column_query = """
                SELECT COLUMN_NAME
                from information_schema.COLUMNS
                where TABLE_SCHEMA='{schema}'
                        and TABLE_NAME='inventor'""".format(
            schema=config["DATABASE"]["NEW_DB"])

        column_data = pd.read_sql_query(column_query, con=engine)
        expected_columns = ['id', 'name_first', 'name_last']
        actual_column_names = column_data.COLUMN_NAME.tolist()
        assert len(actual_column_names) == len(expected_columns)
        assert set(actual_column_names) == set(expected_columns)

        inv_count_data = pd.read_sql_query(count_query, con=engine)
        rinv_distinct_count_data = pd.read_sql_query(distinct_id_query,
                                                     con=engine)
        inv_count = inv_count_data.rows.tolist()[0]
        rinv_distinct_counts = rinv_distinct_count_data.inventors.tolist()[0]
        assert inv_count == rinv_distinct_counts

        duplicate_query = "SELECT count(1) dups from (SELECT count(1)   from inventor group by id having count(1)>1)x"
        dup_count = pd.read_sql_query(duplicate_query, con=engine)
        assert (dup_count.dups.tolist()[0] == 0)
        engine.execute("TRUNCATE TABLE inventor")
Example #7
0
def fetch_session(dbtype='grant'):
    """
    Read from ../project/Development/config.ini file and load appropriate database

    @dbtype: string indicating if we are fetching the session for
             the grant database or the application database
    """
    # config = get_config()
    echo = True
    if dbtype == 'grant':  # this is here so we can port to work for applicaitons also
        read_database = "NEW_DB"
        cstr = get_connection_string(config, read_database)
        engine = create_engine(cstr)

    #     engine = create_engine('mysql+mysqldb://{0}:{1}@{2}/{3}?charset=utf8'.format(
    #         config['DATABASE']['USERNAME'],
    #         config['DATABASE']['PASSWORD'],
    #         config['DATABASE']['HOST'],
    #         config['DATABASE'][read_database], echo=echo))

    schema.GrantBase.metadata.create_all(engine)

    Session = sessionmaker(bind=engine, _enable_transaction_accounting=False)
    session = Session()
    return session
Example #8
0
def prepare_tables(config):
    cstr = get_connection_string(config, 'NEW_DB')
    engine = create_engine(cstr + "&local_infile=1")
    timestamp = str(int(time.time()))
    with engine.connect() as connection:
        connection.execute(
            "CREATE TABLE rawlawyer_copy_backup_{} LIKE rawlawyer;".format(
                timestamp))
        connection.execute(
            "INSERT INTO rawlawyer_copy_backup_{} SELECT * FROM rawlawyer".
            format(timestamp))

    with engine.connect() as connection:
        connection.execute(
            "ALTER TABLE rawlawyer ADD COLUMN alpha_lawyer_id varchar(128) AFTER organization;"
        )

    with engine.connect() as connection:
        connection.execute(
            "UPDATE rawlawyer rc SET rc.alpha_lawyer_id  = rc.organization WHERE rc.organization IS NOT NULL;"
        )
        connection.execute(
            "UPDATE rawlawyer rc SET rc.alpha_lawyer_id  = concat(rc.name_first, '|', rc.name_last) WHERE "
            "rc.name_first IS NOT NULL AND rc.name_last IS NOT NULL;")
        connection.execute(
            "UPDATE rawlawyer rc SET rc.alpha_lawyer_id  = '' WHERE rc.alpha_lawyer_id IS NULL;"
        )
def upload_disambig_results(update_config):
    engine = create_engine(get_connection_string(update_config, "NEW_DB"))
    disambig_output_file = "{wkfolder}/disambig_output/{disamb_file}".format(
        wkfolder=update_config['FOLDERS']['WORKING_FOLDER'],
        disamb_file="inventor_disambiguation.tsv")
    disambig_output = pd.read_csv(disambig_output_file,
                                  sep="\t",
                                  chunksize=300000,
                                  header=None,
                                  quoting=csv.QUOTE_NONE,
                                  names=[
                                      'unknown_1', 'uuid', 'inventor_id',
                                      'name_first', 'name_middle', 'name_last',
                                      'name_suffix'
                                  ])
    count = 0
    for disambig_chunk in disambig_output:
        engine.connect()
        start = time.time()
        count += disambig_chunk.shape[0]
        disambig_chunk[["uuid", "inventor_id"
                        ]].to_sql(name='inventor_disambiguation_mapping',
                                  con=engine,
                                  if_exists='append',
                                  index=False,
                                  method='multi')
        end = time.time()
        print("It took {duration} seconds to get to {cnt}".format(
            duration=round(end - start, 3), cnt=count))
        engine.dispose()
Example #10
0
def upload_disambig_results(update_config):
    engine = create_engine(get_connection_string(update_config, "NEW_DB"))
    disambig_output_file = "{wkfolder}/disambig_output/{disamb_file}".format(
        wkfolder=update_config['FOLDERS']['WORKING_FOLDER'],
        disamb_file="assignee_disambiguation.tsv")
    disambig_output = pd.read_csv(
        disambig_output_file,
        sep="\t",
        chunksize=300000,
        header=None,
        quoting=csv.QUOTE_NONE,
        names=['uuid', 'organization_id', 'person_id', 'organization', 'name'])
    count = 0
    engine.execute("TRUNCATE TABLE assignee_disambiguation_mapping")
    for disambig_chunk in disambig_output:
        disambig_chunk = disambig_chunk.assign(
            assignee_id=disambig_chunk.organization_id)
        disambig_chunk.assignee_id.fillna(disambig_chunk.person_id,
                                          inplace=True)
        disambig_chunk.drop(["organization_id", "person_id"],
                            axis=1,
                            inplace=True)
        count += disambig_chunk.shape[0]
        engine.connect()
        start = time.time()
        disambig_chunk[["uuid", "assignee_id"
                        ]].to_sql(name='assignee_disambiguation_mapping',
                                  con=engine,
                                  if_exists='append',
                                  index=False,
                                  method='multi')
        end = time.time()
        print("It took {duration} seconds to get to {cnt}".format(
            duration=round(end - start, 3), cnt=count))
        engine.dispose()
Example #11
0
    def test_inventor_precache(self, config):
        from updater.post_processing.post_process_inventor import precache_inventors
        from sqlalchemy import create_engine
        import pandas as pd

        count_query = """
        SELECT count(1) as `rows` from disambiguated_inventor_ids;
        """
        distinct_count_query = """
        SELECT count(distinct inventor_id) distinct_inventor_ids from rawinventor;
        """
        duplicate_query = """
        SELECT count(1) as duplicates
            from (SELECT inventor_id from disambiguated_inventor_ids group by inventor_id having count(1) > 1) x
        """

        engine = create_engine(get_connection_string(config, "NEW_DB"))
        pre_count = pd.read_sql_query(count_query, engine)
        assert (pre_count.rows.tolist()[0] == 0)
        precache_inventors(config)
        post_count = pd.read_sql_query(count_query, engine)
        distinct_count = pd.read_sql_query(distinct_count_query, engine)
        assert (post_count.rows.tolist()[0] ==
                distinct_count.distinct_inventor_ids.tolist()[0])
        duplicate_count = pd.read_sql_query(duplicate_query, engine)
        assert (duplicate_count.duplicates.tolist()[0] == 0)
Example #12
0
 def test_inventor_generator(self, config, limit, offset, expected):
     from updater.post_processing.post_process_inventor import generate_disambiguated_inventors
     from sqlalchemy import create_engine
     engine = create_engine(get_connection_string(config, "NEW_DB"))
     inventors = generate_disambiguated_inventors(engine, limit, offset)
     assert (len(inventors.inventor_id.unique()) == expected)
     # disambiguated id, name first, name last, patent date
     assert (inventors.shape[1] == 4)
Example #13
0
def upload_cpc_classes(config):
    cstr = get_connection_string(config, "NEW_DB")
    db_con = create_engine(cstr)
    cpc_folder = '{}/{}'.format(config['FOLDERS']['WORKING_FOLDER'],
                                'cpc_output')

    upload_cpc_small_tables(db_con, config['DATABASE']['NEW_DB'], cpc_folder)
    upload_cpc_subgroup(db_con, config['DATABASE']['NEW_DB'], cpc_folder)
Example #14
0
def clean_data():
    yield
    config = get_config()
    config["DATABASE"]["NEW_DB"] = 'sarvo_test_db'
    from sqlalchemy import create_engine
    engine = create_engine(get_connection_string(config, "NEW_DB"))
    engine.execute("TRUNCATE TABLE inventor;")
    engine.execute("TRUNCATE TABLE disambiguated_inventor_ids;")
Example #15
0
def extract_wipo_data(cpc_chunk, cpc_ipc_concordance, ipc_tech_map, config):
    # Obtain IPC Concordance for each patent based on cpc subgrou ID
    cpc_current_with_concordance = cpc_chunk.merge(right=cpc_ipc_concordance,
                                                   how='left',
                                                   left_on='subgroup_id',
                                                   right_on='cpc_code').drop(
                                                       "cpc_code", axis=1)
    # If concordance does not exist, use subgroup id as IPC code
    cpc_current_with_concordance.ipc_code.fillna(
        cpc_current_with_concordance.subgroup_id, inplace=True)
    # Create lookup fields for IPC Wipo technology field id
    cpc_current_with_concordance = cpc_current_with_concordance.assign(
        section=cpc_current_with_concordance.ipc_code.str.slice(0, 4))
    cpc_current_with_concordance = cpc_current_with_concordance.assign(
        group=cpc_current_with_concordance.ipc_code.str.split("/",
                                                              expand=True)[0])
    # Lookup IPC Tech field ID (WIpo field id)
    # First lookup using "section" column
    cpc_current_with_wito_merge_1 = cpc_current_with_concordance.merge(
        right=ipc_tech_map, how='left', left_on='section',
        right_on='IPC_Code').drop('IPC_Code',
                                  axis=1).rename({"Field_number": "field_id"},
                                                 axis=1)
    # For failed lookups use "group" field
    secondary_lookup = cpc_current_with_wito_merge_1.merge(right=ipc_tech_map,
                                                           how='left',
                                                           left_on='group',
                                                           right_on='IPC_Code')
    # Merge the secondary lookup with main dataset (Merge by index)
    wipo_data_with_merge = cpc_current_with_wito_merge_1.join(
        secondary_lookup[["Field_number"]])
    wipo_data_with_merge.field_id.fillna(wipo_data_with_merge.Field_number,
                                         inplace=True)
    # Clean UP
    wipo_data = wipo_data_with_merge.dropna(subset=['field_id'], axis=0).drop(
        ["subgroup_id", "ipc_code", "section", "group", "Field_number"],
        axis=1)
    # Counter for Each Field ID for each patent
    wipo_count = wipo_data.groupby(["patent_id",
                                    "field_id"]).size().to_frame('wipo_count')
    wipo_count = wipo_count.reset_index()
    # Retain Top 3 most frequent Wipo field IDs
    wipo_filtered_data = wipo_count.groupby("patent_id").apply(
        lambda _df: _df.nlargest(3, 'wipo_count', keep='all')).reset_index(
            drop=True)
    # Assign Sequence
    wipo_filtered_data_sequenced = wipo_filtered_data.drop(
        ["wipo_count"], axis=1).assign(
            sequence=wipo_filtered_data.groupby(['patent_id']).cumcount())
    cstr = get_connection_string(config, "TEMP_UPLOAD_DB")
    print(cstr)
    engine = create_engine(cstr)
    with engine.begin() as conn:
        wipo_filtered_data_sequenced.to_sql('wipo',
                                            conn,
                                            if_exists='append',
                                            index=False,
                                            method="multi")
Example #16
0
def create_new_database(config):
    new_database = config["DATABASE"]["NEW_DB"]
    connection_string = get_connection_string(config, "OLD_DB")
    engine = create_engine(connection_string)
    connection = engine.connect()
    connection.execute(
        'create schema {}  default character set=utf8mb4 default collate=utf8mb4_unicode_ci'
        .format(new_database))
    connection.close()
def prepare_cpc_table(config, drop_indexes):
    """
    Prepare the CPC Current table by dropping Indexes
    :param config: Config file containing variour runtime paramters
    :param drop_indexes: List of Drop Index Statements
    """
    engine = create_engine(get_connection_string(config, "NEW_DB"))
    for drop_statement in drop_indexes:
        engine.execute(drop_statement[0])
Example #18
0
 def __init__(self, config):
     self.config = config
     self.new_db = config['DATABASE']['NEW_DB']
     self.old_db = config['DATABASE']['OLD_DB']
     self.old_id_col = 'disamb_inventor_id_{}'.format(self.old_db[-8:])
     self.new_id_col = 'disamb_inventor_id_{}'.format(self.new_db[-8:])
     self.cstr = get_connection_string(config, database='NEW_DB')
     self.id_to_gender = {}
     pass
def precache_assignee(config):
    assignee_cache_query = """
    INSERT INTO disambiguated_assignee_ids (assignee_id)
    SELECT distinct assignee_id from rawassignee
    UNION
    SELECT distinct assignee_id from {pgpubs_database}.rawassignee;
    """.format(pgpubs_database=config['DATABASE']['PGPUBS_DATABASE'])
    engine = create_engine(get_connection_string(config, "NEW_DB"))
    engine.execute(assignee_cache_query)
Example #20
0
 def __init__(self, config):
     self.config = config
     self.qa_connection_string = get_connection_string(self.config, 'QA_DATABASE')
     self.connection = pymysql.connect(host=self.config['DATABASE']['HOST'],
                                       user=self.config['DATABASE']['USERNAME'],
                                       password=self.config['DATABASE']['PASSWORD'],
                                       db=self.config['DATABASE']['NEW_DB'],
                                       charset='utf8mb4', cursorclass=pymysql.cursors.SSCursor, defer_connect=True)
     self.qa_data = {"DataMonitor_patentwithdrawncount": []}
Example #21
0
def generate_timestamp_uploads(update_config):
    working_folder = update_config['FOLDERS']['WORKING_FOLDER']
    connection_string = get_connection_string(update_config, "TEMP_UPLOAD_DB")
    parsed_data_folder = "{working_folder}/{parsed_folder}".format(
        working_folder=working_folder, parsed_folder="parsed_data")
    for timestamp_folder in os.listdir(parsed_data_folder):
        timestamp_folder_full_path = "{source_root}/{folder_name}/".format(
            source_root=parsed_data_folder, folder_name=timestamp_folder)
        upload_from_timestamp_folder(timestamp_folder_full_path,
                                     connection_string)
    def __init__(self, config, database_section, start_date, end_date):
        """
        Do not instantiate. Overriden class constructor
        :param config: Configparser object containing update parameters
        :param database_section: Section in config that indicates database to use. NEW_DB or TEMP_UPLOAD_DB
        :param start_date: Database Update start date
        :param end_date: Database Update end date
        """
        # Tables that do not directly link to patent table
        self.patent_exclusion_list = [
            'mainclass', 'mainclass_current', 'subclass', 'subclass_current',
            'patent', 'rawlocation'
        ]
        # Update start and end date
        self.start_date = start_date
        self.end_date = end_date
        # Indicator for Upload/Patents database
        self.database_section = database_section

        self.qa_connection_string = get_connection_string(
            config, 'QA_DATABASE')
        self.connection = pymysql.connect(
            host=config['DATABASE']['HOST'],
            user=config['DATABASE']['USERNAME'],
            password=config['DATABASE']['PASSWORD'],
            db=config['DATABASE'][database_section],
            charset='utf8mb4',
            cursorclass=pymysql.cursors.SSCursor,
            defer_connect=True)
        # self.database_connection_string = get_connection_string(config, database_section)
        self.config = config
        # Place Holder for saving QA counts - keys map to table names in patent_QA
        self.qa_data = {
            "DataMonitor_count": [],
            'DataMonitor_nullcount': [],
            'DataMonitor_patentyearlycount': [],
            'DataMonitor_categorycount': [],
            'DataMonitor_floatingpatentcount': [],
            'DataMonitor_maxtextlength': [],
            'DataMonitor_prefixedentitycount': []
        }

        database_type, version = self.config["DATABASE"][
            self.database_section].split("_")
        self.version = version
        self.database_type = database_type

        # Following variables are overridden by inherited classes
        # Dict of tables involved on QC checks
        self.table_config = {}
        # Prefix indicates database where patent table is available
        # Current databaseif prefix is None
        # Used for Text databases
        self.patent_db_prefix = None
def consolidate_cpc_data(config, add_indexes):
    """
    Finalize CPC Current table by removing patents not in patent database and re-adding indexes
    :param config: Config file containing variour runtime paramters
    :param add_indexes: List of Add Index statments
    """
    engine = create_engine(get_connection_string(config, "NEW_DB"))
    delete_query = "DELETE cpc FROM cpc_current cpc LEFT JOIN patent p on p.id = cpc.patent_id WHERE p.id is null"
    engine.execute(delete_query)
    for add_statement in add_indexes:
        engine.execute(add_statement[0])
Example #24
0
def start_lawyer_disambiguation(config):
    prepare_tables(config)
    clean_rawlawyer(config)
    load_clean_rawlawyer(config)
    disambiguator = LawyerDisambiguator(config)
    disambiguator.run_disambiguation()
    cstr = get_connection_string(config, 'NEW_DB')
    engine = create_engine(cstr + "&local_infile=1")

    engine.execute("ALTER TABLE rawlawyer DROP alpha_lawyer_id")
    engine.dispose()
Example #25
0
def post_process_location(config):
    engine = create_engine(get_connection_string(config, "NEW_DB"))
    disambiguated_folder = "{}/disambig_output".format(config['FOLDERS']['WORKING_FOLDER'])
    print('here!', flush=True)
    id_lat_long_lookup, lat_long_cannonical_name = make_lookup(disambiguated_folder)
    print('made lookup', flush=True)
    fips_lookups = create_fips_lookups(config['FOLDERS']['PERSISTENT_FILES'])
    upload_location(engine, lat_long_cannonical_name, disambiguated_folder, fips_lookups)
    print('done locupload ', flush=True)
    process_rawlocation(engine, lat_long_cannonical_name, id_lat_long_lookup, disambiguated_folder)
    print('done process', flush=True)
    upload_rawloc(engine, disambiguated_folder, config['DATABASE']['NEW_DB'])
def update_rawinventor(update_config):
    engine = create_engine(get_connection_string(update_config, "NEW_DB"))
    update_statement = "UPDATE rawinventor ri left join inventor_disambiguation_mapping idm on idm.uuid = ri.uuid set " \
                       "" \
                       "" \
                       "" \
                       "" \
                       "" \
                       "" \
                       "" \
                       "ri.inventor_id=idm.inventor_id "
    engine.execute(update_statement)
def update_rawassignee(update_config):
    engine = create_engine(get_connection_string(update_config, "NEW_DB"))
    update_statement = "UPDATE rawassignee ra left join assignee_disambiguation_mapping adm on adm.uuid = ra.uuid set " \
                       "" \
                       "" \
                       "" \
                       "" \
                       "" \
                       "" \
                       "" \
                       "ra.assignee_id=adm.assignee_id "
    engine.execute(update_statement)
Example #28
0
 def test_table_encoding(self):
     new_database = self.config['DATABASE']["TEMP_UPLOAD_DB"]
     connection_string = get_connection_string(self.config, database='TEMP_UPLOAD_DB')
     engine = create_engine(connection_string)
     collation_query = "SELECT  TABLE_NAME, TABLE_COLLATION from information_schema.tables where TABLE_SCHEMA='{new_db}'".format(
         new_db=new_database)
     collation_cursor = engine.execute(collation_query)
     for table_collation_row in collation_cursor:
         if table_collation_row[1] != 'utf8mb4_unicode_ci':
             raise AssertionError(
                 "Table  collation should be utf8mb4_unicode_ci instead found {collation} for table {tbl}".format(
                     cset=table_collation_row[1], tbl=table_collation_row[0]))
def create_persistent_wide_entity(config, entity):
    db_con = create_engine(
        get_connection_string(config, 'NEW_DB') + '&local_infile=1')
    disambig_folder = "{}/disambig_output/".format(
        config['FOLDERS']['WORKING_FOLDER'])

    old_db = config['DATABASE']['OLD_DB']
    new_db = config['DATABASE']['NEW_DB']
    new_db_timestamp = new_db.replace('patent_', '')

    # set of values that change depending on entity
    persistent_long_table = 'persistent_{0}_disambig_long'.format(entity)
    raw_table = 'raw{0}'.format(entity)
    id_col = '{0}_id'.format(entity)

    outfile_name_wide = 'persistent_{}_wide.tsv'.format(entity)
    outfile_fp_wide = disambig_folder + outfile_name_wide

    persistent_disambig_table = 'persistent_{0}_disambig'.format(entity)

    # get disambig cols from old db's persistent_inventor_disambig
    disambig_cols = get_wide_entity_disambig_cols(db_con, old_db,
                                                  persistent_disambig_table)

    # Add new column for this data update:
    raw_cols = [
        'current_{0}_id'.format(raw_table), 'old_{0}_id'.format(raw_table)
    ]
    header_wide = [raw_cols[0], raw_cols[1]] + disambig_cols + [
        'disamb_{0}_id_'.format(entity) + new_db_timestamp
    ]
    print(header_wide)
    header_df = pd.DataFrame(columns=header_wide)
    header_df.to_csv(outfile_fp_wide, index=False, header=True, sep='\t')
    # get total rows in raw entity table
    result = db_con.execute('select count(*) from {0}.{1}'.format(
        new_db, raw_table))
    total_rows = [r[0] for r in result][0]
    write_wide_outfile(db_con, new_db, old_db, entity, persistent_long_table,
                       raw_table, id_col, total_rows, outfile_fp_wide,
                       header_df)

    ####### 3. create table in database
    create_wide_table_database(db_con, entity, persistent_disambig_table,
                               outfile_fp_wide)

    ######### 4. load data
    db_con.execute(
        "LOAD DATA LOCAL INFILE '{0}' INTO TABLE {1}.{2} FIELDS TERMINATED BY '\t' NULL DEFINED BY '' IGNORE 1 lines;"
        .format(outfile_fp_wide, new_db, persistent_disambig_table))

    return True
Example #30
0
    def test_database_encoding(self):
        new_database = self.config['DATABASE']["TEMP_UPLOAD_DB"]
        connection_string = get_connection_string(self.config, database='TEMP_UPLOAD_DB')
        engine = create_engine(connection_string)

        collation_query = "SELECT DEFAULT_CHARACTER_SET_NAME, DEFAULT_COLLATION_NAME from information_schema.schemata where SCHEMA_NAME='{new_db}'".format(
            new_db=new_database)
        collation_cursor = engine.execute(collation_query)
        cset, collation = collation_cursor.fetchall()[0]
        if cset != 'utf8mb4':
            raise AssertionError("Database character set should be utf8mb4 instead found {cset}".format(cset=cset))
        if collation != 'utf8mb4_unicode_ci':
            raise AssertionError(
                "Database collationshould be utf8mb4_unicode_ci instead found {collation}".format(cset=collation))