def test_reserved_words(self, engine, connection): """Presto uses double quotes, not backticks""" fake_table = Table('select', MetaData(bind=engine), Column('current_timestamp', STRINGTYPE)) query = str(fake_table.select(fake_table.c.current_timestamp == 'a')) self.assertIn('"select"', query) self.assertIn('"current_timestamp"', query) self.assertNotIn('`select`', query) self.assertNotIn('`current_timestamp`', query)
def test_to_sql(self, engine, conn): # TODO pyathena.error.OperationalError: SYNTAX_ERROR: line 1:305: # Column 'foobar' cannot be resolved. # def _format_bytes(formatter, escaper, val): # return val.decode() table_name = "to_sql_{0}".format(str(uuid.uuid4()).replace("-", "")) df = pd.DataFrame({ "col_int": np.int32([1]), "col_bigint": np.int64([12345]), "col_float": np.float32([1.0]), "col_double": np.float64([1.2345]), "col_string": ["a"], "col_boolean": np.bool_([True]), "col_timestamp": [datetime(2020, 1, 1, 0, 0, 0)], "col_date": [date(2020, 12, 31)], # "col_binary": "foobar".encode(), }) # Explicitly specify column order df = df[[ "col_int", "col_bigint", "col_float", "col_double", "col_string", "col_boolean", "col_timestamp", "col_date", # "col_binary", ]] df.to_sql( table_name, engine, schema=SCHEMA, index=False, if_exists="replace", method="multi", ) table = Table(table_name, MetaData(bind=engine), autoload=True) self.assertEqual( table.select().execute().fetchall(), [( 1, 12345, 1.0, 1.2345, "a", True, datetime(2020, 1, 1, 0, 0, 0), date(2020, 12, 31), # "foobar".encode(), )], )
def test_to_sql(self, engine, conn): # TODO Add binary column (After dropping support for Python 2.7) table_name = "to_sql_{0}".format(str(uuid.uuid4()).replace("-", "")) df = pd.DataFrame( { "col_int": np.int32([1]), "col_bigint": np.int64([12345]), "col_float": np.float32([1.0]), "col_double": np.float64([1.2345]), "col_string": ["a"], "col_boolean": np.bool_([True]), "col_timestamp": [datetime(2020, 1, 1, 0, 0, 0)], "col_date": [date(2020, 12, 31)], } ) # Explicitly specify column order df = df[ [ "col_int", "col_bigint", "col_float", "col_double", "col_string", "col_boolean", "col_timestamp", "col_date", ] ] df.to_sql( table_name, engine, schema=SCHEMA, index=False, if_exists="replace", method="multi", ) table = Table(table_name, MetaData(bind=engine), autoload=True) self.assertEqual( table.select().execute().fetchall(), [ ( 1, 12345, 1.0, 1.2345, "a", True, datetime(2020, 1, 1, 0, 0, 0), date(2020, 12, 31), ) ], )
def test_to_sql(self, engine, conn): table_name = 'to_sql_{0}'.format(str(uuid.uuid4()).replace('-', '')) df = pd.DataFrame({'a': [1, 2, 3, 4, 5]}) df.to_sql(table_name, engine, schema=SCHEMA, index=False, if_exists='replace', method='multi') table = Table(table_name, MetaData(bind=engine), autoload=True) rows = table.select().execute().fetchall() self.assertEqual(sorted(rows), [(1, ), (2, ), (3, ), (4, ), (5, )])
def test_reflect_select(self, engine, conn): one_row_complex = Table("one_row_complex", MetaData(bind=engine), autoload=True) self.assertEqual(len(one_row_complex.c), 15) self.assertIsInstance(one_row_complex.c.col_string, Column) rows = one_row_complex.select().execute().fetchall() self.assertEqual(len(rows), 1) self.assertEqual( list(rows[0]), [ True, 127, 32767, 2147483647, 9223372036854775807, 0.5, 0.25, "a string", datetime(2017, 1, 1, 0, 0, 0), date(2017, 1, 2), b"123", "[1, 2]", "{1=2, 3=4}", "{a=1, b=2}", Decimal("0.1"), ], ) self.assertIsInstance(one_row_complex.c.col_boolean.type, BOOLEAN) self.assertIsInstance(one_row_complex.c.col_tinyint.type, INTEGER) self.assertIsInstance(one_row_complex.c.col_smallint.type, INTEGER) self.assertIsInstance(one_row_complex.c.col_int.type, INTEGER) self.assertIsInstance(one_row_complex.c.col_bigint.type, BIGINT) self.assertIsInstance(one_row_complex.c.col_float.type, FLOAT) self.assertIsInstance(one_row_complex.c.col_double.type, FLOAT) self.assertIsInstance(one_row_complex.c.col_string.type, type(STRINGTYPE)) self.assertIsInstance(one_row_complex.c.col_timestamp.type, TIMESTAMP) self.assertIsInstance(one_row_complex.c.col_date.type, DATE) self.assertIsInstance(one_row_complex.c.col_binary.type, BINARY) self.assertIsInstance(one_row_complex.c.col_array.type, type(STRINGTYPE)) self.assertIsInstance(one_row_complex.c.col_map.type, type(STRINGTYPE)) self.assertIsInstance(one_row_complex.c.col_struct.type, type(STRINGTYPE)) self.assertIsInstance(one_row_complex.c.col_decimal.type, DECIMAL)
def save_from_text_to_database(engine: Engine, df: pd.DataFrame): """ Saving these fields Column('language', String), Column('chamber', String), Column('date', Date), Column('file_name', String), Column('file_number', String), Column('file_number_additional', String), Column('html_url', String), Column('html_raw', String), Column('pdf_url', String), Column('pdf_raw', String), """ def save_to_db(df: pd.DataFrame, table: str): # If the returned df is not a DataFrame but a Series, then convert it into a dataframe and Transpose it to correct the variable. (Not needed for most courts, but edge case needs it) if not isinstance(df, pd.DataFrame): df = df.to_frame() df = df.T df.to_sql(table, engine, if_exists="append", index=False) def add_ids_to_df_for_decision(series: pd.DataFrame) -> pd.DataFrame: query = f"SELECT file_id FROM file WHERE file_name = '{series['file_name']}'" series['file_id'] = pd.read_sql(query, engine.connect())["file_id"][0] series['language_id'] = -1 query = f"SELECT chamber_id FROM chamber WHERE chamber_string = '{series['chamber']}'" chamber_id = pd.read_sql(query, engine.connect())['chamber_id'] if len(chamber_id) == 0: print( f"The chamber {series['chamber']} was not found in the database. " f"Add it with the respective court and spider") raise ValueError else: series['chamber_id'] = chamber_id[0] series['decision_id'] = uuid.uuid5(uuid.UUID(int=0), series['file_name']) # TODO: Add topic recognition, similar to the title of the court decision series['topic'] = '' return series def save_the_file_numbers(series: pd.DataFrame) -> pd.DataFrame: """ Saves the file_number for each of the decision ids :param series: :return: """ query = f"SELECT decision_id FROM decision WHERE file_id = '{series['file_id']}'" series['decision_id'] = pd.read_sql(query, engine.connect())["decision_id"][0] with engine.connect() as conn: t = Table('file_number', MetaData(), autoload_with=engine) # Delete and reinsert as no upsert command is available stmt = t.delete().where(delete_stmt_decisions_with_df(series)) conn.execute(stmt) series['text'] = series['file_number'].strip( ) # .map(lambda x: x.strip()) save_to_db(series[['decision_id', 'text']], 'file_number') if ('file_number_additional' in series and series['file_number_additional'] is not None and len(series['file_number_additional']) > 0): series['text'] = series['file_number_additional'].strip( ) # .map(lambda x: x.strip()) save_to_db(series[['decision_id', 'text']], 'file_number') return series if df.empty: return # Delete old decision and file entries with engine.connect() as conn: t_fil = Table('file', MetaData(), autoload_with=engine) t_dec = Table('decision', MetaData(), autoload_with=engine) file_name_list = ','.join( ["'" + str(item) + "'" for item in df['file_name'].tolist()]) stmt = t_fil.select().where(text(f"file_name in ({file_name_list})")) file_ids = [item['file_id'] for item in conn.execute(stmt).all()] if len(file_ids) > 0: file_ids_list = ','.join( ["'" + str(item) + "'" for item in file_ids]) # decision_ids = [item['decision_id'] for item in conn.execute(t_dec.select().where(text(f"file_id in ({file_ids_list})"))).all()] stmt = t_dec.delete().where(text(f"file_id in ({file_ids_list})")) conn.execute(stmt) stmt = t_fil.delete().where(text(f"file_id in ({file_ids_list})")) conn.execute(stmt) save_to_db(df[['file_name', 'html_url', 'pdf_url', 'html_raw', 'pdf_raw']], 'file') df = df.apply(add_ids_to_df_for_decision, 1) df = df.replace( {np.NaN: None} ) # Convert pandas NaT values (Non-Type for Datetime) to None using np as np recognizes these types df['date'] = df['date'].replace(r'^\s*$', None, regex=True) df['date'] = df['date'].astype('datetime64[ns]') save_to_db(df[['language_id', 'chamber_id', 'file_id', 'date', 'topic']], 'decision') df.apply(save_the_file_numbers, 1)
#-*- encoding: utf-8 -*- ''' Created on 2014-11-5 @author: [email protected] ''' from sqlalchemy.sql.schema import MetaData, Table metadata = MetaData('mysql://%s:%s@%s/%s?charset=utf8' % ('root', 'root', '172.16.109.105:3306', 'itgfz2014')) if __name__ == '__main__': mem_tab = Table('itgfz_member',metadata,autoload=True) stat = mem_tab.select() print stat r = stat.execute() print [v for v in r.fetchall()] pass