def pro_cache_data_daily(self, df, index, cached_end): dtypedict = { 'ts_code': NVARCHAR(length=10), 'trade_date': NVARCHAR(length=8), 'open': Float(), 'high': Float(), 'low': Float(), 'close': Float(), 'pre_close': Float(), 'change': Float(), 'pct_chg': Float(), 'vol': Float(), 'amount': Float() } if cached_end is None: print('######## cache new trade daily ############') new = df else: new = df[df.trade_date > cached_end] if index: new.to_sql(self.tables['index_trade_daily'], con=self.conn, if_exists='append', index=False, dtype=dtypedict) else: new.to_sql(self.tables['stock_trade_daily'], con=self.conn, if_exists='append', index=False, dtype=dtypedict)
def futureGame2mysql(game): game = pd.DataFrame(game).T game['awayTeam'] = "nan" game['homeTeam'] = "nan" game['away_win_rate'] = 0.0 game['home_win_rate'] = 0.0 gameTeam = game.iloc[0]['gameTeam'] gameTeam = gameTeam.split("vs") game['awayTeam'] = gameTeam[0] game['homeTeam'] = gameTeam[1] game.drop(columns=['gameOver','gameTeam'], axis=1, inplace=True) game['gameId'] = game['gameId'].astype(int) dtype_dict = { game.columns[0]: Integer(), game.columns[1]: NVARCHAR(length=32), game.columns[2]: NVARCHAR(length=3), game.columns[3]: NVARCHAR(length=3), game.columns[4]: Float(precision=6, asdecimal=True), game.columns[5]: Float(precision=6, asdecimal=True), } try: game.to_sql(name='future_game', con=con, if_exists='append', index=False, dtype=dtype_dict) except: print('[futureGame2mysql] game ' + str(game['gameId']) + ' has existed, skip.')
class Tls206Person(Base): __tablename__ = 'tls206_person' person_id = Column(INT, primary_key=True, default=0) person_name = Column(NVARCHAR(500), default='') person_address = Column(NVARCHAR(1000), default='') person_ctry_code = Column(CHAR(2), default='') doc_std_name_id = Column(INT, default=0) doc_std_name = Column(NVARCHAR(500), default='') psn_id = Column(INT, default=0) psn_name = Column(NVARCHAR(500), default='') psn_level = Column(SMALLINT, default=0) psn_sector = Column(VARCHAR(50), default='')
def recap2mysql(recap): dtype_dict = { recap.columns[0]: NVARCHAR(length=8), recap.columns[1]: NVARCHAR(length=64), recap.columns[2]: Text(), recap.columns[3]: NVARCHAR(length=32), recap.columns[4]: NVARCHAR(length=256) } # 通过dtype设置类型 为dict格式{"col_name":type} try: recap.to_sql(name='recap', con=con, if_exists='append', index=False, dtype=dtype_dict) except: print('[recap2mysql] Insert data repeatedly, skip.')
class Tls215CitnCateg(Base): __tablename__ = 'tls215_citn_categ' pat_publn_id = Column(INT, primary_key=True, default=0) citn_replenished = Column(INT, primary_key=True, default=0) citn_id = Column(SMALLINT, primary_key=True, default=0) citn_categ = Column(NVARCHAR(10), primary_key=True, default='') relevant_claim = Column(SMALLINT, primary_key=True, default=0)
def insert_db(data, table_name, write_index, primary_keys, is_truncate=False): # 插入默认的数据库。 # 定义engine engine_mysql = engine() # 使用 http://docs.sqlalchemy.org/en/latest/core/reflection.html # 使用检查检查数据库表是否有主键。 insp = inspect(engine_mysql) col_name_list = data.columns.tolist() # 如果有索引,把索引增加到varchar上面。 if write_index: # 插入到第一个位置: for i in range(len(data.index.names)): i += 1 col_name_list.insert(0, data.index.names[len(data.index.names) - i]) con = engine_mysql.connect() if is_truncate: delete(table_name, is_truncate=is_truncate) data.to_sql( name=table_name, con=con, if_exists='append', dtype={col_name: NVARCHAR(length=255) for col_name in col_name_list}, index=write_index) # 判断是否存在主键 if insp.get_pk_constraint(table_name) == []: try: con.execute('ALTER TABLE `%s` ADD PRIMARY KEY (%s);' % (table_name, primary_keys)) except Exception as e: print("################## ADD PRIMARY KEY ERROR :", e)
def df2db_update(delete_condition, block_basic_df, block_member_df): dtypedict = { 'data_source': NVARCHAR(length=16), 'block_category': NVARCHAR(length=16), 'block_type': NVARCHAR(length=16), 'block_name': NVARCHAR(length=16), 'block_code': NVARCHAR(length=16), 'ts_code': NVARCHAR(length=16), 'member_count': Integer, 'create_time': DateTime } mysql.exec(delete_table_common.format(f"block_member where {delete_condition}")) # 删除表中记录 # 该函数调用前,需要先将block_member表中的相关数据删掉 pd.io.sql.to_sql(block_member_df, 'block_member', con=engine, if_exists='append', index=False, index_label="data_source, block_category, block_type, block_name, block_code, ts_code", dtype=dtypedict, chunksize=10000) # chunksize参数针对大批量插入,pandas会自动将数据拆分成chunksize大小的数据块进行批量插入; mysql.exec(delete_table_common.format(f"block_basic where {delete_condition}")) # 删除表中记录 pd.io.sql.to_sql(block_basic_df, 'block_basic', con=engine, if_exists='append', index=False, index_label="data_source, block_category, block_type, block_name, block_code", dtype=dtypedict, chunksize=10000)
def import_data(): try: # Create mssql engine connection engine = create_engine('mssql+pyodbc://' + db_host + '/' + db_name + '?driver=SQL+Server+Native+Client+11.0', fast_executemany=True) # Read and filter data data = pd.read_excel(download_path, skiprows=4) data = data[data['Status'].isin( ['In Progress', 'New', 'Pending Acknowledgement'])] # Import data to mssql server data.to_sql( db_table_name, engine, if_exists='replace', index=False, dtype={col_name: NVARCHAR(length=255) for col_name in data}) # Create a function attribute import_data.count = len(data) # Log info logging.info(f"Imported {len(data)} lines of data") except Exception as e: # Log error logging.error("Exception occurred", exc_info=True)
def team2mysql(): teams = pd.read_csv('./data/teams/teams.csv') dtype_dict = {} for i, j in zip(teams.columns, teams.dtypes): if "object" in str(j): dtype_dict.update({i: NVARCHAR(length=32)}) if "float" in str(j): dtype_dict.update({i: Float(precision=2, asdecimal=True)}) if "int" in str(j): dtype_dict.update({i: Integer()}) dtype_dict.update({'logoUrl': NVARCHAR(length=64), 'website': NVARCHAR(length=64), 'description': Text()}) print(dtype_dict) # 通过dtype设置类型 为dict格式{“col_name”:type} teams.to_sql(name='team', con=con, if_exists='replace', index=False, dtype=dtype_dict)
def _write_into_db(self): ''' 写入数据库. ''' self.lock.acquire() # 防止同一时间有其他进程对数据库进行写入 # --------------------- # 1. self.additions.to_sql('news_macro_source',self.engine, index = False,if_exists = 'replace', dtype = {'title':VARCHAR(256), 'news_time':VARCHAR(64), 'href':VARCHAR(256), 'content':NVARCHAR(), 'update_datetime':DATETIME, 'news_source':VARCHAR(32), 'unique_flag':VARCHAR(256), 'if_header':INT}) try: if not self.if_header: self.session.execute(SQL_MERGE_MACRO_NEWS) self.session.commit() else: self.session.execute(SQL_MERGE_MACRO_NEWS_FOR_HEADER) self.session.commit() self.session.execute(SQL_MERGE_MACRO_NEWS_FOR_ALL) self.session.commit() except pymssql.OperationalError: print 'Error of %s: Writing into database failed'%self.spider_name # --------------------- self.lock.release() # 释放
def test_spider(self,if_write = False): self.header_generator = header_generator self.last_flag = None self.additions = pd.DataFrame() status_code = self._get_titles_response() print 'TITLE RESPONSE STATUS: %s'%(str(status_code)) # ----------- 标题页分析,确定增量更新内容self.additions ----------- # additions.columns = [title,href,news_time,content,news_source] # 保证additions都有上述的column,没有内容的用None self._parse_titles_response() # ------------------------------------------------------------------ for idx,row in self.additions.iterrows(): href = row['href'] status_code = self._get_content_response(href) print 'CONTENT RESPONSE STATUS: %s'%(str(status_code)) self._parse_content_response(idx) if if_write: self.engine_obj = DatabaseEngine('xiaoyi') self.engine = self.engine_obj.get_engine() self.session = self.engine_obj.get_session() self.additions.to_sql('news_macro_source',self.engine, index = False,if_exists = 'replace', dtype = {'title':VARCHAR(256), 'news_time':VARCHAR(64), 'href':VARCHAR(256), 'content':NVARCHAR(), 'update_datetime':DATETIME, 'news_source':VARCHAR(32)}) self.session.execute(SQL_MERGE_MACRO_NEWS) self.session.commit() self.last_flag = None
def insert_db(data, table_name, write_index, primary_keys): # 定义engine engine_mysql = engine() # 使用 http://docs.sqlalchemy.org/en/latest/core/reflection.html # 使用检查检查数据库表是否有主键。 insp = inspect(engine_mysql) col_name_list = data.columns.tolist() # 如果有索引,把索引增加到varchar上面。 if write_index: # 插入到第一个位置: col_name_list.insert(0, data.index.name) print(col_name_list) data.to_sql( name=table_name, con=engine_mysql, schema=MYSQL_DB, if_exists='append', dtype={col_name: NVARCHAR(length=255) for col_name in col_name_list}, index=write_index) # 判断是否存在主键 if insp.get_primary_keys(table_name) == []: with engine_mysql.connect() as con: # 执行数据库插入数据。 try: con.execute('ALTER TABLE `%s` ADD PRIMARY KEY (%s);' % (table_name, primary_keys)) except Exception as e: print("################## ADD PRIMARY KEY ERROR :", e)
class Organization(Base): """Represent an Organization in Crunchbase""" __tablename__ = 'tbl_Organization' id = Column(Integer, primary_key=True) name = Column(NVARCHAR(255), nullable=False) url = Column(VARCHAR(255), nullable=False, unique=True) ipo_date = Column(Date) stock_code = Column(VARCHAR(10)) headquarters = Column(VARCHAR(255)) description = Column(TEXT) categories = Column(TEXT) website = Column(VARCHAR(255)) facebook = Column(VARCHAR(255)) twitter = Column(VARCHAR(255)) linkedin = Column(VARCHAR(255)) found_date = Column(Date) aliases = Column(VARCHAR(255)) acquisitions = relationship('Acquisition', back_populates='organization') founders = relationship('Founder', back_populates='organization') employees = relationship('Employee', back_populates='organization') competitors = relationship('Competitor', back_populates='focal_company') partners = relationship('Partner', back_populates='focal_company') board_members = relationship('BoardMember', back_populates='company') def __repr__(self): return self.name
def stock_basic_mysql_pandas(self, list_status="L",if_exists='append'): cur_time = time.strftime("%Y-%m-%d %H:%M:%S", time.localtime()) df = self.get_stock_basic(list_status) col_name = df.columns.tolist() ##col_name.insert(1, 'id') #在最前面插入一列id #col_name.insert(col_name.index('ts_code'), 'id') # 在ts_code列前面插入id,col_name.index('ts_code')+1表示在此列后插入 df.reindex(columns=col_name) ##df['id'] = range(1,len(df) + 1) #不能这样赋值,应该是从数据库中取出最大id后加1 df['list_status'] = [list_status] * len(df) #字段赋值 df['create_time'] = [cur_time] * len(df) #字段赋值 df['update_time'] = [cur_time] * len(df) #字段赋值 dtypedict = { 'id': BigInteger,'ts_code': NVARCHAR(length=16), 'symbol': NVARCHAR(length=16), 'name': NVARCHAR(length=16), 'area': NVARCHAR(length=16), 'industry': NVARCHAR(length=16), 'fullname': NVARCHAR(length=64), 'enname': NVARCHAR(length=128), 'market': NVARCHAR(length=16), 'exchange': NVARCHAR(length=16), 'curr_type': NVARCHAR(length=16),'list_status': NVARCHAR(length=16), 'list_date': NVARCHAR(length=16), 'delist_date': NVARCHAR(length=16), 'is_hs': NVARCHAR(length=16), 'create_time': DateTime, 'update_time': DateTime } ## dtypedict =self.mapping_df_types(df) #此方法对VARCHAR的长度不能灵活定义,不具体通用性 ##df.to_sql('stock_basic_pd', self.engine, if_exists='append', index=False, dtype=dtypedict) # 追加数据到现有表 pd.io.sql.to_sql(df, 'stock_basic_pd', con=self.engine, if_exists=if_exists, index=False, index_label="symbol", dtype=dtypedict, chunksize = 10000) #chunksize参数针对大批量插入,pandas会自动将数据拆分成chunksize大小的数据块进行批量插入;
def DialectSpecificText() -> String: #pylint: disable=C0103 """ Args: N/A Returns: Text-like SQLAlchemy column dialect abstraction. Types returned are: 1) PostgreSQL -> Text 2) MSSQL -> NVARCHAR(None) 3) MySQL -> NVARCHAR(None) 4) Default -> String Preconditions: N/A """ return String()\ .with_variant(Text, 'postgresql')\ .with_variant(NVARCHAR(None), 'mssql')\ .with_variant(NVARCHAR(None), 'mysql')
def update_data(code): dtypedict = {'date': NVARCHAR(length=255), 'code': NVARCHAR(length=255)} df = ts.get_k_data(code=code, start=day_100_ago, end=day_today) df.to_sql(name='history', con=mysql, if_exists='append', dtype=dtypedict) global index_flag if not index_flag: with mysql.connect() as con: con.execute( 'ALTER TABLE `history` ADD INDEX `index_code`(`code`);') con.execute( 'ALTER TABLE `history` ADD INDEX `index_date`(`date`);') index_flag = True logging.info('update_data:%r', code)
class FarmAnimals(DeclarativeBase): __tablename__ = 'farm_animals' farm_id = Column(Integer, default=0) type_id = Column(Integer, default=0) # 种类 animal_id = Column(Integer, autoincrement=True, primary_key=True) animal_name = Column(NVARCHAR(100)) animal_color = Column(NVARCHAR(100)) animal_birthday = Column(String(20)) animal_varieties = Column(Integer, default=0) # 品种 sell_status = Column(Integer, default=0) # 0:未售,1:已售 animal_healthy = Column(Integer, default=0) # 0:健康,1:死亡,2:生病 animal_photo = Column(String(100)) animal_price = Column(Integer, default=0) animal_gender = Column(Integer, default=0) # 0:公, 1:母 createtime = Column(DateTime, default=datetime.now) updatetime = Column(TIMESTAMP, nullable=False)
def mapping_df_types(df): dtypedict = {} for i, j in zip(df.columns, df.dtypes): if "object" in str(j): dtypedict.update({i: NVARCHAR(length=255)}) if "float" in str(j): dtypedict.update({i: Float(precision=4, asdecimal=True)}) if "int" in str(j): dtypedict.update({i: Integer()}) return dtypedict
def map_types(df): dtypedict = {} for i, j in zip(df.columns, df.dtypes): if "object" in str(j): dtypedict.update({i: NVARCHAR(length=255)}) if "float" in str(j): dtypedict.update({i: FLOAT(precision=2, asdecimal=True)}) if "int" in str(j): dtypedict.update({i: INTEGER()}) return dtypedict
def mapping_df_types(df): dtypedict = {} for i, j in zip(df.columns, df.dtypes): if "object" in str(j): dtypedict.update({i: NVARCHAR(None)}) if "float" in str(j): dtypedict.update({i: Float()}) if "int" in str(j): dtypedict.update({i: Integer()}) return dtypedict
def map_types(df): dtypedict = {} for i, j in zip(df.columns, df.dtypes): if 'object' in str(j): dtypedict.update({i: NVARCHAR(length=255)}) if 'float' in str(j): dtypedict.update({i: Float(precision=2, asdecimal=True)}) if 'int' in str(j): dtypedict.update({i: Integer}) return dtypedict
def to_sql_dtype(self,df): from sqlalchemy.types import NVARCHAR, Float, Integer dtypedict = {} for i, j in zip(df.columns, df.dtypes): if "object" in str(j): dtypedict.update({i: NVARCHAR(length=255)}) if "float" in str(j): dtypedict.update({i: Float(precision=2, asdecimal=True)}) if "int" in str(j): dtypedict.update({i: Integer()}) return dtypedict
def write_df_table(cls, dataframe, table): """ 将 DataFrame 数据覆盖保存至数据库表 """ dtypedict = { 'id': Integer(), 'pid_id': Integer(), 'title': NVARCHAR(length=255), 'sentiment': Integer(), 'count': Integer(), 'mean': Float() } dataframe.to_sql(name=table, con=cls.engine, if_exists="replace", index=False, dtype=dtypedict) print(f'数据库写入成功')
def mapping_df_types(df): dtypeDict = {} for i, j in zip(df.columns, df.dtypes): if "object" in str(j): if i in ['stock_id', 'code']: dtypeDict.update({i: NVARCHAR(length=12)}) elif i in ['date']: dtypeDict.update({i: DATETIME}) elif 'date' in i: dtypeDict.update({i: DATE}) else: dtypeDict.update({i: NVARCHAR(length=100)}) if "float" in str(j): dtypeDict.update({i: Float(precision=2, asdecimal=True)}) if "int" in str(j): dtypeDict.update({i: BIGINT}) dtypeDict['stock_id'] = NVARCHAR(length=12) dtypeDict['code'] = NVARCHAR(length=12) dtypeDict['date'] = DATETIME dtypeDict['trade_date'] = DATE return dtypeDict
def __init__(self, db_schema=None, is_kart_branding=False): # Don't call super since we are redefining self.kart_state and self.kart_track. self.sqlalchemy_metadata = MetaData() self.db_schema = db_schema self.is_kart_branding = is_kart_branding self.kart_state = Table( self.kart_table_name(STATE), self.sqlalchemy_metadata, Column("table_name", NVARCHAR(400), nullable=False, primary_key=True), Column("key", NVARCHAR(400), nullable=False, primary_key=True), Column("value", Text, nullable=False), schema=self.db_schema, ) self.kart_track = Table( self.kart_table_name(TRACK), self.sqlalchemy_metadata, Column("table_name", NVARCHAR(400), nullable=False, primary_key=True), Column("pk", NVARCHAR(400), nullable=True, primary_key=True), schema=self.db_schema, )
def wordcloud_pd_handle(self): ls = [{ "source": i[0], "searchword": j[0], "monitorword": pd.value_counts(j[1].split(",")).index, "wordcount": pd.value_counts(j[1].split(",")).values } for i in self.worddict.items() for j in i[1].items()] res = pd.DataFrame() for k in ls: dict_word1 = pd.DataFrame(k) res = res.append(dict_word1, ignore_index=True) dtype = { "source": NVARCHAR(255), "searchword": NVARCHAR(255), "monitorword": NVARCHAR(255), "wordcount": INT, } res.to_sql("wordcloud", con=self.engine_word, if_exists="replace", index=False, dtype=dtype)
def csv2mysql(df, table_name, length): dtype_dict = {} for i, j in zip(df.columns, df.dtypes): if "object" in str(j): dtype_dict.update({i: NVARCHAR(length=length)}) if "float" in str(j): dtype_dict.update({i: Float(precision=2, asdecimal=True)}) if "int" in str(j): dtype_dict.update({i: Integer()}) # 通过dtype设置类型 为dict格式{“col_name”:type} try: df.to_sql(name=table_name, con=con, if_exists='append', index=False, dtype=dtype_dict) except: print('[csv2mysql] Insert data repeatedly, skip.')
class Tls214NplPubln(Base): __tablename__ = 'tls214_npl_publn' npl_publn_id = Column(INT, primary_key=True, default=0) npl_type = Column(CHAR(1), default='') npl_biblio = Column(NVARCHAR(100000), default='') npl_author = Column(NVARCHAR(1000), default='') npl_title1 = Column(NVARCHAR(1000), default='') npl_title2 = Column(NVARCHAR(1000), default='') npl_editor = Column(NVARCHAR(500), default='') npl_volume = Column(NVARCHAR(50), default='') npl_issue = Column(NVARCHAR(50), default='') npl_publn_date = Column(VARCHAR(8), default='') npl_publn_end_date = Column(VARCHAR(8), default='') npl_publisher = Column(NVARCHAR(500), default='') npl_page_first = Column(VARCHAR(200), default='') npl_page_last = Column(VARCHAR(200), default='') npl_abstract_nr = Column(VARCHAR(50), default='') npl_doi = Column(VARCHAR(500), default='') npl_isbn = Column(VARCHAR(30), default='') npl_issn = Column(VARCHAR(30), default='') online_availability = Column(VARCHAR(500), default='') online_classification = Column(VARCHAR(35), default='') online_search_date = Column(VARCHAR(8), default='')
def test_defaults(self): assert self._comp(DATE()) == 'DATE' assert self._comp(DATETIME()) == 'TIMESTAMP(6)' assert self._comp(TIMESTAMP()) == 'TIMESTAMP(6)' assert self._comp(TIME()) == 'TIME(6)' assert self._comp(CHAR()) == 'CHAR(1)' assert self._comp(VARCHAR()) == 'LONG VARCHAR' assert self._comp(NCHAR()) == 'CHAR CHAR SET UNICODE' assert self._comp(NVARCHAR()) == 'LONG VARCHAR CHAR SET UNICODE' assert self._comp(CLOB()) == 'CLOB' assert self._comp(TEXT()) == 'CLOB' assert self._comp(DECIMAL()) == 'DECIMAL(5, 0)' assert self._comp(NUMERIC()) == 'NUMERIC(5, 0)' assert self._comp(INTEGER()) == 'INTEGER' assert self._comp(FLOAT()) == 'FLOAT' assert self._comp(REAL()) == 'REAL' assert self._comp(SMALLINT()) == 'SMALLINT' assert self._comp(BIGINT()) == 'BIGINT' assert self._comp(BOOLEAN()) == 'BYTEINT'
def init_stock_data(self, stock_code): """ used when first time download stock data. """ result = self.get_trade_data(stock_code, self.today) result.columns = ['trade_date', 'stock_code', 'stock_name', 'close_price', 'highest_price', 'lowest_price', 'open_price', 'prev_close_price', 'change_rate', 'amplitude', 'volume', 'turnover'] result = self.data_cleaning(result) columetype = { 'trade_date': Date, 'stock_name': NVARCHAR(length=10), 'close_price': DECIMAL(7, 3), 'highest_price': DECIMAL(7, 3), 'lowest_price': DECIMAL(7, 3), 'open_price': DECIMAL(7, 3), 'prev_close_price': DECIMAL(7, 3), 'change_rate': DECIMAL(7, 3), 'amplitude': DECIMAL(7, 4), 'volume': Integer(), 'turnover': DECIMAL(20, 2) } result.to_sql( name=stock_code, con=self.mysql.engine, if_exists='append', index=False, dtype=columetype) query = self.mysql.session.query( formStockManager.stock_code, formStockManager.update_date ).filter_by(stock_code=stock_code) if query: query.update( {"update_date": self.Today}) self.mysql.session.commit()