def insert(self,d): n = MySQL() n.selectDb('joke') tbname = 'joke' n.insert(tbname, d) n.commit()
def deleteT(db, date): tables1 = gethiveT(db) #传库名 tables2 = getCT(db) #传库名 conn = MySQL(config.washmeta) tables1 = getDBs(tables1) #传库名 len2 = len(tables2) flag = True if len2 == 0: for table1 in tables1: table1['Create_Dt'] = date table1['Data_Tbl_UUID'] = uuid.uuid1() logging.debug('table1:' + table1['Data_Tbl_Phys_Nm']) conn.insert("data_tbl", table1) for table2 in tables2: for table1 in tables1: if table2.get('Data_Tbl_Phys_Nm') == table1.get( 'Data_Tbl_Phys_Nm'): flag = False break if flag: logging.debug('table1:' + table1['Data_Tbl_Phys_Nm']) try: conn.execute( "update data_tbl set Del_Dt='{}' where Data_Tbl_Phys_Nm='{}'" .format(date, table2.get('Data_Tbl_Phys_Nm'))) except Exception: logging.error('删除表元数据失败,数据为:' + str(table2)) print traceback.format_exc() sys.exit(1) flag = True del conn
def insertCByT(db, tb): conn1 = MySQL(config.hivemeta) conn2 = MySQL(config.washmeta) hivesql = """select '{}' AS Data_Tblid, t1.COLUMN_NAME as Fld_Phys_Nm, t1.COMMENT as Fld_Cn_Nm, t1.TYPE_NAME as Fld_Data_Type, t1.INTEGER_IDX as Fld_Ord from columns_v2 t1 left join sds t2 on t1.cd_id = t2.cd_id left join tbls t3 on t2.sd_id = t3.sd_id left join dbs t4 on t3.db_id=t4.db_id where t4.name='{}' and t3.tbl_name='{}'""" cs1 = conn1.execute( hivesql.format(tb.get('Data_Tbl_Phys_Nm'), db, tb.get('Data_Tbl_Phys_Nm'))) cs1 = getTableID(cs1) for c in cs1: c['Create_Dt'] = tb['Create_Dt'] try: logging.debug("插入新增表字段:" + str(c['Fld_Phys_Nm'])) conn2.insert('data_fld', c) except Exception as e: logging.error("插入新增表字段失败:" + str(c['Fld_Phys_Nm'])) print traceback.format_exc() del conn1 del conn2
def insert(self,d): n = MySQL() n.selectDb('images') tbname = 'similar_images' n.insert(tbname, d) n.commit()
def insert_data(data): n=MySQL() sql = "select id from urls_crawled where url='"+data['url']+"';" result = n.query(sql) if result == 0: try: n.insert('urls_crawled',data) except: return 200 n.commit()
def insertNewP(db, date): partitions1 = gethiveP(db) #传库名 partitions2 = getCP(db) #传库名 conn = MySQL(config.washmeta) partitions1 = getTableID(partitions1) len2 = len(partitions2) if len2 == 0: for p1 in partitions1: logging.debug('partition:' + str(p1['Data_Tblid']) + ':' + p1['Dp_Path']) try: conn.insert("dp", p1) except Exception: logging.error('第一次插入分区数据失败,数据为:' + str(p1)) print traceback.format_exc() sys.exit(1) else: for p1 in partitions1: flag = True for p2 in partitions2: if p1.get('Data_Tblid') == p2.get('Data_Tblid') and p1.get( 'Dp_Path') == p2.get('Dp_Path'): flag = False break if flag: logging.debug('插入分区partition:' + str(p1['Data_Tblid']) + ':' + p1['Dp_Path']) try: conn.insert("dp", p1) except Exception: logging.error('插入分区数据失败,数据为:' + str(p1)) print traceback.format_exc() sys.exit(1) for p2 in partitions2: flag = True for p1 in partitions1: if p2.get('Data_Tblid') == p1.get('Data_Tblid') and p2.get( 'Dp_Path') == p1.get('Dp_Path'): flag = False break if flag: logging.debug('删除分区partition:' + str(p2['Data_Tblid']) + ':' + p2['Dp_Path']) try: conn.execute( "delete from dp where Data_Tblid='{}' and Dp_Path='{}'" .format(p2.get('Data_Tblid'), p2.get('Dp_Path'))) except Exception: logging.error('删除分区数据失败,数据为:' + str(p1)) print traceback.format_exc() sys.exit(1) del conn
def html_parser(self, html_source): tree = html.fromstring(html_source) parser = HtmlParser() username = parser.get_username(tree) brief_info = parser.get_brief_info(tree) industry = parser.get_industry(tree) education = parser.get_education(tree) major = parser.get_major(tree) answer_count = parser.get_answer_count(tree) article_count = parser.get_article_count(tree) ask_question_count = parser.get_ask_question_count(tree) collection_count = parser.get_collection_count(tree) follower_count = parser.get_follower_count(tree) followed_count = parser.get_followed_count(tree) follow_live_count = parser.get_follow_live_count(tree) follow_topic_count = parser.get_follow_topic_count(tree) follow_column_count = parser.get_follow_column_count(tree) follow_question_count = parser.get_follow_question_count(tree) follow_collection_count = parser.get_follow_collection_count(tree) now = datetime.datetime.now() current_time = now.strftime("%Y-%m-%d %H:%M:%S") print "*" * 60 print "用户名:%s\n" % username print "个人简介:%s\n" % brief_info print "所处行业:%s\n" % industry print "毕业学校:%s\n" % education print "主修专业:%s\n" % major print "回答数:%s\n" % answer_count print "文章数:%s\n" % article_count print "提问数:%s\n" % ask_question_count print "收藏数:%s\n" % collection_count print "被关注数:%s\n" % follower_count print "关注数:%s\n" % followed_count print "关注直播数:%s\n" % follow_live_count print "关注话题数:%s\n" % follow_topic_count print "关注专栏数:%s\n" % follow_column_count print "关注问题数:%s\n" % follow_question_count print "关注收藏夹数:%s\n" % follow_collection_count print "当前时间:%s\n" % current_time print "*" * 60 # Save data to mysql. db = MySQL(DATABASE_CONFIG) sql = "INSERT INTO t_user(username, brief_info, industry, education, major, answer_count, article_count, ask_question_count, collection_count, follower_count, followed_count, follow_live_count, follow_topic_count, follow_column_count, follow_question_count, follow_collection_count, gmt_create) values('" + username + "','" + brief_info + "','" + industry + "','" + education + "', '" + major + "', '" + answer_count + "', '" + article_count + "', '" + ask_question_count + "', '" + collection_count + "', '" + follower_count + "', '" + followed_count + "', '" + follow_live_count + "', '" + follow_topic_count + "', '" + follow_column_count + "', '" + follow_question_count + "', '" + follow_collection_count + "', '" + current_time + "')" db.insert(sql) # Extract urls self.extract_urls(tree) return
def updateC(db, date): columns1 = gethiveC(db) #传库名 columns2 = getCC(db) #传库名 conn = MySQL(config.washmeta) columns1 = getTableID(columns1) len2 = len(columns2) flag = True if len2 == 0: for c1 in columns1: c1['Create_Dt'] = date logging.debug('column:' + str(c1['Data_Tblid']) + ':' + c1['Fld_Phys_Nm']) try: conn.insert("data_fld", c1) except Exception: logging.error('第一次插入字段数据失败,失败数据为:' + str(c1)) print traceback.format_exc() sys.exit(1) else: for c1 in columns1: flag = True for c2 in columns2: if c1.get('Data_Tblid') == c2.get('Data_Tblid') and c1.get('Fld_Phys_Nm') == c2.get('Fld_Phys_Nm') and c1.get('Fld_Cn_Nm') == c2.get('Fld_Cn_Nm') \ and c1.get('Fld_Data_Type') == c2.get('Fld_Data_Type') and c1.get('Fld_Ord') == c2.get('Fld_Ord'): flag = False break if flag: c1['Upd_Dt'] = date logging.debug('column:' + str(c1['Data_Tblid']) + ':' + c1['Fld_Phys_Nm']) try: conn.execute( "update data_fld set Fld_Cn_Nm='{}',Fld_Data_Type='{}',Fld_Ord='{}',Upd_Dt='{}' where Data_Tblid='{}' and Fld_Phys_Nm='{}'" .format(c1['Fld_Cn_Nm'], c1['Fld_Data_Type'], c1['Fld_Ord'], c1['Upd_Dt'], c1['Data_Tblid'], c1['Fld_Phys_Nm'])) except Exception: logging.error('插入新增字段数据失败,数据为:' + str(c1)) print traceback.format_exc() sys.exit(1) del conn
def insertNewT(db, date): tables1 = gethiveT(db) #传库名 tables2 = getCT(db) #传库名 conn = MySQL(config.washmeta) tables1 = getDBs(tables1) #传库名 len2 = len(tables2) if len2 == 0: for table1 in tables1: ud = uuid.uuid1() table1['Create_Dt'] = date table1['Data_Tbl_UUID'] = str(ud) logging.debug('table1:' + table1['Data_Tbl_Phys_Nm']) try: conn.insert("data_tbl", table1) except Exception: logging.error('第一次插入表数据失败,插入数据是:' + str(table1)) print traceback.format_exc() sys.exit(1) else: for table1 in tables1: flag = True for table2 in tables2: if (table1.get('Data_Tbl_Phys_Nm') == table2.get( 'Data_Tbl_Phys_Nm')): flag = False break if flag: table1['Create_Dt'] = date table1['Data_Tbl_UUID'] = str(uuid.uuid1()) logging.debug('table1:' + table1['Data_Tbl_Phys_Nm']) try: # conn.insert("data_tbl", table1) logging.debug("插入表字段") insertCByT(db, table1) except Exception as e: logging.error('插入新增表失败,插入数据是:' + str(table1)) print e print traceback.format_exc() sys.exit(1) del conn
def insertNewC(db, date): columns1 = gethiveC(db) #传库名 columns2 = getCC(db) #传库名 conn = MySQL(config.washmeta) columns1 = getTableID(columns1) #传库名 len2 = len(columns2) flag = True if len2 == 0: for c1 in columns1: c1['Create_Dt'] = date logging.debug('column:' + str(c1['Data_Tblid']) + ':' + c1['Fld_Phys_Nm']) try: conn.insert("data_fld", c1) except Exception: logging.error('第一次插入字段数据失败,失败数据为:' + str(c1)) print traceback.format_exc() sys.exit(1) else: for c1 in columns1: flag = True for c2 in columns2: if c1.get('Data_Tblid') == c2.get('Data_Tblid') and c1.get( 'Fld_Phys_Nm') == c2.get('Fld_Phys_Nm'): flag = False break if flag: c1['Create_Dt'] = date logging.debug('column:' + str(c1['Data_Tblid']) + ':' + c1['Fld_Phys_Nm']) try: conn.insert('data_fld', c1) except Exception: logging.error('插入新增字段数据失败,数据为:' + str(c1)) print traceback.format_exc() sys.exit(1) del conn
def updateT(db, date): tables1 = gethiveT(db) #传库名 tables2 = getCT(db) #传库名 conn = MySQL(config.washmeta) tables1 = getDBs(tables1) #传库名 len2 = len(tables2) if len2 == 0: for table1 in tables1: table1['Create_Dt'] = date table1['Data_Tbl_UUID'] = uuid.uuid1() logging.debug('table1:' + table1['Data_Tbl_Phys_Nm']) try: conn.insert("data_tbl", table1) except Exception: logging.error('第一次插入表数据失败,插入数据是:' + str(table1)) print traceback.format_exc() sys.exit(1) for table1 in tables1: for table2 in tables2: if table1.get('Data_Tbl_Phys_Nm') == table2.get( 'Data_Tbl_Phys_Nm'): # print 'hive 表:',table1.get('Data_Tbl_Phys_Nm') # print 'clean 表:',table2.get('Data_Tbl_Phys_Nm') # if (not compareP(table1, table2) or not compareC(table1, table2)): if (compareP(db, table1, table2) or compareC(db, table1, table2)): logging.debug('对比表table1:' + table1['Data_Tbl_Phys_Nm']) try: conn.execute( "update data_tbl set Upd_Dt='{}' where Data_Tbl_Phys_Nm='{}'" .format(date, table1.get('Data_Tbl_Phys_Nm'))) except Exception: logging.error('更新表元数据失败,数据为:' + str(table1)) print traceback.format_exc() sys.exit(1) del conn
'charset':'utf8'} db = MySQL(dbconfig) #sql1 = "select tried_num from import_status_record where cid = '%s' and curr_sql_file like '%s';"%(cLists[0],'%'+sqlDate+'%') #execNum=db.query(sql1) #sql = "select count(*) from available_CIDs;" sql2 = "select * from available_CIDs;" cidsNum = db.query(sql2) if __name__ == "__main__": try: #saveout = sys.stdout #saveerr = sys.stderr #f = open("log/import_record"+time.strftime("%m%d_%H%M%S"),"w") #sys.stdout = f #sys.stderr = f #os.system('./stat.sh ' + "config") start = time.strftime("%Y-%m-%d %H:%M:%S") work_manager = WorkManager(cidsNum,4) work_manager.wait_allcomplete() end = time.strftime("%Y-%m-%d %H:%M:%S") print "threads start times: ",start print "threads end times: ",end finally: print "###" * 30 print "Statistics the import results and write results into mysql" os.system('./stat.sh ' + "dataList " + str(execNum+1)) sql="load data infile '%s' into table import_status_record FIELDS TERMINATED BY ',';"%(sqlData) db.insert(sql) #f.close()
class Spider(object): def __init__(self): self.keyword = 'python' # self.cookie_url = 'http://weixin.sogou.com/weixin?type=2&ie=utf8&s_from=input&_sug_=y&_sug_type_=&query={0}'.format(self.keyword) # self.test_url = 'http://weixin.sogou.com/weixin?type=2&ie=utf8&query={}&tsn=1&ft=&et=&interation=&wxid=&usip='.format(self.keyword) self.headers = { 'Cookie': 'ABTEST=8|1537491697|v1; IPLOC=CN3301; SUID=8A00CD733E18960A000000005BA442F1; JSESSIONID=aaaELbWTQ1L2wb49VYFvw; SUID=8A00CD733118960A000000005BA44328; weixinIndexVisited=1; SUV=00CA1DF373CD008A5BA44329E5072984; sct=1; ppinf=5|1537491766|1538701366|dHJ1c3Q6MToxfGNsaWVudGlkOjQ6MjAxN3x1bmlxbmFtZToyOllHfGNydDoxMDoxNTM3NDkxNzY2fHJlZm5pY2s6MjpZR3x1c2VyaWQ6NDQ6bzl0Mmx1UG1pVEc5UzdYai1uNS00dmZESjlaSUB3ZWl4aW4uc29odS5jb218; pprdig=SZP50z_ocFRwyaEzaFydV-HYv-7zERPayFcU4AKiczu0biMhxplP0vHK_c9YDQaC7wSpf6k1pi_KgkugvqfiXFx57nAVREJnCoD2sI6PPqu_RkhU8p_t8K_u0nBORzPL4t56QANrWGeOqqABFIR8--kajPxzjyOrns2gB7Mx1Gk; sgid=18-37096587-AVukQzbMnFYdwGh1fBNzkwE; PHPSESSID=n0hgnp1cq5hb3hde6ag44fo2d0; SUIR=64ED239EEEE89BB9C39C6C78EE77A936; ppmdig=1537498561000000839ca7e11ec83beadd0ca06d0eb18a07; SNUID=5AD01CA2D0D4A689DE848F35D1313467', 'Host': 'weixin.sogou.com', 'Upgrade-Insecure-Requests': '1', # 'Referer': self.cookie_url, 'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/60.0.3112.90 Safari/537.36' } self.base_url = 'http://weixin.sogou.com/weixin?' self.proxy_pool_url = 'http://127.0.0.1:5555/random' self.mysql = MySQL() self.max_count = 5 self.session = requests.Session() def get_proxy(self): try: response = requests.get(self.proxy_pool_url) if response.status_code == 200: return response.text return None except ConnectionError: return None def get_html(self, url, count=1): # 通过代理ip池 print('Crawling', url) print('Trying Count', count) global proxy if count >= self.max_count: print('Tried Too Many Counts') return None try: if proxy: proxies = { 'http': 'http://' + proxy, 'https': 'https://' + proxy } response = self.session.get(url, allow_redirects=False, headers=self.headers, proxies=proxies) return response.text else: response = self.session.get(url, allow_redirects=False, headers=self.headers) if response.status_code == 200: return response.text if response.status_code == 302: # Need Proxy print('302') proxy = self.get_proxy() if proxy: print('Using Proxy', proxy) return self.get_html(url) else: print('Get Proxy Failed') return None except ConnectionError as e: print('Error Occurred', e.args) proxy = self.get_proxy() count += 1 return self.get_html(url, count) # def get_html(url): # # 使用本机ip # try: # response = requests.get(url, allow_redirects=False, headers=headers) # if response.status_code == 200: # return response.text # if response.status_code == 302: # return None # except ConnectionError: # return get_html(url) def get_index(self, keyword, page): data = {'query': keyword, 'type': 2, 'page': page} queries = urlencode(data) url = self.base_url + queries html = self.get_html(url) return html def parse_index(self, html): doc = pq(html) items = doc('.news-box .news-list li .txt-box h3 a').items() for item in items: yield item.attr('href') time.sleep(random.randint(5, 10)) def get_detail(self, url): try: response = requests.get(url) if response.status_code == 200: return response.text return None except ConnectionError: return None def parse_detail(self, html): doc = pq(html) url_object_id = get_md5(html) title = doc('.rich_media_title').text() content = doc('.rich_media_content').text() # date = doc('#publish_time').text() nickname = doc('#js_name').text() wechat = doc('#js_profile_qrcode > div > p:nth-child(3) > span').text() try: match_date = re.search('var.*?publish_time\s=\s(.*)"', html) date = match_date.group(1)[1:11] return { 'url_object_id': url_object_id, 'title': title, 'content': content, 'date': date, 'nickname': nickname, 'wechat': wechat } except AttributeError: return { 'url_object_id': url_object_id, 'title': title, 'content': content, 'nickname': nickname, 'wechat': wechat } def main(self): for page in range(86, 101): self.session.headers.update(self.headers) html = self.get_index(self.keyword, page) if html: article_urls = self.parse_index(html) for article_url in article_urls: article_html = self.get_detail(article_url) if article_html: article_data = self.parse_detail(article_html) print(article_data) self.mysql.insert('python_articles', article_data)
class Tables: """Create or Drop tables,delete data from tables """ def __init__(self): self._logger = Logger(__file__) try: fsock = open("sqls.xml", "r") except IOError: self._logger.error("The file don't exist, Please double check!") self.sqls = BeautifulSoup(fsock.read()) dbconfig = {'host':'127.0.0.1', 'port': 3306, 'user':'******', 'passwd':'123456', 'db':'scenic', 'charset':'utf8'} self.db = MySQL(dbconfig) def initDB(self): """create all tables """ createSqls = self.sqls.find(id="createSql") for item in createSqls.select("item"): sql = item.string self._logger.info("create the table "+item.attrs["id"]) self.db.execute(sql) # must reopen the cursor, or it will raise exception with error code 1024. What a f*****g error self.db.reopenCursor() def createTable(self,name): """create a specified table """ create = self.sqls.find(id="createSql").find(id=name).string if create: self._logger.info(" create table "+name) self.db.execute(create) else: self._logger.error("error occured when create table "+name) def dropAll(self): """drop all the tables """ dropSqls= self.sqls.find(id="dropSql") for item in dropSqls.select("item"): sql = item.string self._logger.info("drop the table "+item.attrs["id"]) self.db.execute(sql) def dropTable(self,name): """drop specified table """ drop = self.sqls.find(id="dropSql").find(name) if drop: self._logger.info("drop the table "+name) self.db.execute(sql) else: self._logger.warn("Don't have the table "+name) def cleanAll(self): """delete data from all the tables,but not drop tables """ cleanSqls= self.sqls.find(id="cleanSql") for item in cleanSqls.select("item"): sql = item.string self._logger.info("clean the table "+item.attrs["id"]) self.db.execute(sql) def cleanTable(self,name): """clean the data of specified table """ pass def insertTable(self,name,params): """insert values int to the specified table # Parameters: name: the name of the table params: the value insert into the tables. It can be tuple for inserting a row,or can be a list to insert serveral rows # Return: """ insert = self.sqls.find(id="insertSql").find(id=name).string if insert: self._logger.info(" insert into table "+name) self.db.insert(insert,params) else: self._logger.error("did not find the table "+name+" when insert") def insertData(self,data): """It is the interface for outer calling # Parameters: data: the value insert into the tables. It can be tuple for inserting a row,or can be a list to insert serveral rows # Return: """ if isinstance(data,Scenic): data.encode() types = self.joint(data.types) seasons = self.joint(data.fits) sceneryParams = (data.id,data.name,data.province,data.city,data.area,data.level,data.quality,data.description,data.website,data.symbol,data.opentime,data.closetime,data.price,data.suggest,seasons,types,data.longitude,data.latitude,data.precise,data.confidence) imageParams = [] for item in data.images: imageParams.append( (data.id,str(uuid.uuid1()),item,data.name,data.name) ) self.insertTable("scenery",sceneryParams) # insert into database when only there are pictures,or it will occur error if imageParams: self.insertTable("sceneryImages",imageParams) else: self._logger.error("the parameter is not the instance of Scenic") return False def joint(self,data,split=","): """Joint list with split parameter,default is , """ result = "" if isinstance(data,list): length = len(data) if length > 0: result = result+data[0] for i in range(1,length): result = result+split+data[i] return result def initTables(self): """Initial basic tables including sceneryType,season """ basic = SearchParams() # insert basic data into sceneryType table params = [] for item in basic.scenicType.keys(): params.append((basic.scenicType[item],item,item)) self.insertTable("sceneryType",params) # insert basic data into season table params = [] for item in basic.scenicFit.keys(): params.append((basic.scenicFit[item],item)) self.insertTable("season",params)
class Tables: """Create or Drop tables,delete data from tables """ def __init__(self): self._logger = Logger(__file__) try: fsock = open("sqls.xml", "r") except IOError: self._logger.error("The file don't exist, Please double check!") self.sqls = BeautifulSoup(fsock.read()) dbconfig = { 'host': '127.0.0.1', 'port': 3306, 'user': '******', 'passwd': '123456', 'db': 'scenic', 'charset': 'utf8' } self.db = MySQL(dbconfig) def initDB(self): """create all tables """ createSqls = self.sqls.find(id="createSql") for item in createSqls.select("item"): sql = item.string self._logger.info("create the table " + item.attrs["id"]) self.db.execute(sql) # must reopen the cursor, or it will raise exception with error code 1024. What a f*****g error self.db.reopenCursor() def createTable(self, name): """create a specified table """ create = self.sqls.find(id="createSql").find(id=name).string if create: self._logger.info(" create table " + name) self.db.execute(create) else: self._logger.error("error occured when create table " + name) def dropAll(self): """drop all the tables """ dropSqls = self.sqls.find(id="dropSql") for item in dropSqls.select("item"): sql = item.string self._logger.info("drop the table " + item.attrs["id"]) self.db.execute(sql) def dropTable(self, name): """drop specified table """ drop = self.sqls.find(id="dropSql").find(name) if drop: self._logger.info("drop the table " + name) self.db.execute(sql) else: self._logger.warn("Don't have the table " + name) def cleanAll(self): """delete data from all the tables,but not drop tables """ cleanSqls = self.sqls.find(id="cleanSql") for item in cleanSqls.select("item"): sql = item.string self._logger.info("clean the table " + item.attrs["id"]) self.db.execute(sql) def cleanTable(self, name): """clean the data of specified table """ pass def insertTable(self, name, params): """insert values int to the specified table # Parameters: name: the name of the table params: the value insert into the tables. It can be tuple for inserting a row,or can be a list to insert serveral rows # Return: """ insert = self.sqls.find(id="insertSql").find(id=name).string if insert: self._logger.info(" insert into table " + name) self.db.insert(insert, params) else: self._logger.error("did not find the table " + name + " when insert")