def get_hotel_id(): shi = [ 'chaozhou', 'shenzhen', 'guangzhou', 'dongguan', 'foshan', 'zhongshan', 'zhuhai', 'shantou', 'qingyuan', 'heyuan', 'zhaoqing', 'yunfu', 'shaoguan', 'meizhou', 'jiangmen', 'maoming', 'yangjiang', 'zhanjiang', 'huizhou', 'shanwei', 'jieyang' ] pattern_yema = 'data-ga-page="([0-9]+)"' yema_pattern = re.compile(pattern_yema) pattern_hotel_id = 'data-shop-url="([0-9]+)"' hotel_id_pattern = re.compile(pattern_hotel_id) for s in shi: sql = "insert into dzdp_hotel_id(dzdp_hotel_id,shi) VALUES " url_shouye = "https://www.dianping.com/" + s + "/hotel/" html = requests.get(url_shouye) yema = yema_pattern.findall(html.text)[-2] for i in range(1, int(yema) + 1): print(i) url_yema = url_shouye + "p" + str(i) html_id = requests.get(url_yema) hotel_id = hotel_id_pattern.findall(html_id.text) for id in set(hotel_id): sql = sql + "('" + id + "','" + s + "')" + "," sql = sql[:-1] cursor.execute(sql) connect.commit() print('成功插入', cursor.rowcount, '条数据')
def export(id): cursor.execute("select con from con where fid=" + id) result = cursor.fetchall() if not result: return "没有数据" titles = json.loads(result[0]["con"]) xl = xlwt.Workbook() sheet = xl.add_sheet("abc", cell_overwrite_ok=True) # 表头 for cols in range(len(titles)): sheet.write(0, cols, titles[cols]["name"]) # 内容 for row in range(len(result)): for col in range(len(json.loads(result[row]["con"]))): sheet.write(row + 1, col, json.loads(result[row]["con"])[col]["value"]) xl.save("demo.xls") res = make_response( send_from_directory(".", "demo.xls", as_attachment=True)) res.headers["content-disposition"] = "attachment;filename=1.xls" return res
def extract_value(self,cursor,tablename=None,name=None,value=None): if name is None: print "Please choose a KOL" else if tablename is None: print "Please choose a table" else: cursor.execute(''.join(["SELECT ",','.join(value),"FROM ",tablename,"WHERE name=",name])) return cursor.fetchall()
def editcon(): title = request.args.get("title") html = request.args.get("html") source = request.args.get("source") id = request.args.get("id") cursor.execute("update forms set title=%s,html=%s,source=%s where id=%s", (title, html, source, id)) db.commit() return "ok"
def formadd(): title = request.args.get("title") source = request.args.get("source") html = request.args.get("html") cursor.execute("insert into forms (title,source,html) values (%s,%s,%s)", (title, source, html)) db.commit() return "ok"
def write_profile(self,cursor,tablename,field,COLUMN,values,path=None,file_name=None): if path||file_name is None: print "Please choose a file and its location" else with open('/'.join([path,file_name]),'r') as json_file: data=json.load(json_file) TABLE=''.join([tablename,COLUMN]) VALUE=''.join(['VALUES(',values,')']) cursor.execute(''.join(["DROP TABLE IF EXISTS ",tablename])) cursor.execute(''.join(["CREATE TABLE ",tablename,field])) cursor.execute(''.join(["INSERT INTO ",TABLE,' ',VALUE]))
def edit(id): cursor.execute("select id,html from forms where id=" + id) result = cursor.fetchone() return render_template("edit.html", data=result)
def conadd(): con = request.args.get("con") fid = request.args.get("fid") cursor.execute("insert into con (con,fid) values (%s,%s)", (con, fid)) db.commit() return "ok"
def show(id): cursor.execute("select id,source from forms where id=" + id) result = cursor.fetchone() return render_template("show.html", data=result)
def list(): cursor.execute("select id,title from forms") result = cursor.fetchall() return render_template("list.html", data=result)
def get_hotel_comment(hotel_id, done_list): starttime = datetime.datetime.now() profile = get_profile() time_mark = 1 for id, s in hotel_id: if id not in done_list: sql_details = "insert into dzdp_hotel_details VALUES " sql_comments = "insert into dzdp_hotel_comments VALUES" url = "http://www.dianping.com/shop/" + id nowtime = datetime.datetime.now() timeminus = ((nowtime - starttime).seconds) / 60 if (timeminus > 5 * time_mark): profile = get_profile() time_mark = time_mark + 1 driver = webdriver.Firefox(profile) driver.set_page_load_timeout(6) try: driver.get(url) except Exception: driver.execute_script("window.stop()") soup = BeautifulSoup(driver.page_source, 'lxml') try: level = soup.find('div', { 'class': 'crumb' }).find_all('a')[2].text except Exception: level = soup.find('div', { 'class': 'crumb' }).find_all('a')[1].text hotel_name = soup.find('div', { 'class': 'hotel-title clearfix' }).find('h1').text addr = soup.find('span', {'class': 'hotel-address'}).text jw_patt = re.compile( '"lat":([0-9]+).([0-9]+),"lng":([0-9]+).([0-9]+),') jw = jw_patt.findall(driver.page_source)[0] weidu = jw[0] + '.' + jw[1] jingdu = jw[2] + '.' + jw[3] sql_details = sql_details + "('%s','%s','%s','%s','%s','%s','%s')" % ( id, s, hotel_name, level, addr, jingdu, weidu) i = 1 while True: url_comment = url + '/review_more_latest?pageno=' + str(i) try: driver.get(url_comment) except Exception: driver.execute_script("window.stop()") comment_soup = BeautifulSoup(driver.page_source, 'lxml') if i == 1: try: pagenum = comment_soup.find_all( 'div', {'class': 'Pages'})[1].find_all('a')[-2].text print(pagenum) except Exception: pagenum = '1' try: comment_list = comment_soup.find('div', {'class': 'comment-list'}) contents = comment_list.find_all('div', {'class': 'content'}) except Exception: dianping_pattr = re.compile( '网友点评</a><em class="col-exp">\(([0-9]+)\)</em>') zero_comment = dianping_pattr.findall(driver.page_source) if zero_comment[0] == '0': cursor.execute(sql_details) sql_comments = sql_comments + "('%s','%s','%s','%s')" % ( id, '', time.strftime('%Y-%m-%d', time.localtime( time.time())), 'no comment') cursor.execute(sql_comments) connect.commit() print('insert no comment') driver.close() break pattern = re.compile('[\r\n]+') for content in contents: content_text = content.find('div', { 'class': 'J_brief-cont' }).text.replace(' ', '') comment_clean = pattern.sub('', content_text).replace( '\'', '').replace('\\', '') comment_id = content.find( 'div', {'class': 'misc-info'})['id'].split('_')[1] day = content.find('span', { 'class': 'time' }).text.split(' ')[0] if len(day) < 7: day = '2017-' + day sql_comments = sql_comments + "('%s','%s','%s','%s')," % ( id, comment_id, day, comment_clean) if str(i) == pagenum: cursor.execute(sql_comments[:-1]) cursor.execute(sql_details) connect.commit() print(sql_details) driver.close() break i = i + 1