def baikespyder(): urls = pd.read_csv("./localdatasets/baike/urls.txt").values number = 0 res = [] for url in tqdm(urls): if url[0].split(".")[-1] == 'htm': continue html = urlhelper(url[0]) soup = BeautifulSoup(html, "lxml") try: resp = soup.findAll('div', attrs={"class": 'para'}) for i in range(len(resp)): try: content = stringpro(resp[i].text) res.append(content) except Exception as e: log.warning("555: {}".format(e)) except Exception as e: log.warning("58888:{}".format(e)) number += 1 if number % 500 == 0: df = pd.DataFrame(res) save_name = "./localdatasets/baike/{}.txt".format(number) print(save_name) df.to_csv(save_name, index=None) res = []
def session_commit(*args, **kwargs): try: return func(*args, **kwargs) except Exception as e: log.error('db operation error, detail {}'.format(e)) log.warning('transaction rollbacks') db_session.rollback()
def search_add_change_status(name): sql = "select * from {} where namemd5={}".format(table_name, name) try: row = db_session.execute(sql).fetchall() return row except Exception as e: log.warning("! xiaohua select error") db_session.close() return None
def getone(): sql = "select * from {} where status != 1 limit 1".format(table_name) try: row = db_session.execute(sql).fetchall() if row is not None and len(row) > 0: return dict(row[0].items()) except Exception as e: log.warning("! xiaohua select error: {}".format(e)) db_session.close() return None
def urlhelper(url): try: req = urllib.request.Request(url) req.add_header("User-Agent", "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/45.0.2454.101 Safari/537.36") req.add_header("Accept", "*/*") req.add_header("Accept-Language", "zh-CN,zh;q=0.8") data = urllib.request.urlopen(req) html = data.read().decode('utf-8') return html except error.URLError as e: log.warning("{}".format(e))
def update(idstr, inputs_dict): try: update_dict = {"name": inputs_dict['name'], "namemd5": curlmd5(inputs_dict['name']), "status": inputs_dict['status'], "createtime": inputs_dict['createtime']} db_session.query(XiaoHua).filter(XiaoHua.id == idstr).update(update_dict) db_session.commit() db_session.close() except Exception as e: log.warning(e) return None return update_dict
def add(inputs_dict): try: add_dict = {"name": inputs_dict['name'], "namemd5": curlmd5(inputs_dict['name']), "status": inputs_dict['status'], "createtime": inputs_dict['createtime']} add_obj = XiaoHua(add_dict) db_session.add(add_obj) db_session.commit() except Exception as e: log.warning(e) add_obj = None db_session.close() return add_obj
def add(inputs_dict): try: add_dict = { "news": inputs_dict['news'], "createtime": inputs_dict['createtime'], "comment": inputs_dict['comment'], } add_obj = CaiLianShe(add_dict) db_session.add(add_obj) db_session.commit() except Exception as e: log.warning(e) add_obj = None db_session.close() return add_obj
def update(idstr, inputs_dict): try: update_dict = { "news": inputs_dict['news'], "time": inputs_dict['name'], "status": inputs_dict['status'], "createtime": inputs_dict['createtime'], "comment": inputs_dict['comment'], } db_session.query(CaiLianShe).filter( CaiLianShe.id == idstr).update(update_dict) db_session.commit() db_session.close() except Exception as e: log.warning(e) return None return update_dict