Example #1
0
def baikespyder():
    urls = pd.read_csv("./localdatasets/baike/urls.txt").values
    number = 0
    res = []
    for url in tqdm(urls):
        if url[0].split(".")[-1] == 'htm':
            continue
        html = urlhelper(url[0])
        soup = BeautifulSoup(html, "lxml")
        try:
            resp = soup.findAll('div', attrs={"class": 'para'})
            for i in range(len(resp)):
                try:
                    content = stringpro(resp[i].text)
                    res.append(content)
                except Exception as e:
                    log.warning("555: {}".format(e))
        except Exception as e:
            log.warning("58888:{}".format(e))

        number += 1
        if number % 500 == 0:
            df = pd.DataFrame(res)
            save_name = "./localdatasets/baike/{}.txt".format(number)
            print(save_name)
            df.to_csv(save_name, index=None)
            res = []
Example #2
0
 def session_commit(*args, **kwargs):
     try:
         return func(*args, **kwargs)
     except Exception as e:
         log.error('db operation error, detail {}'.format(e))
         log.warning('transaction rollbacks')
         db_session.rollback()
Example #3
0
def search_add_change_status(name):
    sql = "select * from {} where namemd5={}".format(table_name, name)
    try:
        row = db_session.execute(sql).fetchall()
        return row
    except Exception as e:
        log.warning("! xiaohua select error")
    db_session.close()
    return None
Example #4
0
def getone():
    sql = "select * from {} where status != 1 limit 1".format(table_name)
    try:
        row = db_session.execute(sql).fetchall()
        if row is not None and len(row) > 0:
            return dict(row[0].items())
    except Exception as e:
        log.warning("! xiaohua select error: {}".format(e))
    db_session.close()
    return None
Example #5
0
def urlhelper(url):
    try:
        req = urllib.request.Request(url)
        req.add_header("User-Agent",
                       "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/45.0.2454.101 Safari/537.36")
        req.add_header("Accept", "*/*")
        req.add_header("Accept-Language", "zh-CN,zh;q=0.8")
        data = urllib.request.urlopen(req)
        html = data.read().decode('utf-8')
        return html
    except error.URLError as e:
        log.warning("{}".format(e))
Example #6
0
def update(idstr, inputs_dict):
    try:
        update_dict = {"name": inputs_dict['name'],
                       "namemd5": curlmd5(inputs_dict['name']),
                       "status": inputs_dict['status'],
                       "createtime": inputs_dict['createtime']}
        db_session.query(XiaoHua).filter(XiaoHua.id == idstr).update(update_dict)
        db_session.commit()
        db_session.close()
    except Exception as e:
        log.warning(e)
        return None
    return update_dict
Example #7
0
def add(inputs_dict):
    try:
        add_dict = {"name": inputs_dict['name'],
                    "namemd5": curlmd5(inputs_dict['name']),
                    "status": inputs_dict['status'],
                    "createtime": inputs_dict['createtime']}
        add_obj = XiaoHua(add_dict)
        db_session.add(add_obj)
        db_session.commit()
    except Exception as e:
        log.warning(e)
        add_obj = None
    db_session.close()
    return add_obj
Example #8
0
def add(inputs_dict):
    try:
        add_dict = {
            "news": inputs_dict['news'],
            "createtime": inputs_dict['createtime'],
            "comment": inputs_dict['comment'],
        }
        add_obj = CaiLianShe(add_dict)
        db_session.add(add_obj)
        db_session.commit()
    except Exception as e:
        log.warning(e)
        add_obj = None
    db_session.close()
    return add_obj
Example #9
0
def update(idstr, inputs_dict):
    try:
        update_dict = {
            "news": inputs_dict['news'],
            "time": inputs_dict['name'],
            "status": inputs_dict['status'],
            "createtime": inputs_dict['createtime'],
            "comment": inputs_dict['comment'],
        }
        db_session.query(CaiLianShe).filter(
            CaiLianShe.id == idstr).update(update_dict)
        db_session.commit()
        db_session.close()
    except Exception as e:
        log.warning(e)
        return None
    return update_dict