Beispiel #1
0
def data_soup(url, header=None, cert=None):
    """
    获取网页后用BeautifulSoup处理
    返回一个元组(request格式的对象,beautifulSoup格式的对象)
    :return:
    :param url:
    :param header:
    """
    print 'data_soup'
    connect = data_request(url, header, cert)
    try:
        if connect:
            html = connect.content
            # encoding = chardet.detect(html)
            # print encoding
            soup = BeautifulSoup(html, 'html5lib')
        else:
            return None, None
    except Exception as e:
        error_text = fun.exception_format(fun.get_current_function_name(), e)
        print error_text
        return connect, None
    else:
        # print 'soup:'+soup.original_encoding
        return connect, soup
Beispiel #2
0
def connect_bing():
    print 'connect_bing'
    try:
        client = pymongo.MongoClient(host, port)
    except Exception as e:
        error_text = fun.exception_format(fun.get_current_function_name(), e)
        print error_text
    else:
        db = client.fish_check
        collection = db.bing
        return collection
Beispiel #3
0
def insert_data(collection, database):
    """
    向数据库中插入数据
    :param collection: 数据库对象
    :param database: 可以是json格式或者dict格式的数据
    """
    print 'insert_data'
    try:
        if isinstance(database, list):
            collection.insert_many(database)
        elif isinstance(database, dict):
            collection.insert_one(database)
    except Exception as e:
        error_text = fun.exception_format(fun.get_current_function_name(), e)
        print error_text
Beispiel #4
0
def is_url_exist(collection, url):
    """
    判断一个URL是否已经被存入数据库中
    存在 返回True
    不存在 返回False
    :param collection: 数据库实例对象
    :param url: url
    """
    print 'search_url'
    try:
        temp = collection.find({'url': url})
    except Exception as e:
        error_text = fun.exception_format(fun.get_current_function_name(), e)
        print error_text
    else:
        if temp.count() == 0:
            return False
        else:
            return True
Beispiel #5
0
def data_request(url, header=None, cert=None):
    """
    获取网页后不处理
    返回一个requests类型的连接
    出错返回一个None
    :param header:
    :param url:
    """
    print 'data_request'
    flag = fun.ssl_judge(url)
    print flag
    count = 0
    while True:
        try:
            page = requests.get(url,
                                headers=header,
                                timeout=10,
                                verify=flag,
                                cert=cert)
        except requests.exceptions.ConnectTimeout:
            print 'ConnectTimeout'
            if count > 1:
                return None
            else:
                count += 1
                continue
        except requests.exceptions.SSLError:
            print 'SSLError'
            flag = False
            continue
        except requests.exceptions.ConnectionError:
            print 'ConnectionError'
            if flag:
                flag = False
                count += 1
                continue
            if count > 1:
                return None
            else:
                count += 1
                continue
        except requests.exceptions.ReadTimeout:
            print 'ReadTimeout'
            if count > 1:
                return None
            else:
                count += 1
                continue
        except requests.exceptions.Timeout:  # this is important
            print 'Timeout'
            return None
        except requests.exceptions.TooManyRedirects:
            print 'TooManyRedirects'
            return None
        except requests.exceptions.HTTPError:
            print 'HTTPError'
            return None
        except requests.exceptions as e:
            error_text = fun.exception_format(fun.get_current_function_name(),
                                              e)
            print error_text
            return None
        else:
            if page.status_code == requests.codes.ok:
                return page  # get page content
            else:
                error_text = "Page Code %s " % page.status_code
                print error_text
                if count > 1:
                    return None
                else:
                    count += 1
                    continue