Exemple #1
0
    def get_page_urls(self):
        '''
        从初始页面HTML中解析出每一页的URL
        :return:
        '''
        # 解析出页码列表
        pages_ul = self.soup.find('ul', class_='npage fr')
        pages_lis = pages_ul.find_all('li')
        pages_lis = [li for li in pages_lis if re.search(r'\d+', li.text)]

        # 最后一页的li对象
        last_page_li = pages_lis[-1]
        # 从最后一页的li对象中解析出页面URL和页码数
        # URL格式:/full/0/765_0_180000_0_0_0_0_10_1,其中最后的'10'表示页码数
        last_page_url = last_page_li.find('a')['href']
        last_page_url = urljoin.url_path_join(
            HtmlConstant.ZHILIAN_XIAOYUAN_HOST)
        page_num = int(last_page_li.text)
        logging.debug(
            'get_page_urls method, last_page_url = {}, page_num = {}',
            last_page_url, page_num)

        # 组建URL格式化字符串
        splited_url = last_page_url.split('_')
        page_formatter_url = '_'.join()
    def new(self):
        # type: () -> str
        """Get new project ID

        Returns:
            dict: New dataset properties
        """
        # todo put/post?
        return requests.get(url_path_join(self.get_address(), "new")).text
    def update(self, ds_id, data):
        # type: (str, dict) -> None
        """ Update project details

        Args:
            ds_id (str): dataset id
            data (dict): new dataset properties
        """
        return requests.post(url_path_join(self.get_address(), "update", ds_id),
                             json=data)
    def project_details(self, ds_id):
        # type (str) -> dict
        """ Get project details

        Args:
            ds_id (str): dataset id

        Returns:
            dict: Dataset details
        """
        return requests.get(url_path_join(self.get_address(),
                                          "detail",
                                          ds_id)).json()
    def use(self, ds_id, usage={}):
        # type: (str, dict) -> dict
        """Get dataset info with use note

        Args:
            ds_id (str): dataset id
            usage (dict): usage details

        Returns:
            dict: dataset details
        """
        return requests.post(url_path_join(self.get_address(), "use", ds_id),
                             json=usage).json()
Exemple #6
0
def link_crawler(seed_url, link_regex):
    """
    :param seed_url:主页面地址
    :param link_regex:正则表达式
    :return:Node
    """
    crawl_queue = [seed_url]
    seen = set(crawl_queue)
    while crawl_queue:
        url = crawl_queue.pop()
        html = download(url, user_agent='Baiduspider', proxy=None, retries=2)
        for link in get_links(html):
            #print('seen:', seen)
            if re.match(link_regex, link, re.I):
                link = urljoin.url_path_join(seed_url, link)
                #print('link1:', link)
                if link not in seen:
                    crawl_queue.append(link)
                    seen.add(link)
Exemple #7
0
def crawl(_dst_url: str):
    global __count
    history_list.append(_dst_url)
    __count = __count + 1
    filename = util.get_hash(_dst_url)
    cache_html_path = cache_page_path + filename + ".html"
    if file.exist(cache_html_path):
        html = file.read(cache_html_path)
        print("缓存读取----->" + filename)
    else:
        print("请求网络----->" + filename)
        html = spider.get_html(_dst_url)
        file.write(cache_html_path, html)
    links = spider.get_links(html)
    img_list = spider.get_img(html)
    # print(img_list)

    __image_list.extend(img_list)

    json_str = json.dumps(img_list, ensure_ascii=False)

    file.write(cache_link_path + filename + ".json", json_str)

    global_list.extend(links)

    global_set = set(global_list)

    new_set = set()

    for x in global_set:
        v = str(x).split("#")[0]
        if _root in v:
            new_set.add(v)
        else:
            if str(v).find('http') == -1:
                new_set.add(urljoin.url_path_join(_base_url, v))

    return new_set
Exemple #8
0
def htmlParse(seed_url, url, st=None, result={}):
    logger = logging.getLogger(__name__)
    if st is None:
        st = set()
    current_page_children = set()
    logger.error("Making network call for url: " + url)
    content = network.makeGetRequest(url)

    if content is None:
        return None
    logger.error("Request done successfully")
    soup = BeautifulSoup(content, parser.SOUP_DECODER)
    for tag in soup.find_all(href=True):
        if is_internal_link(tag[parser.LINK_ATTR]):
            if urlparse(tag[parser.LINK_ATTR]).netloc == '':
                url = urljoin.url_path_join("https://docs.python.org/",
                                            tag[parser.LINK_ATTR])
                st.add(url)
                current_page_children.add(url)
            else:
                st.add(tag[parser.LINK_ATTR])
                current_page_children.add(tag[parser.LINK_ATTR])
    result[url] = current_page_children
    return st
Exemple #9
0
def test_url_path_join_nested():
    assert url_path_join("http://example.com/", "/path/path") == \
           "http://example.com/path/path"
Exemple #10
0
def test_url_path_join_multiple():
    assert url_path_join("http://example.com/", "/path/", "a", "b") == \
           "http://example.com/path/a/b"
Exemple #11
0
def test_url_path_join_trailing_slash():
    assert url_path_join("http://example.com/", "/path/",
                         trailing_slash=True) == \
           "http://example.com/path/"
Exemple #12
0
def test_url_path_join_single():
    assert url_path_join("http://example.com/") == \
           "http://example.com"
Exemple #13
0
#import urllib3
import requests
import re
import urljoin
import urllib
from urllib import robotparser
from urllib import request
from urllib import error
from urllib import parse

# w = whois.whois("baidu.com")
# print(w)

url = 'http://webscraping.com'
rp = robotparser.RobotFileParser()
rp.set_url(urljoin.url_path_join(url, 'robots.txt'))
rp.read()


def download(url, user_agent='wyy', proxy=None, retries=2):
    print('Downloading:', url)
    headers = {'User-agent': user_agent}

    if not rp.can_fetch(user_agent, url):
        print('Blocked by robots.txt:', url)
        return None

    req = request.Request(url, headers=headers)
    opener = request.build_opener()
    if proxy:
        proxy_params = {parse.urlparse(url).schema: proxy}
Exemple #14
0
 def reload(self):
     # type: () -> None
     """ Force server to reload DB
     """
     requests.post(url_path_join(self.get_address(), "reload"))
Exemple #15
0
 def scan(self):
     # type: () -> None
     """ Force local fs rescan
     """
     return requests.get(url_path_join(self.get_address(), "scan"))
    def parse_detail_page(self, response=None, url=None):
        try:
            response.encoding = self.encoding
            unicode_html_body = response.text
            # dict_content = json.loads(unicode_html_body)
            # data = dict_content["data"]
            data = etree.HTML(unicode_html_body)
            # data =BeautifulSoup(unicode_html_body,"lxml")
            # data = htmlparser.Parser (unicode_html_body)

        except Exception as e:
            print("parse_detail_page(): %s" % e)
            return None
        from_tag_url = response.url
        logger.info("list page url%s" % from_tag_url)
        #
        li_content = data.xpath('''//li[@class="l-twos"]''')
        for i in range(1,21):
            detail_url = data.xpath('''(//li[@class="l-twos"])[{}]//div[@class="l-tw-title"]/@onclick'''.format(i))[0].replace("window.location.href='","").replace("'","")
            detail_url = urljoin.url_path_join("http://job.qust.edu.cn",detail_url)
            if self.getdumps(detail_url):
                continue
            date = data.xpath('''(//li[@class="l-twos"])[{}]//div[@class="l-tw-xiangq"]/p[@class="l-tw-xiangqa"]/text()[2]'''.format(i))[0].replace("时间:","")
            location = data.xpath('''(//li[@class="l-twos"])[{}]//div[@class="l-tw-xiangq2"]/p[@class="l-tw-xiangqa"]/text()[2]'''.format(i))[0].replace("地点:","")
            uid = str(uuid.uuid3(uuid.NAMESPACE_DNS, detail_url)) + str(uuid.uuid5(uuid.NAMESPACE_DNS, detail_url))[0]
            ctime = datetime.datetime.now().strftime("%Y-%m-%d %H:%M:%S")
            title = data.xpath('''(//li[@class="l-twos"])[{}]//div[@class="l-tw-title"]/@title'''.format(i))[0]
            siteName = self.siteName
            post = {
                "uuid": uid,  # md5
                "detailUrl": detail_url,  # url
                "title": title,  # 标题
                "location": location,  # 地点
                "siteName": self.siteName,
                "ctime": ctime,
                "date": date,

            }

            dic = self.handle_post(post)
            sql = '''insert into list_info (uuid,detailUrl,title,location,siteName,ctime,date)
                                      VALUE (%s,%s,%s,%s,%s,%s,%s);'''
            try:
                # 执行sql语句
                self.db.ping()

            except Exception as e:
                logger.info(e)
                self.db.rollback()  # 捕捉到错误就回滚
                self.db = MySQLdb.connect(host="127.0.0.1", user="******", passwd="root", db='qust',
                                          charset='utf8')

            try:
                self.cursor.execute(sql, (
                    uid, detail_url, title, location, siteName, ctime, date))
                self.db.commit()  # 把修改的数据提交到数据库
                logger.info("入库完成")
            except Exception as e:
                logger.info(e)
                self.db.rollback()  # 捕捉到错误就回滚

        # self.parse_restore(unicode_html_body)
        return