Ejemplo n.º 1
0
def get_resources_and_page():
    for rp in Rootport.objects.filter(status=0).select_related("link"):
        headers = {
            'User-Agent': 'Mozilla/4.0 (compatible; MSIE 5.5; Windows NT)',
            'Referer': 'http://www.zhihu.com/articles'
        }
        try:
            request = urllib2.Request(url=rp.link, headers=headers)
            response = urllib2.urlopen(request)
            content = response.read()
        except:
            pass
        else:
            if content:
                soup = BeautifulSoup(content)
                page_num_div = soup.find_all("div", class_="pagination")
                if page_num_div:
                    page_num_div_str = BeautifulSoup(str(page_num_div[0]))
                    page_nums = page_num_div_str.find_all("a")
                    count = int(page_nums[-2].get('href'))
                    rp.page_num = count
                    rp.save()
                    get_sub_page_resources(link=rp.link, num=count)
                results = soup.find_all(href=re.compile("magnet"))
                for result in results:
                    link = result.get('href')
                    title = result.get('title')
                    bfr = BF_RESOURCES.add(link)
                    if bfr is False:
                        Resources.objects.create(title=title, link=link)
                keyworld_pages = soup.find_all(href=re.compile("information"))
                get_keyworld(keyworld_pages)
        rp.status = True
        rp.save()
Ejemplo n.º 2
0
def get_sub_page_resources(link=None, num=None):
    for i in range(1, num + 1):
        headers = {
            'User-Agent': 'Mozilla/4.0 (compatible; MSIE 5.5; Windows NT)',
            'Referer': 'http://www.zhihu.com/articles'
        }
        try:
            request = urllib2.Request(url="{link}{i}".format(link=link, i=i),
                                      headers=headers)
            response = urllib2.urlopen(request)
            content = response.read()
        except:
            pass
        else:
            if content:
                soup = BeautifulSoup(urllib.quote(content))
                results = soup.find_all(href=re.compile("magnet"))
                for result in results:
                    sublink = result.get('href')
                    title = result.get('title')
                    bfr = BF_RESOURCES.add(sublink)
                    if bfr is False:
                        Resources.objects.create(title=title, link=sublink)
                keyworld_pages = soup.find_all(href=re.compile("information"))
                get_keyworld(keyworld_pages)
Ejemplo n.º 3
0
def get_resources_and_page():
    for rp in Rootport.objects.filter(status=0).select_related("link"):
        headers = {'User-Agent': 'Mozilla/4.0 (compatible; MSIE 5.5; Windows NT)',
                   'Referer': 'http://www.zhihu.com/articles'}
        try:
            request = urllib2.Request(url=rp.link, headers=headers)
            response = urllib2.urlopen(request)
            content = response.read()
        except:
            pass
        else:
            if content:
                soup = BeautifulSoup(content)
                page_num_div = soup.find_all("div", class_="pagination")
                if page_num_div:
                    page_num_div_str = BeautifulSoup(str(page_num_div[0]))
                    page_nums = page_num_div_str.find_all("a")
                    count = int(page_nums[-2].get('href'))
                    rp.page_num = count
                    rp.save()
                    get_sub_page_resources(link=rp.link, num=count)
                results = soup.find_all(href=re.compile("magnet"))
                for result in results:
                    link = result.get('href')
                    title = result.get('title')
                    bfr = BF_RESOURCES.add(link)
                    if bfr is False:
                        Resources.objects.create(title=title, link=link)
                keyworld_pages = soup.find_all(href=re.compile("information"))
                get_keyworld(keyworld_pages)
        rp.status = True
        rp.save()
Ejemplo n.º 4
0
def get_sub_page_resources(link=None, num=None):
    for i in range(1, num + 1):
        headers = {'User-Agent': 'Mozilla/4.0 (compatible; MSIE 5.5; Windows NT)',
                   'Referer': 'http://www.zhihu.com/articles'}
        try:
            request = urllib2.Request(
                url="{link}{i}".format(link=link, i=i), headers=headers)
            response = urllib2.urlopen(request)
            content = response.read()
        except:
            pass
        else:
            if content:
                soup = BeautifulSoup(urllib.quote(content))
                results = soup.find_all(href=re.compile("magnet"))
                for result in results:
                    sublink = result.get('href')
                    title = result.get('title')
                    bfr = BF_RESOURCES.add(sublink)
                    if bfr is False:
                        Resources.objects.create(title=title, link=sublink)
                keyworld_pages = soup.find_all(href=re.compile("information"))
                get_keyworld(keyworld_pages)