コード例 #1
0
def crawling(url):
  try:
    response = request(url)
    content = checks.page_encoding(response, action="decode")
    match = re.search(r"(?si)<html[^>]*>(.+)</html>", content)
    if match:
      content = "<html>%s</html>" % match.group(1)
    soup = BeautifulSoup(content)
    tags = soup('a')
    if not tags:
      tags = []
      tags += re.finditer(r'(?i)\s(href|src)=["\'](?P<href>[^>"\']+)', content)
      tags += re.finditer(r'(?i)window\.open\(["\'](?P<href>[^)"\']+)["\']', content)
    for tag in tags:
      href = tag.get("href") if hasattr(tag, settings.HTTPMETHOD.GET) else tag.group("href")
      if href:
        href = _urllib.parse.urljoin(url, href)
        if _urllib.parse.urlparse(url).netloc in href:
          if not re.search(r"\?(v=)?\d+\Z", href) and not \
          re.search(r"(?i)\.(js|css)(\?|\Z)", href) and \
          href.split('.')[-1].lower() not in settings.CRAWL_EXCLUDE_EXTENSIONS:
            if request(href): 
              HREF_LIST.append(href)
    if len(HREF_LIST) != 0:
      return list(set(HREF_LIST))
    else:
      if not settings.VERBOSITY_LEVEL >= 2:
        print(settings.SPACE)
      warn_msg = "No usable links found."
      print(settings.print_warning_msg(warn_msg))
      raise SystemExit()
  except (UnicodeEncodeError, ValueError) as e:  # for non-HTML files and non-valid links
    pass
コード例 #2
0
ファイル: crawler.py プロジェクト: xiaofengtongxue/commix
def request(url):
  # Check if defined POST data
  if menu.options.data:
    request = urllib2.Request(url, menu.options.data)
  else:
    request = urllib2.Request(url)
  headers.do_check(request) 
  response = urllib2.urlopen(request)
  soup = BeautifulSoup(response)
  return soup
コード例 #3
0
def crawling(url):

    # Check if defined POST data
    if menu.options.data:
        request = urllib2.Request(url, menu.options.data)
    else:
        request = urllib2.Request(url)
    headers.do_check(request)
    response = urllib2.urlopen(request)
    html_data = response.read()
    soup = BeautifulSoup(html_data)

    href_list = []
    for tag in soup.findAll('a', href=True):
        tag['href'] = urlparse.urljoin(url, tag['href'])
        o = urlparse.urlparse(url)
        if o.netloc in tag['href']:
            href_list.append(tag['href'])
    return href_list
コード例 #4
0
ファイル: crawler.py プロジェクト: zxc135781/commix
def request(url):
  # Check if defined POST data
  if menu.options.data:
    request = _urllib.request.Request(url, menu.options.data.encode(settings.UNICODE_ENCODING))
  else:
    request = _urllib.request.Request(url)
  try:
    headers.do_check(request) 
    response = _urllib.request.urlopen(request)
    soup = BeautifulSoup(response)
    return soup
  except _urllib.error.URLError as e:
    pass
コード例 #5
0
def request(url):
  # Check if defined POST data
  if menu.options.data:
    request = urllib2.Request(url, menu.options.data)
  else:
    request = urllib2.Request(url)
  try:
    headers.do_check(request) 
    response = urllib2.urlopen(request)
    soup = BeautifulSoup(response)
    return soup
  except urllib2.URLError, e:
    err_msg = "Unable to connect to the target URL "
    err_msg += "(" + str(e.args[0]).split("] ")[1] + ")." 
    print settings.print_critical_msg(err_msg)
    raise SystemExit