Ejemplo n.º 1
0
def get_all_links(domain, path, maxSize):
    #response = requests.get(domain+path, headers={'User-Agent': 'Mozilla/5.0'})
    driver = webdriver.PhantomJS()
    driver.get(domain + path)
    soup = BeautifulSoup(driver.page_source, "html.parser")
    links = []
    rp = RobotsCache(10000)
    for div in soup.findAll('div'):
        for link in div.findAll('a', href=True):
            #print(link.get('href'))
            if (rrobots(domain, link.get('href'), rp)):
                regex = re.compile(
                    r'^(?:http|ftp)s?://'  # http:// or https://
                    r'(?:(?:[A-Z0-9](?:[A-Z0-9-]{0,61}[A-Z0-9])?\.)+(?:[A-Z]{2,6}\.?|[A-Z0-9-]{2,}\.?)|'  #domain...
                    r'localhost|'  #localhost...
                    r'\d{1,3}\.\d{1,3}\.\d{1,3}\.\d{1,3})'  # ...or ip
                    r'(?::\d+)?'  # optional port
                    r'(?:/?|[/?]\S+)$',
                    re.IGNORECASE)
                if re.match(regex, domain + link.get('href')) is not None:
                    if (len(link.get('href')) > 0):
                        if ((link.get('href')[0] >= 'a'
                             and link.get('href')[0] <= 'z')
                                or (link.get('href')[0] >= '1'
                                    and link.get('href')[0] <= '9')):
                            links.append('/' + link.get('href'))
                        else:
                            links.append(link.get('href'))
    return links
Ejemplo n.º 2
0
    def __init__(self, url, config={}, proxies={},
                 auth=None, ua=DEFAULT_HODOR_UA,
                 pagination_max_limit=DEFAULT_HODOR_MAX_PAGES,
                 crawl_delay=DEFAULT_CRAWL_DELAY,
                 ssl_verify=False,
                 trim_values=True,
                 robots=True,
                 reppy_capacity=100):

        self.content = None
        self.url = url
        self.domain = self._get_domain()
        self.proxies = proxies
        self.auth = auth
        self.ua = ua
        self.trim_values = trim_values
        self.ssl_verify = ssl_verify
        self.config = {}
        self.extra_config = {}

        self.robots = RobotsCache(capacity=reppy_capacity) if robots else None

        self._pages = []
        self._page_count = 0
        self._pagination_max_limit = pagination_max_limit
        self.crawl_delay = self._crawl_delay(crawl_delay)

        for k, v in config.items():
            if k.startswith("_"):
                self.extra_config[k.lstrip("_")] = v
            else:
                self.config[k] = v
Ejemplo n.º 3
0
    def download_pages_in_queue(self, queue):
        current_page_url = queue.get()

        robot = RobotsCache()
        if (robot.allowed(current_page_url, "*")):

            print current_page_url
            if len(current_page_url) < 10: return
            current_page_html = download_page_by_url(current_page_url)
            bs = BeautifulSoup(current_page_html, "html.parser")

            links = bs.find_all('a', href=True)
            post_links = [link['href'] for link in links]

            for post_link in post_links:
                if len(post_link) < 10: continue
                if str(post_link).find('http') != 0:
                    post_link = str(self.start_url) + str(post_link)
                queue.put(post_link)
            self.sites_num = self.sites_num + 1

            page = Pages(url=current_page_url,
                         parsed_text=get_text_from_html(current_page_html),
                         is_indexed=False)
            page.save()
        else:
            print "Page can't be indexed because of the rules in ROBOTS.TXT"
Ejemplo n.º 4
0
    def __init__(self,
                 file,
                 ua,
                 check=True,
                 output="output.csv"
                 ):  #setting output to false disables file output
        if check:  #only setup robot checker if robot checking is enabled
            self.ua = ua  #user agent
            self.robo = RobotsCache(capacity=100)

        #check var disables or enables robots.txt checking
        #recommended to keep default True value
        self.check = check
        self.req = requests  #request obj for parsing url

        self.output = output  #where to output file

        self.data = []  #init array of grabbed sites

        self.configarr = []  #empty array of all configs

        if type(file) is list:
            self.configarr = file
        else:
            self.configarr.append(file)
    def get_robot_agent(root_domain: str, protocol="http") -> Rules:
        if root_domain.startswith("http"):
            root_domain = LinkChecker.get_root_domain(root_domain)[4]
        versions = ["http://", "https://", "http://www.", "https://www."]
        suffix = "/robots.txt"
        current = ""
        found = False
        for version in versions:
            temp_link = version + root_domain + suffix
            try:
                status_code, content_type = LinkChecker.get_response(temp_link)
                if status_code == ResponseCode.LinkOK:
                    current = temp_link
                    found = True
                    break
                else:
                    raise ConnectionError
            except:
                pass
        if found:
            try:
                robots = RobotsCache()
                req = robots.session.get(current)
                ttl = max(robots.min_ttl,
                          Utility.get_ttl(req.headers, robots.default_ttl))
                # And now parse the thing and return it
                return parser.Rules(current, req.status_code, req.content,
                                    time.time() + ttl)

                # rules = robots.fetch(current)
                # return rules
            except:
                return None
        else:
            return None
    def __FetchRobotFileInfo__(self, url, robotDictForDomains, timeStamp):
        domainName = self.__GetComSubdomainOfUrl__(url)
        robotUrl = ""

        if robotDictForDomains.has_key(domainName) == False:
            robotUrl = self.__GetRobotUrlForUrl__(domainName)
            cache = RobotsCache()
            try:
                timeStamp[domainName] = datetime.datetime.now()
                robotFileObj = cache.fetch(robotUrl)
                doesUrlExistOnline = self.__DoesUrlExistOnline__(robotUrl)
            except:
                doesUrlExistOnline = False
                robotDictForDomains[domainName] = (doesUrlExistOnline, object)

            if doesUrlExistOnline == True:
                robotDictForDomains[domainName] = (doesUrlExistOnline,
                                                   robotFileObj)
            else:
                robotDictForDomains[domainName] = (doesUrlExistOnline, object)

        doesUrlExistOnline = robotDictForDomains[domainName][0]
        robotFileObj = robotDictForDomains[domainName][1]
        # print "heyyy",robotUrl, doesUrlExistOnline, robotFileObj, robotDictForDomains
        return doesUrlExistOnline, robotFileObj, robotDictForDomains, timeStamp, domainName
Ejemplo n.º 7
0
 def testRobot3(self):
     robots = RobotsCache()
     rules = robots.fetch("http://www.realwire.com/")
     crawl_delay = rules.delay("idiot")
     print("delay is:", crawl_delay)
     for i in range(1, 1000):
         print(rules.allowed("http://api.google.com/search/",
                             agent="idiot"))
Ejemplo n.º 8
0
 def check_for_robot_access(self, page):
     self.f.write('--- checking for robots %s\n' % page)
     robots = RobotsCache()
     try:
         if robots.allowed(page + 'robots.txt', 'my-agent'):
             print 'robots allowed'
             self.f.write('robots allowed. \n')
             return True
     except ServerError, r:
         print 'error ', r
         return False
 def robot_rules(_url_scheme, _url_netloc):  # return a robot rules objects
     _domain = urlunparse((_url_scheme, _url_netloc, '', '', '', ''))
     robots = RobotsCache()
     try:
         rules = robots.fetch(_domain, timeout=5)
     except Exception as exc:
         print('FAIL to fatch robot.txt {},{}'.format(
             _url_scheme, _url_netloc))
         print(exc)
         return None
     return rules
Ejemplo n.º 10
0
 def get_text_by_base_url(self):
     robots = RobotsCache(capacity=100)
     if not robots.allowed(self.base_url, "python-requests"):
         return ["Crawling this site is not allowed by robots.txt"]
     text_list = []
     for slug in self.__get_links_by_url_depth():
         sleep(0.5)
         text_list.append(
             remove_emoji(
                 remove_url(self.__get_text_by_url(self.base_url +
                                                   slug))).strip())
     return text_list
Ejemplo n.º 11
0
def confirm_robots_txt(target_url, max_capacity):
    '''confirm that target url is allowed to crawl

    :type target_url: str
    :param target_url: agent wanna crawl
    :type max_capacity: int
    :param max_capacity: limit of max crawling pages
    :rtype: bool
    :return: weather it is possible to scrape
    '''
    robots = RobotsCache(max_capacity)
    return robots.allowed(target_url, 'python program')
Ejemplo n.º 12
0
	def robot_rules(_url_scheme, _url_netloc):  # return a robot rules objects
		#_parsed_url = urlparse(_url)
		_domain = urlunparse((_url_scheme, _url_netloc, '', '', '', ''))
		robots = RobotsCache()
		try:
			#print('DOMAIN: {}'.format(_domain))
			rules = robots.fetch(_domain)
		except Exception as exc:
			print('FAIL to fatch robot.txt')
			print(_url_scheme, _url_netloc)
			print(exc)
			return None
		return rules
Ejemplo n.º 13
0
    def __init__(self, robots_url=None):
        if robots_url:
            robots = RobotsCache()
            self._rules = robots.fetch(robots_url)
            self.is_use_robots = True
        else:
            self.is_use_robots = False

        self._url_norm = UrlNorm()
        self.counter = 0
        self.urls = dict()
        self.connections = defaultdict(set)
        self._lock = RLock()
Ejemplo n.º 14
0
    def __init__(self, file, ua, check=True, output="output.csv"):
        if check:  #only setup robot checker if robot checking is enabled
            self.ua = ua  #user agent
            self.robo = RobotsCache(capacity=0)

        #check disables or enables robots.txt checking
        #recommended to keep default True value
        self.check = check
        self.req = requests

        if os.path.exists(file):
            with open(file) as f:
                self.config = json.load(f)  #opens and parses json file
Ejemplo n.º 15
0
	def setup_method(self, _):
		"""Configure the app."""
		self.url = "http://aetfiws.ovh"
		self.code1 = test_data.CODE1
		self.code2 = test_data.CODE2
		self.code3 = test_data.CODE3
		self.parser = parsers.ExtractData()
		self.parser_encoding = parsers.ExtractEncoding()
		self.STOPWORDS = {'fr':('mot', 'pour', 'de')}
		self.BADWORDS = {'fr': ('pipe', 'xxx')}
		self.is_title = True
		self.title = 'letter'
		self.headers = {'status': '200 OK', 'content-type': 'text/html; charset=utf-8', 'vary': 'X-PJAX, Accept-Encoding'}
		self.reqrobots = RobotsCache(capacity=100)
Ejemplo n.º 16
0
 def allowed(self, url):
     surl = urlparse(url)
     rurl = surl.scheme + '://' + surl.hostname + '/robots.txt'
     if rurl in self.__robot:
         if not self.__robot[rurl].expired:
             return self.__robot[rurl].allowed(url, UA)
     try:
         r = RobotsCache().fetch(rurl)
     except:
         return False
     else:
         self.__robot[rurl] = r
         # add a rule object
         return self.__robot[rurl].allowed(url, UA)
Ejemplo n.º 17
0
    def __init__(self, crawler):
        if not crawler.settings.getbool('ROBOTSTXT_OBEY'):
            raise NotConfigured

        self.completeblacklist = crawler.settings.get('ROBOTSTXT_BLACKLIST',
                                                      ())
        self.blacklist = []
        self.generalblacklist = crawler.settings.get('GENERAL_BLACKLIST', ())
        self.hasblacklist = False
        self.whitelist = crawler.settings.get('ROBOTSTXT_WHITELIST', ())
        self.crawler = crawler
        self._useragent = crawler.settings.get('USER_AGENT')
        self._parsers = {}
        self._spider_netlocs = set()
        self.robots = RobotsCache()

        self.stoprepetitionsrearg = re.compile(ur'.*?\&(.*?\&)\1{1,}.*')
        self.stoprepetitionsreslash = re.compile(ur'.*?\/(.*?\/)\1{1,}.*')
Ejemplo n.º 18
0
    def __init__(self, base_url, forum_codes, archive_location, user_agent,
                 worker_count):
        archiver_logger.info('Archiver initialized.')
        self.base_url = base_url
        self.archive_base_url = urljoin(self.base_url,
                                        ScraperConfig.ARCHIVE_SUBURL)
        self.forum_codes = forum_codes
        self.archive_location = archive_location
        self.user_agent = user_agent
        self.robot_parser = RobotsCache()
        self.scraper_timer = None
        self.shutdown_event = threading.Event()
        self.delay_time = 1

        self.workers = []
        self.worker_count = worker_count

        self.pages_need_visiting = Queue()
        self.pages_need_analysis_counter = RachetingCounter()
        self.pages_visited_lock = threading.Lock()
        self.pages_visited = []
        self.page_re_filters = []
Ejemplo n.º 19
0
dynamodb = boto3.resource('dynamodb')
table = dynamodb.Table("crawl-logs")
crawl_seeds_table = dynamodb.Table("crawl-seeds")


class DecimalEncoder(json.JSONEncoder):
    def default(self, o):
        if isinstance(o, decimal.Decimal):
            if o % 1 > 0:
                return float(o)
            else:
                return int(o)
        return super(DecimalEncoder, self).default(o)


robots = RobotsCache()
user_agent = 'OpenHouseProject.co crawler'
sleep_time = .9
bucket = 'oh-crawl'

expiration_rules = {
    'default': datetime.datetime.now() + datetime.timedelta(days=1),
    'starts_with': {
        'http://www.everyhome.com/Home-For-Sale/':
        datetime.datetime(2099, 1, 1),
        'http://www.everyhome.com/Homes-For-Sale-By-Listing-Date/Listed-on-':
        datetime.datetime(2099, 1, 1)
    }
}

Ejemplo n.º 20
0
 def __init__(self):
     self.reqrobots = RobotsCache()
     self.parser_encoding = parsers.ExtractEncoding()
        	try : fetch url exception : broken urls  

STOPING CONDITION => after N(e.g. 10) url dont tc urls into url dictionary and empty all dictionary and put into queue and finish that queue
'''

'''
data
'''
import re
import datetime as dt
from bs4 import BeautifulSoup
import urllib2
from urlparse import urlparse
from urlparse import urljoin
from reppy.cache import RobotsCache
robots = RobotsCache()         ## creating object for cache robots.txt


'''
url data
'''

class Url_class  :
    def __init__(self , url ):
		self.url = url
		self.anchor = []
                self.anchor_win = []
                self.title = ""
                self.urldata = "" 
    def add_anchor(self,anchortext,ancwintext):
    	self.anchor.append(anchortext)
Ejemplo n.º 22
0
    http://qiita.com/rusarusa/items/d7f014ba80d6fe7a3e07
・PythonでWEB上の画像をまとめてダウンロード
    http://www.dyesac.com/pythonでweb上の画像をまとめてダウンロード/
・画像クローラー
    http://qiita.com/komakomako/items/dd380f980e56e70fa321

Targets:
・https://reverb.com/jp/marketplace/electric-guitars
・https://www.yahoo.co.jp
"""

# (1) クロールするurlを決める
target_url = "https://www.yahoo.co.jp"

# (2) robot.txtを読み込むため際に使用するインスタンスの作成
robots = RobotsCache(100)

# (3) もし、robot.txtを読み込んでみて、クロール許可をもらえたら、先の処理に進む
if robots.allowed(target_url, 'python program'):
    # (4) Javascriptで生成されたコードでもクロールできるようにPhatomJSインスタンスを作成する
    driver = webdriver.PhantomJS()
    # (5) 作成したインスタンスのGetリクエストを呼ぶメソッドに対象のurlを引数として与え、domの情報を手に入れる
    driver.get(target_url)
    # <selenium.webdriver.phantomjs.webdriver.WebDriver (session="b140b9a0-74d3-11e7-b434-8b9f5b309f17")>
    # type(driver)
    # <class 'selenium.webdriver.phantomjs.webdriver.WebDriver'>

    # (6) 先ほど取得したdomの情報をutf-8でエンコードして、クロール対象ページの情報をbyte型として保持する
    html = driver.page_source.encode('utf-8')
    # type(html)
    # <class 'bytes'>
Ejemplo n.º 23
0
#! /usr/bin/env python

from __future__ import print_function

from contextlib import contextmanager
import time

from reppy.cache import RobotsCache
from reppy.parser import Rules

content = '''
User-agent: '*'
Allow: /
'''

cache = RobotsCache()
cache.add(Rules('http://example.com/', 200, content, float('inf')))


@contextmanager
def timer(count):
    '''Time this block.'''
    start = time.time()
    try:
        yield count
    finally:
        duration = time.time() - start
        print('Total: %s' % duration)
        print('  Avg: %s' % (duration / count))
        print(' Rate: %s' % (count / duration))
Ejemplo n.º 24
0
import sqlite3
import urllib
import time
from bs4 import BeautifulSoup
from reppy.cache import RobotsCache
from reppy.robots import Robots

#################################################
default_crawl_delay = 5

# caching robots.txt files for fast access
robots_cache = RobotsCache(capacity=200)

# db commit rate
commit_rate = 1
current_r = 0

#################################################

db_location = 'content.db'
conn = sqlite3.connect(db_location)
cur = conn.cursor()

#################################################
#################################################
# populate url_frontier

url_frontier = set()

cur.execute("SELECT `url_link` FROM `crawled_urls` WHERE `is_scraped` = 0") 
Ejemplo n.º 25
0
from reppy.cache import RobotsCache

agent = 'spoderman'
sandcrawler = RobotsCache(timeout=2)

def is_allowed(url):
    try:
        return sandcrawler.allowed(url, agent)
    except:
        return False

def crawl_delay(url):
    try:
        delay = sandcrawler.delay(url, agent)
        Print('Crawl delay for', url, delay)
        return delay if delay else 1
    except:
        return 1

Ejemplo n.º 26
0
def robots_parse():
    robots = RobotsCache()
    print robots.allowed("http://www.uky.edu/hr/employment", "my-agent")
Ejemplo n.º 27
0
 def setUp(self):
     self.robots = RobotsCache()
Ejemplo n.º 28
0
class EZWS:
    robo = RobotsCache(capacity=100, cache_policy=ReraiseExceptionPolicy(0))
    data: List[str] = []
    """
	SELF:

	config json config file
	ua     user agent
	robo   robotcache obj
	soup   current html page soup obj
	raw    raw html from req.get()
	check  check for robot files, keep true
	output name of output csv file
	"""
    def __init__(self,
                 file: Union[str, Dict],
                 ua: str = "",
                 check: bool = True,
                 output: str = "output.csv") -> None:
        self.ua = ua

        self.check = check

        #setting output to false disables file output
        self.output = output

        self.configarr = _listify(file)

    def allowed(self, url: str) -> bool:
        if not self.check:
            return True

        try:
            if self.robo.allowed(url, self.ua):
                return True
            print(url, "is not allowed")

        except ConnectionException:
            print(url, "seems to be down")

        return False

    def download(self, url: str) -> Optional[Any]:
        if not self.allowed(url):
            return None

        self.raw = requests.get(url).content

        return BeautifulSoup(self.raw, "html.parser")

    def xpath(self, html: str, xp: str) -> List[Any]:
        return cast(List[Any], lxmlhtml.fromstring(html).xpath(xp))

    def select(self, html: Any, json: Dict) -> List[str]:
        xpath = json.get("xpath", "")
        css = json.get("css", "")

        if xpath:
            found = self.xpath(html.getText(), xpath)

            return [found[0]] if self.config["header"] else found

        #assume css was passed
        found = html.select(css)
        if self.config["header"]:
            found = [found[0]]

        completed = []
        for item in found:
            output = []

            contents = _listify(json["contents"])

            for content in contents:
                if content and item.has_attr(content):
                    output.append(item[content])

                else:
                    output.append(item.text)

            completed += output

        return completed

    def clear(self) -> None:
        self.data = []

    def load(self, index: int) -> None:
        config = self.configarr[index]

        if isinstance(config, Dict):
            self.config = config

        else:
            if os.path.exists(config):
                with open(config) as f:
                    self.config = json.load(f)

        return None

    def grab(self, index: Optional[int] = None) -> None:
        if index is None:
            #using grab() with no params will grab all configs passed
            for i in range(len(self.configarr)):
                self.grab(i)

            return None

        self.load(index)
        if self.output:
            sc = simplecsv(self.output, mode="w+")
            if self.config["header"]:
                sc.writerow(self.config["header"])

        for json in self.config["links"]:
            for link in chain(
                    *[explode(link) for link in _listify(json["urls"])]):
                if not self.allowed(link):
                    return None

                soup = self.download(link)
                if not soup:
                    print("could not download file")
                    return None

                for divs in soup.select(json["container"]):
                    data = []
                    for grab in json["grab"]:
                        data += self.select(divs, grab)

                    self.data += data
                    if self.output:
                        sc.writerow(data)

        if self.output:
            sc.close()
Ejemplo n.º 29
0
 def _set_robot_rule(self):
     """
     Set the robots.txt rules
     """
     self.rules = RobotsCache().fetch(self.url)
scrape_path = "http://qiita.com/hmatsu47/items/"
# 探索対象外URL文字列
exclude_str_list = [
    "feed", "rss", "archive", "about", "revision", "like", "follow",
    "contribution", "comment", "reference", ".md"
]
# 探索済みURL
scrape_url_list = []
# 抽出した本文
summary_ap_text = []
# 探索する最大ページ数
crawl_limit = 100
# 本文を抽出する最大ページ数
item_limit = 50
# robots.txt判定用
robots_cache = RobotsCache(capacity=crawl_limit)
# Watson認証情報
apikey = '【APIキー】'
url = '【APIのURL】'


# 対象外URLが含まれていないか判定
def is_crawlable_url(url):
    for es in exclude_str_list:
        if url.find(es) != -1:
            break
    else:
        robots_flag = robots_cache.allowed(domain, "*")
        return (robots_flag)
    return False