Python Robots.parse Examples

Programming Language: Python

Namespace/Package Name: reppy.robots

Class/Type: Robots

Method/Function: parse

Examples at hotexamples.com: 11

Python Robots.parse - 11 examples found. These are the top rated real world Python examples of reppy.robots.Robots.parse extracted from open source projects. You can rate examples to help us improve the quality of examples.

Frequently Used Methods

Show Hide

fetch(30)

robots_url(15)

parse(9)

Frequently Used Methods

fetch (30)

robots_url (15)

parse (9)

Example #1

Show file

def test_cocrawler_reppy():
    r1 = Robots.parse(
        'http://example.com/robots.txt', '''
User-Agent: foo
Allow: /
# comment
Disallow: /
Disallow: /disallowed
''')
    r2 = Robots.parse(
        'http://example.com/robots.txt', '''
User-Agent: foo
Allow: /

Disallow: /
Disallow: /disallowed
''')
    r3 = Robots.parse(
        '', '''
User-Agent: foo
Allow: /

Disallow: /
Disallow: /disallowed
''')

    # despite the blank line or comment, 'foo' is disllowed from disallowed
    assert r1.allowed('/', 'foo') is True
    assert r1.allowed('/disallowed', 'foo') is False
    assert r2.allowed('/', 'foo') is True
    assert r2.allowed('/disallowed', 'foo') is False
    assert r3.allowed('/', 'foo') is True
    assert r3.allowed('/disallowed', 'foo') is False

    # blank line does not reset user-agent to *, so bar has no rules
    assert r1.allowed('/', 'bar') is True
    assert r1.allowed('/disallowed', 'bar') is True
    assert r2.allowed('/', 'bar') is True
    assert r2.allowed('/disallowed', 'bar') is True
    assert r3.allowed('/', 'bar') is True
    assert r3.allowed('/disallowed', 'bar') is True

    # no substring weirdnesses, so foobar does not match foo rules
    assert r1.allowed('/', 'foobar') is True
    assert r1.allowed('/disallowed', 'foobar') is True
    assert r2.allowed('/', 'foobar') is True
    assert r2.allowed('/disallowed', 'foobar') is True
    assert r3.allowed('/', 'foobar') is True
    assert r3.allowed('/disallowed', 'foobar') is True

Example #2

Show file

def test_cocrawler_reppy_xfail():
    r4 = Robots.parse('', '''
User-agent: *
Disallow: //
''')

    # ibm.com, I'm looking at you
    assert r4.allowed('/foo', '') is True
    assert r4.allowed('/', '') is True

Example #3

Show file

File: sitemap_gen.py Project: kazerthekeen/sitemap_generator

def getRobotParser(loader, startUrl):
    robotUrl = urllib.parse.urljoin(startUrl, "/robots.txt")
    page = loader.get(robotUrl, allow_redirects=True)

    if page is None:
        print("Could not read ROBOTS.TXT at: " + robotUrl)
        return None
    #end if

    rp = Robots.parse(robotUrl, page)
    print("Found ROBOTS.TXT at: " + robotUrl)
    return rp

Example #4

Show file

def getRobotParser(startUrl):
    robotUrl = urllib.parse.urljoin(startUrl, "/robots.txt")
    page, _, _ = getPage(robotUrl)

    if page is None:
        print("Could not read ROBOTS.TXT at: " + robotUrl)
        return None
    #end if

    rp = Robots.parse(robotUrl, page)
    print("Found ROBOTS.TXT at: " + robotUrl)
    # return rp
    return None

Example #5

Show file

File: robotstxt.py Project: SilverNight7350/learn

 def __init__(self, robotstxt_body, spider):
     from reppy.robots import Robots
     self.spider = spider
     self.rp = Robots.parse('', robotstxt_body)

Example #6

Show file

 def parse(self, content, name):
     '''Parse the robots.txt in content and return the agent of the provided name.'''
     return Robots.parse('http://example.com', content).agent(name)

Example #7

Show file

def benchmark_reppy_parser(website):
    from reppy.robots import Robots
    rp = Robots.parse('', website['robotstxt'])
    for link in website['links']:
        rp.allowed(link, 'googlebot')

Example #8

Show file

 def parse(self, content, name):
     '''Parse the robots.txt in content and return the agent of the provided name.'''
     return Robots.parse('http://example.com', content).agent(name)

Example #9

Show file

    def parse_robots_txt(self, link_list):
        host, port = self.config.cache_server
        robotsURL = ''
        robots = None
        links = []
        for link_url in link_list:
            parsed_link = parse.urlparse(link_url)
            link_base = '{0.scheme}://{0.netloc}/'.format(parsed_link)
            if robots == None or link_base not in robotsURL:
                if 'today.uci.edu' in link_base:
                    robots = Robots.parse('https://today.uci.edu/department/information_computer_sciences/robots.txt', '''
                    User-agent: *
                    Disallow: /*/calendar/*?*types*
                    Disallow: /*/browse*?*types*
                    Disallow: /*/calendar/200*
                    Disallow: /*/calendar/2015*
                    Disallow: /*/calendar/2016*
                    Disallow: /*/calendar/2017*
                    Disallow: /*/calendar/2018*
                    Disallow: /*/calendar/2019*
                    Disallow: /*/calendar/202*
                    Disallow: /*/calendar/week
                    
                    Disallow: /*/search
                    Disallow: /*?utm
                    
                    Allow: /
                    Allow: /*/search/events.ics
                    Allow: /*/search/events.xml
                    Allow: /*/calendar/ics
                    Allow: /*/calendar/xml
                    ''')
                else:
                    robotsURL = link_base + 'robots.txt'
                    time.sleep(0.5)
                    # get the robots.txt file
                    try:
                        robots = Robots.fetch(f"http://{host}:{port}/", params=[("q", f"{robotsURL}"), ("u", f"{self.config.user_agent}")], timeout=20)
                    except Exception as e:
                        print(e)
                        robots = None

                    # WARNING: UNCOMMENTING BYPASSES CACHE

                    # if the robots is empty, get the robots.txt from actual server
                    # robots_str = str(robots)
                    # robots_str = robots_str.split(': ')[1].split('}')[0]
                    # if robots_str == '[]':
                    #     robots = Robots.fetch(robotsURL, timeout=20)
                    #     print(robots)
            if robots == None:
                links.append(link_url)
                continue
            if parsed_link.params == '':
                if parsed_link.query == '':
                    query_only = '{0.path}/'.format(parsed_link)
                else:
                    query_only = '{0.path}/?{0.query}'.format(parsed_link)
            else:
                if parsed_link.query == '':
                    query_only = '{0.path}/{0.params}/'.format(parsed_link)
                else:
                    query_only = '{0.path}/{0.params}/?{0.query}'.format(parsed_link)
            if robots.allowed(query_only, self.config.user_agent):
                links.append(link_url)
        return links

Example #10

Show file

Allow: /serv
Allow: /~mak
Disallow: /
'''


@contextmanager
def timer(name, count):
    '''Time this block.'''
    start = time.time()
    try:
        yield count
    finally:
        duration = time.time() - start
        print(name)
        print('=' * 10)
        print('Total: %s' % duration)
        print('  Avg: %s' % (duration / count))
        print(' Rate: %s' % (count / duration))
        print('')


with timer('Parse', 100000) as count:
    for _ in xrange(count):
        Robots.parse('http://example.com/robots.txt', content)

parsed = Robots.parse('http://example.com/robots.txt', content)
with timer('Evaluate', 100000) as count:
    for _ in xrange(count):
        parsed.allowed('/org/example.html', 'other-bot')

Example #11

Show file

File: bench.py Project: seomoz/reppy

Allow: /serv
Allow: /~mak
Disallow: /
'''

@contextmanager
def timer(name, count):
    '''Time this block.'''
    start = time.time()
    try:
        yield count
    finally:
        duration = time.time() - start
        print(name)
        print('=' * 10)
        print('Total: %s' % duration)
        print('  Avg: %s' % (duration / count))
        print(' Rate: %s' % (count / duration))
        print('')


with timer('Parse', 100000) as count:
    for _ in xrange(count):
        Robots.parse('http://example.com/robots.txt', content)


parsed = Robots.parse('http://example.com/robots.txt', content)
with timer('Evaluate', 100000) as count:
    for _ in xrange(count):
        parsed.allowed('/org/example.html', 'other-bot')