def filter(self, ua):
        """Remove all of the urls in URLS that UA is not allowed to crawl,
           and fill in the .crawl_delay and .robots_url properties."""

        rules = None
        for url in sorted(self.urls):
            robots_url = Robots.robots_url(url)
            if self.robots_url != robots_url:
                if self.robots_url is None:
                    try:
                        rules = Robots.fetch(robots_url, headers={
                            'User-Agent': ua
                        }).agent(ua)
                    except Exception as e:
                        sys.stderr.write(
                            "warning: failed to fetch and parse {}: {}\n"
                            .format(robots_url, e))
                        rules = DummyAgent()

                    self.robots_url = robots_url
                    self.crawl_delay = rules.delay or 1

                else:
                    raise ValueError(
                        "robots.txt for {} is {}, not {}"
                        .format(url, robots_url, self.robots_url))

            if not rules.allowed(url):
                self.urls.remove(url)
def filter_urls(urls, ua):
    """Partition URLS (an iterable) into sites, and then filter out all of
    the urls in each site that UA is not allowed to crawl.  Returns a list
    of Site objects."""

    sites = defaultdict(Site)
    for url in urls:
        url = canon_url_syntax(url)
        robots_url = Robots.robots_url(url)
        sites[robots_url].add(url)

    for site in sites.values(): site.filter(ua)
    return sorted(sites.values(), key = lambda s: s.robots_url)
Example #3
0
def parse_webpages(webpages):
    for page in webpages:
        # obtain the robots.txt url
        r = Robots.robots_url(page)
        robots = Robots.fetch(r)
        if (robots.allowed(page, '*')):
            # sitemaps is a list of all the sitemaps for a website
            sitemaps = robots.sitemaps
            sitemaps_list = list(sitemaps)
            html = requests.get(page)  # html of the webpage
            soup = bs4.BeautifulSoup(html.text, "html.parser")
            outlinks = soup.find_all("a")  # all the outlinks
            links = [str(i.get('href')) for i in outlinks]
            outlinks = [str(i) for i in outlinks]
            docs = []  # the documents on the page

            for file in links:
                directory = page.rsplit("/", 1)[0]
                link = directory + '/' + file

                # can be expanded to other file types with a comma
                if file.endswith(('txt', 'md')):
                    if file.startswith(('http://', 'www.')):
                        text = bs4.BeautifulSoup(
                            requests.get(file).text, "html.parser")
                        ext = file.rsplit(".", 1)[-1]
                        text = [file, ext, text]
                        # text = {'link': link, 'ext': ext, 'text': text}
                        docs.append(text)
                    else:
                        text = bs4.BeautifulSoup(
                            requests.get(link).text, "html.parser")
                        ext = link.rsplit(".", 1)[-1]
                        text = [link, ext, text]
                        # text = {'link': link, 'ext': ext, 'text': text}
                        docs.append(text)
                elif file.endswith(('pdf')):  # special case if PDF
                    x = file
                    try:
                        if file.startswith(('http://', 'www.')):
                            pdf = file.rsplit("/", 1)[-1]
                            response = urlopen(file)
                        else:
                            pdf = file.rsplit("/", 1)[-1]
                            # must first check if pdf is found
                            response = urlopen(link)

                    except urllib.error.HTTPError as e:
                        # if 404 error, put 404 as text
                        text = [link, "pdf", "404"]
                        # text = {'link': link, 'ext': 'pdf', 'text': "404"}
                        docs.append(text)

                    else:
                        # otherwise must save the pdf to run pypdf2
                        file = open(pdf, 'wb')
                        file.write(response.read())
                        file.close()
                        if x.startswith('http://'):
                            link = x
                        txt = ""
                        file = open(pdf, 'rb')
                        parser = PDFParser(file)
                        document = PDFDocument(parser)
                        rsrcmgr = PDFResourceManager()
                        laparams = LAParams()
                        device = PDFPageAggregator(rsrcmgr, laparams=laparams)
                        interpreter = PDFPageInterpreter(rsrcmgr, device)
                        for p in PDFPage.create_pages(document):
                            # As the interpreter processes the page stored in PDFDocument object
                            interpreter.process_page(p)
                            # The device renders the layout from interpreter
                            layout = device.get_result()
                            # Out of the many LT objects within layout, we are interested in LTTextBox and LTTextLine
                            for lt_obj in layout:
                                if isinstance(lt_obj, LTTextBox) or isinstance(
                                        lt_obj, LTTextLine):
                                    txt += lt_obj.get_text()

                        # close the pdf file
                        file.close()
                        name = [link, "pdf", txt]
                        # name = {'link': link, 'ext': 'pdf', 'text': txt}
                        os.remove(pdf)  # remove the saved file when done
                        docs.append(name)

            docs = [[str(i) for i in lis] for lis in docs]
            timestamp = datetime.datetime.now().isoformat()
            output = {
                'url': page,
                'timestamp': timestamp,
                'outlinks': outlinks,
                'html': html.text,
                'docs': docs,
                'sitemaps': sitemaps_list
            }

            with Crawling_L_REST.app.app_context():
                Crawling_L_REST.add_webpage(output)

            return output
Example #4
0
 def robots(self,path):
     self.ro = Robots.fetch(self.url + path)
     return self.ro
 def __init__(self, seed_url, user_agent):
     self.seed_url = seed_url
     self.user_agent = user_agent
     self.robots_url = Robots.robots_url(seed_url)
     self.robots = Robots.fetch(self.robots_url)
     self.accepted_header_content_type = "text/html"
Example #6
0
 def __init__(self, robotstxt_body, spider):
     from reppy.robots import Robots
     self.spider = spider
     self.rp = Robots.parse('', robotstxt_body)
Example #7
0
from threading import Thread, Lock
from requests_html import HTMLSession
from time import sleep
from reppy.robots import Robots
from time import strftime, gmtime, time

domain = input('Введите домен:')
domain_link = f'https://{domain}'

new_urls = {domain_link}
scaned_urls = set()
locker = Lock()

robots = Robots.fetch(domain_link)
agent = 'Googlebot'

timestamp = strftime("__%d_%b_%H_%M_%S", gmtime())
file = open(f'{domain}_{timestamp}.txt', 'a', encoding='utf-8')


def worker():
    with HTMLSession() as session:
        while True:
            if len(new_urls) == 0:
                sleep(10)
                if len(new_urls) == 0:
                    break
            try:
                url = new_urls.pop()
                response = session.get(url, timeout=1)
                url_links = response.html.absolute_links
Example #8
0
def reppy_robot(url):
    robot_url = urljoin(get_domain_name(url), "robots.txt")
    rp = Robots.fetch(robot_url)
    #print(rp.allowed(href, '*'))
    yield rp.allowed(url, '*')
Example #9
0
Allow: /serv
Allow: /~mak
Disallow: /
'''


@contextmanager
def timer(name, count):
    '''Time this block.'''
    start = time.time()
    try:
        yield count
    finally:
        duration = time.time() - start
        print(name)
        print('=' * 10)
        print('Total: %s' % duration)
        print('  Avg: %s' % (duration / count))
        print(' Rate: %s' % (count / duration))
        print('')


with timer('Parse', 100000) as count:
    for _ in xrange(count):
        Robots.parse('http://example.com/robots.txt', content)

parsed = Robots.parse('http://example.com/robots.txt', content)
with timer('Evaluate', 100000) as count:
    for _ in xrange(count):
        parsed.allowed('/org/example.html', 'other-bot')
Example #10
0

# In[28]:

options = webdriver.ChromeOptions()
options.add_argument("headless")
driver = webdriver.Chrome(
    "C:\\Users\\shree\\Downloads\\Softwares\\chromedriver_win32\\chromedriver.exe",
    options=options)
q = queue.Queue()
urlsAlreadyVisited = set()
seed = "https://www.foodrepublic.com/recipes"
q.put(seed)
print(urldefrag(seed)[0])
urlsAlreadyVisited.add(urldefrag(seed)[0])
robots = Robots.fetch('http://www.foodrepublic.com/robots.txt')
agent = robots.agent('User-agent')

# In[1]:

while True:
    url = q.get()
    driver.get(url)
    soup = BeautifulSoup(driver.page_source, 'html.parser')
    #     saveContentInHeirarchy(str(soup.extract()),url)
    links = soup.find_all('a')
    for link in links:
        u = link.get('href')
        if not is_absolute(u):
            u = urljoin(url, u)
        if "foodrepublic.com" in u and "@foodrepublic.com" not in u:
Example #11
0
import random
from time import sleep
from threading import Lock
from queue import Queue
from requests_html import HTMLSession
from concurrent.futures import ThreadPoolExecutor
from reppy.robots import Robots

# This utility uses `requests` to fetch the content

DOMAIN = 'goodreads.com'

scaned_urls = set()
locker = Lock()

robots = Robots.fetch(f'https://www.{DOMAIN}/robots.txt')


def worker(queue):
	session = HTMLSession()
	while True:
		if queue.qsize() == 0:
			sleep(30)
			if queue.qsize() == 0:
				break
		try:
			url = queue.get()
			print('Send request to', url)
			resp = session.get(url)
			title = resp.html.xpath('//title/text()')[0].strip()
			print(title)
Example #12
0
Allow: /serv
Allow: /~mak
Disallow: /
'''

@contextmanager
def timer(name, count):
    '''Time this block.'''
    start = time.time()
    try:
        yield count
    finally:
        duration = time.time() - start
        print(name)
        print('=' * 10)
        print('Total: %s' % duration)
        print('  Avg: %s' % (duration / count))
        print(' Rate: %s' % (count / duration))
        print('')


with timer('Parse', 100000) as count:
    for _ in xrange(count):
        Robots.parse('http://example.com/robots.txt', content)


parsed = Robots.parse('http://example.com/robots.txt', content)
with timer('Evaluate', 100000) as count:
    for _ in xrange(count):
        parsed.allowed('/org/example.html', 'other-bot')
Example #13
0
    def __extract_info(self, url):

        self.__print_debug('crawling page', url)

        parsed_url = urlparse(url)
        if parsed_url.netloc == self.__initial_domain_name:
            if not self.__rp.allowed(url, self.__user_agent):
                self.__print_debug('disallowed by user agent')
                return None
        else:
            current_robot = Robots.fetch(Robots.robots_url(url))
            if not current_robot.allowed(url, self.__user_agent):
                self.__print_debug('disallowed by user agent')
                return None

        content, is_html, language = self.__crawl_page(url)
        if content is None:
            return None

        path = urlparse(url).path.replace('/', '_')
        if path is None or path == '':
            path = '__index__'

        if self.__storage:
            self.__set_up_folders(parsed_url.netloc)
            fsource = open(
                self.__PATH_SOURCE + parsed_url.netloc + '/' + path + '.html',
                'wb')
            fsource.write(content)
            fsource.close()

        if not is_html:
            self.__pages.append({
                'content': content,
                'language': language,
                'url': url,
                'html': content
            })
            return content
        soup = BeautifulSoup(content, 'html.parser')

        for link in soup.find_all('a'):
            href = link.get('href')
            if href is None or '#' in href:
                continue
            if href.startswith('http'):
                self.__add_url(href)
                continue

            if href.startswith('mailto'):
                continue

            new_url = str(urljoin(url, href))
            self.__add_url(new_url)
        texts = soup.findAll(text=True)
        visible_texts = filter(self.__tag_visible, texts)

        visible_texts = ' '.join(t.strip() for t in visible_texts
                                 if t.strip() != '')

        if self.__storage:
            fout = open(
                self.__PATH_INFO + parsed_url.netloc + '/' + path + '.json',
                'w')
            fout.write(
                json.dumps({
                    'url': url,
                    'domain_name': parsed_url.netloc,
                    'html': content.decode('utf-8'),
                    'language': language,
                    'content': visible_texts,
                    'meta': self.__meta,
                }))
            fout.close()

        self.__pages.append({
            'content': visible_texts,
            'language': language,
            'url': url,
            'html': content
        })
Example #14
0
domain = input('Введите домен для краулинга: ')
first_link = f'http://{domain}/'

prepared_response = session.get(first_link, proxies={})
first_link = prepared_response.url
domain = first_link.split('/')[2]

robots_link = f'https://{domain}/robots.txt'

crawled_links = set()

links_to_crawl = set()
links_to_crawl.add(first_link)

robots = Robots.fetch(robots_link)

file_results = open('checking_results.txt', 'w', encoding='utf-8')

while True:

    if len(links_to_crawl) == 0:
        break
    url = links_to_crawl.pop()

    try:
        proxies = get_random_proxy()

        t1 = time()
        response = session.get(url, proxies=proxies, timeout=8)
        t2 = time()
Example #15
0
 def is_robot_valid(self, url):
     robot_url = urljoin(get_domain_name(url), "robots.txt")
     rp = Robots.fetch(robot_url)
     yield rp.allowed(url, self.hdr['User-Agent'])
Example #16
0
 def parse(self, content, name):
     '''Parse the robots.txt in content and return the agent of the provided name.'''
     return Robots.parse('http://example.com', content).agent(name)
Example #17
0
def crawl_page(url):
    if not check_url(url):
        return

    try:
        content = requests.get(url).text
    except requests.exceptions.ConnectionError:
        return

    soup = BeautifulSoup(content, "html.parser")

    print("Start URL: ", url)
    site = get_site_information(url)
    rows = dbm.get_all_rows("sites")
    for i in range(100):
        try:
            if is_existing(site.link, rows):
                dbm.update_column(site)
                break
            else:
                dbm.insert_into_sites(site)
                break
        except:
            pass

    links = soup.find_all('a')

    index = 0
    for link in links:
        links[index] = link.get('href')
        index += 1

    links = list(filter(None, links))  # remove empty strings

    links = list(set(links))  # removing duplicates
    filtered_links = list()
    for link in links:
        if url_pattern.match(link):
            filtered_links.append(link)
            links.remove(link)
        elif sub_url_pattern.match(link):
            if str(link).startswith("/") and url.endswith("/"):
                filtered_links.append(get_url(get_domain(url[:-1])) + link)
            else:
                filtered_links.append(get_url(get_domain(url)) + link)

            links.remove(link)

    index = 0
    for link in filtered_links:
        if "?" in link:  # Found get parameter
            filtered_links[index] = filtered_links[index][:filtered_links[
                index].find("?")]  # Remove get parameters

        if "#" in link:
            filtered_links[index] = filtered_links[
                index][:filtered_links[index].find("#")]

        if "%" in link:
            filtered_links[index] = filtered_links[
                index][:filtered_links[index].find("%")]

        index += 1

    filtered_links.append(url)

    print("Links found: ", len(filtered_links))
    for link in filtered_links:
        if not check_url(link):
            filtered_links.remove(link)

    sites = list()

    for link in filtered_links:
        if link in get_url(get_domain(link)):
            pass

        elif link not in url:
            domain = get_domain(link)
            robots_url = get_url_to_robots(get_url(domain))
            try:
                robots = Robots.fetch(robots_url)
                if not robots.allowed(link, user_agent):
                    if link.endswith("/"):
                        if robots.allowed(link[:-1].allowed(
                                link[:-1], user_agent)):
                            pass
                        else:
                            filtered_links.remove(link)
                    else:
                        filtered_links.remove(link)
                else:
                    print("link: ", link)
            except:
                filtered_links.remove(link)

    for link in filtered_links:
        if get_url(get_domain(link)) + str("/sitemap") in link:
            print(1)
            continue

        sites.append(get_site_information(link))
        filtered_links.remove(link)

    # Adding all sites to the db in a seperate thread
    threading.Thread(target=insert_into_db, args=(sites, )).start()
    print("Links found: ", len(sites))

    if len(sites) > 0:
        with stopit.ThreadingTimeout(100) as to_ctx_mgr:
            assert to_ctx_mgr.state == to_ctx_mgr.EXECUTING
            choice = random.choice(sites).link
            if not choice == url:
                crawl_page(choice)
Example #18
0
from requests_html import HTMLSession
from reppy.robots import Robots

print('Start working')

domain = input('Enter domain name: ')
home_url = f'http://{domain}/'
robots_url = f'http://{domain}/robots.txt'

robots = Robots.fetch(robots_url)

links_to_scan = set()
links_to_scan.add(home_url)

scaned_links = set()

session = HTMLSession()

result_file = open('results.csv', 'w')
result_file.write(f'Is Duplicate\tURL\tTitle\tDescription\tH1\tCanonical\n')

all_titles = set()


def make(s):
    try:
        s = s[0].strip()
    except IndexError:
        s = ''
    return s
Example #19
0
def main():
    downloader = None

    parser = argparse.ArgumentParser()
    parser.add_argument("-i", "--insecure", help="use HTTP instead of HTTPS", action="store_true")
    parser.add_argument("-e", "--export", help="export immediately without downloading (Only useful if you already downloaded something to the .pickle file)", action="store_true")
    parser.add_argument('-E', '--Exchange', help='Only export ticker symbols from this exchange (the filtering is done during the export phase)')
    parser.add_argument('type', nargs='?', default='generic', help='The type to download, this can be: '+" ".join(list(options.keys())))
    parser.add_argument("-s", "--sleep", help="The time to sleep in seconds between requests", type=float, default=0)
    parser.add_argument("-p", "--pandantic", help="Stop and warn the user if some rare assertion fails", action="store_true")

    args = parser.parse_args()

    protocol = 'http' if args.insecure else 'https'
    if args.insecure:
        print("Using insecure connection")

    if args.export:
        print("Exporting pickle file")

    tickerType = args.type = args.type.lower()

    print("Checking if we can resume a old download session")
    try:
        downloader = loadDownloader(tickerType)
        print("Downloader found on disk, resuming")
    except:
        print("No old downloader found on disk")
        print("Starting a new session")
        if tickerType not in options:
            print("Error: " + tickerType + " is not a valid type option. See --help")
            exit(1)
        else:
            downloader = options[tickerType]

    robotsUrl = protocol + '://finance.yahoo.com/robots.txt'
    robots = Robots.fetch(robotsUrl)
    try:
        if not args.export:
            
            if(not robots.allowed(protocol + '://finance.yahoo.com/_finance_doubledown/api/resource/searchassist', user_agent)):
                print('Execution of script halted due to ' + robotsUrl)
                return 1
            
            if not downloader.isDone():
                print("Downloading " + downloader.type)
                print("")
                downloadEverything(downloader, tickerType, args.insecure, args.sleep, args.pandantic)
                print ("Saving downloader to disk...")
                saveDownloader(downloader, tickerType)
                print ("Downloader successfully saved.")
                print ("")
            else:
                print("The downloader has already finished downloading everything")
                print("")

    except Exception as ex:
        print("A exception occurred while downloading. Suspending downloader to disk")
        saveDownloader(downloader, tickerType)
        print("Successfully saved download state")
        print("Try removing {type}.pickle file if this error persists")
        print("Issues can be reported on https://github.com/Benny-/Yahoo-ticker-symbol-downloader/issues")
        print("")
        raise
    except KeyboardInterrupt as ex:
        print("Suspending downloader to disk as .pickle file")
        saveDownloader(downloader, tickerType)

    if downloader.isDone() or args.export:
        print("Exporting "+downloader.type+" symbols")

        data = tablib.Dataset()
        data.headers = downloader.getRowHeader()

        for symbol in downloader.getCollectedSymbols():
            if(args.Exchange == None):
                data.append(symbol.getRow())
            elif (symbol.exchange == args.Exchange):
                data.append(symbol.getRow())

        with io.open(downloader.type + '.csv', 'w', encoding='utf-8') as f:
            f.write(text.join(u',', data.headers) + '\n')
            writer = csv.writer(f)
            for i in range(0, len(data)):
                row = [text(y) if not y is None else u"" for y in data[i]]
                writer.writerow(row)

        try:
            with open(downloader.type + '.xlsx', 'wb') as f:
                f.write(data.xlsx)
        except:
            print("Could not export .xlsx due to a internal error")

        try:
            with open(downloader.type + '.json', 'wb') as f:
                f.write(data.json.encode('UTF-8'))
        except:
            print("Could not export .json due to a internal error")

        try:
            with open(downloader.type + '.yaml', 'wb') as f:
                f.write(data.yaml.encode('UTF-8'))
        except:
            print("Could not export .yaml due to a internal error")
from reppy.robots import Robots

#grab robots url
url = Robots.robots_url('https://science.rpi.edu/computer-science')

if 'http' in url:
    #print(url)
    robots = Robots.fetch(url)
    #print(robots)
    print(robots.allowed('https://science.rpi.edu/computer-science/', 'agent'))
    print(robots.allowed('https://science.rpi.edu/admin/', 'agent'))
def check_allow(urls):
    robots = Robots.fetch(f'{start_url}/robots.txt')
    return {url for url in urls if robots.allowed(url, 'Googlebot') == True}
Example #22
0
 def parse(self, content, name):
     '''Parse the robots.txt in content and return the agent of the provided name.'''
     return Robots.parse('http://example.com', content).agent(name)
Example #23
0
    def parse_robots_txt(self, link_list):
        host, port = self.config.cache_server
        robotsURL = ''
        robots = None
        links = []
        for link_url in link_list:
            parsed_link = parse.urlparse(link_url)
            link_base = '{0.scheme}://{0.netloc}/'.format(parsed_link)
            if robots == None or link_base not in robotsURL:
                if 'today.uci.edu' in link_base:
                    robots = Robots.parse('https://today.uci.edu/department/information_computer_sciences/robots.txt', '''
                    User-agent: *
                    Disallow: /*/calendar/*?*types*
                    Disallow: /*/browse*?*types*
                    Disallow: /*/calendar/200*
                    Disallow: /*/calendar/2015*
                    Disallow: /*/calendar/2016*
                    Disallow: /*/calendar/2017*
                    Disallow: /*/calendar/2018*
                    Disallow: /*/calendar/2019*
                    Disallow: /*/calendar/202*
                    Disallow: /*/calendar/week
                    
                    Disallow: /*/search
                    Disallow: /*?utm
                    
                    Allow: /
                    Allow: /*/search/events.ics
                    Allow: /*/search/events.xml
                    Allow: /*/calendar/ics
                    Allow: /*/calendar/xml
                    ''')
                else:
                    robotsURL = link_base + 'robots.txt'
                    time.sleep(0.5)
                    # get the robots.txt file
                    try:
                        robots = Robots.fetch(f"http://{host}:{port}/", params=[("q", f"{robotsURL}"), ("u", f"{self.config.user_agent}")], timeout=20)
                    except Exception as e:
                        print(e)
                        robots = None

                    # WARNING: UNCOMMENTING BYPASSES CACHE

                    # if the robots is empty, get the robots.txt from actual server
                    # robots_str = str(robots)
                    # robots_str = robots_str.split(': ')[1].split('}')[0]
                    # if robots_str == '[]':
                    #     robots = Robots.fetch(robotsURL, timeout=20)
                    #     print(robots)
            if robots == None:
                links.append(link_url)
                continue
            if parsed_link.params == '':
                if parsed_link.query == '':
                    query_only = '{0.path}/'.format(parsed_link)
                else:
                    query_only = '{0.path}/?{0.query}'.format(parsed_link)
            else:
                if parsed_link.query == '':
                    query_only = '{0.path}/{0.params}/'.format(parsed_link)
                else:
                    query_only = '{0.path}/{0.params}/?{0.query}'.format(parsed_link)
            if robots.allowed(query_only, self.config.user_agent):
                links.append(link_url)
        return links
Example #24
0
from reppy.robots import Robots

url = "http://www.amazon.com"
robots = Robots.fetch(url + "/robots.txt")
paths = [
    '/', '/gp/dmusic/', '/gp/dmusic/promotions/PrimeMusic/',
    '/gp/registry/wishlist/'
]

for path in paths:
    print("{0}: {1}".format(robots.allowed(path, '*'), url + path))
Example #25
0
from reppy.robots import Robots

#%%
robots = Robots.fetch('https://allabout.co.jp/robots.txt')
agent = robots.agent('*')

#アクセス可能かどうか
agent.allowed('https://allabout.co.jp/r_finance/')
#agent.allowed('https://allabout.co.jp/ranking/daily/')

#%%クロール間隔の取得
robots = Robots.fetch('https://allabout.co.jp/robots.txt')
agent = robots.agent('bingbot')
agent.delay