Exemple #1
0
    def proxy_address(self, flow: http.HTTPFlow) -> typing.Tuple[str, int]:
        # Check if the URL is known to the CDX server
        playback = False

        # Use the canonicalised URL
        r_url = str(urlcanon.whatwg(urlcanon.parse_url(flow.request.url)))

        # Query the CDX service for this URL:
        ctx.log.info("checking %s..." % r_url)
        r = self.s.get('http://cdxserver:8080/fc',
                       params={
                           'url': r_url,
                           'sort': 'reverse',
                           'limit': 10
                       })

        # Loop through response CDX lines:
        for cdxline in r.iter_lines(decode_unicode=True):
            cdx = cdxline.split(" ")
            # Compare canonicalised URLs (in case an intermediary e.g. adds a default :80 port)
            cdx_url = str(urlcanon.whatwg(urlcanon.parse_url(cdx[2])))
            if r_url == cdx_url:
                ctx.log.info("MATCH")
                playback = True
                break
            else:
                ctx.log.info("NO MATCH '%s' '%s'" % (r_url, cdx_url))

        # Either playback or record, depending on the outcome:
        if playback:
            ctx.log.info("PYWB")
            return ("pywb", 8080)
        else:
            ctx.log.info("WARCPROX")
            return ("warcprox", 8000)
Exemple #2
0
def test_parser_idempotence():
    path = os.path.join(os.path.dirname(__file__), '..', '..', 'testdata',
                        'idempotence.json')
    with open(path, 'rb') as f:
        inputs = load_json_bytes(f.read())
    for s in inputs:
        assert urlcanon.parse_url(s).__bytes__() == s
Exemple #3
0
def new_site(frontier, site):
    site.id = str(uuid.uuid4())
    logging.info("new site {}".format(site))
    # insert the Page into the database before the Site, to avoid situation
    # where a brozzler worker immediately claims the site, finds no pages
    # to crawl, and decides the site is finished
    try:
        url = urlcanon.parse_url(site.seed)
        hashtag = (url.hash_sign + url.fragment).decode("utf-8")
        urlcanon.canon.remove_fragment(url)
        page = brozzler.Page(
            frontier.rr, {
                "url": str(url),
                "site_id": site.get("id"),
                "job_id": site.get("job_id"),
                "hops_from_seed": 0,
                "priority": 1000,
                "needs_robots_check": True
            })
        if hashtag:
            page.hashtags = [
                hashtag,
            ]
        page.save()
        logging.info("queued page %s", page)
    finally:
        # finally block because we want to insert the Site no matter what
        site.save()
Exemple #4
0
    def from_url(cls, url):
        """Returns broken-down SURT from a URL.

        Arguments:
        url -- The URL to SURTify.

        Returns:
        A SURT broken down into its parts.
        """
        return cls(parse_url(url).surt().decode('utf-8'))
Exemple #5
0
def new_seed_page(frontier, site):
    url = urlcanon.parse_url(site.seed)
    hashtag = (url.hash_sign + url.fragment).decode("utf-8")
    urlcanon.canon.remove_fragment(url)
    page = brozzler.Page(frontier.rr, {
        "url": str(url), "site_id": site.get("id"),
        "job_id": site.get("job_id"), "hops_from_seed": 0,
        "priority": 1000, "needs_robots_check": True})
    if hashtag:
        page.hashtags = [hashtag,]
    return page
Exemple #6
0
def new_seed_page(frontier, site):
    url = urlcanon.parse_url(site.seed)
    hashtag = (url.hash_sign + url.fragment).decode("utf-8")
    urlcanon.canon.remove_fragment(url)
    page = brozzler.Page(frontier.rr, {
        "url": str(url), "site_id": site.get("id"),
        "job_id": site.get("job_id"), "hops_from_seed": 0,
        "priority": 1000, "needs_robots_check": True})
    if hashtag:
        page.hashtags = [hashtag,]
    return page
Exemple #7
0
def url_matches_domain_exactly(url, domain):
    '''
    Returns true if
     - domain is an ip address and url.host is the same ip address
     - domain is a domain and url.host is the same domain

    Does not do any normalization/canonicalization. Probably a good idea to
    call `host_matches_domain(
            canonicalize(url), urlcanon.normalize_host(domain))`.
    '''
    if not isinstance(url, urlcanon.ParsedUrl):
        url = urlcanon.parse_url(url)
    return host_matches_domain_exactly(url.host, domain)
Exemple #8
0
    def applies(self, url, parent_url=None):
        '''
        Returns true if `url` matches `match_rule`.

        All conditions must match for a url to be considered a match.

        The caller should normally canonicalize before `url` and `parent_url`
        passing them to this method.

        Args:
            url (urlcanon.ParsedUrl or bytes or str): already canonicalized url
            parent_url (urlcanon.ParsedUrl or bytes or str, optional): parent
                url, should be supplied if the rule has a `parent_url_regex`
        Returns:
            bool: True if the rule matches, False otherwise
        '''
        if not isinstance(url, urlcanon.ParsedUrl):
            url = urlcanon.parse_url(url)

        if self.domain:
            domain_test_fn = (url_matches_domain if not self.exact else
                              url_matches_domain_exactly)
            if not domain_test_fn(url, self.domain):
                return False
        if self.surt:
            surt = url.surt()
            if not (surt == self.surt
                    if self.exact else surt.startswith(self.surt)):
                return False
        if self.ssurt:
            surt = url.ssurt()
            if not (surt == self.ssurt
                    if self.exact else surt.startswith(self.ssurt)):
                return False
        if self.substring and not url.__bytes__().find(self.substring) >= 0:
            return False
        if self.regex:
            if not self.regex.match(url.__bytes__()):
                return False
        if self.parent_url_regex:
            if not parent_url:
                return False
            if isinstance(parent_url, urlcanon.ParsedUrl):
                parent_url = parent_url.__bytes__()
            elif isinstance(parent_url, unicode):
                parent_url = parent_url.encode('utf-8')
            if not self.parent_url_regex.match(parent_url):
                return False

        return True
Exemple #9
0
def test_w3c_test_data(input, href, test):
    url = urlcanon.parse_url(input)
    urlcanon.whatwg(url)
    assert test['protocol'].encode('utf-8') == (url.scheme +
                                                url.colon_after_scheme)
    assert test['username'].encode('utf-8') == url.username
    assert test['password'].encode('utf-8') == url.password
    assert test['host'].encode('utf-8') == url.host_port
    assert test['hostname'].encode('utf-8') == url.host
    assert test['pathname'].encode('utf-8') == url.path
    assert test['search'].encode('utf-8') == (url.query and
                                              (url.question_mark + url.query)
                                              or b'')
    assert test['hash'].encode('utf-8') == (url.fragment and
                                            (url.hash_sign + url.fragment)
                                            or b'')
    assert test['href'] == unicode(url)
Exemple #10
0
 def url_to_canon(self, url):
     parsed_url = urlcanon.parse_url(url)
     urlcanon.whatwg(parsed_url)
     parsed_url = str(parsed_url)
     if parsed_url.lower().endswith("index.html"):
         parsed_url = parsed_url[:parsed_url.index("index.html")]
     neki2 = parsed_url.rsplit('/', 1)[1]
     if '#' in neki2:
         parsed_url = parsed_url[:parsed_url.index("#")]
     if neki2 != '' and '.' not in neki2 and not neki2.endswith('/'):
         parsed_url += '/'
     parsed_url = urllib.parse.unquote(parsed_url)
     if parsed_url.count(':') == 1:
         ena, dva = parsed_url.split(':')
         if ' ' in dva:
             parsed_url = ena + ':' + urllib.parse.quote(dva)
     parsed_url = url_normalize.url_normalize(parsed_url)
     return parsed_url
Exemple #11
0
def normalizeUrl(url, link_value):
    parsed_url_str = None
    for exc in PATH_EXCLUSIONS:
        if exc in url.path:
            return None

    if url.query or url.fragment or url.scheme == "mailto" or url.scheme == "tel" or url.scheme == "data" or url.scheme == "javascript":
        return None

    link_value = eliminateFromURL(link_value, EXTRAS)
    parsed_url = urlcanon.parse_url(link_value)
    urlcanon.whatwg(parsed_url)
    parsed_url_str = str(parsed_url)
    parsed_url_str = parsed_url_str.replace('//', '/')
    if parsed_url_str:
        if parsed_url_str[0] == '.':
            parsed_url_str = parsed_url_str[1:]
        if parsed_url_str[-1] == '/':
            parsed_url_str = parsed_url_str[:-1]
    return parsed_url_str
Exemple #12
0
def add_articles():
    Article.objects.delete(date__gte=(datetime.datetime.now() -
                                      datetime.timedelta(days=2)))
    idk = FeedModel.objects.all()

    for bar in idk:
        print(bar.url)
        foo = feedparser.parse(bar.url)
        for post in foo.entries:
            time.sleep(10)
            parsed_url = urlcanon.parse_url(post.link)
            og = OpenGraph(url=post.link)
            try:
                category = model.predict([post.title])
                Article.objects.add_article(post.title, post.description,
                                            parsed_url, og.image, bar.title,
                                            category)
                logger.info("Article Added")
            except:
                logger.info("Did Not Work")
                continue
Exemple #13
0
            def prune_outlinks(dirty_links, block_list=None):
                '''
                Filter for valid schemes, remove URL fragments, and drop any other designated URLs from the list.
                '''
                links = set()
                dirty_links = set(dirty_links)

                self.logger.info('Pruning links...')
                for link in dirty_links:
                    link = urlcanon.parse_url(link)

                    if link.scheme in (b'http', b'https', b'ftp'):
                        urlcanon.canon.remove_fragment(link)
                        link = str(link).strip()
                        links.add(link)

                self.logger.info('Pruning complete.')

                # Need to remove after link fragments have been removed to prevent duplication.
                self.logger.info('Removing Links: %s', ', '.join(block_list))
                links = links.difference(block_list)

                return links
Exemple #14
0
            def prune_outlinks(dirty_links, block_list=None):
                '''
                Filter for valid schemes, remove URL fragments, and drop any other designated URLs from the list.
                '''
                links = set()
                dirty_links = set(dirty_links)

                self.logger.info('Pruning links...')
                for link in dirty_links:
                    link = urlcanon.parse_url(link)

                    if link.scheme in (b'http', b'https', b'ftp'):
                        urlcanon.canon.remove_fragment(link)
                        link = str(link).strip()
                        links.add(link)

                self.logger.info('Pruning complete.')

                # Need to remove after link fragments have been removed to prevent duplication.
                self.logger.info('Removing Links: %s', ', '.join(block_list))
                links = links.difference(block_list)

                return links
Exemple #15
0
def test_parsing(input, parsed_fields):
    parsed_url = urlcanon.parse_url(input)
    assert parsed_url.leading_junk == parsed_fields[b'leading_junk']
    assert parsed_url.scheme == parsed_fields[b'scheme']
    assert parsed_url.colon_after_scheme == parsed_fields[
        b'colon_after_scheme']
    assert parsed_url.slashes == parsed_fields[b'slashes']
    assert parsed_url.username == parsed_fields[b'username']
    assert parsed_url.colon_before_password == parsed_fields[
        b'colon_before_password']
    assert parsed_url.password == parsed_fields[b'password']
    assert parsed_url.at_sign == parsed_fields[b'at_sign']
    assert parsed_url.ip6 == parsed_fields[b'ip6']
    assert parsed_url.ip4 == parsed_fields[b'ip4']
    assert parsed_url.host == parsed_fields[b'host']
    assert parsed_url.colon_before_port == parsed_fields[b'colon_before_port']
    assert parsed_url.port == parsed_fields[b'port']
    assert parsed_url.path == parsed_fields[b'path']
    assert parsed_url.question_mark == parsed_fields[b'question_mark']
    assert parsed_url.query == parsed_fields[b'query']
    assert parsed_url.hash_sign == parsed_fields[b'hash_sign']
    assert parsed_url.fragment == parsed_fields[b'fragment']
    assert parsed_url.trailing_junk == parsed_fields[b'trailing_junk']
Exemple #16
0
def parse_record(path, node_id, edge_id, process_record, max_identifier_length,
                 dt14):
    with open(path, "rb") as infile:
        # loop on every record in WAT
        for record in ArchiveIterator(infile):
            record_array = []

            if record.rec_type != 'metadata':
                continue

            warc_target_uri = urlcanon.parse_url(
                record.rec_headers.get_header('WARC-Target-URI'))
            urlcanon.whatwg(warc_target_uri)  # canonicalization

            # select only members whose WARC-Target-URI begins with "https?://"
            if not re.search("^https?://", str(warc_target_uri)) or len(
                    str(warc_target_uri)) > max_identifier_length:
                continue

            dt = record.rec_headers.get_header('WARC-Date')

            if dt14:
                dt = dp.parse(dt).strftime('%Y%m%d%H%M%S')

            # construct node with timestamp (VersionNode)
            version_node = {
                "an": {
                    node_id: {
                        "identifier":
                        str(warc_target_uri.ssurt(), encoding='utf-8'),
                        "timestamp":
                        dt,
                        "TYPE":
                        "VersionNode"
                    }
                }
            }

            record_array.append(json.dumps(version_node))
            record_array.append('\r\n')

            source_id = node_id
            node_id += 1

            content = json.loads(record.raw_stream.read().decode('utf-8'))

            try:
                links = content["Envelope"]["Payload-Metadata"][
                    "HTTP-Response-Metadata"]["HTML-Metadata"]["Links"]
            except:
                links = ''

            # loop on links if not empty and get all urls
            if links != '':
                for link in links:
                    # this is for empty outlink elements, maybe a bug in webarchive-commons used to generate WAT
                    try:
                        # convert relative outlink to absolute one
                        url = urljoin(str(warc_target_uri), link["url"])
                        urlcanon.whatwg(url)  # canonicalization

                        # match only urls that begin with "https?://"
                        if not re.search("^https?://", url) or len(
                                str(url)) > max_identifier_length:
                            continue

                        # construct node and edge
                        node = {
                            "an": {
                                node_id: {
                                    "identifier":
                                    str(urlcanon.parse_url(url).ssurt(),
                                        encoding="utf-8"),
                                    "TYPE":
                                    "Node"
                                }
                            }
                        }

                        edge = {
                            "ae": {
                                edge_id: {
                                    "directed": "true",
                                    "source": str(source_id),
                                    "target": str(node_id)
                                }
                            }
                        }

                        record_array.append(json.dumps(node))
                        record_array.append('\r\n')
                        record_array.append(json.dumps(edge))
                        record_array.append('\r\n')

                        node_id += 1
                        edge_id += 1
                    except:
                        continue

            same_batch = process_record(record_array, node_id, edge_id)

            if not same_batch:
                node_id = edge_id = 1
Exemple #17
0
 def from_seeds(seed_list: List[str]) -> "Scope":
     new_list: Set[bytes] = set()
     for url in seed_list:
         surt = parse_url(url).surt(with_scheme=False)
         new_list.add(surt[0 : surt.index(surt_end) + 1])
     return Scope(new_list)
Exemple #18
0
 def in_scope(self, url: str) -> bool:
     usurt = parse_url(url).surt(with_scheme=False)
     for surt in self.surts:
         if usurt.startswith(surt):
             return True
     return False
Exemple #19
0
def test_semantic_precise(uncanonicalized, canonicalized):
    url = urlcanon.parse_url(uncanonicalized)
    urlcanon.semantic_precise(url)
    assert url.__bytes__() == canonicalized
Exemple #20
0
def test_aggressive(uncanonicalized, canonicalized):
    url = urlcanon.parse_url(uncanonicalized)
    # if uncanonicalized == b'  https://www.google.com/  ':
    #     import pdb; pdb.set_trace()
    urlcanon.aggressive(url)
    assert url.__bytes__() == canonicalized
Exemple #21
0
def test_google_canonicalizer(uncanonicalized, canonicalized):
    url = urlcanon.parse_url(uncanonicalized)
    urlcanon.google(url)
    assert url.__bytes__() == canonicalized
Exemple #22
0
def test_surt_without_trailing_comma(url, surt):
    assert urlcanon.parse_url(url).surt(trailing_comma=False) == surt
Exemple #23
0
def test_surt_without_scheme(url, surt):
    assert urlcanon.parse_url(url).surt(with_scheme=False) == surt
Exemple #24
0
def test_surt(url, surt):
    assert urlcanon.parse_url(url).surt() == surt
Exemple #25
0
def test_supplemental_whatwg(uncanonicalized, canonicalized):
    url = urlcanon.parse_url(uncanonicalized)
    urlcanon.whatwg(url)
    assert url.__bytes__() == canonicalized
Exemple #26
0
def get_canonized_url(url):
    return urlcanon.parse_url(url)
    def run(self):
        while not frontier.empty():
            # get next url from frontier
            url = frontier.get()

            # parse url to get base url and domain name
            split_url = urlsplit(url)
            base = "{0.netloc}".format(split_url)

            domain = base.replace("www.", "") if "www." in base else base
            base_url = "{0.scheme}://{0.netloc}/".format(split_url)

            # first check if can access page
            canAccess = self.checkIPAccessTime(domain)
            if canAccess != None:
                if not canAccess:
                    # return url to frontier and move on to the next url
                    frontier.put(url)
                    continue
            else:
                continue

            # check if site already saved
            robotLock.acquire()
            site = self.findSiteByDomain(domain)
            if site:
                robotLock.release()
                siteID = site[0]
                robot_content = site[2]
            else:
                # retrieve robots.txt content
                try:
                    r = requests.get(parse.urljoin(base_url, 'robots.txt'))
                    robot_content = None

                    # if it exists, save it
                    if r.status_code == requests.codes.ok:
                        robot_content = r.text
                except(requests.exceptions.MissingSchema, requests.exceptions.ConnectionError, requests.exceptions.InvalidURL, requests.exceptions.InvalidSchema):
                    robot_content = None

                # wait some time
                time.sleep(MINOR_TIMEOUT)

                # get sitemap.xml
                try:
                    s = requests.get(parse.urljoin(base_url, 'sitemap.xml'))
                    sitemap_content = None

                    # if it exists save it
                    if s.status_code == requests.codes.ok:
                        sitemap_content = s.text
                except(requests.exceptions.MissingSchema, requests.exceptions.ConnectionError, requests.exceptions.InvalidURL, requests.exceptions.InvalidSchema):
                    sitemap_content = None

                # wait some time
                time.sleep(MINOR_TIMEOUT)

                # save site
                siteID = self.insertSite(domain, robot_content, sitemap_content)
                robotLock.release()

            # create robot file parser object
            robot = robotexclusionrulesparser.RobotExclusionRulesParser()
            if robot_content:
                robot.parse(robot_content)

            # check if current url is allowed by robots.txt
            duplicatesLock.acquire()
            if not robot.is_allowed(USER_AGENT, url):
                pageID = self.findPageByUrl(url)
                self.deleteLinkByID(pageID)
                self.deletePageByUrl(url)
                duplicatesLock.release()
                continue

            duplicatesLock.release()

            # download content from url
            try:
                self.webDriver.get(url)
                time.sleep(TIMEOUT)
            except TimeoutException:
                # save timeout
                if pageID:
                    # page already saved
                    self.updatePage(pageID, siteID, PAGE_TIMEOUT, None, req.response.status_code, datetime.now())
                else:
                    # save new page
                    pageID = self.insertPage(siteID, PAGE_TIMEOUT, url, None, req.response.status_code, datetime.now())

                # continue to next url in frontier
                del self.webDriver.requests
                print(f"Worker {self.threadID}: {url} done...")
                continue

            # retrieve request that loaded page
            req = None
            for request in self.webDriver.requests:
                if request.response and request.response.status_code >= 300 and request.response.status_code <= 399:
                    continue

                if request.response and request.path == url:
                    req = request
                    break

                if request.response and request.response.status_code == requests.codes.ok:
                    req = request
                    break

            if req == None:
                for request in self.webDriver.requests:
                    if request.response:
                        if request.response.status_code == 403 or request.response.status_code == 503:
                            req = request
                            break

                if not req:
                    req = self.webDriver.last_request

            # check page type and save page info
            pageID = self.findPageByUrl(url)
            if req and req.response:
                content_type = req.response.headers.get('Content-Type')
                if content_type:
                    if "text/html" in content_type:
                        # HTML page

                        # check for canonical link
                        try:
                            canonicalLink = self.webDriver.find_element_by_xpath("//link[@rel='canonical']")
                            if canonicalLink:
                                link = canonicalLink.get_attribute('href')

                                if link != url:
                                    # is duplicate
                                    duplicatesLock.acquire()

                                    # check if original page already saved
                                    originalPageID = self.findPageByUrl(link)
                                    if originalPageID:
                                        duplicatesLock.release()

                                        if pageID:
                                            # page already saved
                                            self.updatePage(pageID, None, DUPLICATE, None, None, datetime.now())
                                        else:
                                            # save new page and remember id
                                            pageID = self.insertPage(None, DUPLICATE, None, None, None, datetime.now())

                                        # add link to original page
                                        self.insertLink(pageID, originalPageID)

                                        # continue to next url in frontier
                                        del self.webDriver.requests
                                        print(f"Worker {self.threadID}: {url} done...")
                                        continue
                                    else:
                                        # create blank page
                                        originalPageID = self.insertPage(None, FRONTIER, link, None, None, None)
                                        duplicatesLock.release()

                                        if pageID:
                                            # page already saved
                                            self.updatePage(pageID, None, DUPLICATE, None, None, datetime.now())
                                        else:
                                            # save new page and remember id
                                            pageID = self.insertPage(None, DUPLICATE, None, None, None, datetime.now())

                                        # add link to original page
                                        self.insertLink(pageID, originalPageID)

                                        # add url to frontier
                                        frontier.put(link)

                                        # continue to next url in frontier
                                        del self.webDriver.requests
                                        print(f"Worker {self.threadID}: {url} done...")
                                        continue
                        except(NoSuchElementException, StaleElementReferenceException):
                            pass

                        # check for duplicate content
                        originalPageID = self.findPageByContent(self.webDriver.page_source)
                        if originalPageID:
                            # is duplicate
                            if pageID:
                                # page already saved
                                self.updatePage(pageID, None, DUPLICATE, None, None, datetime.now())
                            else:
                                # save new page and remember id
                                pageID = self.insertPage(None, DUPLICATE, None, None, None, datetime.now())

                            # add link to original page
                            self.insertLink(pageID, originalPageID)

                            # continue to next url in frontier
                            del self.webDriver.requests
                            print(f"Worker {self.threadID}: {url} done...")
                            continue

                        # not duplicate
                        if pageID:
                            # page already saved
                            self.updatePage(pageID, siteID, FRONTIER_HTML, self.webDriver.page_source, req.response.status_code, datetime.now())
                        else:
                            # save new page and remember id
                            pageID = self.insertPage(siteID, FRONTIER_HTML, url, self.webDriver.page_source, req.response.status_code, datetime.now())

                        # let through only pages that loaded successfully
                        if req.response.status_code != requests.codes.ok:
                            del self.webDriver.requests
                            print(f"Worker {self.threadID}: {url} done...")
                            continue
                    elif "text/plain" in content_type:
                        # TXT content
                        if pageID:
                            # page already saved
                            self.updatePage(pageID, siteID, BINARY, None, req.response.status_code, datetime.now())
                        else:
                            # save new page
                            pageID = self.insertPage(siteID, BINARY, url, None, req.response.status_code, datetime.now())

                        # insert page data
                        self.insertPageData(pageID, TXT)

                        # continue to next url in frontier
                        del self.webDriver.requests
                        print(f"Worker {self.threadID}: {url} done...")
                        continue
                    elif "application/pdf" in content_type:
                        # PDF content
                        if pageID:
                            # page already saved
                            self.updatePage(pageID, siteID, BINARY, None, req.response.status_code, datetime.now())
                        else:
                            # save new page
                            pageID = self.insertPage(siteID, BINARY, url, None, req.response.status_code, datetime.now())

                        # insert page data
                        self.insertPageData(pageID, PDF)

                        # continue to next url in frontier
                        del self.webDriver.requests
                        print(f"Worker {self.threadID}: {url} done...")
                        continue
                    elif "application/msword" in content_type:
                        # DOC content
                        if pageID:
                            # page already saved
                            self.updatePage(pageID, siteID, BINARY, None, req.response.status_code, datetime.now())
                        else:
                            # save new page
                            pageID = self.insertPage(siteID, BINARY, url, None, req.response.status_code, datetime.now())

                        # insert page data
                        self.insertPageData(pageID, DOC)

                        # continue to next url in frontier
                        del self.webDriver.requests
                        print(f"Worker {self.threadID}: {url} done...")
                        continue
                    elif "application/vnd.openxmlformats-officedocument.wordprocessingml.document" in content_type:
                        # DOCX content
                        if pageID:
                            # page already saved
                            self.updatePage(pageID, siteID, BINARY, None, req.response.status_code, datetime.now())
                        else:
                            # save new page
                            pageID = self.insertPage(siteID, BINARY, url, None, req.response.status_code, datetime.now())

                        # insert page data
                        self.insertPageData(pageID, DOCX)

                        # continue to next url in frontier
                        del self.webDriver.requests
                        print(f"Worker {self.threadID}: {url} done...")
                        continue
                    elif "application/vnd.ms-powerpoint" in content_type:
                        # PPT content
                        if pageID:
                            # page already saved
                            self.updatePage(pageID, siteID, BINARY, None, req.response.status_code, datetime.now())
                        else:
                            # save new page
                            pageID = self.insertPage(siteID, BINARY, url, None, req.response.status_code, datetime.now())

                        # insert page data
                        self.insertPageData(pageID, PPT)

                        # continue to next url in frontier
                        del self.webDriver.requests
                        print(f"Worker {self.threadID}: {url} done...")
                        continue
                    elif "application/vnd.openxmlformats-officedocument.presentationml.presentation" in content_type:
                        # PPTX content
                        if pageID:
                            # page already saved
                            self.updatePage(pageID, siteID, BINARY, None, req.response.status_code, datetime.now())
                        else:
                            # save new page
                            pageID = self.insertPage(siteID, BINARY, url, None, req.response.status_code, datetime.now())

                        # insert page data
                        self.insertPageData(pageID, PPTX)

                        # continue to next url in frontier
                        del self.webDriver.requests
                        print(f"Worker {self.threadID}: {url} done...")
                        continue
                    elif "image" in content_type:
                        # IMAGE content
                        if pageID:
                            # page already saved
                            self.updatePage(pageID, siteID, BINARY, None, req.response.status_code, datetime.now())
                        else:
                            # save new page
                            pageID = self.insertPage(siteID, BINARY, url, None, req.response.status_code, datetime.now())

                        # parse file name
                        filename = urlparse(url)

                        # insert image data
                        self.insertImage(pageID, os.path.basename(filename.path), content_type, datetime.now())

                        # continue to next url in frontier
                        del self.webDriver.requests
                        print(f"Worker {self.threadID}: {url} done...")
                        continue
                    elif "text/css" in content_type:
                        # CSS content
                        if pageID:
                            # page already saved
                            self.updatePage(pageID, siteID, BINARY, None, req.response.status_code, datetime.now())
                        else:
                            # save new page
                            pageID = self.insertPage(siteID, BINARY, url, None, req.response.status_code, datetime.now())

                        # insert page data
                        self.insertPageData(pageID, CSS)

                        # continue to next url in frontier
                        del self.webDriver.requests
                        print(f"Worker {self.threadID}: {url} done...")
                        continue
                    elif "text/csv" in content_type:
                        # CSV content
                        if pageID:
                            # page already saved
                            self.updatePage(pageID, siteID, BINARY, None, req.response.status_code, datetime.now())
                        else:
                            # save new page
                            pageID = self.insertPage(siteID, BINARY, url, None, req.response.status_code, datetime.now())

                        # insert page data
                        self.insertPageData(pageID, CSV)

                        # continue to next url in frontier
                        del self.webDriver.requests
                        print(f"Worker {self.threadID}: {url} done...")
                        continue
                    elif "application/zip" in content_type:
                        # ZIP content
                        if pageID:
                            # page already saved
                            self.updatePage(pageID, siteID, BINARY, None, req.response.status_code, datetime.now())
                        else:
                            # save new page
                            pageID = self.insertPage(siteID, BINARY, url, None, req.response.status_code, datetime.now())

                        # insert page data
                        self.insertPageData(pageID, ZIP)

                        # continue to next url in frontier
                        del self.webDriver.requests
                        print(f"Worker {self.threadID}: {url} done...")
                        continue
                    else:
                        # unknown BINARY content
                        if pageID:
                            # page already saved
                            self.updatePage(pageID, siteID, BINARY, None, req.response.status_code, datetime.now())
                        else:
                            # save new page
                            pageID = self.insertPage(siteID, BINARY, url, None, req.response.status_code, datetime.now())

                        # insert page data
                        self.insertPageData(pageID, UNKNOWN)

                        # continue to next url in frontier
                        del self.webDriver.requests
                        print(f"Worker {self.threadID}: {url} done...")
                        continue
                else:
                    # no content header -> mark page as UNDEFINED
                    if pageID:
                        # page already saved
                        self.updatePage(pageID, siteID, UNDEFINED, None, req.response.status_code, datetime.now())
                    else:
                        # save new page
                        pageID = self.insertPage(siteID, UNDEFINED, url, None, req.response.status_code, datetime.now())

                    # continue to next url in frontier
                    del self.webDriver.requests
                    print(f"Worker {self.threadID}: {url} done...")
                    continue
            else:
                # some kind of error happened
                if pageID:
                    # page already saved
                    self.updatePage(pageID, siteID, NO_RESPONSE, None, None, datetime.now())
                else:
                    # save new page
                    pageID = self.insertPage(siteID, NO_RESPONSE, url, None, None, datetime.now())

                # continue to next url in frontier
                del self.webDriver.requests
                print(f"Worker {self.threadID}: {url} done...")
                continue

            # only if page is of HTML type
            # extract links

            # href
            elements = self.webDriver.find_elements_by_xpath("//*[@href]")
            for element in elements:
                try:
                    link = element.get_attribute('href')

                    # check if url allowed by robots.txt and if is from .gov.si
                    if self.isGov(link) and robot.is_allowed(USER_AGENT, link):
                        # canonicalize url
                        link = str(urlcanon.whatwg(urlcanon.parse_url(link)))

                        # add url to frontier
                        self.addUrlToFrontier(pageID, link)
                except(NoSuchElementException, StaleElementReferenceException):
                    continue

            # onclick
            elements = self.webDriver.find_elements_by_xpath("//*[@onclick]")
            for element in elements:
                try:
                    line = element.get_attribute('onclick')
                    if line:
                        link = ""
                        if "location.href='" in line:
                            rightLine = line.split("location.href='")[1]
                            link = rightLine.split("'")[0]
                        elif "document.location='" in line:
                            rightLine = line.split("document.location='")[1]
                            link = rightLine.split("'")[0]

                        if link != "":
                            # check if url allowed by robots.txt and if is from .gov.si
                            if self.isGov(link) and robot.is_allowed(USER_AGENT, link):
                                # canonicalize url
                                link = str(urlcanon.whatwg(urlcanon.parse_url(link)))

                                # add url to frontier
                                self.addUrlToFrontier(pageID, link)
                except(NoSuchElementException, StaleElementReferenceException):
                    continue

            # extract images
            elements = self.webDriver.find_elements_by_tag_name('img')
            for element in elements:
                try:
                    link = element.get_attribute('src')

                    # check if url allowed by robots.txt, if is from .gov.si and if src attribute has URL
                    if self.isGov(link) and robot.is_allowed(USER_AGENT, link) and re.match(self.urlValidator, link):
                        link = str(urlcanon.whatwg(urlcanon.parse_url(link)))

                        self.addUrlToFrontier(pageID, link)
                except(NoSuchElementException, StaleElementReferenceException):
                    continue

            del self.webDriver.requests
            print(f"Worker {self.threadID}: {url} done...")

        self.conn.close()
        self.webDriver.quit()
        print(f"Worker {self.threadID}: finished crawling.")
    def gather_links(self):

        # Define Browser Options

        soup = BeautifulSoup(self.current_page_html, "lxml")

        # Extract links to profiles from TWDS Authors
        links = set()
        images = set()
        for link in soup.find_all("a"):
            current_url_relative = link.get('href')

            current_url = urllib.parse.urljoin(self.site_currently_crawling[1],
                                               current_url_relative)

            current_parsed_url_urlcanon = urlcanon.parse_url(current_url)
            urlcanon.whatwg(current_parsed_url_urlcanon)

            current_parsed_url = urllib.parse.urlparse(current_url)

            if (current_parsed_url.scheme != "http"
                    and current_parsed_url.scheme != "https"):
                continue

            #print("uglyurl: ", current_url, "CANON: ", current_parsed_url_urlcanon, "current_parsed_url: ", current_parsed_url)

            # print("DOMAIN", self.site_currently_crawling[1])
            # print("     URL------->", current_url, current_parsed_url.geturl())

            links.add(current_parsed_url)

        onclicks = soup.find_all(attrs={'onclick': True})

        if len(onclicks) > 0:
            for onclick in onclicks:
                try:
                    x = onclick.find("location=")
                    if (x < 0): continue
                    onclick_split = onclick.split(onclick[x + 9])
                    for index, string in enumerate(onclick_split):
                        if "location=" in string:
                            loc = onclick_split[index + 1]
                            print(
                                "!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!",
                                loc)
                            current_url = urllib.parse.urljoin(
                                self.site_currently_crawling[1], loc)
                            current_parsed_url_urlcanon = urlcanon.parse_url(
                                current_url)
                            urlcanon.whatwg(current_parsed_url_urlcanon)
                            current_parsed_url = urllib.parse.urlparse(
                                current_url)
                            links.add(current_parsed_url)
                            break
                except Exception:
                    continue

        for image in soup.find_all("img"):
            current_url_relative = image.get('src')

            current_url = urllib.parse.urljoin(self.site_currently_crawling[1],
                                               current_url_relative)

            current_parsed_url = urllib.parse.urlparse(current_url)

            images.add(current_parsed_url)

        # print(images)

        for image in images:
            fullurl = urllib.parse.urljoin(self.site_currently_crawling[1],
                                           image.geturl())
            fullurl = urllib.parse.urlparse(fullurl)

            try:
                res = requests.get(fullurl.geturl())
            except Exception:
                continue

            content_type = res.headers['content-type']
            content = res.content
            url = image.geturl()
            path = urllib.parse.urlparse(url).path
            filename = os.path.basename(path)

            db.insert_image(self.page_currently_crawling[0], filename,
                            content_type, content, int(time.time()))

        return list(links)
Exemple #29
0
 def canonicalize(self, url):
     if not isinstance(url, urlcanon.ParsedUrl):
         url = urlcanon.parse_url(url)
     for step in self.steps:
         step(url)
     return url
def canon(s: str) -> str:
    parsed = urlcanon.parse_url(s)
    return str(urlcanon.whatwg(parsed))
Exemple #31
0
def clean_url(s: str) -> str:
    s = s.strip()
    parsed = urlcanon.parse_url(s)
    if not parsed.port and parsed.colon_before_port:
        parsed.colon_before_port = b""
    return str(urlcanon.whatwg(parsed))
Exemple #32
0
    def main(self):
        # The page contains HTML, lets scrape it --------------------------------------------------
        firefox_options = FirefoxOptions()

        # Adding a specific user agent
        firefox_options.add_argument("user-agent=fri-ieps-kslk")
        firefox_options.add_argument("--headless")

        print(f"[PageHandler] Retrieving web page URL '{self.page_url}'")
        self.driver = webdriver.Firefox(
            options=firefox_options,
            executable_path=Config.WEB_DRIVER_LOCATION_GECKO)
        self.driver.set_page_load_timeout(10)

        self.driver.get(self.page_url)

        # Timeout needed for Web page to render (read more about it)
        time.sleep(Config.RENDERING_TIMEOUT)

        self.html_content = self.driver.page_source

        # Checking for duplicates ------------------------------------------------------------------
        self.hashed_content = hashlib.md5(
            self.html_content.encode("utf-8")).hexdigest()

        is_duplicate = self.session.query(Page).filter(
            Page.content_hash == self.hashed_content).first()
        if is_duplicate:
            self.page_db.page_type_code = "DUPLICATE"
            self.page_db.http_status_code = self.status_code
            self.page_db.site_id = self.site_id
            self.page_db.url = self.page_url
            self.page_db.accessed_time = getTimestamp()
            self.page_db.content_hash = self.hashed_content
            self.session.commit()
            self.session.close()
            self.driver.quit()
            return

        # The page is valid html and its not a duplicate, now we extract all the links on the page ---
        links = []

        # First, we extract the links with tag name "a"
        elems = self.driver.find_elements_by_tag_name("a")
        for elem in elems:
            href = elem.get_attribute('href')
            if href is None:
                continue
            if href.startswith("/"):
                links.append(self.base_url + href)
            elif href is not None and ("http" in href or "https" in href):
                links.append(href)

        # We also extract links from the onclick sections
        onclicks = self.driver.find_elements_by_xpath("//*[@onclick]")
        for el in onclicks:
            temp = el.get_attribute("onclick")
            if "location.href=" in temp:
                temp = temp.replace("location.href=", "")\
                    .replace("\'", "")\
                    .replace("\"", "")
                links.append(temp)

        # Remove the links that point outside of .gov
        links_trancuted = []
        for el in links:
            if "gov.si/" in el:
                links_trancuted.append(el)

        links = links_trancuted

        # Put the links in the canonical form
        links_canonical = []
        for el in links:
            parsed_link = urlcanon.parse_url(el)
            urlcanon.whatwg(parsed_link)
            links_canonical.append(str(parsed_link))

        links = links_canonical

        # Save the links to the DB -----------------------------------------------------------------
        for link in links:
            # Check if link is already in the DB
            is_duplicate = self.session.query(Page).filter(
                Page.url == link).first()
            if is_duplicate is None:
                extracted_domain_name = get_domain_name_from_url(link)

                page = Page()
                page.site_id = self.get_site_id_for_page(extracted_domain_name)

                # Pages with status == None have yet to be visited
                page.status = None
                page.page_type_code = "FRONTIER"
                page.url = link
                self.session.add(page)
                self.session.commit()

                # Also add a Link to the DB
                link_ = Link()
                link_.from_page = self.page_id
                link_.to_page = self.session.query(Page).filter(
                    Page.url == link).first().id
                self.session.add(link_)
                self.session.commit()
            #else:
            #    print(f"Page {link} is already in the DB")

        # Finding and storing the images on the page --------------------------------------------------
        imgs = self.driver.find_elements_by_tag_name("img")
        for elem in imgs:
            src = elem.get_attribute("src")
            url = ""
            if src is None:
                continue
            if src.startswith("/"):
                url = self.base_url + src
            elif src is not None and ("http" in src or "https" in src):
                url = src
            if url != "" and len(url) <= 255:
                # Save the image
                image = Image()
                image.page_id = self.page_id
                image.filename = url
                image.content_type = "BINARY"
                image.accessed_time = getTimestamp()
                self.session.add(image)
                self.session.commit()

        # With all the data scraped, we can save the page to the DB -------------------------------------
        self.page_db.html_content = self.html_content
        self.page_db.accessed_time = getTimestamp()
        self.page_db.content_hash = self.hashed_content
        self.page_db.http_status_code = self.status_code
        self.page_db.site_id = self.site_id
        self.page_db.page_type_code = "HTML"
        self.page_db.url = self.page_url
        self.session.commit()

        # Lets be responsible and close the session and the driver
        self.session.close()
        self.driver.quit()