Example #1
0
def url_fix(url, charset='utf-8'):
    if isinstance(url, unicode):
        url = url.encode(charset, 'ignore')

    scheme, netloc, path, params, query, fragment = urlparse(url)

    if not netloc:
        raise MalformedURLException(url)

    if netloc.endswith('youtube.com'):
        params = parse_qs(query + fragment)
        try:
            # Of the form http://www.youtube.com/v/<video_id>(?|&)foo=bar
            matched = match('^/(v|embed)/([^?&]+)', path)
            if matched:
                video_id = matched.group(2)
            else:
                video_id = params['v'][0]
            scheme = 'http'
            netloc = 'www.youtube.com'
            path = '/watch'
            query = 'v=%s' % video_id
            fragment = params = ''
        except KeyError:
            raise MalformedURLException(url)

    elif netloc.endswith('vimeo.com'):
        try:
            scheme = 'http'
            netloc = 'www.vimeo.com'
            path = path[path.rindex('/') + 1:]
            query = params = fragment = ''
        except ValueError:
            raise MalformedURLException(url)

    elif netloc in ['facebook.com', 'www.facebook.com']:
        params = parse_qs(query + fragment)
        try:
            video_id = params['video_id'][0]
            scheme = 'https'
            netloc = 'graph.facebook.com'
            path = '/%s' % video_id
            query = params = fragment = ''
        except KeyError:
            vid_match = match('/v/([0-9]+)$', path)
            if not vid_match:
                raise MalformedURLException(url)
            scheme = 'http'
            netloc = 'www.facebook.com'
            query = params = fragment = ''

    else:
        path = quote(path, '/%')
        query = quote_plus(query, ':&=')

    return urlunparse(norm_tuple(scheme, netloc, path, params, query, fragment))
Example #2
0
    def download(self):
        '''
        modified from https://github.com/codelucas/newspaper/blob/master/newspaper/network.py
        '''
        if(self.is_downloaded):
            return True

        FAIL_ENCODING = 'ISO-8859-1'
        useragent = self.newspaper_article.config.browser_user_agent
        timeout = self.newspaper_article.config.request_timeout

        try:
            html = None
            response = requests.get(url=self.url, timeout=60)#TODO: add back get_request_kwargs functionality present in newspaper impl
            if(response.status_code >= 400):
                logging.warn(u"encountered status code {0} while getting {1}".format(response.status_code, self.url))
                return False

            if(not re.search("(text/html|application/xhtml\+xml) *(; .*)?", response.headers["content-type"])):
                logging.debug(u"not a html: {0}".format(response.headers["content-type"]))
                return False

            try:
                parsed_url = urlparse(response.url)
                parsed_as_list = list(parsed_url)
                parsed_as_list[5] = ''
                self.canonical_url = urlunparse(urlnorm.norm_tuple(*parsed_as_list))
            except Exception as e:
                logging.info(u"skipping malformed url {0}. Error: {1}".format(response.url, str(e)))
                return False

            if response.encoding != FAIL_ENCODING:
                html = response.text
            else:
                html = response.content
            if not html:
                return False

            converted = UnicodeDammit(html, is_html=True)
            if not converted.unicode_markup:
                logging.warn("Failed to detect encoding of downloaded article, tried: " + ", ".join(converted.tried_encodings))
                return False
            self.html = converted.unicode_markup
            self.is_downloaded = True
        except Exception as e:
            logging.warn('%s on %s' % (e, self.url))
            return False
        return True
Example #3
0
        def next(self):
            '''
            (Crawler) -> newspaper.Article
            returns the next article in the sequence
            '''
            #standard non-recursive tree iteration
            while(True):
                if(len(self.visit_queue) <= 0):
                    raise StopIteration
                current_url = self.visit_queue.pop()

                if(self._should_skip()):
                    logging.info(u"skipping {0} randomly".format(current_url))
                    continue

                logging.info(u"visiting {0}".format(current_url))
                #use newspaper to download and parse the article
                article = ExplorerArticle(current_url)
                article.download()

                # get get urls from the article
                for link in article.get_links():
                    url = urljoin(current_url, link.href, False)
                    if self.url_in_filter(url, self.filters):
                        logging.info("Matches with filter, skipping the {0}".format(url))
                        continue
                    try:
                        parsed_url = urlparse(url)
                        parsed_as_list = list(parsed_url)
                        if parsed_url.scheme != u"http" and parsed_url.scheme != u"https":
                            logging.info(u"skipping url with invalid scheme: {0}".format(url))
                            continue
                        parsed_as_list[5] = ''
                        url = urlunparse(urlnorm.norm_tuple(*parsed_as_list))
                    except Exception as e:
                        logging.info(u"skipping malformed url {0}. Error: {1}".format(url, str(e)))
                        continue
                    if not parsed_url.netloc.endswith(self.domain):
                        continue
                    if url in self.visited_urls:
                        continue
                    self.visit_queue.appendleft(url)
                    self.visited_urls.add(url)
                    logging.info(u"added {0} to the visit queue".format(url))

                self.pages_visited += 1
                return article
Example #4
0
    def next(self):
        '''
        (Crawler) -> newspaper.Article
        returns the next article in the sequence
        '''

        #standard non-recursive tree iteration
        with open('../ignore_filter/' + self.site.name + '_ignore_file.txt',
                  'a') as ignore_filter_file:
            try:
                current_level = 0
                while (True):
                    if (self.limit > 0 and self.visited_count > self.limit):
                        raise StopIteration('Limit reached: {:d}'.format(
                            self.limit))
                    # if(self.pages_visited > self.probabilistic_n):
                    #     raise StopIteration
                    # self.cursor.execute("SELECT * FROM " + self.tovisit_table + " ORDER BY id LIMIT 1")
                    # row = self.cursor.fetchone()
                    # if(row):
                    #     row_id = row[0]
                    #     current_url = row[1]
                    #     self.cursor.execute("DELETE FROM " + self.tovisit_table + " WHERE id=%s", (row_id,))
                    # else:
                    #     raise StopIteration

                    # if(self._should_skip()):
                    #     logging.info(u"skipping {0} randomly".format(current_url))
                    #     continue
                    try:
                        if (self.site.is_shallow):
                            current = self.to_visit.get_nowait()
                            current_url = current[0]
                            current_level = current[1]
                            logging.info(u"Shallow on level {0} {1}".format(
                                current_level, current_url))
                        else:
                            current_url = self.to_visit.get_nowait()
                    except Empty:
                        self.site.is_shallow = True  # On line 26 the site gets set TO DELETE
                        self.to_visit.put((self.site.url, str(0)))
                        self.ignore_filter = ScalableBloomFilter(
                            initial_capacity=10000000, error_rate=0.00001)
                        ignore_filter_file.close()
                        os.remove('../ignore_filter/' + self.site.name +
                                  '_ignore_file.txt')
                        logging.info("stopped iteration")
                        logging.info(u"{0}".format(self.site.url))
                        raise ZeroDivisionError

                    logging.info(u"visiting {0}".format(current_url))
                    self.visited_count += 1
                    #use newspaper to download and parse the article
                    article = ExplorerArticle(current_url)
                    article.download()
                    if (self.site.is_shallow):
                        if (int(current_level) > self.level):
                            continue
                    # get urls from the article
                    for link in article.get_links():
                        url = urljoin(current_url, link.href, False)
                        if self.url_in_filter(url, self.filters):
                            logging.info(
                                u"skipping url \"{0}\" because it matches filter"
                                .format(url))
                            continue
                        try:
                            parsed_url = urlparse(url)
                            parsed_as_list = list(parsed_url)

                            if (parsed_url.scheme != u"http"
                                    and parsed_url.scheme != u"https"):
                                logging.info(
                                    u"skipping url with invalid scheme: {0}".
                                    format(url))
                                continue
                            parsed_as_list[5] = ''
                            url = urlunparse(
                                urlnorm.norm_tuple(*parsed_as_list))
                        except Exception as e:
                            logging.info(
                                u"skipping malformed url {0}. Error: {1}".
                                format(url, str(e)))
                            continue
                        if (not parsed_url.netloc.endswith(self.domain)):
                            continue
                        # If the url have been added to ignore list, skip
                        if (url in self.ignore_filter):
                            continue
                        # Ignores the subscribe links for many domains
                        if (u"subscribe" in url or "subscribe" in url and
                                not (u"-subscribe" in url or "-subscribe"
                                     or u"subscribe-" in url or "subscribe-")):
                            continue

                        # Append the url to to_visit queue
                        if (self.site.is_shallow):
                            self.to_visit.put(
                                (url, str(int(current_level) + 1)))
                            logging.info(
                                u"added {0} to the to_visit as well as the level {1}"
                                .format(url, str(int(current_level) + 1)))

                            # Append the url to visited to remove duplicates
                            self.ignore_filter.add(url)
                            ignore_filter_file.write(url.encode('utf8') + "\n")
                        else:
                            self.to_visit.put(url)
                            logging.info(
                                u"added {0} to the to_visit".format(url))

                            # Append the url to visited to remove duplicates
                            self.ignore_filter.add(url)
                            ignore_filter_file.write(url.encode('utf8') + "\n")

                    # Update the Queue
                    self.to_visit.task_done()

                    return article

            except StopIteration as e:
                raise e
            except ValueError as e:
                raise ValueError
            except Exception as e:
                raise e
Example #5
0
    def download(self):
        '''
        modified from https://github.com/codelucas/newspaper/blob/master/newspaper/network.py
        '''
        if (self.is_downloaded):
            return True

        FAIL_ENCODING = 'ISO-8859-1'
        useragent = self.newspaper_article.config.browser_user_agent
        timeout = self.newspaper_article.config.request_timeout

        try:
            html = None
            with eventlet.Timeout(15):
                response = requests.get(
                    url=self.url, timeout=15
                )  #TODO: add back get_request_kwargs functionality present in newspaper impl
            if (response.status_code >= 400):
                logging.warn(
                    u"encountered status code {0} while getting {1}".format(
                        response.status_code, self.url))
                return False

            if (not re.search("(text/html|application/xhtml\+xml) *(; .*)?",
                              response.headers["content-type"])):
                logging.debug(u"not a html: {0}".format(
                    response.headers["content-type"]))
                return False

            try:
                parsed_url = urlparse(response.url)
                parsed_as_list = list(parsed_url)
                parsed_as_list[5] = ''
                self.canonical_url = urlunparse(
                    urlnorm.norm_tuple(*parsed_as_list))
            except Exception as e:
                logging.info(u"skipping malformed url {0}. Error: {1}".format(
                    response.url, str(e)))
                return False

            if response.encoding != FAIL_ENCODING:
                html = response.text
            else:
                html = response.content
            if not html:
                return False

            converted = UnicodeDammit(html, is_html=True)
            if not converted.unicode_markup:
                logging.warn(
                    "Failed to detect encoding of downloaded article, tried: "
                    + ", ".join(converted.tried_encodings))
                return False
            self.html = converted.unicode_markup
            self.is_downloaded = True
        except Exception as e:
            logging.warn('%s on %s' % (e, self.url))
            return False
        except eventlet.Timeout as t:
            logging.warn('Timeout on %s' % (self.url))
            return False
        return True
Example #6
0
    def next(self):
        '''
        (Crawler) -> newspaper.Article
        returns the next article in the sequence
        '''

        #standard non-recursive tree iteration
        with open('../ignore_filter/' + self.site.name + '_ignore_file.txt', 'a') as ignore_filter_file:
            try:
                current_level = 0
                while(True):
                    if (self.limit > 0 and self.visited_count > self.limit):
                        raise StopIteration('Limit reached: {:d}'.format(self.limit))
                    # if(self.pages_visited > self.probabilistic_n):
                    #     raise StopIteration
                    # self.cursor.execute("SELECT * FROM " + self.tovisit_table + " ORDER BY id LIMIT 1")
                    # row = self.cursor.fetchone()
                    # if(row):
                    #     row_id = row[0]
                    #     current_url = row[1]
                    #     self.cursor.execute("DELETE FROM " + self.tovisit_table + " WHERE id=%s", (row_id,))
                    # else:
                    #     raise StopIteration

                    # if(self._should_skip()):
                    #     logging.info(u"skipping {0} randomly".format(current_url))
                    #     continue
                    try:
                        if (self.site.is_shallow):
                            current = self.to_visit.get_nowait()
                            current_url = current[0]
                            current_level = current[1]
                            logging.info(u"Shallow on level {0} {1}".format(current_level, current_url))
                        else:
                            current_url = self.to_visit.get_nowait()
                    except Empty:
                        self.site.is_shallow = True # On line 26 the site gets set TO DELETE
                        self.to_visit.put((self.site.url, str(0)))
                        self.ignore_filter = ScalableBloomFilter(
                        initial_capacity=10000000,
                        error_rate=0.00001)
                        ignore_filter_file.close()
                        os.remove('../ignore_filter/' + self.site.name + '_ignore_file.txt')
                        logging.info("stopped iteration")
                        logging.info(u"{0}".format(self.site.url))
                        raise ZeroDivisionError


                    logging.info(u"visiting {0}".format(current_url))
                    self.visited_count += 1
                    #use newspaper to download and parse the article
                    article = ExplorerArticle(current_url)
                    article.download()
                    if (self.site.is_shallow):
                        if (int(current_level) > self.level):
                            continue
                    # get urls from the article
                    for link in article.get_links():
                        url = urljoin(current_url, link.href, False)
                        if self.url_in_filter(url, self.filters):
                            logging.info(u"skipping url \"{0}\" because it matches filter".format(url))
                            continue
                        try:
                            parsed_url = urlparse(url)
                            parsed_as_list = list(parsed_url)

                            if(parsed_url.scheme != u"http" and parsed_url.scheme != u"https"):
                                logging.info(u"skipping url with invalid scheme: {0}".format(url))
                                continue
                            parsed_as_list[5] = ''
                            url = urlunparse(urlnorm.norm_tuple(*parsed_as_list))
                        except Exception as e:
                            logging.info(u"skipping malformed url {0}. Error: {1}".format(url, str(e)))
                            continue
                        if(not parsed_url.netloc.endswith(self.domain)):
                            continue
                        # If the url have been added to ignore list, skip
                        if (url in self.ignore_filter):
                            continue
                        # Ignores the subscribe links for many domains
                        if (u"subscribe" in url or "subscribe" in url and not(u"-subscribe" in url or "-subscribe" or u"subscribe-" in url or "subscribe-")):
                        	continue

                        # Append the url to to_visit queue
                        if (self.site.is_shallow):
                            self.to_visit.put((url, str(int(current_level) + 1)))
                            logging.info(u"added {0} to the to_visit as well as the level {1}".format(url, str(int(current_level) + 1)))

                            # Append the url to visited to remove duplicates
                            self.ignore_filter.add(url)
                            ignore_filter_file.write(url.encode('utf8') + "\n")
                        else:
                            self.to_visit.put(url)
                            logging.info(u"added {0} to the to_visit".format(url))

                            # Append the url to visited to remove duplicates
                            self.ignore_filter.add(url)
                            ignore_filter_file.write(url.encode('utf8') + "\n")

                    # Update the Queue
                    self.to_visit.task_done()


                    return article


            except StopIteration as e:
                raise e
            except ValueError as e:
                raise ValueError
            except Exception as e:
                raise e