Example #1
0
    def getImage(self, imageUrl, referrer, fidx):

        content, handle = self.wg.getpage(imageUrl,
                                          returnMultiple=True,
                                          addlHeaders={'Referer': referrer})
        if not content or not handle:
            raise ValueError("Failed to retreive image from page '%s'!" %
                             referrer)

        fileN = urllib.parse.unquote(
            urllib.parse.urlparse(handle.geturl())[2].split("/")[-1])
        fileN = bs4.UnicodeDammit(fileN).unicode_markup

        mtype = magic.from_buffer(content, mime=True)
        fext = mimetypes.guess_extension(mtype)

        # Assume jpeg if we can't figure it out, because it's probably safe.
        if not fext:
            fext = ".jpg"

        filename = "{counter} - {orig} {ext}".format(
            orig=fileN,
            counter=str(fidx).zfill(4),
            ext=fext,
        )

        self.log.info("retreived image '%s' with a size of %0.3f K", filename,
                      len(content) / 1000.0)
        return filename, content
Example #2
0
    def smartGetItem(self, itemUrl: str, *args, **kwargs):

        lowerspliturl = urllib.parse.urlsplit(itemUrl.lower())
        for processor in PREEMPTIVE_PROCESSORS:
            if processor.preemptive_wants_url(lowerspliturl=lowerspliturl):
                self.log.info(
                    "Preemptive fetch handler %s wants to modify content",
                    processor)
                return processor.premptive_handle(url=itemUrl, wg=self.wg)

        content, fileN, mType = self.wg.getItem(itemUrl=itemUrl,
                                                *args,
                                                **kwargs)

        # Decode text-type items
        if mType.startswith('text'):
            if isinstance(content, bytes):
                content = bs4.UnicodeDammit(content).unicode_markup

        processed = False
        for processor in PROCESSORS:
            if processor.wants_url(lowerspliturl=lowerspliturl,
                                   mimetype=mType):
                processed = True
                content = processor.preprocess(url=itemUrl,
                                               lowerspliturl=lowerspliturl,
                                               mimeType=mType,
                                               content=content,
                                               wg=self.wg)

        if processed:
            self.log.info("All preprocessors completed!")
        return content, fileN, mType
Example #3
0
    def getItem(self, itemUrl: str):
        content, handle = self.getpage(itemUrl, returnMultiple=True)

        if not content or not handle:
            raise urllib.error.URLError(
                "Failed to retreive file from page '%s'!" % itemUrl)

        handle_info = handle.info()

        if handle_info['Content-Disposition'] and 'filename=' in handle_info[
                'Content-Disposition'].lower():
            fileN = handle_info['Content-Disposition'].split("=", 1)[-1]
        else:
            fileN = urllib.parse.unquote(
                urllib.parse.urlparse(handle.geturl())[2].split("/")[-1])
            fileN = bs4.UnicodeDammit(fileN).unicode_markup
        mType = handle_info['Content-Type']

        # If there is an encoding in the content-type (or any other info), strip it out.
        # We don't care about the encoding, since WebFunctions will already have handled that,
        # and returned a decoded unicode object.
        if mType and ";" in mType:
            mType = mType.split(";")[0].strip()

        # *sigh*. So minus.com is f*****g up their http headers, and apparently urlencoding the
        # mime type, because apparently they're shit at things.
        # Anyways, fix that.
        if mType and '%2F' in mType:
            mType = mType.replace('%2F', '/')

        self.log.info(
            "Retreived file of type '%s', name of '%s' with a size of %0.3f K",
            mType, fileN,
            len(content) / 1000.0)
        return content, fileN, mType
Example #4
0
def getItemChromium(wg, log, itemUrl):
	log.info("Fetching page for URL: '%s' with Chromium" % itemUrl)

	with wg._chrome_context('http://www.google.com', extra_tid=None) as cr:

		wg._syncIntoChromium(cr)

		response = cr.blocking_navigate_and_get_source(itemUrl, timeout=wg.navigate_timeout_secs)

		raw_url = cr.get_current_url()
		fileN = urllib.parse.unquote(urllib.parse.urlparse(raw_url)[2].split("/")[-1])
		fileN = bs4.UnicodeDammit(fileN).unicode_markup

		wg._syncOutOfChromium(cr)

	# Probably a bad assumption
	if response['binary']:
		mType = "application/x-binary"
	else:
		mType = "text/html"

	# Use the new interface that returns the actual type
	if 'mimetype' in response:
		mType = response['mimetype']

	# So, wg._cr.page_source appears to be the *compressed* page source as-rendered. Because reasons.
	content = response['content']

	return content, fileN, mType
Example #5
0
	def getImage(self, imageUrl, referrer):

		content, handle = self.wg.getpage(imageUrl, returnMultiple=True, addlHeaders={'Referer': referrer})
		if not content or not handle:
			raise ValueError("Failed to retreive image from page '%s'!" % referrer)

		fileN = urllib.parse.unquote(urllib.parse.urlparse(handle.geturl())[2].split("/")[-1])
		fileN = bs4.UnicodeDammit(fileN).unicode_markup
		self.log.info("retreived image '%s' with a size of %0.3f K", fileN, len(content)/1000.0)

		if not "." in fileN:
			info = handle.info()
			if 'Content-Type' in info:
				tp = info['Content-Type']
				if ";" in tp:
					tp = tp.split(";")[0]
				ext = guess_extension(tp)
				if ext == None:
					ext = "unknown_ftype"
				print(info['Content-Type'], ext)
				fileN += "." + ext
			else:
				fileN += ".jpg"

		# Let magic figure out the files for us (it's probably smarter then kissmanga, anyways.)
		guessed = magic.from_buffer(content, mime=True)
		ext = guess_extension(tp)
		if ext:
			fileN = fileN + ext

		return fileN, content
Example #6
0
    def getItemChromium(self, itemUrl):
        self.log.info("Fetching page for URL: '%s' with Chromium" % itemUrl)

        with ChromeController.ChromeContext(self._cr_binary) as cr:

            self._syncIntoChromium(cr)

            response = cr.blocking_navigate_and_get_source(itemUrl, timeout=10)

            raw_url = cr.get_current_url()
            fileN = urllib.parse.unquote(
                urllib.parse.urlparse(raw_url)[2].split("/")[-1])
            fileN = bs4.UnicodeDammit(fileN).unicode_markup

            self._syncOutOfChromium(cr)

        # Probably a bad assumption
        if response['binary']:
            mType = "application/x-binary"
        else:
            mType = "text/html"

        # So, self._cr.page_source appears to be the *compressed* page source as-rendered. Because reasons.
        content = response['content']
        return content, fileN, mType
Example #7
0
    def getItem(self, itemUrl, addlHeaders=None):

        content, handle = self.wg.getpage(
            itemUrl,
            returnMultiple=True,
            addlHeaders={'Referer': self.refererUrl})
        if not content or not handle:
            raise ValueError("Failed to retreive file from page '%s'!" %
                             itemUrl)

        info = handle.info()
        if not 'Content-Disposition' in info:
            info['Content-Disposition'] = ''

        fileN = jsLiteralParse.parseContentDispositon(
            info['Content-Disposition'], itemUrl)
        fileN = bs4.UnicodeDammit(fileN).unicode_markup

        mType = handle.info()['Content-Type']

        # If there is an encoding in the content-type (or any other info), strip it out.
        # We don't care about the encoding, since WebRequest will already have handled that,
        # and returned a decoded unicode object.

        if mType and ";" in mType:
            mType = mType.split(";")[0].strip()

        self.log.info(
            "Retreived file of type '%s', name of '%s' with a size of %0.3f K",
            mType, fileN,
            len(content) / 1000.0)
        return content, fileN, mType
    def getItemSeleniumChromium(self, itemUrl):
        self.log.info("Fetching page for URL: '%s' with SeleniumChromium" %
                      itemUrl)

        if not self.selenium_chromium_driver:
            self._initSeleniumChromiumWebDriver()
        self._syncIntoSeleniumChromiumWebDriver()

        with SeleniumCommon.load_delay_context_manager(
                self.selenium_chromium_driver):
            self.selenium_chromium_driver.get(itemUrl)
        time.sleep(3)

        fileN = urllib.parse.unquote(
            urllib.parse.urlparse(
                self.selenium_chromium_driver.current_url)[2].split("/")[-1])
        fileN = bs4.UnicodeDammit(fileN).unicode_markup

        self._syncOutOfSeleniumChromiumWebDriver()

        # Probably a bad assumption
        mType = "text/html"

        # So, self.selenium_chromium_driver.page_source appears to be the *compressed* page source as-rendered. Because reasons.
        source = self.selenium_chromium_driver.execute_script(
            "return document.getElementsByTagName('html')[0].innerHTML")

        source = "<html>" + source + "</html>"
        return source, fileN, mType
Example #9
0
	def getItem(self, itemUrl):

		try:
			content, handle = self.getpage(itemUrl, returnMultiple=True)
		except:
			print("Failure?")
			if self.rules['cloudflare']:
				if not self.stepThroughCloudFlare(itemUrl, titleNotContains='Just a moment...'):
					raise Exceptions.FetchFailureError("Could not step through cloudflare!")
				# Cloudflare cookie set, retrieve again
				content, handle = self.getpage(itemUrl, returnMultiple=True)
			else:
				raise

		if not content or not handle:
			raise urllib.error.URLError("Failed to retreive file from page '%s'!" % itemUrl)

		fileN = urllib.parse.unquote(urllib.parse.urlparse(handle.geturl())[2].split("/")[-1])
		fileN = bs4.UnicodeDammit(fileN).unicode_markup
		mType = handle.info()['Content-Type']

		# If there is an encoding in the content-type (or any other info), strip it out.
		# We don't care about the encoding, since WebRequest will already have handled that,
		# and returned a decoded unicode object.
		if mType and ";" in mType:
			mType = mType.split(";")[0].strip()

		# *sigh*. So minus.com is f*****g up their http headers, and apparently urlencoding the
		# mime type, because apparently they're shit at things.
		# Anyways, fix that.
		if '%2F' in  mType:
			mType = mType.replace('%2F', '/')

		self.log.info("Retreived file of type '%s', name of '%s' with a size of %0.3f K", mType, fileN, len(content)/1000.0)
		return content, fileN, mType
Example #10
0
 def to_unicode_or_bust(self, obj, encoding='utf-8'):
     try:
         if isinstance(obj, basestring):
             if not isinstance(obj, unicode):
                 obj = unicode(obj, encoding)
         return obj
     except:
         return bs4.UnicodeDammit(obj, is_html=False).unicode_markup
Example #11
0
    def __plain_local_fetch(self, itemUrl):
        error = None
        try:
            itemUrl = itemUrl.strip()
            itemUrl = itemUrl.replace(" ", "%20")
            content, handle = self.wg_proxy().getpage(itemUrl,
                                                      returnMultiple=True)
        except WebRequest.FetchFailureError:
            self.log.error("Failed to fetch page!")
            for line in traceback.format_exc().split("\n"):
                self.log.error(line)

            error = traceback.format_exc()
            content, handle = None, None
        except:
            print("Failure?")
            if self.rules['cloudflare']:
                if not self.wg_proxy().stepThroughCloudFlare(
                        itemUrl, titleNotContains='Just a moment...'):
                    raise ValueError("Could not step through cloudflare!")
                # Cloudflare cookie set, retrieve again
                content, handle = self.wg_proxy().getpage(itemUrl,
                                                          returnMultiple=True)
            else:
                raise

        if not content or not handle:
            if error:
                raise DownloadException(
                    "Failed to retreive file from page '%s'!\n\nFetch traceback:\n%s\n\nEnd fetch traceback."
                    % (itemUrl, error))
            raise DownloadException("Failed to retreive file from page '%s'!" %
                                    itemUrl)

        fileN = urllib.parse.unquote(
            urllib.parse.urlparse(handle.geturl())[2].split("/")[-1])
        fileN = bs4.UnicodeDammit(fileN).unicode_markup
        mType = handle.info()['Content-Type']

        # If there is an encoding in the content-type (or any other info), strip it out.
        # We don't care about the encoding, since WebRequest will already have handled that,
        # and returned a decoded unicode object.
        if mType and ";" in mType:
            mType = mType.split(";")[0].strip()

        # *sigh*. So minus.com is f*****g up their http headers, and apparently urlencoding the
        # mime type, because apparently they're shit at things.
        # Anyways, fix that.
        if '%2F' in mType:
            mType = mType.replace('%2F', '/')

        self.wg_proxy().cj.save()

        self.log.info(
            "Retreived file of type '%s', name of '%s' with a size of %0.3f K",
            mType, fileN,
            len(content) / 1000.0)
        return content, fileN, mType
Example #12
0
    def decode_html(html_string, encoding=None, default_encoding='iso-8859-1'):
        """Decodes a html string into a unicode string.
        If explicit encoding is defined then
        it would use it otherwise it will decoding it using
        beautiful soups UnicodeDammit feature,
        otherwise it will use w3lib to decode the html.

        Returns a two tuple with (<encoding>, <decoded unicode string>)

        :rtype: (str, str)
        :returns: (used-encoding, unicode-markup)
        """

        tried = [encoding, default_encoding]

        try:
            logger.info("Trying UnicodeDammit Codec for decoding html.")
            try:
                import bs4
            except ImportError:
                raise ImportError(
                    "bs4 module is not installed. "
                    "Install it using pip: $ pip install bs4"
                )
            converted = bs4.UnicodeDammit(html_string, [encoding], is_html=True)

            if not converted.unicode_markup:
                tried += converted.tried_encodings
                logger.critical(
                    "UnicodeDammit decoder failed to decode html!"
                    "Encoding tried by default enc: [%s]"
                    "Trying fallback..." % ','.join(tried)
                )
                raise UnicodeDecodeError

            return converted.original_encoding, converted.unicode_markup

        except (UnicodeDecodeError, ImportError):
            # This method will definitely decode the html though
            # the result could be corrupt. But if you getting a
            # corrupt html output then you definitely have to
            # manually provide the encoding.
            try:
                import w3lib
                from w3lib.encoding import html_to_unicode
            except ImportError:
                raise ImportError(
                    "w3lib module is not installed. "
                    "Install it using pip: $ pip install w3lib"
                )

            enc, unc = w3lib.encoding.html_to_unicode(
                None, html_body_str=html_string,
                default_encoding=default_encoding
            )
            return enc, unc
	def preprocess_content(self, url, lowerspliturl, mimetype, contentstr):
		if isinstance(contentstr, bytes):
			contentstr = bs4.UnicodeDammit(contentstr).unicode_markup



		if search_regex.search(contentstr):
			contentstr = self.fix_shortened(contentstr)

		return contentstr
Example #14
0
	def getImage(self, imageUrl, referrer):

		content, handle = self.wg.getpage(imageUrl, returnMultiple=True, addlHeaders={'Referer': referrer})
		if not content or not handle:
			raise ValueError("Failed to retreive image from page '%s'!" % referrer)

		fileN = urllib.parse.unquote(urllib.parse.urlparse(handle.geturl())[2].split("/")[-1])
		fileN = bs4.UnicodeDammit(fileN).unicode_markup
		self.log.info("retreived image '%s' with a size of %0.3f K", fileN, len(content)/1000.0)
		return fileN, content
Example #15
0
    def preprocess_content(self, url, lowerspliturl, mimetype, contentstr):
        if isinstance(contentstr, bytes):
            contentstr = bs4.UnicodeDammit(contentstr).unicode_markup

        if '<title>reddit.com: over 18?</title>' in contentstr:
            self.log.info("Adult clickwrap page. Stepping through")
            url = urllib.parse.urlunparse(lowerspliturl + ("", ))
            contentstr = self._acceptAdult(contentstr, url)
            self.log.info("Retreived clickwrapped content successfully")

        return contentstr
Example #16
0
	def __decodeTextContent(self, pgctnt, cType):

		if cType:
			if (";" in cType) and ("=" in cType):
				# the server is reporting an encoding. Now we use it to decode the content
				# Some wierdos put two charsets in their headers:
				# `text/html;Charset=UTF-8;charset=UTF-8`
				# Split, and take the first two entries.
				docType, charset = cType.split(";")[:2]
				charset = charset.split("=")[-1]

				# Only decode content marked as text (yeah, google is serving zip files
				# with the content-disposition charset header specifying "UTF-8") or
				# specifically allowed other content types I know are really text.
				decode = ['application/atom+xml', 'application/xml', "application/json", 'text']
				if any([item in docType for item in decode]):
					try:
						pgctnt = str(pgctnt, charset)
					except UnicodeDecodeError:
						self.log.error("Encoding Error! Stripping invalid chars.")
						pgctnt = pgctnt.decode('utf-8', errors='ignore')

			else:
				# The server is not reporting an encoding in the headers.
				# Use content-aware mechanisms for determing the content encoding.


				if "text/html" in cType or \
					'text/javascript' in cType or    \
					'text/css' in cType or    \
					'application/xml' in cType or    \
					'application/atom+xml' in cType:				# If this is a html/text page, we want to decode it using the local encoding

					pgctnt = self.__decodeHtml(pgctnt, cType)

				elif "text/plain" in cType or "text/xml" in cType:
					pgctnt = bs4.UnicodeDammit(pgctnt).unicode_markup

				# Assume JSON is utf-8. Probably a bad idea?
				elif "application/json" in cType:
					pgctnt = pgctnt.decode('utf-8')

				elif "text" in cType:
					self.log.critical("Unknown content type!")
					self.log.critical(cType)

		else:
			self.log.critical("No content disposition header!")
			self.log.critical("Cannot guess content type!")

		return pgctnt
Example #17
0
    def cr_fetch(self, itemUrl):
        wg = self.wg_proxy()
        self.log.info("Synchronous rendered chromium fetch!")
        content = None
        with wg.chromiumContext(itemUrl) as cr:
            try:
                content = cr.blocking_navigate_and_get_source(itemUrl)
                if content:
                    if content['binary'] is False:
                        # If the content isn't binary, retreive the rendered version.
                        content = cr.get_rendered_page_source()
                    else:
                        self.log.error("Binary content!")

            except Exception:
                pass

            if not content:
                for x in range(99):
                    try:
                        content = cr.get_rendered_page_source()
                    except Exception as e:
                        self.log.error(
                            "Failure extracting source (%s)! Retrying %s..." %
                            (e, x))
                        if x > 3:
                            raise

            if itemUrl.endswith("/feed/"):
                mType = "application/rss+xml"
            else:
                mType = 'text/html'

            raw_url = cr.get_current_url()
            fileN = urllib.parse.unquote(
                urllib.parse.urlparse(raw_url)[2].split("/")[-1])
            fileN = bs4.UnicodeDammit(fileN).unicode_markup

            title, cur_url = cr.get_page_url_title()

        if "debug" in sys.argv:
            self.log.info("Title: %s", title)
            self.log.info("Mime: %s", mType)
            self.log.info("Fname: %s", fileN)
            self.log.info("Content: ")
            self.log.info("%s", content)

        return content, fileN, mType
    def preprocess_content(self, url, lowerspliturl, mimetype, contentstr):
        if isinstance(contentstr, bytes):
            contentstr = bs4.UnicodeDammit(contentstr).unicode_markup

        url_segs = url.split("/")
        sid = None
        try:
            sid = int(url_segs[4])
        except ValueError:
            pass

        if len(url_segs) >= 5 and url_segs[3] == 'series' and sid:
            self.log.info("Chapter list page. Inserting ToC")
            contentstr = self._render_sh_toc(url, sid, contentstr)

        return contentstr
Example #19
0
    def extract(self):
        try:
            arch, fName = self.wg.getFileAndName(
                self.url, addlHeaders={'Referer': self.refererUrl})
        except IndexError:
            print("ERROR: Failure retrieving page!")
            return None, []

        baseName = fName.split(".")[0]

        if not isinstance(arch, bytes):
            if 'You need permission' in arch or 'Sign in to continue to Docs':
                self.log.critical("Retrieving zip archive failed?")
                self.log.critical("Retreived content type: '%s'", type(arch))
                raise TypeError("Cannot access document? Is it protected?")
            else:
                with open("tmp_page.html", "w") as fp:
                    fp.write(arch)
                raise ValueError("Doc not valid?")

        zp = io.BytesIO(arch)
        zfp = zipfile.ZipFile(zp)

        resources = []
        baseFile = None

        for item in zfp.infolist():
            if not "/" in item.filename and not baseFile:
                contents = zfp.open(item).read()
                contents = bs4.UnicodeDammit(contents).unicode_markup

                baseFile = (item.filename, contents)

            elif baseName in item.filename and baseName:
                raise ValueError("Multiple base file items?")

            else:
                resources.append(
                    (item.filename, mimetypes.guess_type(item.filename)[0],
                     zfp.open(item).read()))

        if not baseFile:
            raise ValueError("No base file found!")

        return baseFile, resources
Example #20
0
    def preprocessContent(self, url, mimetype, contentstr):
        if mimetype != 'text/html':
            return contentstr

        if isinstance(contentstr, bytes):
            contentstr = bs4.UnicodeDammit(contentstr).unicode_markup

        soup = WebRequest.as_soup(contentstr)

        for bogus in soup.find_all(
                "a",
                href='https://www.asianhobbyist.com/android-mobile-app-live/'):
            bogus.decompose()

        # There should be some content. If the page is completely empty of text, it was probably an error.
        assert len(soup.get_text(strip=True)) > 50

        return soup.prettify()
 def make_soup(self, content, url=None, from_encoding=None):
     try:
         # Try and unicode it first.
         converted = bs4.UnicodeDammit(content)
         if converted.unicode_markup:
             content = converted.unicode_markup
         else:
             self.log.warning('Could not convert to unicode: %s', url)
         soup = bs4.BeautifulSoup(content, 'lxml', from_encoding)
         setattr(soup, '_url', url)  # Fudge for debug
     except Exception as error:
         self.submit_parse_error("Parse exception", exception=str(error))
         wrapped = ScraperParseException(
             "Content parse error",
             error=error,
             scraper_name=self.__class__.__name__)
         wrapped.logged = True
         raise
     return soup
Example #22
0
	def getItemPhantomJS(self, itemUrl):
		self.log.info("Fetching page for URL: '%s' with PhantomJS" % itemUrl)

		with self.pjs_context():

			with load_delay_context_manager(self.pjs_driver):
				self.pjs_driver.get(itemUrl)
			time.sleep(3)

			fileN = urllib.parse.unquote(urllib.parse.urlparse(self.pjs_driver.current_url)[2].split("/")[-1])
			fileN = bs4.UnicodeDammit(fileN).unicode_markup


			# Probably a bad assumption
			mType = "text/html"

			# So, self.pjs_driver.page_source appears to be the *compressed* page source as-rendered. Because reasons.
			source = self.pjs_driver.execute_script("return document.getElementsByTagName('html')[0].innerHTML")

		assert source != '<head></head><body></body>'

		source = "<html>"+source+"</html>"
		return source, fileN, mType
Example #23
0
def trydecodingbytes(bs):
    try:
        import bs4
        dammit = bs4.UnicodeDammit(bs)
        unicode_markup, original_encoding, tried_encodings = dammit.unicode_markup, dammit.original_encoding, dammit.tried_encodings
    except ImportError:
        unicode_markup, original_encoding, tried_encodings = None, None, None
    if unicode_markup:
        return unicode_markup, original_encoding
    try:
        encoding = chardet(bs)['encoding']
    except ImportError:
        encoding = None
    if not encoding and tried_encodings and tried_encodings[
            0] and tried_encodings[0][0]:
        encoding = tried_encodings[0][0]
    if encoding:
        try:
            s = bs.decode(encoding=encoding)
        except Exception:
            s = str(bs)
            encoding = None
    return s, encoding
    def premptive_handle_content(self, url):
        '''
		F**k Vue.js so much
		'''

        wrapper_step_through_timeout = 60
        loading_str = "{{"

        with self.wg._chrome_context(url, extra_tid=False) as cr:
            self.wg._syncIntoChromium(cr)
            try:

                response = cr.blocking_navigate_and_get_source(url)

                raw_url = cr.get_current_url()
                fileN = urllib.parse.unquote(
                    urllib.parse.urlparse(raw_url)[2].split("/")[-1])
                fileN = bs4.UnicodeDammit(fileN).unicode_markup

                # Short circuit for the binary content case.
                if response['binary']:
                    return response['content'], fileN, magic.from_buffer(
                        response['content'], mime=True)

                self.log.info("Waiting for content to render...")

                for _ in range(wrapper_step_through_timeout):
                    body = cr.get_rendered_page_source()
                    if loading_str not in body:
                        self.log.info("Content appears to have rendered!")
                        return self.de_garbage_html(body), fileN, "text/html"
                    time.sleep(1)

            finally:
                self.wg._syncOutOfChromium(cr)

        raise WebRequest.GarbageSiteWrapper("Could not render JS content!")
Example #25
0
    def getItem(self, itemUrl):

        content, handle = self.wg.getpage(itemUrl, returnMultiple=True)
        if not content or not handle:
            raise ValueError("Failed to retreive file from page '%s'!" %
                             itemUrl)

        fileN = urllib.parse.unquote(
            urllib.parse.urlparse(handle.geturl())[2].split("/")[-1])
        fileN = bs4.UnicodeDammit(fileN).unicode_markup
        mType = handle.info()['Content-Type']

        # If there is an encoding in the content-type (or any other info), strip it out.
        # We don't care about the encoding, since WebFunctions will already have handled that,
        # and returned a decoded unicode object.

        if mType and ";" in mType:
            mType = mType.split(";")[0].strip()

        self.log.info(
            "Retreived file of type '%s', name of '%s' with a size of %0.3f K",
            mType, fileN,
            len(content) / 1000.0)
        return content, fileN, mType
Example #26
0
import os


# Программа запускается одним большим куском, сначала обкачивает сайт газеты в память (в виде словаря),
# затем создает соответствующие директории вида (каталог, где находится скрипт)/год/месяц/... ,
# сохраняет в них файлы в неразмеченном виде,
# для каждого из них вызывает майстем (нужно указать путь к исполняемому файлу) и прогоняет два раза - с выдачей в xml и txt.
# Создать csv можно, вызвав функцию makecsv() - после того, как весь файл хотя бы один раз запустится и в памяти останется словарь.




http = urllib3.PoolManager()

gazeta = http.request('GET', 'http://www.zeml-trub.ru/index.php')
pagedata = bs4.BeautifulSoup(bs4.UnicodeDammit(gazeta.data, is_html = True).unicode_markup, 'lxml')

timedict = {}
datelist = []
#rgxp = "[0-9]+\-[0-9]+\-[0-9]+"
#rgxp = '2'
counter = 0

# Определяем количество страниц со статьями для обхода

Nstranic = re.findall('[0-9]+ ', pagedata.find(string = 'дальше').parent.parent.get_text())[-1].strip()

for stranica in range(1, int(Nstranic)):
    for i in pagedata.find_all('div'):
        
        try:
Example #27
0
    def generate_diffs(
        self,
        paper_src_dir: str,
        dpi: int = settings.DEFAULT_INFERENCE_DPI
    ) -> (Optional[List[str]], Optional[List[str]]):
        """
        Given the directory of a latex source file, create a modified copy of the source that includes colored boxes
        surrounding each figure and table.
        """
        paper_tex = glob.glob(paper_src_dir + '/' + '*.tex')
        if len(paper_tex) > 1:
            logging.warning('Multiple .tex files found')
            return None
        elif len(paper_tex) < 1:
            logging.warning('No .tex files found')
            return None
        texfile = paper_tex[0]
        chunk_dir, paper_id = os.path.split(paper_src_dir)
        chunk_id = os.path.basename(chunk_dir)

        # Modify latex source
        with open(texfile, 'rb') as f:
            # Some files may cause a UnicodeDecodeError if read directly as text
            # so use bs4 to fix them up
            text = bs4.UnicodeDammit(f.read()).unicode_markup
        paper_modified_src_dir = self.ARXIV_MODIFIED_SRC_DIR + chunk_id + '/' + paper_id
        if not os.path.isdir(paper_modified_src_dir):
            os.makedirs(paper_modified_src_dir)
        color_filename = paper_modified_src_dir + '/color.tex'
        black_filename = paper_modified_src_dir + '/black.tex'
        text = self.make_12_pt(text)
        _IMPORT_STR = IMPORT_STR
        if self.augment_typewriter_font:
            _IMPORT_STR = _IMPORT_STR + TYPE_WRITER_FONT
        if self.augment_line_spacing_1_5:
            _IMPORT_STR = _IMPORT_STR + LINE_SPREAD_1_5
        with open(color_filename, 'w') as f:
            COLOR_STR = (_IMPORT_STR %
                         ('red', 'yellow', 'green', 'blue')) + BEGIN_DOC
            print(text.replace(BEGIN_DOC, COLOR_STR), file=f)
        with open(black_filename, 'w') as f:
            BLACK_STR = (_IMPORT_STR %
                         ('white', 'white', 'black', 'black')) + BEGIN_DOC
            print(text.replace(BEGIN_DOC, BLACK_STR), file=f)

        result_dir = self.ARXIV_DIFF_DIR + chunk_id + '/' + paper_id + '/'
        if not os.path.isdir(result_dir):
            os.makedirs(result_dir)
        try:
            # on some PDFs, call_pdflatex doesn't raise an exception even
            # after the timeout, and instead hangs indefinitely (> 24
            # hours).
            color_pdf = figure_utils.call_pdflatex(
                src_tex=color_filename,
                src_dir=paper_src_dir,
                dest_dir=result_dir,
                timeout=self.PDFLATEX_TIMEOUT)
            black_pdf = figure_utils.call_pdflatex(
                src_tex=black_filename,
                src_dir=paper_src_dir,
                dest_dir=result_dir,
                timeout=self.PDFLATEX_TIMEOUT)
        except figure_utils.LatexException as e:
            logging.warning('Pdflatex failure: %s' % e.stdout)
            return None
        color_ims = pdf_renderer.render(color_pdf,
                                        dpi=dpi,
                                        max_pages=self.MAX_PAGES)
        black_ims = pdf_renderer.render(black_pdf,
                                        dpi=dpi,
                                        max_pages=self.MAX_PAGES)
        diff_names = []
        for (color_page, black_page) in zip(color_ims, black_ims):
            assert os.path.isfile(color_page) and os.path.isfile(black_page)
            color_page_im = imageio.imread(color_page)
            black_page_im = imageio.imread(black_page)
            assert color_page_im.shape == black_page_im.shape
            diff_page = figure_utils.im_diff(color_page_im, black_page_im)
            diff_name = result_dir + 'diff-' + os.path.basename(black_page)
            imageio.imwrite(uri=diff_name, im=diff_page)
            diff_names.append(diff_name)
        return diff_names, black_ims
Example #28
0
def create_data_entry(datarow):
    new_entry = datautils.DataStruct.DATAFRAMETEMPLATE
    datarow[0] = datarow[0].lower()
    new_entry["Url"] = [datarow[0]]
    new_entry["Source"] = ["logic-immo.be"]
    new_entry["Zip"] = [int(datarow[0].split('/')[6].split("-")[-1])]
    localities = datautils.DataStruct.get_locality(new_entry["Zip"].values[0])
    local = "None"

    if len(localities) == 1:
        local = localities[0].lower()
    else:
        for locality in localities:
            if datarow[1].lower().find(locality.lower()) != -1:
                local = locality.lower()
                break

    new_entry["Locality"] = local
    new_entry["Type of sale"] = ["vente"]
    for element in datarow:
        sale_match = re.search("(vente publique|vente|viager)",
                               element.lower())
        if sale_match:
            print((sale_match, element[max(0,
                                           sale_match.start() -
                                           30):min(len(element),
                                                   sale_match.end() + 30)]))
            new_entry["Type of sale"] = sale_match.groups()[0]

    for element in datarow:
        furnished_all_match = re.search(
            "(sans|niet|pas)? ?(meublé| meuble|gemeubileerde) ?([a-zA-Z]*)",
            element.lower())
        if furnished_all_match:
            print((furnished_all_match,
                   element[max(0,
                               furnished_all_match.start() -
                               30):min(len(element),
                                       furnished_all_match.end() + 30)]))
            not_list = ("sans", "niet", "pas")
            for group in furnished_all_match.groups():
                if group in not_list:
                    new_entry["Furnished"] = [0]
                    continue
            if "évier" in furnished_all_match.groups():
                continue
            new_entry["Furnished"] = [1]

    if datarow[0].find("appartements") != -1:
        new_entry["Type of property"] = ["appartment"]
    elif datarow[0].find("maisons") != -1:
        new_entry["Type of property"] = ["maison"]
    else:
        new_entry["Type of property"] = ["none"]

    new_entry["Subtype of property"] = [
        datarow[1].split(" à vendre à")[0].split(" ")[-1].lower()
    ]
    if not new_entry["Subtype of property"].values[0]:
        text = datarow[0]
        subtype = re.search('[0-9]{4}/([a-zA-Z]+)-[0-9]*', text)
        if subtype:
            new_entry["Subtype of property"] = subtype.groups()[0]
    datarow[1] = " ".join(bs4.UnicodeDammit(datarow[1]).unicode_markup.split())
    match_price = re.search("([0-9]{0,3}) ?([0-9]{0,3}) ?([0-9]{1,3}) ?0?€",
                            datarow[1])
    price = -1
    if match_price:
        for i, group in enumerate(match_price.groups()[::-1]):
            if group != "":
                price += (1000**i) * int(group)
    new_entry["Price"] = [price]
    new_entry["Fully equipped kitchen"] = [0]
    for element in datarow:
        kitchen_all_match = re.search(__patterns["cuisine"], element.lower())
        if kitchen_all_match:
            for group in kitchen_all_match.groups():
                if group:
                    kitchen_bad_match = re.search(__patterns["cuisine2"],
                                                  group)
                    if kitchen_bad_match:
                        new_entry["Fully equipped kitchen"] = [0]
                        break
                    else:
                        new_entry["Fully equipped kitchen"] = [1]
        if new_entry["Fully equipped kitchen"].values[0] == 0:
            kitchen_match = re.match(__patterns["cuisine3"], element.lower())
            if kitchen_match:
                new_entry["Fully equipped kitchen"] = [True]
            else:
                kitchen_match = re.match(__patterns["cuisine4"],
                                         element.lower())
                if kitchen_match:
                    new_entry["Fully equipped kitchen"] = [True]

    new_entry["Furnished"] = [0]
    for element in datarow:
        furnished_all_match = re.search(
            "(sans|niet|pas)? ?(meublé| meuble|gemeubileerde) ?([a-zA-Z]*)",
            element.lower())
        if furnished_all_match:
            print((furnished_all_match,
                   element[max(0,
                               furnished_all_match.start() -
                               30):min(len(element),
                                       furnished_all_match.end() + 30)]))
            not_list = ("sans", "niet", "pas")
            for group in furnished_all_match.groups():
                if group in not_list:
                    new_entry["Furnished"] = [0]
                    continue
            if "évier" in furnished_all_match.groups():
                continue
            new_entry["Furnished"] = [1]
    new_entry["Swimming pool"] = [0]
    for element in datarow:
        swimmingpool = re.match(r"piscine|zwembad", element.lower())
        if swimmingpool:
            print(element)
            new_entry["Swimming pool"] = [1]
            break

    new_entry["Area"] = [-1]
    pr = list()
    try:
        new_entry["Area"] = [int(datarow[1].split(".")[1].split(" m²")[0])]

    except:
        for element in datarow:
            m = re.search("surface ([0-9]+) m²", element.lower())
            pr.append((element, "Surface ([0-9]+) m²"))
            if m:
                new_entry["Area"] = [int(m.groups()[0])]
                break
        if new_entry["Area"].values[0] == -1:
            for element in datarow:
                pr.append((element, "([0-9]+) m²[a-zA-Z ]*hab"))
                m = re.search("([0-9]+) m²[a-zA-Z ]*hab", element.lower())
                if m:
                    area = -1
                    for group in m.groups():
                        area = int(group) if int(group) > area else area
                    new_entry["Area"] = [area]
                    break

        if new_entry["Area"].values[0] == -1:
            for element in datarow:
                pr.append((element, "surface du living ([0-9]+) m²"))
                m = re.search("surface du living ([0-9]+) m²", element.lower())
                if m:
                    new_entry["Area"] = [int(m.groups()[0])]
                    break

        if new_entry["Area"].values[0] == -1:
            for element in datarow:
                pr.append((element, "superficie totale [a-zA-Z ]*([0-9]+) m²"))
                m = re.search("superficie totale [a-zA-Z ]*([0-9]+) m²",
                              element.lower())
                if m:
                    new_entry["Area"] = [int(m.groups()[0])]
                    break

    new_entry["State of the building"] = ["none"]
    for element in datarow:
        state_match = re.search("(a|à|batiment|architecture|construction)? ?" + \
                                "(neuf|contemporain|moderne|vétuste|neuve|neuw|rénnover|" + \
                                "rénover|rénnové|rénové)", element.lower())
        if state_match:
            print((state_match, element[max(0,
                                            state_match.start() -
                                            30):min(len(element),
                                                    state_match.end() + 30)]))
            for group in state_match.groups():
                if not group:
                    continue
                new_entries = ("neuf", "neuw", "nieuw", "neuve")
                for entry in new_entries:
                    if group.find(entry) != -1:
                        new_entry["State of the building"] = ["new"]
                        break

                if new_entry["State of the building"].values[0] == "none":
                    renovated_entries = ("contemporain", "moderne", "rénnové",
                                         "rénové")
                    for entry in renovated_entries:
                        if group.find(entry) != -1:
                            new_entry["State of the building"] = ["good"]
                            break
                    if new_entry["State of the building"].values[0] != "none":
                        break

                if new_entry["State of the building"].values[0] == "none":
                    renovate_entries = ("rénnover", "rénover", "vétuste")
                    for entry in renovate_entries:
                        if group.find(entry) != -1:
                            new_entry["State of the building"] = [
                                "to renovate"
                            ]
                            break
                    if new_entry["State of the building"].values[0] != "none":
                        break

    new_entry["Open fire"] = [0]
    for element in datarow:
        state_match = re.search("(feu ouvert|open haard)", element.lower())
        if state_match:
            print((state_match, element[max(0,
                                            state_match.start() -
                                            30):min(len(element),
                                                    state_match.end() + 30)]))
            new_entry["Open fire"] = [1]

    new_entry["Terrace"] = [0]
    new_entry["Terrace Area"] = [-1]
    for element in datarow[::-1]:
        state_match = re.search("(terasse|terras) ?.* ([0-9]+) ?m²",
                                element.lower())
        if state_match:
            new_entry["Terrace"] = [1]
            new_entry["Terrace Area"] = int(state_match.groups()[-1])
        if new_entry["Terrace"].values[0]:
            break

    new_entry["Garden"] = [0]
    new_entry["Garden Area"] = [-1]
    for element in datarow[::-1]:
        state_match = re.search(
            "(jardin|tuin) ?[a-zA-Z ]* ([0-9.]+) ?(m²|are)", element.lower())
        if state_match:
            print((state_match, element[max(0,
                                            state_match.start() -
                                            30):min(len(element),
                                                    state_match.end() + 30)]))
            new_entry["Garden"] = [1]
            area = float(state_match.groups()[-2])
            if state_match.groups()[-1] == 'are':
                area = area * 100
            new_entry["Garden Area"] = int(area)
        if new_entry["Garden"].values[0]:
            break

    new_entry["Number of rooms"] = [-1]
    for element in datarow[::-1]:
        state_match = re.search("([0-9]+) ?(chambre|kamer|slaapkamer)",
                                element.lower())
        if state_match:
            print((state_match, element[max(0,
                                            state_match.start() -
                                            30):min(len(element),
                                                    state_match.end() + 30)]))
            new_entry["Number of rooms"] = int(state_match.groups()[0])
        if new_entry["Number of rooms"].values[0] > 0:
            break

    new_entry["Surface of the land"] = [-1]
    for element in datarow[::-1]:
        state_match = re.search(
            "(surface)? ?(du)? ?terrain ?(de)? ?([0-9.]+) ?(m²|are)",
            element.lower())
        if state_match:
            print((state_match, element[max(0,
                                            state_match.start() -
                                            30):min(len(element),
                                                    state_match.end() + 30)]))
            area = float(state_match.groups()[-2])
            if state_match.groups()[-1] == 'are':
                area = area * 100
            new_entry["Surface of the land"] = int(area)
        if new_entry["Surface of the land"].values[0] > -1:
            break

    if new_entry["Surface of the land"].values[0] == -1:
        new_entry["Surface of the land"] = new_entry["Garden Area"]
    new_entry["Surface area of the plot of land"] = new_entry[
        "Surface of the land"]

    new_entry["Number of facades"] = [-1]
    for element in datarow[::-1]:
        state_match = re.search("([0-9]+) ?(façade|facade)", element.lower())
        if state_match:
            print((state_match, element[max(0,
                                            state_match.start() -
                                            30):min(len(element),
                                                    state_match.end() + 30)]))
            new_entry["Number of facades"] = int(state_match.groups()[0])
        if new_entry["Number of facades"].values[0] > -1:
            break

    if new_entry["Number of facades"].values[0]:
        facades = -1
        if new_entry["Subtype of property"].values[0] == "penthouse":
            facades = 4
        new_entry["Number of facades"] = facades

    return new_entry
Example #29
0
def decode_html(html_string):
    converted = bs4.UnicodeDammit(html_string, isHTML=True)
    if not converted.unicode_markup:
        raise UnicodeDecodeError("Failed to detect encoding, tried [%s]",
                                 ', '.join(converted.tried_encodings))
    return converted.unicode_markup
Example #30
0
	def preprocessContent(self, url, mimetype, contentstr):
		if mimetype != 'text/html':
			return contentstr

		if isinstance(contentstr, bytes):
			contentstr = bs4.UnicodeDammit(contentstr).unicode_markup

		soup = WebRequest.as_soup(contentstr)
		next_chp_links = soup.find_all("a", class_='nextkey')
		prev_chp_links = soup.find_all("a", class_='prevkey')

		for tag in next_chp_links:
			tag.string = "Next chapter"
		for tag in prev_chp_links:
			tag.string = "Previous chapter"

		for bogus in soup.find_all("div", class_='x-modal-content'):
			bogus.decompose()
		for bogus in soup.find_all("div", class_='wpdiscuz_unauth'):
			bogus.decompose()
		for bogus in soup.find_all("div", class_='wpd-default'):
			bogus.decompose()
		for bogus in soup.find_all("div", class_='imagepost'):
			bogus.decompose()
		for bogus in soup.find_all("div", class_='donation'):
			bogus.decompose()
		for bogus in soup.find_all("form", class_='x-search'):
			bogus.decompose()
		for bogus in soup.find_all("ul", class_='x-menu'):
			bogus.decompose()
		for bogus in soup.find_all("div", class_='comments-area'):
			bogus.decompose()
		for bogus in soup.find_all("div", class_='respond'):
			bogus.decompose()
		for bogus in soup.find_all("div", class_='x-bar-space-v'):
			bogus.decompose()
		for bogus in soup.find_all("div", class_='e23-20'):
			bogus.decompose()
		for bogus in soup.find_all("button"):
			bogus.decompose()
		for bogus in soup.find_all("a", id='wpdUserContentInfoAnchor'):
			bogus.decompose()
		for bogus in soup.find_all("div", id='wpdUserContentInfo'):
			bogus.decompose()

		appends = []
		for item in soup.find_all('div', class_='togglepost'):
			# print("found append")
			appends.append(item.extract())

		tgtdiv = soup.find("article", class_='post')

		if tgtdiv:
			tgtdiv = tgtdiv.parent.parent
			tgtdiv.append(soup.new_tag('hr'))
			for append in appends:
				# print("Appending:", append)
				tgtdiv.append(append)

		# There should only ever be one of these.
		for mature_div in soup.find_all("div", class_='include_content_rating'):
			for item in mature_div.find_all('div', class_='list-group-item'):
				item.decompose()

		return soup.prettify()