Example #1
0
def get_link_content(link):
    try:
        response = requests.get(link)
        if response.status_code == 400:
            logging.warn(u"404 {}".format(link))
            return None
        if response.status_code != 200:
            raise Exception(u"Unable to fetch release content: {0}".format(link))
    except requests.exceptions.InvalidURL as e:
        logging.warn(u"Invalid link {0}: {1}".format(link, unicode(e)))
        return None

    content_type = response.headers.get('content-type')
    if not content_type:
        logging.warn(u"Response did not contain a Content-Type header: {0}".format(link))
        return None

    (mime_type, mime_subtype, mt_params) = parse_mime_type(content_type)
    if mime_type != 'text' or mime_subtype not in ('html', 'xhtml'):
        logging.warn(u"Skipping non-HTML link: {0}".format(link))
        return None

    if len(response.content) == 0:
        logging.warn(u"Server returned an empty body: {0}".format(link))
        return None

    (title, body) = readability_extract(response.content)
    return kill_control_characters(body)
Example #2
0
    def _http_get_json(self, url):
        """
        Make an HTTP GET request to the specified URL, check that it returned a
        JSON response, and returned the data parsed from that response.

        Parameters
        ----------
        url
            The URL to GET.

        Returns
        -------
        Dictionary of data parsed from a JSON HTTP response.

        Exceptions
        ----------
        * PythonKCMeetupsBadJson
        * PythonKCMeetupsBadResponse
        * PythonKCMeetupsMeetupDown
        * PythonKCMeetupsNotJson
        * PythonKCMeetupsRateLimitExceeded

        """
        response = self._http_get(url)

        content_type = response.headers['content-type']
        parsed_mimetype = mimeparse.parse_mime_type(content_type)
        if parsed_mimetype[1] not in ('json', 'javascript'):
            raise PythonKCMeetupsNotJson(content_type)

        try:
            return json.loads(response.content)
        except ValueError as e:
            raise PythonKCMeetupsBadJson(e)
Example #3
0
    def get_best_handler(cls, mimetype):
        """Return the handler and score that that best fit the mimetype.

        Args:
            mimetype (tuple):
                A parsed mimetype to find the best handler for. This is a
                3-tuple of the type, subtype, and parameters as returned by
                :py:func:`mimeparse.parse_mime_type`.

        Returns:
            tuple:
            A tuple of ``(best_score, mimetype_handler)``. If no handler
            was found, this will be ``(0, None)``.
        """
        best_score, best_fit = (0, None)

        for mimetype_handler in _registered_mimetype_handlers:
            for mt in mimetype_handler.supported_mimetypes:
                try:
                    score = score_match(mimeparse.parse_mime_type(mt),
                                        mimetype)

                    if score > best_score:
                        best_score, best_fit = (score, mimetype_handler)
                except ValueError:
                    continue

        return (best_score, best_fit)
Example #4
0
    def serialize(self, obj, accept=None, **opts):
        '''serialize(obj) -> content, content_type

        Serialize an object to text.
        '''
        accept = accept or self.default_content_type
        content_type, format = self.get_format(accept)
        method = getattr(self, 'to_%s' % format)

        # ugly hack to get the params to the header part out
        try:
            accept = [
                part
                for part in accept.split(',')
                if part.startswith(content_type)
            ][0]
        except IndexError:
            # '*/*' case
            accept = content_type

        params = mimeparse.parse_mime_type(accept)[2]
        for key, value in params.items():
            opts.setdefault(key, value)

        return self.SerializedContainer(method(obj, **opts), content_type)
Example #5
0
    def require_representation(self, req):
        """Require raw representation dictionary from falcon request object.

        This does not perform any field parsing or validation but only uses
        allowed content-encoding handler to decode content body.

        Note:
            Currently only JSON is allowed as content type.

        Args:
            req (falcon.Request): request object

        Returns:
            dict: raw dictionary of representation supplied in request body

        """
        try:
            type_, subtype, _ = parse_mime_type(req.content_type)
            content_type = '/'.join((type_, subtype))
        except:
            raise falcon.HTTPUnsupportedMediaType(
                description="Invalid Content-Type header: {}".format(
                    req.content_type
                )
            )

        if content_type == 'application/json':
            body = req.stream.read()
            return json.loads(body.decode('utf-8'))
        else:
            raise falcon.HTTPUnsupportedMediaType(
                description="only JSON supported, got: {}".format(content_type)
            )
Example #6
0
    def get_best_handler(cls, mimetype):
        """Return the handler and score that that best fit the mimetype.

        Args:
            mimetype (unicode):
                The mimetype to find the best handler for.

        Returns:
            tuple:
            A tuple of ``(best_score, mimetype_handler)``. If no handler
            was found, this will be ``(0, None)``.
        """
        best_score, best_fit = (0, None)

        for mimetype_handler in _registered_mimetype_handlers:
            for mt in mimetype_handler.supported_mimetypes:
                try:
                    score = score_match(mimeparse.parse_mime_type(mt),
                                        mimetype)

                    if score > best_score:
                        best_score, best_fit = (score, mimetype_handler)
                except ValueError:
                    continue

        return (best_score, best_fit)
Example #7
0
    def for_type(cls, attachment):
        """Returns the handler that is the best fit for provided mimetype."""
        if attachment.mimetype:
            try:
                mimetype = mimeparse.parse_mime_type(attachment.mimetype)
            except:
                logging.error('Unable to parse MIME type "%s" for %s',
                              attachment.mimetype, attachment)
                return None

            # Override the mimetype if mimeparse is known to misinterpret this
            # type of file as 'octet-stream'
            extension = os.path.splitext(attachment.filename)[1]

            if extension in MIMETYPE_EXTENSIONS:
                mimetype = MIMETYPE_EXTENSIONS[extension]

            score, handler = cls.get_best_handler(mimetype)

            if handler:
                try:
                    return handler(attachment.get_review_request(), attachment)
                except ObjectDoesNotExist as e:
                    logging.error('Unable to load review UI for %s: %s',
                                  attachment, e)
                except Exception as e:
                    logging.error('Error instantiating '
                                  'FileAttachmentReviewUI %r: %s',
                                  handler, e)

        return None
Example #8
0
    def get_best_handler(cls, mimetype):
        """Return the Review UI and score that that best fit the mimetype.

        Args:
            mimetype (unicode):
                The mimetype to find a Review UI for.

        Returns:
            tuple:
            A tuple of ``(best_score, review_ui)``, or ``(0, None)`` if one
            could not be found.
        """
        best_score = 0
        best_fit = None

        for review_ui in _file_attachment_review_uis:
            for mt in review_ui.supported_mimetypes:
                try:
                    score = score_match(mimeparse.parse_mime_type(mt),
                                        mimetype)

                    if score > best_score:
                        best_score = score
                        best_fit = review_ui
                except ValueError:
                    continue

        return best_score, best_fit
Example #9
0
    def for_type(cls, attachment):
        """Return the handler that is the best fit for provided mimetype."""
        if not attachment.mimetype:
            return None

        try:
            mimetype = mimeparse.parse_mime_type(attachment.mimetype)
        except:
            logging.warning('Unable to parse MIME type "%s" for %s',
                            attachment, attachment.mimetype)
            mimetype = ('application', 'octet-stream', {})

        # Override the mimetype if mimeparse is known to misinterpret this
        # type of file as `octet-stream`
        extension = os.path.splitext(attachment.filename)[1]

        if extension in MIMETYPE_EXTENSIONS:
            mimetype = MIMETYPE_EXTENSIONS[extension]

        score, handler = cls.get_best_handler(mimetype)

        if handler:
            try:
                return handler(attachment, mimetype)
            except Exception as e:
                logging.error('Unable to load Mimetype Handler for %s: %s',
                              attachment, e)

        return MimetypeHandler(attachment, mimetype)
 def _test_parse_mime_type(self, args, expected):
     if expected is None:
         self.assertRaises(mimeparse.MimeTypeParseException, mimeparse.parse_mime_type, args)
     else:
         expected = tuple(expected)
         result = mimeparse.parse_mime_type(args)
         message = "Expected: '%s' but got %s" % (expected, result)
         self.assertEqual(expected, result, message)
Example #11
0
  def __init__(self, http, postproc, uri,
               method='GET',
               body=None,
               headers=None,
               methodId=None,
               resumable=None):
    """Constructor for an HttpRequest.

    Args:
      http: httplib2.Http, the transport object to use to make a request
      postproc: callable, called on the HTTP response and content to transform
                it into a data object before returning, or raising an exception
                on an error.
      uri: string, the absolute URI to send the request to
      method: string, the HTTP method to use
      body: string, the request body of the HTTP request,
      headers: dict, the HTTP request headers
      methodId: string, a unique identifier for the API method being called.
      resumable: MediaUpload, None if this is not a resumbale request.
    """
    self.uri = uri
    self.method = method
    self.body = body
    self.headers = headers or {}
    self.methodId = methodId
    self.http = http
    self.postproc = postproc
    self.resumable = resumable

    # Pull the multipart boundary out of the content-type header.
    major, minor, params = mimeparse.parse_mime_type(
        headers.get('content-type', 'application/json'))

    # Terminating multipart boundary get a trailing '--' appended.
    self.multipart_boundary = params.get('boundary', '').strip('"') + '--'

    # If this was a multipart resumable, the size of the non-media part.
    self.multipart_size = 0

    # The resumable URI to send chunks to.
    self.resumable_uri = None

    # The bytes that have been uploaded.
    self.resumable_progress = 0

    self.total_size = 0

    if resumable is not None:
      if self.body is not None:
        self.multipart_size = len(self.body)
      else:
        self.multipart_size = 0
      self.total_size = (
          self.resumable.size() +
          self.multipart_size +
          len(self.multipart_boundary))
Example #12
0
    def for_type(cls, attachment):
        """Returns the handler that is the best fit for provided mimetype."""
        mimetype = mimeparse.parse_mime_type(attachment.mimetype)
        score, handler = cls.get_best_handler(mimetype)

        if handler:
            try:
                return handler(attachment.get_review_request(), attachment)
            except Exception, e:
                logging.error('Unable to load review UI for %s: %s',
                              attachment, e, exc_info=1)
Example #13
0
    def _http_get_json(self, url):
        response = self._http_get(url)

        content_type = response.headers['content-type']
        parsed_mimetype = mimeparse.parse_mime_type(content_type)
        if parsed_mimetype[1] not in ('json', 'javascript'):
            raise MeetupsNotJson(content_type)

        try:
            return json.loads(response.content)
        except ValueError as e:
            raise MeetupsBadJson(e)
Example #14
0
    def import_submission(self, submission: praw.objects.Submission) -> dict:
        """ Import a submission from flickr. Uses their oEmbed API.

        flickr.com was nice enough to provide us with an oEmbed API.
        Apparently these guys also support video, so we should also make sure
        to not try to parse that.

        This function will define the following values in its return data:
        - author: simply "a flickr.com user"
        - source: The url of the submission
        - importer_display/header
        - import_urls

        :param submission: A reddit submission to parse.
        """
        try:
            if not self.regex.match(urlsplit(submission.url).netloc):
                return None
            url = submission.url
            data = {'author': 'a flickr.com user',
                    'source': url,
                    'importer_display':
                        {'header': 'Imported flickr.com image:\n\n'}}
            r = requests.head(url, headers=self.headers)
            if r.status_code == 301:
                return None

            mime_text = r.headers.get('Content-Type')
            mime = mimeparse.parse_mime_type(mime_text)
            # If we're already given an image...
            if mime[0] == 'image':
                # Use the already given URL
                image_url = submission.url
            else:
                # Otherwise, find the image in the html
                 self.log.info("Getting submission.url: " + url)
                 html = urllib.request.urlopen(url).read().decode('utf-8')
                 image_urls = re.findall(r'farm[\d]\.[a-z0-9/.\\/_]*', html)
                 if image_urls:
                     image_url = 'http://' + image_urls[-1].replace('\\', '')
                     self.log.info("Got image url %s", image_url)
                 else:
                     self.log.error('Could not find any flickr URL %s', submission.url)
                     return None
                 
            assert image_url
            data['import_urls'] = [image_url]
            return data
        except Exception:
            self.log.error('Could not import flickr URL %s (%s)',
                           submission.url, traceback.format_exc())
            return None
Example #15
0
def parse_content_type(contenttype):
    mime_type = mimeparse.parse_mime_type(contenttype)
    
    if "charset" in mime_type[2]:
        # Remove charset from mime_type, if we have it
        encoding = mime_type[2].pop("charset")
    else:
        encoding = None
    
    if encoding == 'x-ctext':
        encoding = 'latin1'
    
    return mime_type, encoding
Example #16
0
 def status(
     self,
     test_id=None,
     test_status=None,
     test_tags=None,
     runnable=True,
     file_name=None,
     file_bytes=None,
     eof=False,
     mime_type=None,
     route_code=None,
     timestamp=None,
 ):
     super(Starts, self).status(
         test_id,
         test_status,
         test_tags=test_tags,
         runnable=runnable,
         file_name=file_name,
         file_bytes=file_bytes,
         eof=eof,
         mime_type=mime_type,
         route_code=route_code,
         timestamp=timestamp,
     )
     if not test_id:
         if not file_bytes:
             return
         if not mime_type or mime_type == "test/plain;charset=utf8":
             mime_type = "text/plain; charset=utf-8"
         primary, sub, parameters = mimeparse.parse_mime_type(mime_type)
         content_type = testtools.content_type.ContentType(primary, sub, parameters)
         content = testtools.content.Content(content_type, lambda: [file_bytes])
         text = content.as_text()
         if text and text[-1] not in "\r\n":
             self._neednewline = True
         self._output.write(text)
     elif test_status == "inprogress" and test_id not in self._emitted:
         if self._neednewline:
             self._neednewline = False
             self._output.write("\n")
         worker = ""
         for tag in test_tags or ():
             if tag.startswith("worker-"):
                 worker = "(" + tag[7:] + ") "
         if timestamp:
             timestr = timestamp.isoformat()
         else:
             timestr = ""
             self._output.write("%s: %s%s [start]\n" % (timestr, worker, test_id))
         self._emitted.add(test_id)
Example #17
0
    def import_submission(self, submission: praw.objects.Submission) -> dict:
        """Import a submission from drawcrowd. Uses raw HTML scraping.

        As it turns out, drawcrowd likes to provide different data
        (all in <meta> tags) to non-web-browser requests.
        Since it provides enough information anyways, we don't bother getting
        around it and just parse that.

        This function will define the following values in its return data:
        - author: The author of the post
        - source: The url of the submission
        - importer_display/header
        - import_urls

        :param submission: A reddit submission to parse.
        """
        try:
            url = html.unescape(submission.url)
            if not self.regex.match(urlsplit(url).netloc):
                return None
            data = {'source': url}
            r = requests.head(url, headers=self.headers)
            if r.status_code == 301:  # Moved Permanently
                return None
            mime_text = r.headers.get('Content-Type')
            mime = mimeparse.parse_mime_type(mime_text)
            if mime[0] == 'image':
                data['author'] = 'An unknown drawcrowd user'
                image_url = url
            else:
                # Note: Drawcrowd provides different content to non-web-browsers.
                r = requests.get(url, headers=self.headers)
                bs = bs4.BeautifulSoup(r.content.decode('utf-8'))
                matched = bs.find(property='og:image')
                if not matched:
                    self.log.warning('Could not find locate drawcrowd image to scrape.')
                    return None
                image_url = matched['content']
                matched = bs.find(property='og:title')
                if matched:
                    data['author'] = matched['content']
                else:
                    data['author'] = 'an unknown drawcrowd author'
                data['importer_display'] = {'header': 'Mirrored image from {}:\n\n'.format(data['author'])}
            assert image_url
            data['import_urls'] = [image_url]
            return data
        except Exception:
            self.log.error('Could not import drawcrowd URL %s (%s)',
                           submission.url, traceback.format_exc())
            return None
Example #18
0
    def import_submission(self, submission: praw.objects.Submission) -> dict:
        """ Import a submission from gyazo. Uses their oEmbed API.

        gyazo.com was nice enough to provide us with an oEmbed API.
        Apparently these guys also support video, so we should also make sure
        to not try to parse that.

        This function will define the following values in its return data:
        - author: simply "a gyazo.com user"
        - source: The url of the submission
        - importer_display/header
        - import_urls

        :param submission: A reddit submission to parse.
        """
        try:
            if not self.regex.match(urlsplit(submission.url).netloc):
                return None
            data = {'author': 'a gyazo.com user',
                    'source': submission.url,
                    'importer_display':
                        {'header': 'Imported gyazo.com image:\n\n'}}
            r = requests.head(submission.url, headers=self.headers)
            if r.status_code == 301:
                return None

            mime_text = r.headers.get('Content-Type')
            mime = mimeparse.parse_mime_type(mime_text)
            # If we're already given an image...
            if mime[0] == 'image':
                # Use the already given URL
                image_url = submission.url
            else:
                # Otherwise, use the gyazo oEmbed API.
                response = requests.get(
                    'https://api.gyazo.com/api/oembed/',
                    {'url': submission.url},
                    headers=self.headers).json()
                if response.get('type') == 'photo':
                    image_url = response.get('url')
                else:
                    # This is something that is not a photo. Do not scrape.
                    return None

            assert image_url
            data['import_urls'] = [image_url]
            return data
        except Exception:
            self.log.error('Could not import gyazo URL %s (%s)',
                           submission.url, traceback.format_exc())
            return None
Example #19
0
    def encoding(self):
        if self._encoding is not None:
            # Encoding has been set manually.
            return self._encoding

        # Get the `Content-Type` header, if available.
        content_type = self.headers.get('Content-Type')
        if content_type:
            # Parse out the primary type and parameters from the media type.
            ptype, _, params = mimeparse.parse_mime_type(content_type)

            # Return the specified charset or the default depending on the
            # primary type.
            default = 'utf-8' if ptype == 'application' else 'iso-8859-1'
            return params.get('charset', default)
Example #20
0
    def get_best_handler(cls, mimetype):
        """Returns the handler and score that that best fit the mimetype."""
        best_score, best_fit = (0, None)

        for mimetype_handler in _registered_mimetype_handlers:
            for mt in mimetype_handler.supported_mimetypes:
                try:
                    score = score_match(mimeparse.parse_mime_type(mt), mimetype)

                    if score > best_score:
                        best_score, best_fit = (score, mimetype_handler)
                except ValueError:
                    continue

        return (best_score, best_fit)
Example #21
0
def build_content_type(format, encoding='utf-8', api=None):
    """
    Adds the vnd.api.<api_name> attribute to the content type
    (if using AcceptHeaderRouter) and appends the character encoding.
    """
    if api and api._accept_header_routing:
        type, subtype, vars = mimeparse.parse_mime_type(format)
        subtype = '%s+%s' % (api.subtype, subtype)
        attributes = ''
        for k, v in vars.iteritems():
            attributes += '; %s=%s' % (k, v)
        format = '%s/%s%s' % (type, subtype, attributes)
    if 'charset' in format:
        return format
    
    return "%s; charset=%s" % (format, encoding)
Example #22
0
    def import_submission(self, submission: praw.objects.Submission) -> dict:
        """Import a submission from tinypic. Uses raw HTML scraping.

        Because this downloads the page and tries to scrape the HTML,
        we are at significant risk of the image ID on the DOM changing.
        Therefore, this plugin is liable to break.

        This function will define the following values in its return data:
        - author: simply "an anonymous Tinypic user"
        - source: The url of the submission
        - importer_display/header
        - import_urls

        :param submission: A reddit submission to parse.
        """
        try:
            # It seems PRAW doesn't unescape the reddit URL.
            # Tinyurl is the only importer so far that depends on URL parameters.
            url = html.unescape(submission.url)
            if not self.regex.match(urlsplit(url).netloc):
                return None
            data = {'author': 'an anonymous Tinypic user',
                    'source': url,
                    'importer_display':
                        {'header': '~~Liberated~~Mirrored tinypic image:\n\n'}}
            r = requests.head(url, headers=self.headers)
            if r.status_code == 301:  # Moved Permanently
                return None
            mime_text = r.headers.get('Content-Type')
            mime = mimeparse.parse_mime_type(mime_text)
            if mime[0] == 'image':
                image_url = url
            else:
                r = requests.get(url, headers=self.headers)
                bs = bs4.BeautifulSoup(r.content.decode('utf-8'))
                matched = bs.select('div#imgFrame img')
                if not matched:
                    self.log.warning('Could not find locate Tinypic image to scrape.')
                    return None
                image_url = matched[0]['src']
            assert image_url
            data['import_urls'] = [image_url]
            return data
        except Exception:
            self.log.error('Could not import tinypic URL %s (%s)',
                           submission.url, traceback.format_exc())
            return None
Example #23
0
    def get_best_handler(cls, mimetype):
        """Returns the handler and score that that best fit the mimetype."""
        best_score = 0
        best_fit = None

        for review_ui in _file_attachment_review_uis:
            for mt in review_ui.supported_mimetypes:
                try:
                    score = score_match(mimeparse.parse_mime_type(mt), mimetype)

                    if score > best_score:
                        best_score = score
                        best_fit = review_ui
                except ValueError:
                    continue

        return best_score, best_fit
Example #24
0
    def encoding(self):
        """
        The name of the encoding used to decode the stream’s bytes
        into strings, and to encode strings into bytes.

        Reads the charset value from the `Content-Type` header, if available;
        else, returns nothing.
        """
        # Get the `Content-Type` header, if available.
        content_type = self.headers.get('Content-Type')
        if content_type:
            # Parse out the primary type and parameters from the media type.
            ptype, _, params = mimeparse.parse_mime_type(content_type)

            # Return the specified charset or the default depending on the
            # primary type.
            default = 'utf-8' if ptype == 'application' else 'iso-8859-1'
            return params.get('charset', default)
Example #25
0
    def import_submission(self, submission: praw.objects.Submission) -> dict:
        """Import a submission from Derpibooru.

        This function will define the following values in its return data:
        - author: simply "an anonymous user on Derpibooru"
        - source: The url of the submission
        - importer_display/header
        - import_urls

        After we define that, we need to get the image. Since Derpibooru has an API,
        we use that to try to get the image if the image is a non-CDN URL. If it is
        a CDN, we take the image directory and upload *that* to Imgur.

        image_url is the variable of the image to upload.

        :param submission: A reddit submission to parse.
        """
        try:
            url = html.unescape(submission.url)
            if not self.regex.match(urlsplit(url).netloc):
                return None
            r = requests.head(url, headers=self.headers)
            mime_text = r.headers.get('Content-Type')
            mime = mimeparse.parse_mime_type(mime_text)
            # if mime[0] == 'image':
            self.log.debug('Initiating Derpibooru plugin')
            jsonUrl = 'http://derpiboo.ru/oembed.json?url=' + url  # The API endpoint
            callapi = requests.get(jsonUrl)  # Fetch the API's JSON file.
            json = callapi.json()
            img = 'http:' + (json['thumbnail_url'])
            author = (json['author_name'])
            provider_url = (json['provider_url'])
            data = {'author': author,
                    'source': img,
                    'importer_display':
                        {'header': 'Mirrored [image](' + provider_url + ') by Derpibooru artist \
                        [' + author + '](https://derpiboo.ru/tags/artist-colon-' + author + '):\n\n'}}
            image_url = img
            data['import_urls'] = [image_url]
            return data
        except Exception:
            self.log.error('Could not import Derpibooru URL %s (%s)',
                           submission.url, traceback.format_exc())
            return None
Example #26
0
  def __init__(self, http, postproc, uri,
               method='GET',
               body=None,
               headers=None,
               methodId=None,
               resumable=None):
    """Constructor for an HttpRequest.

    Args:
      http: httplib2.Http, the transport object to use to make a request
      postproc: callable, called on the HTTP response and content to transform
                it into a data object before returning, or raising an exception
                on an error.
      uri: string, the absolute URI to send the request to
      method: string, the HTTP method to use
      body: string, the request body of the HTTP request,
      headers: dict, the HTTP request headers
      methodId: string, a unique identifier for the API method being called.
      resumable: MediaUpload, None if this is not a resumbale request.
    """
    self.uri = uri
    self.method = method
    self.body = body
    self.headers = headers or {}
    self.methodId = methodId
    self.http = http
    self.postproc = postproc
    self.resumable = resumable
    self.response_callbacks = []
    self._in_error_state = False

    # Pull the multipart boundary out of the content-type header.
    major, minor, params = mimeparse.parse_mime_type(
        headers.get('content-type', 'application/json'))

    # The size of the non-media part of the request.
    self.body_size = len(self.body or '')

    # The resumable URI to send chunks to.
    self.resumable_uri = None

    # The bytes that have been uploaded.
    self.resumable_progress = 0
Example #27
0
    def for_type(cls, attachment):
        """Returns the handler that is the best fit for provided mimetype."""
        mimetype = mimeparse.parse_mime_type(attachment.mimetype)

        # Override the mimetype if mimeparse is known to misinterpret this
        # type of file as `octet-stream`
        extension = os.path.splitext(attachment.filename)[1]

        if extension in MIMETYPE_EXTENSIONS:
            mimetype = MIMETYPE_EXTENSIONS[extension]

        score, handler = cls.get_best_handler(mimetype)

        if handler:
            try:
                return handler(attachment, mimetype)
            except Exception, e:
                logging.error('Unable to load Mimetype Handler for %s: %s',
                              attachment, e, exc_info=1)
Example #28
0
def analyze_resource_file(path, extension=None):
    def isnt_msdoc_text(content_type):
        extensions = list(
            filter(lambda x: x[1] == content_type,
                   settings.SUPPORTED_CONTENT_TYPES))[0][2]
        return len({'doc', 'docx'} & set(extensions)) == 0

    logger.debug(f"analyze_resource_file({path}, {extension})")
    m = magic.Magic(mime=True, mime_encoding=True)
    result = m.from_file(path)
    family, content_type, options = parse_mime_type(result)
    logger.debug(f"  parsed mimetype: {family}/{content_type});{options}")
    file_info = magic.from_file(path)
    logger.debug(f"  file info: {file_info}")
    encoding = options.get('charset', 'unknown')
    logger.debug(f"  encoding: {encoding}")
    extension = file_format_from_content_type(content_type,
                                              family=family,
                                              extension=extension)
    logger.debug(f"  extension: {extension}")
    if family == 'text' and content_type == 'plain':
        if encoding.startswith('unknown'):
            encoding = guess_file_encoding(path)
            logger.debug(f" encoding (guess-plain): {encoding}")
        extension = guess_text_file_format(path, encoding)
        logger.debug(f"  extension (guess-plain): {extension}")

    if extension in ('doc', 'docx', 'xls', 'xlsx', 'ods',
                     'odt') or content_type == 'msword':
        if encoding.startswith('unknown'):
            encoding = guess_file_encoding(path)
            logger.debug(f"  encoding (guess-spreadsheet): {encoding}")
        spreadsheet_format = guess_spreadsheet_file_format(path, encoding)
        if any((extension in ('xls', 'xlsx', 'ods'),
                isnt_msdoc_text(content_type), spreadsheet_format)):
            extension = spreadsheet_format
            logger.debug(f"  extension (guess-spreadsheet): {extension}")

    logger.debug(
        f'  finally: extension = {extension}, file_info = {file_info}, encoding = {encoding}'
    )
    return extension, file_info, encoding
Example #29
0
    def import_submission(self, submission: praw.objects.Submission) -> dict:
        """Import a submission from gifs.com.

        Because this downloads the page and tries to scrape the HTML,
        we are at significant risk of the image ID on the DOM changing.
        Therefore, this plugin is liable to break.

        This function will define the following values in its return data:
        - author: simply "an anonymous user on gifs.com"
        - source: The url of the submission
        - importer_display/header
        - import_urls

        :param submission: A reddit submission to parse.
        """
        try:
            url = html.unescape(submission.url)
            if not self.regex.match(urlsplit(url).netloc):
                return None
            data = {
                'author': 'a gifscom user',
                'source': url,
                'importer_display': {
                    'header': 'Mirrored gifscom image:\n\n'
                }
            }
            r = requests.head(url, headers=self.headers)
            mime_text = r.headers.get('Content-Type')
            mime = mimeparse.parse_mime_type(mime_text)
            if mime[0] == 'image':
                image_url = url
            else:
                self.log.warning(
                    'gifs.com URL posted that is not an image: %s',
                    submission.url)
                return None
            data['import_urls'] = [image_url]
            return data
        except Exception:
            self.log.error('Could not import gifs.com URL %s (%s)',
                           submission.url, traceback.format_exc())
            return None
Example #30
0
    def supportsSingleExports(self, bug_ids):
        """Return True if the Trac instance provides CSV exports for single
        tickets, False otherwise.

        :bug_ids: A list of bug IDs that we can use for discovery purposes.
        """
        html_ticket_url = '%s/%s' % (
            self.baseurl, self.ticket_url.replace('?format=csv', ''))

        for bug_id in bug_ids:
            try:
                # We try to retrieve the ticket in HTML form, since that
                # will tell us whether or not it is actually a valid ticket.
                ticket_id = int(bug_id)
                self._getPage(html_ticket_url % ticket_id)
            except BugTrackerConnectError as e:
                if isinstance(e.error, requests.HTTPError):
                    # We can consider the ticket to be invalid.
                    pass
                else:
                    raise
            except ValueError:
                # The ticket_id couldn't be identified and it's of no use to
                # us anyway.
                pass
            else:
                # If we didn't get an error we can try to get the ticket in
                # CSV form. If this fails then we can consider single ticket
                # exports to be unsupported.
                try:
                    response = self._getPage(
                        "%s/%s" % (self.baseurl, self.ticket_url % ticket_id))
                    subtype = parse_mime_type(
                        response.headers.get('Content-Type', ''))[1]
                    return subtype == 'csv'
                except BugTrackerConnectError:
                    return False
        else:
            # If we reach this point then we likely haven't had any valid
            # tickets or something else is wrong. Either way, we can only
            # assume that CSV exports of single tickets aren't supported.
            return False
Example #31
0
    def get_best_handler(cls, mimetype):
        """Returns the handler and score that that best fit the mimetype."""
        best_score, best_fit = (0, cls)

        for mt in cls.supported_mimetypes:
            try:
                score = score_match(mimeparse.parse_mime_type(mt), mimetype)

                if score > best_score:
                    best_score, best_fit = (score, cls)
            except ValueError:
                continue

        for handler in cls.__subclasses__():
            score, best_handler = handler.get_best_handler(mimetype)

            if score > best_score:
                best_score, best_fit = (score, best_handler)

        return (best_score, best_fit)
Example #32
0
    def get_best_handler(cls, mimetype):
        """Returns the handler and score that that best fit the mimetype."""
        best_score, best_fit = (0, cls)

        for mt in cls.supported_mimetypes:
            try:
                score = score_match(mimeparse.parse_mime_type(mt), mimetype)

                if score > best_score:
                    best_score, best_fit = (score, cls)
            except ValueError:
                continue

        for handler in cls.__subclasses__():
            score, best_handler = handler.get_best_handler(mimetype)

            if score > best_score:
                best_score, best_fit = (score, best_handler)

        return (best_score, best_fit)
Example #33
0
    def for_type(cls, attachment):
        """Return the Review UI that is the best fit for a file attachment.

        Args:
            attachment (reviewboard.attachments.models.FileAttachments):
                The file attachment to locate a Review UI for.

        Returns:
            FileAttachmentReviewUI:
            The Review UI for the attachment, or ``None`` if a suitable one
            could not be found.
        """
        if attachment.mimetype:
            try:
                mimetype = mimeparse.parse_mime_type(attachment.mimetype)
            except:
                logging.error('Unable to parse MIME type "%s" for %s',
                              attachment.mimetype, attachment)
                return None

            # Override the mimetype if mimeparse is known to misinterpret this
            # type of file as 'octet-stream'
            extension = os.path.splitext(attachment.filename)[1]

            if extension in MIMETYPE_EXTENSIONS:
                mimetype = MIMETYPE_EXTENSIONS[extension]

            score, handler = cls.get_best_handler(mimetype)

            if handler:
                try:
                    return handler(attachment.get_review_request(), attachment)
                except ObjectDoesNotExist as e:
                    logging.error('Unable to load review UI for %s: %s',
                                  attachment, e)
                except Exception as e:
                    logging.error('Error instantiating '
                                  'FileAttachmentReviewUI %r: %s',
                                  handler, e)

        return None
Example #34
0
def mime_object_maker(url, mimetype, session=None):
    """
    return a data object suitable for the mimetype given.
    this will either return a astropy fits object or a pyvo DALResults object,
    a PIL object for conventional images or string for text content.

    Parameters
    ----------
    url : str
        the object download url
    mimetype : str
        the content mimetype
    session : object
        optional session to use for network requests
    """
    session = use_session(session)
    mimetype = mimeparse.parse_mime_type(mimetype)

    if mimetype[0] == 'text':
        return session.get(url).text

    if mimetype[1] == 'fits' or mimetype[1] == 'x-fits':
        response = session.get(url)
        return HDUList.fromstring(response.content)

    if mimetype[0] == 'image':
        from PIL import Image
        from io import BytesIO
        response = session.get(url)
        bio = BytesIO(response.content)
        return Image.open(bio)

    if mimetype[1] == 'x-votable' or mimetype[1] == 'x-votable+xml':
        # As soon as there are some kind of recursive data structures,
        # things start to get messy
        if mimetype[2].get('content', None) == 'datalink':
            from .adhoc import DatalinkResults
            return DatalinkResults.from_result_url(url)
        else:
            from .query import DALResults
            return DALResults.from_result_url(url)
Example #35
0
    def for_type(cls, attachment):
        """Return the Review UI that is the best fit for a file attachment.

        Args:
            attachment (reviewboard.attachments.models.FileAttachments):
                The file attachment to locate a Review UI for.

        Returns:
            FileAttachmentReviewUI:
            The Review UI for the attachment, or ``None`` if a suitable one
            could not be found.
        """
        if attachment.mimetype:
            try:
                mimetype = mimeparse.parse_mime_type(attachment.mimetype)
            except:
                logging.error('Unable to parse MIME type "%s" for %s',
                              attachment.mimetype, attachment)
                return None

            # Override the mimetype if mimeparse is known to misinterpret this
            # type of file as 'octet-stream'
            extension = os.path.splitext(attachment.filename)[1]

            if extension in MIMETYPE_EXTENSIONS:
                mimetype = MIMETYPE_EXTENSIONS[extension]

            score, handler = cls.get_best_handler(mimetype)

            if handler:
                try:
                    return handler(attachment.get_review_request(), attachment)
                except ObjectDoesNotExist as e:
                    logging.error('Unable to load review UI for %s: %s',
                                  attachment, e)
                except Exception as e:
                    logging.error(
                        'Error instantiating '
                        'FileAttachmentReviewUI %r: %s', handler, e)

        return None
Example #36
0
    def for_type(cls, attachment):
        """Returns the handler that is the best fit for provided mimetype."""
        mimetype = mimeparse.parse_mime_type(attachment.mimetype)

        # Override the mimetype if mimeparse is known to misinterpret this
        # type of file as `octet-stream`
        extension = os.path.splitext(attachment.filename)[1]

        if extension in MIMETYPE_EXTENSIONS:
            mimetype = MIMETYPE_EXTENSIONS[extension]

        score, handler = cls.get_best_handler(mimetype)

        if handler:
            try:
                return handler(attachment, mimetype)
            except Exception, e:
                logging.error('Unable to load Mimetype Handler for %s: %s',
                              attachment,
                              e,
                              exc_info=1)
Example #37
0
    def create_text(self, komtext):
        misc_info = kom.CookedMiscInfo()

        if komtext.recipient_list is not None:
            for rec in komtext.recipient_list:
                if rec is not None:
                    misc_info.recipient_list.append(rec)

        if komtext.comment_to_list is not None:
            for ct in komtext.comment_to_list:
                if ct is not None:
                    misc_info.comment_to_list.append(ct)

        print misc_info.to_string()

        mime_type = mimeparse.parse_mime_type(komtext.content_type)
        # Because a text consists of both a subject and body, and you
        # can have a text subject in combination with an image, a
        # charset is needed to specify the encoding of the subject.
        mime_type[2]['charset'] = 'utf-8'
        content_type = mime_type_tuple_to_str(mime_type)

        # TODO: how would this work with images?
        fulltext = str()
        fulltext += komtext.subject.encode('utf-8') + "\n"
        if (mime_type[0] == 'text'):
            fulltext += komtext.body.encode('utf-8')
        else:
            fulltext += komtext.body

        aux_items = []
        aux_items.append(
            kom.AuxItem(kom.AI_CREATING_SOFTWARE,
                        data="%s %s" %
                        (self.client_name, self.client_version)))
        aux_items.append(kom.AuxItem(kom.AI_CONTENT_TYPE, data=content_type))

        text_no = kom.ReqCreateText(self.conn, fulltext, misc_info,
                                    aux_items).response()
        return text_no
Example #38
0
    def for_type(cls, attachment):
        """Return the handler that is the best fit for provided mimetype.

        Args:
            attachment (reviewboard.attachments.models.FileAttachment):
                The file attachment to find the best handler for.

        Returns:
            MimetypeHandler:
            The best mimetype handler for the attachment, or ``None`` if
            one could not be found.
        """
        if not attachment.mimetype:
            return None

        try:
            mimetype = mimeparse.parse_mime_type(attachment.mimetype)
        except:
            logging.warning('Unable to parse MIME type "%s" for %s',
                            attachment, attachment.mimetype)
            mimetype = ('application', 'octet-stream', {})

        # Override the mimetype if mimeparse is known to misinterpret this
        # type of file as `octet-stream`
        extension = os.path.splitext(attachment.filename)[1]

        if extension in MIMETYPE_EXTENSIONS:
            mimetype = MIMETYPE_EXTENSIONS[extension]

        score, handler = cls.get_best_handler(mimetype)

        if handler:
            try:
                return handler(attachment, mimetype)
            except Exception as e:
                logging.error('Unable to load Mimetype Handler for %s: %s',
                              attachment, e)

        return MimetypeHandler(attachment, mimetype)
Example #39
0
    def export_submission(self,
                          import_urls: list,
                          video: bool = False,
                          **import_info) -> dict:
        """Check if something reported as a video is a raw video, then
        post the direct link if it is.

        This function will define the following values in the export data:
        - link_display

        :param import_urls: A set (of one?) of links to videos.
        :param video: Whether the imported data is a video or not.
        :param import_info: Other importing information passed. Ignored.
        :return: None if no export, an export info dictionary otherwise.
        """
        if not video:
            return None
        self.log.debug('Attempting to upload raw video URL.')
        links = []
        for url in import_urls:
            req = requests.head(url, headers=self.headers)
            if not req.ok:
                self.log.debug('URL %s was not valid.', url)
                continue
            try:
                mime_text = req.headers.get('Content-Type')
                mime = mimeparse.parse_mime_type(mime_text)
            except Exception:
                self.log.debug('Error parsing MIME for URL %s', url)
                continue
            if mime[0] != 'video':
                self.log.debug('URL %s is not a video!', url)
                continue
            links.append('[Direct video](%s)  \n' % url)
        if not links:
            self.log.info('No direct video links found!')
            return None
        return {'link_display': ''.join(links)}
Example #40
0
def mime_object_maker(url, mimetype):
    """
    return a data object suitable for the mimetype given.
    this will either return a astropy fits object or a pyvo DALResults object,
    a PIL object for conventional images or string for text content.

    Parameters
    ----------
    url : str
        the object download url
    mimetype : str
        the content mimetype
    """
    mimetype = mimeparse.parse_mime_type(mimetype)

    if mimetype[0] == 'text':
        return s.get(url).text

    if mimetype[1] == 'fits' or mimetype[1] == 'x-fits':
        r = s.get(url)
        return HDUList.fromstring(r.content)

    if mimetype[0] == 'image':
        from PIL import Image
        from io import BytesIO
        r = s.get(url)
        b = BytesIO(r.content)
        return Image.open(b)

    if mimetype[1] == 'x-votable' or mimetype[1] == 'x-votable+xml':
        # As soon as there are some kind of recursive data structures,
        # things start to get really f*cked up
        if mimetype[2].get('content', None) == 'datalink':
            from .adhoc import DatalinkResults
            return DatalinkResults.from_result_url(url)
        else:
            from .query import DALResults
            return DALResults.from_result_url(url)
Example #41
0
 def create_text(self, komtext):
     misc_info = kom.CookedMiscInfo()
     
     if komtext.recipient_list is not None:
         for rec in komtext.recipient_list:
             if rec is not None:
                 misc_info.recipient_list.append(rec)
     
     if komtext.comment_to_list is not None:
         for ct in komtext.comment_to_list:
             if ct is not None:
                 misc_info.comment_to_list.append(ct)
     
     print misc_info.to_string()
     
     mime_type = mimeparse.parse_mime_type(komtext.content_type)
     # Because a text consists of both a subject and body, and you
     # can have a text subject in combination with an image, a
     # charset is needed to specify the encoding of the subject.
     mime_type[2]['charset'] = 'utf-8'
     content_type = mime_type_tuple_to_str(mime_type)
     
     # TODO: how would this work with images?
     fulltext = str()
     fulltext += komtext.subject.encode('utf-8') + "\n"
     if (mime_type[0] == 'text'):
         fulltext += komtext.body.encode('utf-8')
     else:
         fulltext += komtext.body
     
     aux_items = []
     aux_items.append(kom.AuxItem(kom.AI_CREATING_SOFTWARE,
                                  data="%s %s" % (self.client_name, self.client_version)))
     aux_items.append(kom.AuxItem(kom.AI_CONTENT_TYPE,
                                  data=content_type))
     
     text_no = kom.ReqCreateText(self.conn, fulltext, misc_info, aux_items).response()
     return text_no
Example #42
0
def parseType(fmt):
    if (fmt == "null"):
        fmt = "application/x-unknown"
    if (fmt == "text"):
        fmt = "text/plain"
    if fmt.find("/") == -1:
        return (fmt.lower(), '', {})
    # Attept to parse:
    try:
        (type, subtype, params) = mimeparse.parse_mime_type(fmt)
        return (type.lower(), subtype.lower(), params)
    except:
        print "ERROR: Could not fully parse: " + fmt

    try:
        fmt_matcher = re.compile(r'([a-z0-9\+\.]+)\/([a-z0-9\+\.]+)')
        fmt_match = fmt_matcher.match(fmt.lower())
        (type, subtype) = fmt_match.groups()
        return (type, subtype, {})
    except:
        print "ERROR: Could not partially parse: " + fmt

    return ("application", 'x-malformed-mimetype', {})
Example #43
0
 def status(self, test_id=None, test_status=None, test_tags=None,
            runnable=True, file_name=None, file_bytes=None, eof=False,
            mime_type=None, route_code=None, timestamp=None):
     super(Starts, self).status(
         test_id, test_status,
         test_tags=test_tags, runnable=runnable, file_name=file_name,
         file_bytes=file_bytes, eof=eof, mime_type=mime_type,
         route_code=route_code, timestamp=timestamp)
     if not test_id:
         if not file_bytes:
             return
         if not mime_type or mime_type == 'test/plain;charset=utf8':
             mime_type = 'text/plain; charset=utf-8'
         primary, sub, parameters = mimeparse.parse_mime_type(mime_type)
         content_type = testtools.content_type.ContentType(
             primary, sub, parameters)
         content = testtools.content.Content(
             content_type, lambda: [file_bytes])
         text = content.as_text()
         if text and text[-1] not in '\r\n':
             self._neednewline = True
         self._output.write(text)
     elif test_status == 'inprogress' and test_id not in self._emitted:
         if self._neednewline:
             self._neednewline = False
             self._output.write('\n')
         worker = ''
         for tag in test_tags or ():
             if tag.startswith('worker-'):
                 worker = '(' + tag[7:] + ') '
         if timestamp:
             timestr = timestamp.isoformat()
         else:
             timestr = ''
             self._output.write('%s: %s%s [start]\n' %
                                (timestr, worker, test_id))
         self._emitted.add(test_id)
Example #44
0
    def require_representation(self, req):
        """Require raw representation dictionary from falcon request object.

        This does not perform any field parsing or validation but only uses
        allowed content-encoding handler to decode content body.

        Note:
            Currently only JSON is allowed as content type.

        Args:
            req (falcon.Request): request object

        Returns:
            dict: raw dictionary of representation supplied in request body

        """
        try:
            type_, subtype, _ = parse_mime_type(req.content_type)
            content_type = '/'.join((type_, subtype))
        except:
            raise falcon.HTTPUnsupportedMediaType(
                description="Invalid Content-Type header: {}".format(
                    req.content_type))

        if content_type == 'application/json':
            body = req.stream.read()
            try:
                res = json.loads(body.decode('utf-8'))
            except json.decoder.JSONDecodeError:
                raise falcon.HTTPBadRequest(
                    title="Bad Request",
                    description="Body is not a valid json.")
            return res
        else:
            raise falcon.HTTPUnsupportedMediaType(
                description="only JSON supported, got: {}".format(
                    content_type))
Example #45
0
def download_file(url, allowed_content_types=None):  # noqa: C901
    logger.debug(f"download_file({url}, {allowed_content_types})")
    try:
        URLValidator()(url)
    except ValidationError:
        raise InvalidUrl('Invalid url address: %s' % url)

    filename, format = None, None

    supported_content_types = allowed_content_types or [
        ct[1] for ct in settings.SUPPORTED_CONTENT_TYPES
    ]

    r = requests.get(url,
                     stream=True,
                     allow_redirects=True,
                     verify=False,
                     timeout=180)

    if r.status_code != 200:
        raise InvalidResponseCode('Invalid response code: %s' % r.status_code)

    family, content_type, options = parse_mime_type(
        r.headers.get('Content-Type'))
    logger.debug(f'  Content-Type: {family}/{content_type};{options}')

    if content_type not in ('octet-stream', 'octetstream'
                            ) and content_type not in supported_content_types:
        raise InvalidContentType('Unsupported type: %s' %
                                 r.headers.get('Content-Type'))

    resource_type = _get_resource_type(r)
    logger.debug(f'  resource_type: {resource_type}')

    if resource_type == 'file':
        content_disposition = r.headers.get('Content-Disposition', None)
        logger.debug(f'  content_disposition: {content_disposition}')
        if content_disposition:
            # Get filename from header
            res = re.findall("filename=(.+)", content_disposition)
            filename = res[0][:100] if res else None
            logger.debug(f'  filename: {filename}')
            if filename:
                filename = filename.replace('"', '')
                format = filename.split('.')[-1]
                logger.debug(f'  filename: {filename}, format: {format}')

        if not filename:
            name, format = filename_from_url(url, content_type)
            filename = '.'.join([name, format])
            logger.debug(
                f'  filename: {filename}, format: {format} - from url')

        filename = filename.strip('.')

        if content_type in ('octet-stream', 'octetstream'):
            family, content_type = content_type_from_file_format(format)
            logger.debug(f'  {family}/{content_type} - from file format')

        format = file_format_from_content_type(content_type,
                                               family=family,
                                               extension=format)
        logger.debug(f'  format:{format} - from content type')

        content = BytesIO(r.content)
        return resource_type, {
            'filename': filename,
            'format': format,
            'content': content
        }
    else:
        format = file_format_from_content_type(content_type, family)
        logger.debug(f'  format: {format} - from content type')
        if resource_type == 'api':
            return resource_type, {'format': format}
        else:
            if r.url != url:
                if r.history and r.history[-1].status_code == 301:
                    raise InvalidResponseCode(
                        'Resource location has been moved!')
            return resource_type, {'format': format}
Example #46
0
    def import_submission(self, submission: praw.objects.Submission) -> dict:
        """Import a submission from deviantArt. Ignores flash content.

        Uses a combination of the DA backend and HTML scraping.

        This function will define the following values in its return data:
        - author: The author of the image.
        - source: The submission URL.
        - importer_display/header
        - import_urls


        :param submission: A reddit submission to parse.
        :return: None if no import, an import info dictionary otherwise.
        """
        try:
            if self.regex_direct.match(urlsplit(submission.url).netloc):
                r = requests.head(submission.url, headers=self.headers)
                mime_text = r.headers.get('Content-Type')
                mime = mimeparse.parse_mime_type(mime_text)
                if mime[0] == 'image':
                    self.log.debug('DA link is a direct image')
                    data = {'author': 'An unknown DA author',
                            'source': submission.url,
                            'import_urls': [submission.url],
                            'importer_display':
                                {'header': 'Mirrored deviantArt image '
                                           'by an unknown author:\n\n'}}
                    return data
            if not self.regex.match(urlsplit(submission.url).netloc):
                return None
            query_url = 'http://backend.deviantart.com/oembed?{}'.format(
                urlencode({'format': 'json', 'url': submission.url}))
            self.log.debug('%s is valid DA url.', submission.url)
            self.log.debug('Querying DA API %s', query_url)

            response = json.loads(self.read_url(query_url))

            if response['type'] not in ('link', 'photo'):
                self.log.debug('Response is not link or photo')
                return None
            self.log.debug('Author name: %s', response['author_name'])

            # Using the official DA API
            data = {'author': response['author_name'],
                    'source': submission.url,
                    'importer_display':
                        {'header': 'Mirrored deviantArt image by the author "{}":\n\n'.format(
                            response['author_name'])}}
            if response['type'] == 'link':
                data['import_urls'] = [response['fullsize_url']]
                self.log.debug('Found DA API url %s', data['import_urls'])

            try:
                # Trying to scrape manually
                bs = BeautifulSoup(self.read_url(submission.url))

                # Checking for flash animation, because mirroring a preview
                # for a flash animation is stupid
                is_flash = bool(bs.select('iframe[class~=flashtime]'))
                is_madefire = bool(bs.select('iframe[class~=madefire-player]'))
                if is_flash or is_madefire:
                    self.log.info('DA url is flash, no preview needed.')
                    return None
                # Seems to alternate between the two
                full_view = (bs.select('img[class~=fullview]') or
                             bs.select('img[class~=dev-content-full]'))
                if full_view:
                    full_url = full_view[0]['src']
                    self.log.debug('Found full DA image url: %s', full_url)
                    data['import_urls'] = [full_url]
            except Exception as e:
                self.log.error(traceback.format_exc())

            if 'import_urls' not in data:
                self.log.debug('No url found for DA image.')
                return None

            return data

        except Exception as e:
            self.log.error('Deviantart Error: %s', traceback.format_exc())
            return None
def test_parse_mime_type(args, expected):
    expected = tuple(expected)
    result = mimeparse.parse_mime_type(args)
    message = "Expected: '%s' but got %s" % (expected, result)
    assert expected == result, message
Example #48
0
    def export_submission(self,
                          import_urls: list,
                          author: str = 'an Unknown Author',
                          source: str = 'an Unknown Source',
                          video: bool = False,
                          **import_info) -> dict:
        """Upload one or multiple images to Imgur. Cannot support videos.

        Uses the imgurpython library.

        This function will define the following values in the export data:
        - exporter
        - link_display

        :param import_urls: A set of direct links to images to upload.
        :param author: The author to note in the description.
        :param source: The source to note in the description.
        :param video: Whether the imported data is a video or not.
        :param import_info: Other importing information passed. Ignored.
        :return: None if no export, an export info dictionary otherwise.
        """
        # imgur does not support videos.
        if not self.client:
            return None
        if video:
            return None
        description = ('This is a mirror uploaded by /u/%s, '
                       'originally made by %s, located at %s' %
                       (self.username, author, source))
        results = {'exporter': self.__class__.__name__}
        config = {}
        album = {}

        # Should we do an album?
        if len(import_urls) == 0:
            self.log.warning('An import gave no urls.')
            return None
        elif len(import_urls) == 1:
            self.log.debug('A single image will be uploaded.')
            is_album = False
            config['description'] = description
        else:
            self.log.debug('An album will be uploaded.')
            try:
                album = self.client.create_album({'description': description})
            except ImgurClientRateLimitError:
                self.log.error('Ran into imgur rate limit! %s',
                               self.client.credits)
                return None
            except Exception:
                self.log.error('Could not create album! %s',
                               traceback.format_exc())
                return None
            config['album'] = album['deletehash']
            is_album = True

        try:
            # Try to upload each image given.
            images = []
            for import_url in import_urls:
                self.log.debug('Uploading URL "%s" to imgur', import_url)
                image = self.client.upload_from_url(import_url, config)
                self.log.debug('Uploaded image: %s', str(image))
                images.append(image)
            if is_album:
                results[
                    'link_display'] = '[Imgur Album](https://imgur.com/a/%s)  \n' % album[
                        'id']
            else:
                picture_url = images[0]['link'].replace('http', 'https')
                r = requests.head(picture_url)
                mime_text = r.headers.get('Content-Type')
                mime = mimeparse.parse_mime_type(mime_text)
                if mime[1] == 'gif':
                    picture_url = re.sub(r'(\.\w+)?$', '.gifv', picture_url)
                results['link_display'] = '[Imgur](%s)  \n' % picture_url

        except ImgurClientRateLimitError:
            self.log.error('Ran into imgur rate limit! %s',
                           self.client.credits)
            return None
        except Exception:
            self.log.error('Broken exception catch %s', traceback.format_exc())
            if is_album:
                self.log.error('Try to delete album!')
                self.delete_export(album['deletehash'])
        return results
Example #49
0
def _get_doc_type(response):
    return mimeparse.parse_mime_type(response.headers["Content-Type"])[1]
Example #50
0
    def __search_documents(self, index_name):
        start_time = time.time()

        @after_this_request
        def to_do_after_this_request(response):
            record_log(request, response, logger=self.__http_logger)
            self.__record_metrics(start_time, request, resp)
            return response

        data = {}
        status_code = None

        try:
            query = request.args.get('query', default='', type=str)
            search_field = request.args.get('search_field', default='', type=str)
            page_num = request.args.get('page_num', default=1, type=int)
            page_len = request.args.get('page_len', default=10, type=int)
            weighting = BM25F
            if len(request.data) > 0:
                mime = mimeparse.parse_mime_type(request.headers.get('Content-Type'))
                charset = 'utf-8' if mime[2].get('charset') is None else mime[2].get('charset')
                if mime[1] == 'yaml':
                    weighting = get_multi_weighting(yaml.safe_load(request.data.decode(charset)))
                elif mime[1] == 'json':
                    weighting = get_multi_weighting(json.loads(request.data.decode(charset)))
                else:
                    raise ValueError('unsupported format')

            results_page = self.__indexer.search_documents(index_name, query, search_field, page_num,
                                                           page_len=page_len, weighting=weighting)

            if results_page.pagecount >= page_num or results_page.total <= 0:
                results = {
                    'is_last_page': results_page.is_last_page(),
                    'page_count': results_page.pagecount,
                    'page_len': results_page.pagelen,
                    'page_num': results_page.pagenum,
                    'total': results_page.total,
                    'offset': results_page.offset
                }
                hits = []
                for result in results_page.results[results_page.offset:]:
                    fields = {}
                    for item in result.iteritems():
                        fields[item[0]] = item[1]
                    hit = {
                        'fields': fields,
                        'doc_num': result.docnum,
                        'score': result.score,
                        'rank': result.rank,
                        'pos': result.pos
                    }
                    hits.append(hit)
                results['hits'] = hits

                data['results'] = results
                status_code = HTTPStatus.OK
            else:
                data['error'] = 'page_num must be <= {0}'.format(results_page.pagecount)
                status_code = HTTPStatus.BAD_REQUEST
        except (ConstructorError, JSONDecodeError, ValueError) as ex:
            data['error'] = '{0}'.format(ex.args[0])
            status_code = HTTPStatus.BAD_REQUEST
            self.__logger.error(ex)
        except Exception as ex:
            data['error'] = '{0}'.format(ex.args[0])
            status_code = HTTPStatus.INTERNAL_SERVER_ERROR
            self.__logger.error(ex)
        finally:
            data['time'] = time.time() - start_time
            data['status'] = {'code': status_code.value, 'phrase': status_code.phrase,
                              'description': status_code.description}

        output = request.args.get('output', default='json', type=str).lower()

        # make response
        resp = make_response(data, output)
        resp.status_code = status_code

        return resp
Example #51
0
 def _parse_mime_type(path):
     result = _magic.from_file(path)
     return parse_mime_type(result)
Example #52
0
 def _handler_for(self, mimetype):
     mt = mimeparse.parse_mime_type(mimetype)
     score, handler = MimetypeHandler.get_best_handler(mt)
     return handler
Example #53
0
    def request(self, method, url, query_params=None, headers=None,
                body=None, post_params=None, _preload_content=True,
                _request_timeout=None):
        """Perform requests.

        :param method: http request method
        :param url: http request url
        :param query_params: query parameters in the url
        :param headers: http request headers
        :param body: request json body, for `application/json`
        :param post_params: request post parameters,
                            `application/x-www-form-urlencoded`
                            and `multipart/form-data`
        :param _preload_content: if False, the urllib3.HTTPResponse object will
                                 be returned without reading/decoding response
                                 data. Default is True.
        :param _request_timeout: timeout setting for this request. If one
                                 number provided, it will be total request
                                 timeout. It can also be a pair (tuple) of
                                 (connection, read) timeouts.
        """
        method = method.upper()
        assert method in ['GET', 'HEAD', 'DELETE', 'POST', 'PUT',
                          'PATCH', 'OPTIONS']

        if post_params and body:
            raise ValueError(
                "body parameter cannot be used with post_params parameter."
            )

        post_params = post_params or {}
        headers = headers or {}

        timeout = None
        if _request_timeout:
            if isinstance(_request_timeout, (int, ) if six.PY3 else (int, long)):  # noqa: E501,F821
                timeout = urllib3.Timeout(total=_request_timeout)
            elif (isinstance(_request_timeout, tuple) and
                  len(_request_timeout) == 2):
                timeout = urllib3.Timeout(
                    connect=_request_timeout[0], read=_request_timeout[1])

        if 'Content-Type' not in headers:
            headers['Content-Type'] = 'application/json'

        try:
            # For `POST`, `PUT`, `PATCH`, `OPTIONS`, `DELETE`
            if method in ['POST', 'PUT', 'PATCH', 'OPTIONS', 'DELETE']:
                if query_params:
                    url += '?' + urlencode(query_params)
                if re.search('json', headers['Content-Type'], re.IGNORECASE):
                    request_body = '{}'
                    if body is not None:
                        request_body = json.dumps(body)
                    r = self.pool_manager.request(
                        method, url,
                        body=request_body,
                        preload_content=_preload_content,
                        timeout=timeout,
                        headers=headers)
                elif headers['Content-Type'] == 'application/x-www-form-urlencoded':  # noqa: E501
                    r = self.pool_manager.request(
                        method, url,
                        fields=post_params,
                        encode_multipart=False,
                        preload_content=_preload_content,
                        timeout=timeout,
                        headers=headers)
                elif headers['Content-Type'] == 'multipart/form-data':
                    # must del headers['Content-Type'], or the correct
                    # Content-Type which generated by urllib3 will be
                    # overwritten.
                    del headers['Content-Type']
                    r = self.pool_manager.request(
                        method, url,
                        fields=post_params,
                        encode_multipart=True,
                        preload_content=_preload_content,
                        timeout=timeout,
                        headers=headers)
                # Pass a `string` parameter directly in the body to support
                # other content types than Json when `body` argument is
                # provided in serialized form
                elif isinstance(body, str):
                    request_body = body
                    r = self.pool_manager.request(
                        method, url,
                        body=request_body,
                        preload_content=_preload_content,
                        timeout=timeout,
                        headers=headers)
                else:
                    # Cannot generate the request from given parameters
                    msg = """Cannot prepare a request message for provided
                             arguments. Please check that your arguments match
                             declared content type."""
                    raise ApiException(status=0, reason=msg)
            # For `GET`, `HEAD`
            else:
                r = self.pool_manager.request(method, url,
                                              fields=query_params,
                                              preload_content=_preload_content,
                                              timeout=timeout,
                                              headers=headers)
        except urllib3.exceptions.SSLError as e:
            msg = "{0}\n{1}".format(type(e).__name__, str(e))
            raise ApiException(status=0, reason=msg)

        if _preload_content:
            r = RESTResponse(r)

            # In the python 3, the response.data is bytes.
            # we need to decode it to string.
            if six.PY3:
                if 'Content-Type' not in r.getheaders() or mimeparse.parse_mime_type(r.getheader('Content-Type'))[1].upper() in ['PDF']:
                    # when a file is returned, 'Content-Type' does not exist or is one of ['PDF',]
                    pass
                else:
                    # when a json is returned, 'Content-Type' exists
                    r.data = r.data.decode('utf8')
                    pass
            # log response body
            logger.debug("response body: %s", r.data)

        if not 200 <= r.status <= 299:
            raise ApiException(http_resp=r)

        return r
Example #54
0
 def negotiated_mime_type(self):
     parsed_mime_type = mimeparse.parse_mime_type(
         self._negotiate_content_type())
     return '%s/%s' % (parsed_mime_type[0], parsed_mime_type[1])
Example #55
0
 def assert_score(pattern, test, score):
     self.assertAlmostEqual(
         score_match(mimeparse.parse_mime_type(pattern),
                     mimeparse.parse_mime_type(test)), score)
Example #56
0
    def __put(self, key=''):
        start_time = time.time()

        @after_this_request
        def to_do_after_this_request(response):
            record_log(request, response, logger=self.__http_logger)
            self.__record_metrics(start_time, request, response)
            return response

        data = {}
        status_code = None

        try:
            mime = mimeparse.parse_mime_type(
                request.headers.get('Content-Type'))
            charset = 'utf-8' if mime[2].get(
                'charset') is None else mime[2].get('charset')
            if mime[1] == 'yaml':
                value = yaml.safe_load(request.data.decode(charset))
            elif mime[1] == 'json':
                value = json.loads(request.data.decode(charset))
            else:
                # handle as a string
                value = request.data.decode(charset)

            sync = False
            if request.args.get('sync', default='',
                                type=str).lower() in TRUE_STRINGS:
                sync = True

            self.__manager.put(key if key.startswith('/') else '/' + key,
                               value,
                               sync=sync)

            if sync:
                status_code = HTTPStatus.CREATED
            else:
                status_code = HTTPStatus.ACCEPTED
        except (ConstructorError, JSONDecodeError, ValueError) as ex:
            data['error'] = '{0}'.format(ex.args[0])
            status_code = HTTPStatus.BAD_REQUEST
            self.__logger.error(ex)
        except Exception as ex:
            data['error'] = '{0}'.format(ex.args[0])
            status_code = HTTPStatus.INTERNAL_SERVER_ERROR
            self.__logger.error(ex)
        finally:
            data['time'] = time.time() - start_time
            data['status'] = {
                'code': status_code.value,
                'phrase': status_code.phrase,
                'description': status_code.description
            }

        output = request.args.get('output', default='json', type=str).lower()

        # make response
        resp = make_response(data, output)
        resp.status_code = status_code

        return resp
Example #57
0
 def for_type(cls, attachment):
     """Returns the handler that is the best fit for provided mimetype."""
     mimetype = mimeparse.parse_mime_type(attachment.mimetype)
     score, handler = cls.get_best_handler(mimetype)
     return handler(attachment, mimetype)
def get_file_info(path):
    _magic = magic.Magic(mime=True, mime_encoding=True)
    result = _magic.from_file(path)
    return parse_mime_type(result)