Exemple #1
0
def get_media(img_url, post_id):
	if any(s in img_url for s in ('i.imgur.com', 'i.redd.it', 'i.reddituploads.com')):
		# This adds support for all imgur links (including galleries), but I need to make a new regex
		#if ('i.imgur.com' not in img_url) and ('imgur.com' in img_url):
			#print('[bot] Attempting to retrieve image URL for', img_url, 'from imgur...')
			#regex = r"(https?:\/\/imgur\.com\/a\/(.*?)(?:\/.*|$))"
			#m = re.search(regex, img_url, flags=0)
			#print(m.group(0))
			#img_url = imgur.get_image(img_url)
		file_name = os.path.basename(urllib.parse.urlsplit(img_url).path)
		file_extension = os.path.splitext(img_url)[-1].lower();
		# Fix for issue with i.reddituploads.com links not having a file extension in the URL
		if not file_extension:
			file_extension += '.jpg'
			file_name += '.jpg'
			img_url += '.jpg'
		file_path = IMAGE_DIR + '/' + file_name
		print('[BOT] Downloading file at URL ' + img_url + ' to ' + file_path + ', file type identified as ' + file_extension)
		if ('gifv' not in img_url): # Can't process GIFV links until Imgur API integration is working
			img = save_file(img_url, file_path)
			return img
		else:
			print('[BOT] GIFV files are not supported yet')
			return ''
	elif ('gfycat.com' in img_url): # Gfycat
		# Twitter supports uploading videos, but Tweepy hasn't updated to support it yet.
		gfycat_name = os.path.basename(urllib.parse.urlsplit(img_url).path)
		client = GfycatClient()
		gfycat_info = client.query_gfy(gfycat_name)
		gfycat_url = gfycat_info['gfyItem']['mp4Url']
		file_path = IMAGE_DIR + '/' + gfycat_name + '.mp4'
		print('[BOT] Downloading Gfycat at URL ' + gfycat_url + ' to ' + file_path)
		gfycat_file = save_file(gfycat_url, file_path)
		return gfycat_file
	else:
		print('[BOT] Post', post_id, 'doesn\'t point to an image/video:', img_url)
		return ''
Exemple #2
0
def get_media(img_url, post_id):
	if any(s in img_url for s in ('i.imgur.com', 'i.redd.it', 'i.reddituploads.com')):
		# This adds support for all imgur links (including galleries), but I need to make a new regex
		if ('i.imgur.com' not in img_url) and ('imgur.com' in img_url):
			print('[bot] Attempting to retrieve image URL for', img_url, 'from imgur...')
			regex = r"(https?:\/\/imgur\.com\/?a?\/(.*?)(?:\/.*|$))"
			m = re.search(regex, img_url, flags=0)
			print(m.group(0))
			img_url = imgur.get_image(img_url)
		file_name = os.path.basename(urllib.parse.urlsplit(img_url).path)
		file_extension = os.path.splitext(img_url)[-1].lower();
		# Fix for issue with i.reddituploads.com links not having a file extension in the URL
		if not file_extension:
			file_extension += '.jpg'
			file_name += '.jpg'
			img_url += '.jpg'
		file_path = IMAGE_DIR + '/' + file_name
		print('[ OK ] Downloading file at URL ' + img_url + ' to ' + file_path + ', file type identified as ' + file_extension)
		if ('gifv' not in img_url): # Can't process GIFV links until Imgur API integration is working
			img = save_file(img_url, file_path)
			return img
		else:
			print('[WARN] GIFV files are not supported yet')
			return ''
	elif ('gfycat.com' in img_url): # Gfycat
		# Twitter supports uploading videos, but Tweepy hasn't updated to support it yet.
		gfycat_name = os.path.basename(urllib.parse.urlsplit(img_url).path)
		client = GfycatClient()
		gfycat_info = client.query_gfy(gfycat_name)
		gfycat_url = gfycat_info['gfyItem']['mp4Url']
		file_path = IMAGE_DIR + '/' + gfycat_name + '.mp4'
		print('[ OK ] Downloading Gfycat at URL ' + gfycat_url + ' to ' + file_path)
		gfycat_file = save_file(gfycat_url, file_path)
		return gfycat_file
	else:
		print('[WARN] Post', post_id, 'doesn\'t point to an image/video:', img_url)
		return ''
Exemple #3
0
def get_url(submission, mp4_instead_gif=True):
    '''
    return TYPE, URL, EXTENSION
    E.x.: return 'img', 'http://example.com/pic.png', 'png'
    '''
    
    def what_is_inside(url):
        header = requests.head(url).headers
        if 'Content-Type' in header:
            return header['Content-Type']
        else:
            return ''

    url = submission.url
    url_content = what_is_inside(url)

    if (CONTENT_JPEG == url_content or CONTENT_PNG == url_content):
        return TYPE_IMG, url, url_content.split('/')[1]

    if CONTENT_GIF in url_content:
        if url.endswith('.gif') and mp4_instead_gif:
            # Let's try to find .mp4 file.
            url_mp4 = url[:-4] + '.mp4'
            if CONTENT_MP4 == what_is_inside(url_mp4):
                return TYPE_GIF, url_mp4, 'mp4'
        return TYPE_GIF, url, 'gif'
    
    if url.endswith('.gifv'):
        if mp4_instead_gif:
            url_mp4 = url[:-5] + '.mp4'
            if CONTENT_MP4 == what_is_inside(url_mp4):
                return TYPE_GIF, url_mp4, 'mp4'
        if CONTENT_GIF in what_is_inside(url[0:-1]):
            return TYPE_GIF, url[0:-1], 'gif'

    if submission.is_self is True:
        # Self submission with text
        return TYPE_TEXT, None, None

    if urlparse(url).netloc == 'imgur.com':
        # Imgur
        imgur_config = yaml.load(open(os.path.join('configs', 'imgur.yml')).read())
        imgur_client = ImgurClient(imgur_config['client_id'], imgur_config['client_secret'])
        path_parts = urlparse(url).path.split('/')
        if path_parts[1] == 'gallery':
            # TODO: gallary handling
            return TYPE_OTHER, url, None
        elif path_parts[1] == 'topic':
            # TODO: topic handling
            return TYPE_OTHER, url, None
        elif path_parts[1] == 'a':
            # An imgur album
            album = imgur_client.get_album(path_parts[2])
            story = dict()
            for num, img in enumerate(album.images):
                number = num + 1
                what = TYPE_IMG
                link = img['link']
                ext = img['type'].split('/')[1]
                if img['animated']:
                    what = TYPE_GIF
                    link = img['mp4'] if mp4_instead_gif else img['gifv'][:-1]
                    ext = 'mp4' if mp4_instead_gif else 'gif'
                story[number] = {
                    'url': link,
                    'what': what,
                    'ext': ext
                }
            if len(story) == 1:
                return story[1]['what'], story[1]['url'], story[1]['ext']
            return TYPE_ALBUM, story, None
        else:
            # Just imgur img
            img = imgur_client.get_image(path_parts[1].split('.')[0])
            if not img.animated:
                return TYPE_IMG, img.link, img.type.split('/')[1]
            else:
                if mp4_instead_gif:
                    return TYPE_GIF, img.mp4, 'mp4'
                else:
                    # return 'gif', img.link, 'gif'
                    return TYPE_GIF, img.gifv[:-1], 'gif'
    elif 'gfycat.com' in urlparse(url).netloc:
        client = GfycatClient()
        rname = re.findall(r'gfycat.com\/(?:detail\/)?(\w*)', url)[0]
        try:
            urls = client.query_gfy(rname)['gfyItem']
            if mp4_instead_gif:
                return TYPE_GIF, urls['mp4Url'], 'mp4'
            else:
                return TYPE_GIF, urls['max5mbGif'], 'gif'
        except KeyError:
            logging.info('Gfy fail prevented!')
            return TYPE_OTHER, url, None
    else:
        return TYPE_OTHER, url, None
Exemple #4
0
                except:
                    print("   \_ Unable to get {}".format(entry.url))
                    ignore.add(path)
                    continue

                hasext = os.path.splitext(path)
                if not hasext[1]:
                    ext = os.path.splitext(url_to_get)[1]
                    path += ext

                print("   \_{}".format(url_to_get))

            elif parts.netloc == 'gfycat.com':
                url_path = parts.path.split('/')
                try:
                    obj = gfycat.query_gfy(url_path[-1])
                    url_to_get = obj.get('gfyItem').get('mp4Url')
                    print("   \_{}".format(url_to_get))

                except Exception as ex:
                    print("   \_ Unable to get {} : {}".format(entry.url, ex))
                    ignore.add(path)
                    continue

            try:
                request = urllib.request.Request(url_to_get, headers={'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_9_3) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/35.0.1916.47 Safari/537.36'})
                remote = urllib.request.urlopen(request)
                with open(path, 'bw') as f:
                    f.write(remote.read())

                urllist.add(entry.url)