def get_media(img_url, post_id): if any(s in img_url for s in ('i.imgur.com', 'i.redd.it', 'i.reddituploads.com')): # This adds support for all imgur links (including galleries), but I need to make a new regex #if ('i.imgur.com' not in img_url) and ('imgur.com' in img_url): #print('[bot] Attempting to retrieve image URL for', img_url, 'from imgur...') #regex = r"(https?:\/\/imgur\.com\/a\/(.*?)(?:\/.*|$))" #m = re.search(regex, img_url, flags=0) #print(m.group(0)) #img_url = imgur.get_image(img_url) file_name = os.path.basename(urllib.parse.urlsplit(img_url).path) file_extension = os.path.splitext(img_url)[-1].lower(); # Fix for issue with i.reddituploads.com links not having a file extension in the URL if not file_extension: file_extension += '.jpg' file_name += '.jpg' img_url += '.jpg' file_path = IMAGE_DIR + '/' + file_name print('[BOT] Downloading file at URL ' + img_url + ' to ' + file_path + ', file type identified as ' + file_extension) if ('gifv' not in img_url): # Can't process GIFV links until Imgur API integration is working img = save_file(img_url, file_path) return img else: print('[BOT] GIFV files are not supported yet') return '' elif ('gfycat.com' in img_url): # Gfycat # Twitter supports uploading videos, but Tweepy hasn't updated to support it yet. gfycat_name = os.path.basename(urllib.parse.urlsplit(img_url).path) client = GfycatClient() gfycat_info = client.query_gfy(gfycat_name) gfycat_url = gfycat_info['gfyItem']['mp4Url'] file_path = IMAGE_DIR + '/' + gfycat_name + '.mp4' print('[BOT] Downloading Gfycat at URL ' + gfycat_url + ' to ' + file_path) gfycat_file = save_file(gfycat_url, file_path) return gfycat_file else: print('[BOT] Post', post_id, 'doesn\'t point to an image/video:', img_url) return ''
def get_media(img_url, post_id): if any(s in img_url for s in ('i.imgur.com', 'i.redd.it', 'i.reddituploads.com')): # This adds support for all imgur links (including galleries), but I need to make a new regex if ('i.imgur.com' not in img_url) and ('imgur.com' in img_url): print('[bot] Attempting to retrieve image URL for', img_url, 'from imgur...') regex = r"(https?:\/\/imgur\.com\/?a?\/(.*?)(?:\/.*|$))" m = re.search(regex, img_url, flags=0) print(m.group(0)) img_url = imgur.get_image(img_url) file_name = os.path.basename(urllib.parse.urlsplit(img_url).path) file_extension = os.path.splitext(img_url)[-1].lower(); # Fix for issue with i.reddituploads.com links not having a file extension in the URL if not file_extension: file_extension += '.jpg' file_name += '.jpg' img_url += '.jpg' file_path = IMAGE_DIR + '/' + file_name print('[ OK ] Downloading file at URL ' + img_url + ' to ' + file_path + ', file type identified as ' + file_extension) if ('gifv' not in img_url): # Can't process GIFV links until Imgur API integration is working img = save_file(img_url, file_path) return img else: print('[WARN] GIFV files are not supported yet') return '' elif ('gfycat.com' in img_url): # Gfycat # Twitter supports uploading videos, but Tweepy hasn't updated to support it yet. gfycat_name = os.path.basename(urllib.parse.urlsplit(img_url).path) client = GfycatClient() gfycat_info = client.query_gfy(gfycat_name) gfycat_url = gfycat_info['gfyItem']['mp4Url'] file_path = IMAGE_DIR + '/' + gfycat_name + '.mp4' print('[ OK ] Downloading Gfycat at URL ' + gfycat_url + ' to ' + file_path) gfycat_file = save_file(gfycat_url, file_path) return gfycat_file else: print('[WARN] Post', post_id, 'doesn\'t point to an image/video:', img_url) return ''
def get_url(submission, mp4_instead_gif=True): ''' return TYPE, URL, EXTENSION E.x.: return 'img', 'http://example.com/pic.png', 'png' ''' def what_is_inside(url): header = requests.head(url).headers if 'Content-Type' in header: return header['Content-Type'] else: return '' url = submission.url url_content = what_is_inside(url) if (CONTENT_JPEG == url_content or CONTENT_PNG == url_content): return TYPE_IMG, url, url_content.split('/')[1] if CONTENT_GIF in url_content: if url.endswith('.gif') and mp4_instead_gif: # Let's try to find .mp4 file. url_mp4 = url[:-4] + '.mp4' if CONTENT_MP4 == what_is_inside(url_mp4): return TYPE_GIF, url_mp4, 'mp4' return TYPE_GIF, url, 'gif' if url.endswith('.gifv'): if mp4_instead_gif: url_mp4 = url[:-5] + '.mp4' if CONTENT_MP4 == what_is_inside(url_mp4): return TYPE_GIF, url_mp4, 'mp4' if CONTENT_GIF in what_is_inside(url[0:-1]): return TYPE_GIF, url[0:-1], 'gif' if submission.is_self is True: # Self submission with text return TYPE_TEXT, None, None if urlparse(url).netloc == 'imgur.com': # Imgur imgur_config = yaml.load(open(os.path.join('configs', 'imgur.yml')).read()) imgur_client = ImgurClient(imgur_config['client_id'], imgur_config['client_secret']) path_parts = urlparse(url).path.split('/') if path_parts[1] == 'gallery': # TODO: gallary handling return TYPE_OTHER, url, None elif path_parts[1] == 'topic': # TODO: topic handling return TYPE_OTHER, url, None elif path_parts[1] == 'a': # An imgur album album = imgur_client.get_album(path_parts[2]) story = dict() for num, img in enumerate(album.images): number = num + 1 what = TYPE_IMG link = img['link'] ext = img['type'].split('/')[1] if img['animated']: what = TYPE_GIF link = img['mp4'] if mp4_instead_gif else img['gifv'][:-1] ext = 'mp4' if mp4_instead_gif else 'gif' story[number] = { 'url': link, 'what': what, 'ext': ext } if len(story) == 1: return story[1]['what'], story[1]['url'], story[1]['ext'] return TYPE_ALBUM, story, None else: # Just imgur img img = imgur_client.get_image(path_parts[1].split('.')[0]) if not img.animated: return TYPE_IMG, img.link, img.type.split('/')[1] else: if mp4_instead_gif: return TYPE_GIF, img.mp4, 'mp4' else: # return 'gif', img.link, 'gif' return TYPE_GIF, img.gifv[:-1], 'gif' elif 'gfycat.com' in urlparse(url).netloc: client = GfycatClient() rname = re.findall(r'gfycat.com\/(?:detail\/)?(\w*)', url)[0] try: urls = client.query_gfy(rname)['gfyItem'] if mp4_instead_gif: return TYPE_GIF, urls['mp4Url'], 'mp4' else: return TYPE_GIF, urls['max5mbGif'], 'gif' except KeyError: logging.info('Gfy fail prevented!') return TYPE_OTHER, url, None else: return TYPE_OTHER, url, None
except: print(" \_ Unable to get {}".format(entry.url)) ignore.add(path) continue hasext = os.path.splitext(path) if not hasext[1]: ext = os.path.splitext(url_to_get)[1] path += ext print(" \_{}".format(url_to_get)) elif parts.netloc == 'gfycat.com': url_path = parts.path.split('/') try: obj = gfycat.query_gfy(url_path[-1]) url_to_get = obj.get('gfyItem').get('mp4Url') print(" \_{}".format(url_to_get)) except Exception as ex: print(" \_ Unable to get {} : {}".format(entry.url, ex)) ignore.add(path) continue try: request = urllib.request.Request(url_to_get, headers={'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_9_3) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/35.0.1916.47 Safari/537.36'}) remote = urllib.request.urlopen(request) with open(path, 'bw') as f: f.write(remote.read()) urllist.add(entry.url)