def __init__(self, link, name, template_data): self.logger = logging.getLogger(__name__) opener = urllib.request.build_opener() opener.addheaders = [( 'User-Agent', 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_9_3) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/35.0.1916.47 Safari/537.36' )] urllib.request.install_opener(opener) self.link = link self.name = name self.template_data = template_data self.load_config() self.saveDir = Save() self.direct = ""
def main(parser): # subreddit/user/text file subR = None filepath = None if parser.subreddit: if '.txt' in parser.subreddit: filepath = parser.subreddit else: subR = parser.subreddit # output template global save save = Save(parser.base_dir, parser.template) logger.debug('Output template set to {}'.format(save)) # initialise database global db db = DBInterface(parser.db_location) if parser.subreddit: # Passes subreddits to feeder current_cycle = 0 while (current_cycle < parser.cycles): if filepath is not None: with open(filepath) as f: line = f.readline() while line: subR = "{}".format(line.strip()) feeder(subR, parser) line = f.readline() else: feeder(subR, parser) if parser.cycles > 1: logger.info("Waiting {} seconds".format(parser.wait)) time.sleep(parser.wait) current_cycle += 1
class Common: valid_url = r'((.)+\.(?P<ext>jpg|png|gif|jpeg|bmp|tiff|webp|mp4|mov|mpeg|3gp|mp3|flac|ogg))|(https?://i.reddituploads.com/(.)+)' def __init__(self, link, name, template_data): self.logger = logging.getLogger(__name__) opener = urllib.request.build_opener() opener.addheaders = [( 'User-Agent', 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_9_3) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/35.0.1916.47 Safari/537.36' )] urllib.request.install_opener(opener) self.link = link self.name = name self.template_data = template_data self.load_config() self.saveDir = Save() self.direct = "" def load_config(self): parser = Parser() data = parser.config try: int(data["media_download"]["retries"]) int(data["media_download"]["wait_time"]) self.retries = data["media_download"]["retries"] self.wait_time = data["media_download"]["wait_time"] except TypeError: self.logger.warning( "TypeError: Media download retries or wait time is not an integer." ) self.retries = 5 self.wait_time = 60 def save(self): if '.gifv' in self.link: ext = 'mp4' self.link = self.link.replace('gifv', 'mp4') elif 'i.reddituploads.com' in self.link: ext = 'jpeg' else: ext = re.search(self.valid_url, self.link).group('ext') self.template_data["ext"] = ext self.direct = self.saveDir.get_dir(self.template_data) self.logger.debug("Saving {} with extension {}".format(self.link, ext)) if not self.save_image(): return False return True def save_image(self, current_retry=1): try: urlretrieve(self.link, self.direct) except (URLError, RemoteDisconnected, ConnectionResetError) as e: if self.retries > current_retry: self.logger.warning("{}, retrying {}".format( str(e), self.link)) time.sleep(self.wait_time) current_retry += 1 self.save_image(current_retry) else: self.logger.error("{}, failed {}".format(str(e), self.link)) return False except Exception as e: if self.retries > current_retry: self.logger.error("{}, retrying {}".format(str(e), self.link)) time.sleep(self.wait_time) current_retry += 1 self.save_image(current_retry) else: self.logger.error("{}, failed {}".format(str(e), self.link)) return False return True def get_html(self, headers_param={}): headers = { 'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_9_3) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/35.0.1916.47 Safari/537.36', } headers.update(headers_param) req = Request( self.link, data=None, headers=headers, ) try: page_html = urlopen(req).read() page_html = soup(page_html, "lxml") except (HTTPError, URLError) as e: page_html = None self.logger.error('{} - Link {}'.format(str(e), self.link)) return page_html def format_name(self, title): title = re.sub('[?/|\\\}{:<>*"]', '', title) if len(title) > 190: title = title[:120] return title
def routeSubmission(submission): logger = logging.getLogger(__name__) save = Save() title = formatName(submission.title) link = submission.url downloaded = True path = { 'author': str(submission.author), 'subreddit': str(submission.subreddit), 'id': str(submission.id), 'created_utc': str(submission.created_utc), 'title': title, 'ext': 'txt' } # Selftext post if submission.is_self: with open(save.get_dir(path), 'a+') as f: f.write(str(submission.selftext.encode('utf-8'))) # Link to a jpg, png, gifv, gif, jpeg elif re.match(Common.valid_url, link): if not Common(link, '{}-{}'.format(str(submission.id), title), path).save(): downloaded = False # Imgur elif re.match(Imgur.valid_url, link): if not Imgur(link, title, path).save(): downloaded = False # Giphy elif re.match(Giphy.valid_url, link): if not Giphy(link, title, path).save(): downloaded = False # Tenor elif re.match(Tenor.valid_url, link): if not Tenor(link, title, path).save(): downloaded = False # Redgifs elif re.match(Redgifs.valid_url, link): if not Redgifs(link, title, path).save(): downloaded = False # Gfycat elif re.match(Gfycat.valid_url, link): if not Gfycat(link, title, path).save(): downloaded = False elif re.match(RedditGallery.valid_url, link): if not RedditGallery(link, title, path).save(): downloaded = False # Flickr elif 'flickr.com/' in link: downloaded = False logger.info("No mathces: No Flickr support {}".format(link)) # Reddit submission elif re.match(RedditHandler.valid_url, link): downloaded = False logger.debug("Fetching crosspost {}".format(link)) new_submission = RedditHandler(link, title, path).save() if not new_submission: downloaded = False if not routeSubmission(new_submission): downloaded = False # youtube_dl supported site elif YouTube.yt_supported(link): if not YouTube(link, title, path).save(): downloaded = False else: logger.info("No matches: {}".format(link)) downloaded = False return downloaded
def main(args): subR = None filepath = None if args.subreddit: if '.txt' in args.subreddit: filepath = args.subreddit else: subR = args.subreddit # wait if args.wait and args.subreddit: try: wait = int(args.wait) except ValueError: logger.error("Please enter an integer in seconds to wait") sys.exit() else: wait = 600 # posts if args.posts and args.subreddit: try: posts = int(args.posts) except ValueError: logger.error("Please enter an inter for the number of posts") sys.exit() else: posts = 50 # output if args.output and args.subreddit: base_dir = os.path.abspath(args.output) if not os.path.exists(base_dir): os.makedirs(base_dir) else: base_dir = os.getcwd() # sort sort = 'hot' if args.sort and (args.sort.lower() == 'hot' or args.sort.lower() == 'new' or args.sort.lower() == 'top') and args.subreddit: sort = args.sort elif args.sort: logger.error("Please enter hot, new or top for sort") sys.exit() # blacklist if args.blacklist: config["reddit"]["blacklist"].append(args.blacklist) # reddit api credentials if args.reddit_id: config["reddit"]["creds"]["client_id"] = args.reddit_id if args.reddit_secret: config["reddit"]["creds"]["client_secret"] = args.reddit_secret with open('./resources/config.json', 'w') as f: json.dump(config, f) # by_sub !!! global save save = Save(base_dir, args.by_sub) # initialise database global db db = DBInterface(config["general"]["database_location"]) if args.subreddit: # Passes subreddits to feeder while (True): if filepath is not None: with open(filepath) as f: line = f.readline() while line: subR = "{}".format(line.strip()) feeder(subR, posts, base_dir, sort) line = f.readline() else: feeder(subR, posts, base_dir, sort) logger.info("Waiting {} seconds".format(wait)) time.sleep(wait)
from resources.handlers.imgur import Imgur from resources.handlers.common import Common from resources.save import Save from resources.db_interface import DBInterface class color: RED = '\033[91m' BOLD = '\033[1m' END = '\033[0m' with open('./resources/config.json') as f: config = json.load(f) save = Save(os.getcwd(), True) logger = logging.getLogger(__name__) db = None def grabber(subR, base_dir, posts, sort): # Initialise Reddit reddit = praw.Reddit( client_id=config["reddit"]["creds"]["client_id"], client_secret=config["reddit"]["creds"]["client_secret"], user_agent=config["reddit"]["creds"]["user_agent"]) if 'u/' in subR or '/u/' in subR: if '/u/' in subR: subR = subR[3:] elif 'u/' in subR: