def process_actor(name, gender, url, data_path='./data/pr0n_processor/backend/'): uid = unique_id(url) os.makedirs(os.path.join(data_path, uid[0], uid[1]), exist_ok=True) filepath = os.path.join(data_path, uid[0], uid[1], uid) if os.path.exists(filepath + '.pkl'): return try: page_data = yield httpclient.fetch(url, request_timeout=10) except HTTPError: return soup = BeautifulSoup(page_data.body, 'html.parser') img = soup.find('img', title=re.compile("Portrait of")) img_src = img.attrs['src'] try: image_req = yield httpclient.fetch(img_src, request_timeout=10) except HTTPError: return image_fd = BytesIO(image_req.body) try: image = Image.open(image_fd) except OSError: return image_np = np.array(image) try: rects, scores, poses = detector.run(image_np) except RuntimeError: return if len(scores) != 1: return try: face_hash = yield openface.hash_face(image_np, bb=rects[0]) except: return data = { 'url': url, 'uid': uid, 'rects': rects[0], 'pose': poses[0], 'score': scores[0], 'face_hash': face_hash, 'name': name, 'gender': gender, } try: image.save(filepath + '.jpg') with open(filepath + '.pkl', 'wb+') as fd: pickle.dump(data, fd, protocol=-1) print(name, gender, url, uid) except OSError: return
def process_subreddit(subreddit, data_path='./data/pr0n_processor/backend/'): reddit = praw.Reddit(user_agent='gulperpr0n') submissions = skip_unfound(chain( reddit.get_subreddit(subreddit).get_hot(), reddit.get_subreddit(subreddit).get_top_from_all(), reddit.get_subreddit(subreddit).get_top_from_year(), reddit.get_subreddit(subreddit).get_top_from_month(), )) for submission in submissions: try: url = normalize_url(submission.url) uid = unique_id(url) except ValueError: continue os.makedirs(os.path.join(data_path, uid[0], uid[1]), exist_ok=True) filepath = os.path.join(data_path, uid[0], uid[1], uid) if os.path.exists(filepath + '.pkl'): continue try: image_req = yield http_client.fetch(url, request_timeout=5) except HTTPError: continue image_fd = BytesIO(image_req.body) try: image = Image.open(image_fd) except OSError: continue image_np = np.array(image) try: rects, scores, poses = detector.run(image_np) except RuntimeError: continue if len(scores) != 1: continue try: face_hash = yield openface.hash_face(image_np, bb=rects[0]) except: continue print(subreddit, uid, url) data = { 'url': url, 'uid': uid, 'rects': rects[0], 'pose': poses[0], 'score': scores[0], 'face_hash': face_hash, 'reddit_submission': submission, } try: image.save(filepath + '.jpg') with open(filepath + '.pkl', 'wb+') as fd: pickle.dump(data, fd, protocol=-1) except OSError: continue