async def process_item(self, item, spider): if spider.debug is True: return item # So we don't re-send scenes that have already been scraped if self.crawler.settings['ENABLE_MONGODB']: if spider.force is not True: result = self.db.scenes.find_one({'url': item['url']}) if result is not None: return payload = { 'title': item['title'], 'description': item['description'], 'date': item['date'], 'image': item['image'], 'image_blob': item['image_blob'], 'url': item['url'], 'performers': item['performers'], 'tags': item['tags'], 'external_id': str(item['id']), 'site': item['site'], 'trailer': item['trailer'], 'parent': item['parent'], 'network': item['network'], 'force_update': self.crawler.settings.getbool('FORCE_UPDATE'), } # Post the scene to the API - requires auth with permissions disp_result = "" if self.crawler.settings['TPDB_API_KEY'] and not spider.settings.get( 'local'): headers = { 'Authorization': 'Bearer %s' % self.crawler.settings['TPDB_API_KEY'], 'Accept': 'application/json', 'Content-Type': 'application/json', 'User-Agent': 'tpdb-scraper/1.0.0' } response = Http.post('https://api.metadataapi.net/scenes', json=payload, headers=headers) if response: if response.ok: disp_result = disp_result + 'Submitted OK' else: disp_result = disp_result + 'Submission Error: Code #%d' % response.status_code else: disp_result = disp_result + 'Submission Error: No Response Code' print(response.content) url_hash = hashlib.sha1(str( item['url']).encode('utf-8')).hexdigest() if self.crawler.settings['MONGODB_ENABLE']: if not response.ok: self.db.errors.replace_one( {'_id': url_hash}, { 'url': item['url'], 'error': 1, 'when': datetime.now().isoformat(), 'response': response.json() }, upsert=True) else: self.db.scenes.replace_one({'_id': url_hash}, dict(item), upsert=True) else: disp_result = 'Local Run, Not Submitted' if spider.settings.get('localdump'): # Toss to local TPDB Instance headers = { 'Authorization': 'Bearer %s' % self.crawler.settings['TPDB_TEST_API_KEY'], 'Accept': 'application/json', 'Content-Type': 'application/json', 'User-Agent': 'tpdb-scraper/1.0.0' } response = Http.post('http://api.tpdb.test/scenes', json=payload, headers=headers) if response: if response.ok: disp_result = disp_result + '\tSubmitted to Local OK' else: disp_result = disp_result + '\tSubmission to Local Error: Code #%d' % response.status_code else: disp_result = disp_result + '\tSubmission to Local Error: No Response Code' print(response.content) # ############################# if spider.settings.getbool('display') and spider.settings.get( 'LOG_LEVEL') == 'INFO': if len(item['title']) >= 50: title_length = 5 else: title_length = 55 - len(item['title']) if len(item['site']) >= 15: site_length = 5 else: site_length = 20 - len(item['site']) if "T" in item['date']: disp_date = re.search(r'(.*)T\d', item['date']).group(1) else: disp_date = item['date'] print( f"Item: {item['title'][0:50]}" + " " * title_length + f"{item['site'][0:15]}" + " " * site_length + f"\t{str(item['id'])[0:15]}\t{disp_date}\t{item['url']}\t{disp_result}" ) if spider.settings.getbool('export'): item2 = item.copy() if not spider.settings.get('showblob'): if 'image_blob' in item2: item2.pop('image_blob', None) self.exporter.export_item(item2) return item
async def process_item(self, item, spider): if self.crawler.settings['ENABLE_MONGODB']: if spider.force is not True: result = self.db.performers.find_one({'url': item['url']}) if result is not None: return if 'fakeboobs' in item and item['fakeboobs']: if item['fakeboobs'].lower() == 'yes': item['fakeboobs'] = True elif item['fakeboobs'].lower() == 'no': item['fakeboobs'] = False else: item['fakeboobs'] = None payload = { 'name': item['name'], 'site': item['network'], 'url': item['url'], 'bio': item['bio'], 'image': item['image'], 'image_blob': item['image_blob'], 'extra': { 'gender': item['gender'], 'birthday': item['birthday'], 'astrology': item['astrology'], 'birthplace': item['birthplace'], 'ethnicity': item['ethnicity'], 'nationality': item['nationality'], 'haircolor': item['haircolor'], # ~ 'eyecolor': item['eyecolor'], 'weight': item['weight'], 'height': item['height'], 'measurements': item['measurements'], 'tattoos': item['tattoos'], 'piercings': item['piercings'], 'cupsize': item['cupsize'], 'fakeboobs': item['fakeboobs'] } } # Post the scene to the API - requires auth with permissions disp_result = "" if self.crawler.settings['TPDB_API_KEY'] and not spider.settings.get( 'local'): headers = { 'Authorization': 'Bearer %s' % self.crawler.settings['TPDB_API_KEY'], 'Accept': 'application/json', 'Content-Type': 'application/json', 'User-Agent': 'tpdb-scraper/1.0.0' } response = Http.post('https://api.metadataapi.net/performer_sites', json=payload, headers=headers, verify=False) if response: if response.ok: disp_result = 'Submitted OK' else: disp_result = 'Submission Error: Code #' + str( response.status_code) else: disp_result = 'Submission Error: No Response Code' print(response.content) if self.crawler.settings['MONGODB_ENABLE']: url_hash = hashlib.sha1(str( item['url']).encode('utf-8')).hexdigest() if not response.ok: self.db.errors.replace_one( {'_id': url_hash}, { 'url': item['url'], 'error': 1, 'when': datetime.now().isoformat(), 'response': response.json() }, upsert=True) else: self.db.performers.replace_one({'_id': url_hash}, dict(item), upsert=True) else: disp_result = 'Local Run, Not Submitted' if spider.settings.get('localdump'): # Toss to local TPDB Instance headers = { 'Authorization': 'Bearer %s' % self.crawler.settings['TPDB_TEST_API_KEY'], 'Accept': 'application/json', 'Content-Type': 'application/json', 'User-Agent': 'tpdb-scraper/1.0.0' } response = Http.post('http://api.tpdb.test/performer_sites', json=payload, headers=headers, verify=False) if response: if response.ok: disp_result = disp_result + '\tSubmitted to Local OK' else: disp_result = disp_result + '\tSubmission to Local Error: Code #%d' % response.status_code else: disp_result = disp_result + '\tSubmission to Local Error: No Response Code' print(response.content) # ############################## if spider.settings.getbool('display') and spider.settings.get( 'LOG_LEVEL') == 'INFO': name_length = 50 - len(payload['name']) if name_length < 1: name_length = 1 print(f"Performer: {payload['name']}" + " " * name_length + f"{payload['site']}\t{payload['url']}\t{disp_result}") if spider.settings.getbool('export'): item2 = payload.copy() if not spider.settings.get('showblob'): if "image_blob" in item2: item2.pop('image_blob', None) self.exporter.export_item(item2) return item