def fetch_new_sceneorg_files(days=1): url = "https://files.scene.org/api/adhoc/latest-files/?days=%d" % days new_file_count = 0 while True: req = urllib2.Request(url, None, {'User-Agent': user_agent}) page = urllib2.urlopen(req) response = json.loads(page.read()) page.close() if not response.get('success'): break for item in response['files']: path_components = item['fullPath'].split('/')[1:] dirs = path_components[:-1] path = '/' current_dir = Directory.objects.get_or_create( path='/', defaults={'last_seen_at': datetime.datetime.now()}) for d in dirs: last_dir = current_dir path += d + '/' try: current_dir = Directory.objects.get(path=path) current_dir.last_seen_at = datetime.datetime.now() current_dir.is_deleted = False current_dir.save() except Directory.DoesNotExist: current_dir = Directory.objects.create(path=path, last_seen_at=datetime.datetime.now(), parent=last_dir) path += path_components[-1] try: f = File.objects.get(path=path) f.last_seen_at = datetime.datetime.now() f.is_deleted = False f.size = item['size'] f.save() except File.DoesNotExist: File.objects.create( path=path, last_seen_at=datetime.datetime.now(), directory=current_dir, size=item['size']) new_file_count += 1 url = response.get('nextPageURL') if url: time.sleep(1) else: break if new_file_count > 0: find_sceneorg_results_files()
def scan_dir_listing(): new_file_count = 0 for path, entries in parse_all_dirs(): # print path try: dir = Directory.objects.get(path=path) except Directory.DoesNotExist: dir = Directory.objects.create(path=path, last_seen_at=datetime.datetime.now()) new_file_count += update_dir_records(dir, entries) if new_file_count > 0: find_sceneorg_results_files()
def fetch_new_sceneorg_files(days=1): url = "https://files.scene.org/api/adhoc/latest-files/?days=%d" % days new_file_count = 0 while True: req = urllib.request.Request(url, None, {'User-Agent': settings.HTTP_USER_AGENT}) page = urllib.request.urlopen(req) response = json.loads(page.read()) page.close() if not response.get('success'): logger.warning("scene.org API request returned non-success! %r" % response) break logger.info("API request to %s succeeded - %d files returned" % (url, len(response['files']))) for item in response['files']: # the fullPath field in the API consists of a byte string (de facto utf-8) interpreted # as windows-1252 and served to us as a Unicode string. # Here we encode as windows-1252 (to reconstruct the original bytestream as closely as # possible), then decode the bytestream as iso-8859-1 to embed that bytestream into # a unicode string that we can process and ultimately insert into the db. full_path = item['fullPath'].encode('Windows-1252', 'ignore').decode('iso-8859-1') path_components = full_path.split('/')[1:] dirs = path_components[:-1] path = '/' current_dir, created = Directory.objects.get_or_create( path='/', defaults={'last_seen_at': datetime.datetime.now()}) for d in dirs: last_dir = current_dir path += d + '/' try: current_dir = Directory.objects.get(path=path) current_dir.last_seen_at = datetime.datetime.now() current_dir.is_deleted = False current_dir.save() except Directory.DoesNotExist: current_dir = Directory.objects.create( path=path, last_seen_at=datetime.datetime.now(), parent=last_dir) path += path_components[-1] try: f = File.objects.get(path=path) f.last_seen_at = datetime.datetime.now() f.is_deleted = False f.size = item['size'] f.save() except File.DoesNotExist: logger.info("New file found: %s" % path) File.objects.create(path=path, last_seen_at=datetime.datetime.now(), directory=current_dir, size=item['size']) new_file_count += 1 url = response.get('nextPageURL') if url: time.sleep(1) else: pointless_call_to_make_coverage_notice_this_line() break if new_file_count > 0: find_sceneorg_results_files()
def handle(self, *args, **kwargs): def callback(party): print("found results.txt for %s" % party) find_sceneorg_results_files(callback)
def handle_noargs(self, **options): def callback(party): print "found results.txt for %s" % party find_sceneorg_results_files(callback)
def fetch_new_sceneorg_files(days=1): url = "https://files.scene.org/api/adhoc/latest-files/?days=%d" % days new_file_count = 0 while True: req = urllib2.Request(url, None, {'User-Agent': settings.HTTP_USER_AGENT}) page = urllib2.urlopen(req) response = json.loads(page.read()) page.close() if not response.get('success'): logger.warning("scene.org API request returned non-success! %r" % response) break logger.info("API request to %s succeeded - %d files returned" % (url, len(response['files']))) for item in response['files']: # the fullPath field in the API consists of a byte string (de facto utf-8) interpreted # as windows-1252 and served to us as a Unicode string. # Here we encode as windows-1252 (to reconstruct the original bytestream as closely as # possible), then decode the bytestream as iso-8859-1 to embed that bytestream into # a unicode string that we can process and ultimately insert into the db. full_path = item['fullPath'].encode('Windows-1252', 'ignore').decode('iso-8859-1') path_components = full_path.split('/')[1:] dirs = path_components[:-1] path = '/' current_dir = Directory.objects.get_or_create( path='/', defaults={'last_seen_at': datetime.datetime.now()}) for d in dirs: last_dir = current_dir path += d + '/' try: current_dir = Directory.objects.get(path=path) current_dir.last_seen_at = datetime.datetime.now() current_dir.is_deleted = False current_dir.save() except Directory.DoesNotExist: current_dir = Directory.objects.create(path=path, last_seen_at=datetime.datetime.now(), parent=last_dir) path += path_components[-1] try: f = File.objects.get(path=path) f.last_seen_at = datetime.datetime.now() f.is_deleted = False f.size = item['size'] f.save() except File.DoesNotExist: logger.info("New file found: %s" % path) File.objects.create( path=path, last_seen_at=datetime.datetime.now(), directory=current_dir, size=item['size']) new_file_count += 1 url = response.get('nextPageURL') if url: time.sleep(1) else: break if new_file_count > 0: find_sceneorg_results_files()
def fetch_new_sceneorg_files(path, days=1): new_file_count = fetch_sceneorg_dir(path=path, days=days, async=False) if new_file_count > 0: find_sceneorg_results_files()