def encode(parallelize=True, hdf5_path=c.RECORDING_DIR, experiment=None): # TODO(post v3): Get a couple separate hdf5 files from different # situations / view modes for eval hdf5_to_convert_path = get_hdf5_path_to_convert(hdf5_path) train_dataset = get_dataset(hdf5_to_convert_path, train=True) eval_dataset = get_dataset(hdf5_to_convert_path, train=False) buffer_size = 1000 utils.assert_disk_space(hdf5_path) out_path = None while experiment is None: experiment = utils.get_valid_filename( input('Enter a name for your dataset: ')) out_path = os.path.join(hdf5_path, experiment + c.TFRECORD_DIR_SUFFIX) if os.path.exists(out_path): print('The path %s exists, please choose a new name.' % out_path) experiment = None if out_path is None: raise RuntimeError('tfrecord output path not set') os.makedirs(out_path) save_dataset(train_dataset, buffer_size, filename=os.path.join(out_path, 'deepdrive_train'), parallelize=parallelize, out_path=out_path) save_dataset(eval_dataset, buffer_size, filename=os.path.join(out_path, 'deepdrive_eval'), parallelize=parallelize, out_path=out_path)
def main(file_path, output_folder, file_type, download_n_files, max_size=None, n_jobs=1): df = pds.read_csv(file_path, sep=";").sample(frac=1) # naively filter the df to get only the desired file_type df = df[df.format == file_type] if download_n_files: df = df.iloc[:download_n_files] print(f"There are {len(df)} resources of type {file_type}") urls = df["url"].values resource_ids = df["id"].values dataset_ids = df["dataset.id"].values new_ids = dataset_ids + "--" + resource_ids organizations = df["dataset.organization"].fillna("NA").apply( lambda x: unidecode.unidecode(get_valid_filename(x))).values assert len(urls) == len(new_ids) if n_jobs > 1: succes_downloaded = Parallel(n_jobs=n_jobs)( delayed(downloader)(url, id, org, output_folder, file_type, max_size) for url, id, org in tqdm(list(zip(urls, new_ids, organizations)))) else: succes_downloaded = [] for url, id, org in tqdm(list(zip(urls, new_ids, organizations))): succes_downloaded.append( downloader(url, id, org, output_folder, file_type, max_size)) print( f"I successfully downloaded {sum(succes_downloaded)} of {len(succes_downloaded)} files" )
def watch(self): curr_time = datetime.datetime.now().strftime("%Y-%m-%d %H.%M.%S") file_name = curr_time + " - " + self.streamer + " - " + get_valid_filename(self.stream_title) + ".ts" dir = os.getcwd() + os.path.sep + self.streamer + os.path.sep if not os.path.exists(dir): os.makedirs(dir) output_filepath = dir + file_name self.streamer_dict.update({'output_filepath': output_filepath}) streams = streamlink.streams(self.streamer_dict['stream_info']['channel']['url']) try: stream = streams[self.stream_quality] except KeyError: temp_quality = self.stream_quality if len(streams) > 0: # False => stream is probably offline if self.stream_quality in streams.keys(): self.stream_quality = StreamQualities.BEST.value else: self.stream_quality = list(streams.keys())[-1] # best not in streams? choose best effort quality else: self.cleanup = True if not self.cleanup: print('Invalid stream quality: ' + '\'' + temp_quality + '\'') print('Falling back to default case: ' + self.stream_quality) self.streamer_dict['preferred_quality'] = self.stream_quality stream = streams[self.stream_quality] else: stream = None if not self.kill and not self.cleanup and stream: print(self.streamer + ' is live. Saving stream in ' + self.stream_quality + ' quality to ' + output_filepath + '.') try: with open(output_filepath, "ab") as out_file: # open for [a]ppending as [b]inary fd = stream.open() while not self.kill and not self.cleanup: data = fd.read(1024) # If data is empty the stream has ended if not data: fd.close() out_file.close() break out_file.write(data) except streamlink.StreamError as err: print('StreamError: {0}'.format(err)) # TODO: test when this happens except IOError as err: # If file validation fails this error gets triggered. print('Failed to write data to file: {0}'.format(err)) self.streamer_dict.update({'kill': self.kill}) self.streamer_dict.update({'cleanup': self.cleanup}) return self.streamer_dict
def save_product(product): os.makedirs(jumbo_products_path, exist_ok=True) try: with open( os.path.join( jumbo_products_path, get_valid_filename(product['title']) + '.json') , 'w') as f: json.dump(product, f) except KeyError: print('Could not save product with id: {}'.format(product['id']))
def main(): colorama.init() parser = argparse.ArgumentParser( description= 'Fetch top level comments from a reddit post (eg: python3 fetch_comments.py -l https://www.reddit.com/r/AskReddit/comments/75goki/whats_a_movie_to_watch_when_you_want_a_good_cry/)' ) parser.add_argument('-l', '--link', type=str, metavar='', required=True, help='Link of the post') parser.add_argument( '-s', '--sort', type=str, metavar='', choices=['best', 'top', 'new'], default='confidence', help='Optionally specify order of sort [best or top or new]') parser.add_argument( '-loc', '--location', type=str, metavar='', default=os.getcwd() + '/comments', help='Optionally specify the directory/location to be downloaded') args = parser.parse_args() # initializing userAgent ua = get_userAgent() print('Connecting to reddit..') url = args.link + '.json?sort=' if args.sort == 'best': args.sort = 'confidence' top_level_comments = get_comments(url + args.sort, ua) filename = get_valid_filename(top_level_comments[0]) + "_comments.txt" if not os.path.exists(args.location): os.makedirs(args.location) location = os.path.join(args.location, filename) output_filehandle = open(location, mode='w', encoding='utf8') output_text = '\n'.join(top_level_comments) output_filehandle.write(output_text) output_filehandle.write('\n') erase_previous_line() print('Downloaded comments')
def scrape_products(): for category_url in category_urls: print(category_url) r = requests.get(category_url) category_json = r.json() # json_data = None # with open('category.json', 'r') as f: # json_data = json.load(f) jsonpath_expr = parse('$..navItem.link.href') products = [match.value for match in jsonpath_expr.find(category_json)] products = [x for x in products if x.startswith('/producten/product')] all_products = {} for product in products: r = requests.get(base_url + product) tree = html.fromstring(r.content) # json_ld = tree.xpath('string(/html/head/script[7])') javascript = tree.xpath('string(/html/body/script/text())')[54:-14] javascript = ast.literal_eval(javascript) javascript = json.loads(javascript) if javascript['product']['state'] == 'ERROR': print('ERROR state detected:') print(javascript) continue product_info = javascript['product']['card'] title = product_info['products'][0]['title'] try: os.makedirs('products/ah') except FileExistsError: pass with open('products/ah/{}.json'.format(get_valid_filename(title)), 'w') as f: json.dump(product_info, f) all_products[title] = product_info print(product_info['products'][0]['title']) print(products)
def _plot_precision_recall_curve(df, test_set_id): results_df = df[df.test_set_id == test_set_id] results_df = _sort_models_by_list(results_df, default_models_sorter) plt.figure(figsize=(4, 4)) count = 0 color_list = [ "#640f79", "#66d9cf", "#d032a3", "#8138fc", "#8fda59", 'black' ] linestyle_list = ['dotted', 'dashdot', 'solid'] for _, row in results_df.iterrows(): color = color_list[count] linestyle = linestyle_list[count // 2] plt.plot(row.precision_recall_curve[1], row.precision_recall_curve[0], label=row.fancy_model_name, linestyle=linestyle, color=color, linewidth=1.2) count += 1 if FLAGS.show_random_guess: no_skill = _calculate_no_skill_model(df, test_set_id) plt.plot([0, 1], [no_skill, no_skill], linestyle='--', color='gainsboro', linewidth=1.2, label='Random guess') legend = plt.legend(loc='upper left', bbox_to_anchor=(1.05, 1)) else: legend = plt.legend(loc='lower left') plt.xlim(0.0, 1.00) plt.ylim(0.0, 1.05) plt.xlabel('Recall') plt.ylabel('Precision') for file_format in FLAGS.chart_format_list: chart_file_name = test_set_id + '.' + file_format chart_file_name = utils.get_valid_filename(chart_file_name) chart_file_name = os.path.join(FLAGS.charts_path, chart_file_name) plt.savefig(chart_file_name, dpi=600, bbox_extra_artists=(legend, ), bbox_inches='tight') print('Saved chart to %s' % chart_file_name)
def get_pictures_from_subreddit(data, subreddit, location, nsfw): for i in range(len(data)): if data[i]['data']['over_18']: # if nsfw post and you only want sfw if nsfw == 'n': continue else: # if sfw post and you only want nsfw if nsfw == 'x': continue current_post = data[i]['data'] image_url = current_post['url'] if '.png' in image_url: extension = '.png' elif '.jpg' in image_url or '.jpeg' in image_url: extension = '.jpeg' elif 'imgur' in image_url: image_url += '.jpeg' extension = '.jpeg' else: continue erase_previous_line() print('downloading pictures from r/' + subreddit + '.. ' + str((i * 100) // len(data)) + '%') # redirects = False prevents thumbnails denoting removed images from getting in image = requests.get(image_url, allow_redirects=False) if (image.status_code == 200): try: output_filehandle = open( location + '/' + get_valid_filename(current_post['title']) + extension, mode='bx') output_filehandle.write(image.content) except: pass
def get_audio_filename(self, text): return '/tmp/{}_{}.mp3'.format( get_valid_filename(text), hashlib.md5(text.encode('utf8')).hexdigest())
def watch(self): curr_time = datetime.datetime.now().strftime("%Y-%m-%d %H.%M.%S") file_name = curr_time + " - " + self.streamer + " - " + get_valid_filename( self.stream_title) + ".ts" directory = self._formatted_download_folder( self.streamer_login) + os.path.sep if not os.path.exists(directory): os.makedirs(directory) output_filepath = directory + file_name self.streamer_dict.update({'output_filepath': output_filepath}) streams = streamlink.streams('https://www.twitch.tv/' + self.streamer_login) # Occurs when already recording another stream and new streamer (that is already live) is added # not sure why this error is thrown.. # Traceback (most recent call last): # File "C:\Program Files\Python36\lib\threading.py", line 916, in _bootstrap_inner # self.run() # File "E:\Downloads\automatic-twitch-recorder\venv\lib\site-packages\streamlink\stream\segmented.py", line 59, in run # for segment in self.iter_segments(): # File "E:\Downloads\automatic-twitch-recorder\venv\lib\site-packages\streamlink\stream\hls.py", line 307, in iter_segments # self.reload_playlist() # File "E:\Downloads\automatic-twitch-recorder\venv\lib\site-packages\streamlink\stream\hls.py", line 235, in reload_playlist # self.process_sequences(playlist, sequences) # File "E:\Downloads\automatic-twitch-recorder\venv\lib\site-packages\streamlink\plugins\twitch.py", line 210, in process_sequences # return super(TwitchHLSStreamWorker, self).process_sequences(playlist, sequences) # TypeError: super(type, obj): obj must be an instance or subtype of type try: stream = streams[self.stream_quality] except KeyError: temp_quality = self.stream_quality if len(streams) > 0: # False => stream is probably offline if self.stream_quality in streams.keys(): self.stream_quality = StreamQualities.BEST.value else: self.stream_quality = list(streams.keys())[ -1] # best not in streams? choose best effort quality else: self.cleanup = True if not self.cleanup: print('Invalid stream quality: ' + '\'' + temp_quality + '\'') print('Falling back to default case: ' + self.stream_quality) self.streamer_dict['preferred_quality'] = self.stream_quality stream = streams[self.stream_quality] else: stream = None if not self.kill and not self.cleanup and stream: print(self.streamer + ' is live. Saving stream in ' + self.stream_quality + ' quality to ' + output_filepath + '.') try: with open( output_filepath, "ab") as out_file: # open for [a]ppending as [b]inary fd = stream.open() while not self.kill and not self.cleanup: data = fd.read(1024) # If data is empty the stream has ended if not data: fd.close() out_file.close() break out_file.write(data) except streamlink.StreamError as err: print('StreamError: {0}'.format( err)) # TODO: test when this happens except IOError as err: # If file validation fails this error gets triggered. print('Failed to write data to file: {0}'.format(err)) self.streamer_dict.update({'kill': self.kill}) self.streamer_dict.update({'cleanup': self.cleanup}) return self.streamer_dict
def perform(self): if not self.check_if_needed(): return dest_file_name = base_date.strftime("%Y-%m-%d_%H%M%S_") + utils.get_valid_filename(self.name) self.process_vars([ ['DEST_FILENAME', dest_file_name], ['SRC_DIR', self.src_dir], ['DEST_DIR', self.dest_dir], ]) dest_path = self.dest_dir + dest_file_name if self.do_compress in ['gzip', 'store']: ext = {'gzip': 'tgz', 'store': 'tar'}[self.do_compress] dest_path_compressed = self.dest_dir + dest_file_name + "." + ext if os.path.exists(dest_path_compressed): report.log_warn("Destination file exists {0}".format(self.cc(dest_path_compressed))) return dest_path_compressed_tmp = dest_path_compressed + ".tmp" report.log_state("Changing directory to {0}".format(self.cc(self.src_dir))) os.chdir(self.src_dir) report.log_state("Creating archive {0} to {1}...".format(self.cc(self.src_dir), self.cc(dest_path_compressed_tmp))) if self.do_compress == 'gzip': cmd = "tar --create . | pigz > {0}".format(shlex.quote(dest_path_compressed_tmp)) else: cmd = "tar --create --file {0} .".format(shlex.quote(dest_path_compressed_tmp)) report.log_command(cmd) process = subprocess.Popen(cmd, shell=True, stdout=subprocess.PIPE, stderr=subprocess.STDOUT) processLimit = None if self.cpu_limit: limit_cmd = "cpulimit --lazy --include-children --pid={0} --limit={1}".format(process.pid, self.cpu_limit) report.log_command(limit_cmd) processLimit = subprocess.Popen(limit_cmd, shell=True) else: pass r = process.wait() if processLimit is not None: processLimit.wait() tar_output = process.stdout.read() if r == 0: report.log_state("Moving {0} to {1}".format(self.cc(dest_path_compressed_tmp), self.cc(dest_path_compressed))) os.rename(dest_path_compressed_tmp, dest_path_compressed) else: raise BackupException("tar failed", tar_output) else: if os.path.exists(dest_path): report.log_warn("Destination folder exists {0}".format(self.cc(dest_path))) return dest_path_tmp = dest_path + "_tmp" report.log_state("Copying {0} to {1}...".format(self.cc(self.src_dir), self.cc(dest_path_tmp))) try: if os.path.exists(dest_path_tmp): shutil.rmtree(dest_path_tmp) shutil.copytree(self.src_dir, dest_path_tmp) os.rename(dest_path_tmp, dest_path) except Exception as err: report.log_warn(err)