def save_pandas_data(file_name, dat, old_data=None, verbose=VERBOSE): try: data = clean_pandas_data(dat) if old_data is not None: try: # Avoid the last index as it may contain an incomplete week or month last_dt = old_data.index[-2] idx = data.index.get_loc(last_dt.strftime("%Y-%m-%d")) updated_data = pd.concat( (old_data.iloc[:-2, :], data.iloc[idx:, :]), axis=0) updated_data.reset_index().to_csv( file_name, index=False, compression="infer") # Update except KeyError as err: LOG.error(f"Error updating the data: {err}") else: data.reset_index().to_csv(file_name, index=False, compression="infer") # Save if verbose > 1: symbol = file_name.parent.name LOG.info( f"Saved {symbol} data:{get_tabs(symbol, prev=12)}[{file_name.stem}] OK" ) except Exception as err: LOG.error( f"ERROR saving data:\t\t{file_name.parent.name + file_name.stem} " f"{err.__repr__()} {traceback.print_tb(err.__traceback__)}")
async def update_stock_info(info_file, info, create=True, verbose=VERBOSE): try: # Clean key names clean_info = clean_enumeration(info) clean_info.pop('matchScore', None) # Read previous info if info_file.exists(): old_info = await read_info_file(info_file, check=False, verbose=verbose) else: old_info = {} await save_stock_info(info_file, clean_info, old_info=old_info, create=create) if verbose > 1: symbol = info_file.parent.name LOG.info(f"Updating {symbol} info:{get_tabs(symbol, prev=15)}OK") except Exception as err: LOG.error( f"ERROR updating info: {info_file}. Msg: {err.__repr__()} {traceback.print_tb(err.__traceback__)}" )
def get_optimizer(params: Iterable, args: argparse.Namespace) -> Optional[optim.Optimizer]: name = args.optim_type.lower() if name == 'sgd': LOG.info( f"SGD Optimizer <lr={args.lr}, momentum={args.momentum}, nesterov=True>" ) return optim.SGD(params=params, lr=args.lr, momentum=args.momentum, nesterov=True) elif name == 'adam': LOG.info( f"Adam Optimizer <lr={args.lr}, betas={args.betas}, eps={args.eps}>" ) return optim.Adam(params=params, lr=args.lr, betas=args.betas, eps=args.eps) elif name == 'adamw': LOG.info( f"Adam Optimizer <lr={args.lr}, betas={args.betas}, eps={args.eps}, weight_decay={args.weight_decay}>" ) return optim.AdamW(params=params, lr=args.lr, betas=args.betas, eps=args.eps, weight_decay=args.weight_decay) else: LOG.error(f"Unsupported optimizer: [bold red]{name}[/].") raise ValueError(f"Unsupported optimizer: {name}.")
def _get_activation(name: str) -> Optional[nn.Module]: if name.lower() == 'relu': return nn.ReLU() # TODO: test other activations else: LOG.error(f'Unrecognized activation function: [bold red]{name}[/].') raise ValueError(f'Unrecognized activation function: {name}.')
def read_pandas_data(file_name): if not file_name.exists(): LOG.error(f"ERROR: data not found for {file_name}") return None return pd.read_csv(file_name, parse_dates=['date'], index_col='date', date_parser=dateparse)
def clean_pandas_data(dat): """Receives a dictionary of data, transform the dict into a pandas DataFrame and clean the column names""" try: data = pd.DataFrame.from_dict(dat, orient="index") # Apply clean names to columns and index column_names = clean_enumeration(data.columns.tolist()) data.columns = column_names data.index.name = 'date' data.sort_index(axis=0, inplace=True, ascending=True) # Sort by date except Exception as err: LOG.error(f"Error cleaning dataset: {err}") data = None return data
def manage_vantage_errors(response, symbol): if "Error Message" in response.keys(): LOG.error( f"ERROR: Not possible to retrieve {symbol}. Msg: {response['Error Message']}" ) elif "Note" in response.keys(): if response["Note"][:111] == 'Thank you for using Alpha Vantage! Our standard API call frequency ' \ 'is 5 calls per minute and 500 calls per day.': LOG.info( f"Retrieving {symbol}:{get_tabs(symbol, prev=12)}Max frequency reached! Waiting..." ) return "longWait" return None
async def read_info_file(info_file, check=True, verbose=VERBOSE): if not info_file: return {} if info_file.exists(): async with aiofiles.open(info_file, "r") as info: data = await info.read() if verbose > 1: LOG.info(f"Info file read:{get_tabs('', prev=15)}{info_file}") return json.loads(data) else: if check: LOG.error(f"ERROR: No info found at {info_file}") if verbose > 1: LOG.warning(f"Info file: {info_file}\tDO NOT EXISTS!") return {}
async def query_data(symbol, category=None, api="vantage", verbose=VERBOSE, **kwargs): if category is None: raise ValueError("Please provide a valid category in the parameters") # Get semaphore semaphore_controller.get_semaphore(api) if verbose > 2: LOG.info("Successfully acquired the semaphore") if api == "vantage": url, params = alpha_vantage_query(symbol, category, key=KEYS_SET["alpha_vantage"], **kwargs) LOG.info( f"Retrieving {symbol}:{get_tabs(symbol, prev=12)}From '{api}' API") else: LOG.error(f"Not supported api {api}") counter = 0 while counter <= QUERY_RETRY_LIMIT: async with aiohttp.ClientSession() as session: async with session.get(url, params=params, headers=HEADERS) as resp: data = await resp.json() if api == "vantage": if manage_vantage_errors(data, symbol) == "longWait": counter += 1 await asyncio.sleep(VANTAGE_WAIT) else: break await asyncio.sleep(MIN_SEM_WAIT) if verbose > 2: LOG.info("Releasing Semaphore") # Release semaphore semaphore_controller.release_semaphore(api) return data
def prepare_MTT_dataset(args): CONSOLE.rule("Pre-processing MTT Annotations and Data for Machine Learning") # --- get dirs --- # create out dir if not exists p_out = Path(args.p_out).absolute() while True: if p_out.exists(): res = CONSOLE.input(f"Output folder exists ({p_out.as_posix()})! Do you want to remove it first? " f"(You can also clean it manually now and hit enter key to retry) [y/n]: ") if res.lower() in ['y', 'yes']: # delete target folder shutil.rmtree(p_out) # create new one p_out.mkdir() LOG.info(f"Target folder removed, and new empty folder created.") break elif res.lower() in ['n', 'no']: LOG.error(f"Output folder exists! Creating folder failed. Target: {p_out.as_posix()} exists.") raise FileExistsError(f"Output folder exists! Creating folder failed. Target: {p_out.as_posix()} exists.") else: continue else: p_out.mkdir() LOG.info(f"Target folder ({p_out.as_posix()}) created.") break # train/val/test dirs p_out.joinpath('train').mkdir() p_out.joinpath('val').mkdir() p_out.joinpath('test').mkdir() # check raw data p_raw = Path(args.p_raw).absolute() assert len(list(p_raw.glob('[0-9, a-z]'))) == 16, "MTT Raw data should have 16 directories from 0-9 and a-f." # --- parsing and processing annotations --- annotations = process_MTT_annotations(p_anno=args.p_anno, p_info=args.p_info, delimiter=args.delimiter, n_top=args.n_topk) # save processed annotations annotations.to_csv(Path(args.p_anno).parent.joinpath(f'annotations_top{args.n_topk}.csv').as_posix(), index=False) # save topk labels with open(Path(args.p_anno).parent.joinpath('labels.txt').as_posix(), 'w') as f: f.write(','.join(annotations.columns.tolist()[:args.n_topk])) CONSOLE.rule("Audio Preprocessing") LOG.info(f"MTT annotations processed. Now segmenting audios based on annotations for machine learning...") # --- process audio files based on annotations --- avg = annotations.shape[0] // args.n_worker processes = [Process(target=_process_audio_files, args=(i, annotations.iloc[i*avg:(i+1)*avg], p_out, p_raw, args.n_samples, args.sr, args.n_topk)) if i != args.n_worker-1 else Process(target=_process_audio_files, args=(i, annotations.iloc[i*avg:], p_out, p_raw, args.n_samples, args.sr, args.n_topk)) for i in range(args.n_worker)] LOG.info(f"{args.n_worker} workers created.") # start jobs for p in processes: p.start() # wait jobs to finish for p in processes: p.join() CONSOLE.rule('MTT Dataset Preparation Done') return