def main(input_filepath, output_filepath): """ Runs data processing scripts to turn raw data from (../raw) into cleaned data ready to be analyzed (saved in ../processed). """ logger = logging.getLogger(__name__) logger.info('making final data set from raw data') input_filepath = Path(input_filepath) output_filepath = Path(output_filepath) / "all_incidents.feather" all_snapshots = list(input_filepath.glob("**/*.snapshot")) all_incidents = pqdm(all_snapshots, page_to_data_series, n_jobs=cpu_count() - 1) all_incidents = pd.DataFrame(all_incidents) logger.info( f"Dropping any duplicates (snapshot content is the same) from original rows {all_incidents.shape[0]}" ) all_incidents.drop_duplicates(inplace=True) logger.info(f"Dropped duplicates results in rows {all_incidents.shape[0]}") logger.info(f"Saving data to: {str(output_filepath)}") all_incidents.reset_index().rename(columns={ 'index': 'incident_name' }).to_feather(output_filepath) logger.info("Finished successfully.")
def create_unique_ins_labels(data, overwrite=False, base_path='.'): """ Create a dictionary with the count of each existent atom-smiles in the train dataset and a dataframe with the atom-smiles in each compound. eg: SMILES dataframe :param data: Pandas data frame with columns ['file_name', 'SMILES']. [Pandas DF] :param overwrite: overwrite existing JSON file at base_path + '/data/unique_atoms_smiles.json. [bool] :param base_path: base path of the environment. [str] :return: A dict of counts[dict] and DataFrame of unique atom-smiles per compound. """ smiles_list = data.SMILES.to_list() # check if file exists output_counts_path = base_path + '/data/unique_atom_smiles_counts.json' output_unique_atoms = base_path + '/data/unique_atoms_per_molecule.csv' output_mol_rarity = base_path + '/data/mol_rarity_train.csv' if all([os.path.exists(p) for p in [output_counts_path, output_unique_atoms]]): if overwrite: print(f'{color.BLUE}Output files exists, but overwriting.{color.BLUE}') else: print(f'{color.BOLD}labels JSON {color.END} already exists, skipping process and reading file.\n', f'{color.BLUE}Counts file readed from:{color.END} {output_counts_path}\n', f'{color.BLUE}Unique atoms file readed from:{color.END} {output_unique_atoms}\n' f'{color.BLUE}Mol rarity file readed from:{color.END} {output_counts_path}\n', f'if you want to {color.BOLD} overwrite previous file {color.END}, ' f'call function with {color.BOLD}overwrite=True{color.END}') return json.load(open(output_counts_path, 'r')), \ pd.read_csv(output_unique_atoms) assert type(smiles_list) == list, 'Input smiles data type must be a LIST' n_jobs = multiprocessing.cpu_count() - 1 # get unique atom-smiles in each compound and count for sampling later. result = pqdm(smiles_list, _get_unique_atom_smiles_and_rarity, n_jobs=n_jobs, desc='Calculating unique atom-smiles and rarity') result, sample_weights = list(map(list, zip(*result))) counts = Counter(x for xs in result for x in xs) # save counts with open(output_counts_path, 'w') as fout: json.dump(counts, fout) # save sample weights sample_weights = pd.DataFrame.from_dict(sample_weights) sample_weights.insert(0, "file_name", data.file_name) sample_weights.to_csv(output_mol_rarity, index=False) # save unique atoms in each molecule to oversample less represented classes later unique_atoms_per_molecule = pd.DataFrame({'SMILES': smiles_list, 'unique_atoms': [set(r) for r in result]}) unique_atoms_per_molecule.to_csv(output_unique_atoms, index=False) print(f'{color.BLUE}Counts file saved at:{color.END} {output_counts_path}\n' + f'{color.BLUE}Unique atoms file saved at:{color.END} {output_unique_atoms}') return counts, unique_atoms_per_molecule
def numpy_dataset_stream_noaugment(train_files): par_file = partial(file_to_vector_array_stream_test_data) from pqdm.processes import pqdm import multiprocessing d = pqdm(train_files, par_file, n_jobs=multiprocessing.cpu_count()) list_audio = [] for i in d: list_audio.append(i) newlist = numpy.asarray(list_audio) newlist = newlist.reshape(newlist.shape[0] * newlist.shape[1], newlist.shape[2]) return newlist
def one_hot(dataset, column_prefix=None, n_coresJob=1, disableLoadBar=True, columns_use=None): """ Application of the one-hot encoding preprocessing (e.g., [0,0,1 0,1,0]) Note: if you use the column_prefixer it is not possible to undo the one_hot encoding preprocessing If column_prefix is column then the column names will be used, else it will use the custom name provided :param columns_use: :param column_prefix: :param n_coresJob: Number of cores you need for multiprocessing (e.g., 1 column per process) :param disableLoadBar: Chooses if you want load bar or not (default = True) :param dataset: dataset to one-hot encode :return: A new Dataset with the one-hot encoding transformation """ dfFinal = pd.DataFrame() columns_Processing = [] assert isinstance(dataset, pd.DataFrame), "Dataset needs to be of type Pandas" if isinstance(dataset, pd.DataFrame): if columns_use is not None: assert all(flag in dataset.columns for flag in columns_use), "Use columns specific to the dataset given the columns provided are not found " \ + ' '.join([j for j in columns_use]) if set(columns_use).issubset(dataset.columns): for column in columns_use: columns_Processing.append(dataset[column]) else: for column in dataset: columns_Processing.append(dataset[column]) func = partial(__one_hot_single__, column_prefix=column_prefix) d = pqdm(columns_Processing, func, n_jobs=n_coresJob, disable=disableLoadBar) if columns_use is not None: dfFinal = pd.concat([i for i in d], axis=1) dfFinal = pd.concat([ dfFinal, dataset[dataset.columns.difference(columns_use, sort=False)] ], axis=1, sort=True) else: dfFinal = pd.concat([i for i in d], axis=1) return dfFinal
def idf(dataset, n_coresJob=1, disableLoadBar=True, columns_use=None): """ The Inverse Document Frequency (IDF) uses f(x)= log(n/f_x), where n is the length of x and f_x is the frequency of x. :param columns_use: List of columns to use :param disableLoadBar: Chooses if you want load bar or not (default = True) :param n_coresJob: Number of cores to use :param dataset: dataset to transform :return: Dataset with the IDF transformation """ TransformedData = dataset.copy() columns_Processing = [] assert isinstance(TransformedData, pd.DataFrame), "Dataset needs to be of type Pandas" if isinstance(TransformedData, pd.DataFrame): # if columns_use is not None: assert all(flag in TransformedData.columns for flag in columns_use), "Use columns specific to the dataset given the columns provided are not found " \ + ' '.join([j for j in columns_use]) if set(columns_use).issubset(TransformedData.columns): for column in columns_use: columns_Processing.append(TransformedData[column]) else: for column in TransformedData: columns_Processing.append(TransformedData[column]) d = pqdm(columns_Processing, __idf_single__, n_jobs=n_coresJob, disable=disableLoadBar) if columns_use is not None: dfFinal = pd.concat([i for i in d], axis=1) dfFinal = pd.concat([ dfFinal, TransformedData[TransformedData.columns.difference( columns_use, sort=False)] ], axis=1, sort=True) else: dfFinal = pd.concat([i for i in d], axis=1) return dfFinal
def scale_data(df, column=[], n_cores=1, scaleFunc="", customfunc=None): assert isinstance(df, pd.DataFrame), "Dataset needs to be of type Pandas" assert (scaleFunc != "" or scaleFunc == "min_max" or scaleFunc == "std" or scaleFunc == "custom"), "Specify a " \ "scaler (" \ "e.g., " \ "'min_max' or " \ "'std') or " \ "'custom' " if scaleFunc == 'custom': assert ( callable(customfunc) ), "Please provide a function for the custom function you want to use" if column is not None: assert all(flag in df.columns for flag in column), "Use columns specific to the dataset given the columns provided are not found " \ + ' '.join([j for j in column]) valArgs = [] if len(column) == 0: columns = df.columns.values diff = columns for i in columns: valArgs.append(df[i]) else: columns = df.columns.values for i in column: valArgs.append(df[i]) diff = columns if scaleFunc == "min_max": func = partial(scale_single_min_max) elif scaleFunc == "std": func = partial(scale_single_std) else: func = partial(customfunc) d = pqdm(valArgs, func, n_jobs=n_cores) dfFinal = pd.concat([i for i in d], axis=1) Concated = pd.concat([df[diff], dfFinal[dfFinal.columns.values]], axis=1, sort=True) return Concated
def pcp(dataset=pd.DataFrame(), perc=0.05, mergeCategory="Others", n_coresJob=1, disableLoadBar=True, columns_use=None): """ The Percentage Categorical Pruned (PCP) merges all least frequent levels (summing up to perc percent) into a single level. It works by first sorting the feature levels according to their frequency in the training data. Then, the least frequent levels (summing up to a threshold percentage of P ) are merged into a single category denoted as "Others", it uses all the dataset! :param columns_use: Specific columns to apply transformation(default None applies to every COLUMN). :param disableLoadBar: Chooses if you want load bar or not (default = True) :param n_coresJob: Number of cores to use for the preprocessing :param mergeCategory: Category for merging the data (by default "Others") :param dataset: dataset to transform :param perc: threshold percentage of P :return: the "Dataset" transformed """ TransformedData = dataset.copy() assert isinstance(TransformedData, pd.DataFrame), "Dataset needs to be of type Pandas" assert 0 <= perc <= 1, "Percentage goes from 0 to 1, it may neither be negative nor above 1" if isinstance(TransformedData, pd.DataFrame) and perc <= 1: columns_Processing = [] if columns_use is not None: assert all(flag in TransformedData.columns for flag in columns_use), "Use columns specific to the dataset given the columns provided are not found " \ + ' '.join([j for j in columns_use]) if set(columns_use).issubset(TransformedData.columns): for column in columns_use: columns_Processing.append(TransformedData[column]) else: for column in TransformedData: columns_Processing.append(TransformedData[column]) func = partial(__pcp_single__, perc_inner=perc, mergeCategoryinner=mergeCategory) d = pqdm(columns_Processing, func, n_jobs=n_coresJob, disable=disableLoadBar) if columns_use is not None: dfFinal = pd.concat([i for i in d], axis=1) dfFinal.columns = columns_use dfFinal = pd.concat([ dfFinal, TransformedData[TransformedData.columns.difference( columns_use, sort=False)] ], axis=1, sort=True) else: dfFinal = pd.concat([i for i in d], axis=1) dfFinal.columns = TransformedData.columns return dfFinal