Ejemplo n.º 1
0
def main(input_filepath, output_filepath):
    """ Runs data processing scripts to turn raw data from (../raw) into
        cleaned data ready to be analyzed (saved in ../processed).
    """
    logger = logging.getLogger(__name__)
    logger.info('making final data set from raw data')

    input_filepath = Path(input_filepath)
    output_filepath = Path(output_filepath) / "all_incidents.feather"

    all_snapshots = list(input_filepath.glob("**/*.snapshot"))

    all_incidents = pqdm(all_snapshots,
                         page_to_data_series,
                         n_jobs=cpu_count() - 1)

    all_incidents = pd.DataFrame(all_incidents)

    logger.info(
        f"Dropping any duplicates (snapshot content is the same) from original rows {all_incidents.shape[0]}"
    )
    all_incidents.drop_duplicates(inplace=True)
    logger.info(f"Dropped duplicates results in rows {all_incidents.shape[0]}")

    logger.info(f"Saving data to: {str(output_filepath)}")
    all_incidents.reset_index().rename(columns={
        'index': 'incident_name'
    }).to_feather(output_filepath)

    logger.info("Finished successfully.")
Ejemplo n.º 2
0
def create_unique_ins_labels(data, overwrite=False, base_path='.'):
    """
    Create a dictionary with the count of each existent atom-smiles in the
    train dataset and a dataframe with the atom-smiles in each compound.
    eg: SMILES dataframe

    :param data: Pandas data frame with columns ['file_name', 'SMILES']. [Pandas DF]
    :param overwrite: overwrite existing JSON file at base_path + '/data/unique_atoms_smiles.json. [bool]
    :param base_path: base path of the environment. [str]
    :return: A dict of counts[dict] and DataFrame of unique atom-smiles per compound.
    """
    smiles_list = data.SMILES.to_list()

    # check if file exists
    output_counts_path = base_path + '/data/unique_atom_smiles_counts.json'
    output_unique_atoms = base_path + '/data/unique_atoms_per_molecule.csv'
    output_mol_rarity = base_path + '/data/mol_rarity_train.csv'

    if all([os.path.exists(p) for p in [output_counts_path, output_unique_atoms]]):
        if overwrite:
            print(f'{color.BLUE}Output files exists, but overwriting.{color.BLUE}')
        else:
            print(f'{color.BOLD}labels JSON {color.END} already exists, skipping process and reading file.\n',
                  f'{color.BLUE}Counts file readed from:{color.END} {output_counts_path}\n',
                  f'{color.BLUE}Unique atoms file readed from:{color.END} {output_unique_atoms}\n'
                  f'{color.BLUE}Mol rarity file readed from:{color.END} {output_counts_path}\n',
                  f'if you want to {color.BOLD} overwrite previous file {color.END}, '
                  f'call function with {color.BOLD}overwrite=True{color.END}')

            return json.load(open(output_counts_path, 'r')), \
                   pd.read_csv(output_unique_atoms)

    assert type(smiles_list) == list, 'Input smiles data type must be a LIST'

    n_jobs = multiprocessing.cpu_count() - 1

    # get unique atom-smiles in each compound and count for sampling later.
    result = pqdm(smiles_list, _get_unique_atom_smiles_and_rarity,
                  n_jobs=n_jobs, desc='Calculating unique atom-smiles and rarity')
    result, sample_weights = list(map(list, zip(*result)))
    counts = Counter(x for xs in result for x in xs)

    # save counts
    with open(output_counts_path, 'w') as fout:
        json.dump(counts, fout)

    # save sample weights
    sample_weights = pd.DataFrame.from_dict(sample_weights)
    sample_weights.insert(0, "file_name", data.file_name)
    sample_weights.to_csv(output_mol_rarity, index=False)

    # save unique atoms in each molecule to oversample less represented classes later
    unique_atoms_per_molecule = pd.DataFrame({'SMILES': smiles_list, 'unique_atoms': [set(r) for r in result]})
    unique_atoms_per_molecule.to_csv(output_unique_atoms, index=False)

    print(f'{color.BLUE}Counts file saved at:{color.END} {output_counts_path}\n' +
          f'{color.BLUE}Unique atoms file saved at:{color.END} {output_unique_atoms}')

    return counts, unique_atoms_per_molecule
Ejemplo n.º 3
0
def numpy_dataset_stream_noaugment(train_files):
    par_file = partial(file_to_vector_array_stream_test_data)

    from pqdm.processes import pqdm
    import multiprocessing
    d = pqdm(train_files, par_file, n_jobs=multiprocessing.cpu_count())
    list_audio = []
    for i in d:
        list_audio.append(i)
    newlist = numpy.asarray(list_audio)
    newlist = newlist.reshape(newlist.shape[0] * newlist.shape[1], newlist.shape[2])
    return newlist
Ejemplo n.º 4
0
def one_hot(dataset,
            column_prefix=None,
            n_coresJob=1,
            disableLoadBar=True,
            columns_use=None):
    """ Application of the one-hot encoding preprocessing (e.g., [0,0,1
                                                                 0,1,0])
        Note: if you use the column_prefixer it is not possible to undo the one_hot encoding preprocessing
        If column_prefix is column then the column names will be used, else it will use the custom name provided
        :param columns_use:
        :param column_prefix:
        :param n_coresJob: Number of cores you need for multiprocessing (e.g., 1 column per process)
        :param disableLoadBar: Chooses if you want load bar or not (default = True)
        :param dataset: dataset to one-hot encode

        :return: A new Dataset with the one-hot encoding transformation
    """
    dfFinal = pd.DataFrame()
    columns_Processing = []
    assert isinstance(dataset,
                      pd.DataFrame), "Dataset needs to be of type Pandas"
    if isinstance(dataset, pd.DataFrame):
        if columns_use is not None:
            assert all(flag in dataset.columns for flag in
                       columns_use), "Use columns specific to the dataset given the columns provided are not found " \
                                     + ' '.join([j for j in columns_use])
            if set(columns_use).issubset(dataset.columns):

                for column in columns_use:
                    columns_Processing.append(dataset[column])

        else:
            for column in dataset:
                columns_Processing.append(dataset[column])

        func = partial(__one_hot_single__, column_prefix=column_prefix)
        d = pqdm(columns_Processing,
                 func,
                 n_jobs=n_coresJob,
                 disable=disableLoadBar)

        if columns_use is not None:
            dfFinal = pd.concat([i for i in d], axis=1)
            dfFinal = pd.concat([
                dfFinal, dataset[dataset.columns.difference(columns_use,
                                                            sort=False)]
            ],
                                axis=1,
                                sort=True)
        else:
            dfFinal = pd.concat([i for i in d], axis=1)
    return dfFinal
Ejemplo n.º 5
0
def idf(dataset, n_coresJob=1, disableLoadBar=True, columns_use=None):
    """
    The Inverse Document Frequency (IDF) uses f(x)= log(n/f_x),
    where n is the length of x and f_x is the frequency of x.

    :param columns_use: List of columns to use
    :param disableLoadBar: Chooses if you want load bar or not (default = True)
    :param n_coresJob: Number of cores to use
    :param dataset: dataset to transform

    :return: Dataset with the IDF transformation
    """

    TransformedData = dataset.copy()
    columns_Processing = []
    assert isinstance(TransformedData,
                      pd.DataFrame), "Dataset needs to be of type Pandas"
    if isinstance(TransformedData, pd.DataFrame):
        #
        if columns_use is not None:
            assert all(flag in TransformedData.columns for flag in
                       columns_use), "Use columns specific to the dataset given the columns provided are not found " \
                                     + ' '.join([j for j in columns_use])
            if set(columns_use).issubset(TransformedData.columns):

                for column in columns_use:
                    columns_Processing.append(TransformedData[column])

        else:
            for column in TransformedData:
                columns_Processing.append(TransformedData[column])

        d = pqdm(columns_Processing,
                 __idf_single__,
                 n_jobs=n_coresJob,
                 disable=disableLoadBar)
        if columns_use is not None:
            dfFinal = pd.concat([i for i in d], axis=1)
            dfFinal = pd.concat([
                dfFinal, TransformedData[TransformedData.columns.difference(
                    columns_use, sort=False)]
            ],
                                axis=1,
                                sort=True)
        else:
            dfFinal = pd.concat([i for i in d], axis=1)

        return dfFinal
Ejemplo n.º 6
0
def scale_data(df, column=[], n_cores=1, scaleFunc="", customfunc=None):
    assert isinstance(df, pd.DataFrame), "Dataset needs to be of type Pandas"
    assert (scaleFunc != "" or scaleFunc == "min_max" or scaleFunc == "std" or scaleFunc == "custom"), "Specify a " \
                                                                                                       "scaler (" \
                                                                                                       "e.g., " \
                                                                                                       "'min_max' or " \
                                                                                                       "'std') or " \
                                                                                                       "'custom' "

    if scaleFunc == 'custom':
        assert (
            callable(customfunc)
        ), "Please provide a function for the custom function you want to use"

    if column is not None:
        assert all(flag in df.columns for flag in
                   column), "Use columns specific to the dataset given the columns provided are not found " \
                            + ' '.join([j for j in column])
    valArgs = []
    if len(column) == 0:
        columns = df.columns.values
        diff = columns
        for i in columns:
            valArgs.append(df[i])
    else:
        columns = df.columns.values
        for i in column:
            valArgs.append(df[i])
        diff = columns
    if scaleFunc == "min_max":
        func = partial(scale_single_min_max)
    elif scaleFunc == "std":
        func = partial(scale_single_std)
    else:
        func = partial(customfunc)
    d = pqdm(valArgs, func, n_jobs=n_cores)

    dfFinal = pd.concat([i for i in d], axis=1)

    Concated = pd.concat([df[diff], dfFinal[dfFinal.columns.values]],
                         axis=1,
                         sort=True)

    return Concated
Ejemplo n.º 7
0
def pcp(dataset=pd.DataFrame(),
        perc=0.05,
        mergeCategory="Others",
        n_coresJob=1,
        disableLoadBar=True,
        columns_use=None):
    """
    The Percentage Categorical Pruned (PCP) merges all least frequent levels (summing up to perc percent) into a
    single level. It works by first sorting the feature levels according to their frequency in the training data.
    Then, the least frequent levels (summing up to a threshold percentage of P ) are merged into a single category
    denoted as "Others", it uses all the dataset!

    :param columns_use: Specific columns to apply transformation(default None applies to every COLUMN).
    :param disableLoadBar: Chooses if you want load bar or not (default = True)
    :param n_coresJob: Number of cores to use for the preprocessing
    :param mergeCategory: Category for merging the data (by default "Others")
    :param dataset: dataset to transform
    :param perc: threshold percentage of P
    :return: the "Dataset" transformed



    """

    TransformedData = dataset.copy()

    assert isinstance(TransformedData,
                      pd.DataFrame), "Dataset needs to be of type Pandas"
    assert 0 <= perc <= 1, "Percentage goes from 0 to 1, it may neither be negative nor above 1"
    if isinstance(TransformedData, pd.DataFrame) and perc <= 1:
        columns_Processing = []
        if columns_use is not None:
            assert all(flag in TransformedData.columns for flag in
                       columns_use), "Use columns specific to the dataset given the columns provided are not found " \
                                     + ' '.join([j for j in columns_use])
            if set(columns_use).issubset(TransformedData.columns):

                for column in columns_use:
                    columns_Processing.append(TransformedData[column])

        else:
            for column in TransformedData:
                columns_Processing.append(TransformedData[column])
        func = partial(__pcp_single__,
                       perc_inner=perc,
                       mergeCategoryinner=mergeCategory)

        d = pqdm(columns_Processing,
                 func,
                 n_jobs=n_coresJob,
                 disable=disableLoadBar)

        if columns_use is not None:
            dfFinal = pd.concat([i for i in d], axis=1)
            dfFinal.columns = columns_use
            dfFinal = pd.concat([
                dfFinal, TransformedData[TransformedData.columns.difference(
                    columns_use, sort=False)]
            ],
                                axis=1,
                                sort=True)
        else:
            dfFinal = pd.concat([i for i in d], axis=1)
            dfFinal.columns = TransformedData.columns
        return dfFinal