Exemple #1
0
def TrainingModel(DataSet, OutputPathModel, Verbose=True):
    if type(DataSet) == _pd.core.frame.DataFrame:
        df = DataSet
    else:
        df = _pd.read_csv(DataSet)
        df.index = df['Id Unico']
        df.drop(['Id Unico'], inplace=True, axis=1)

    df['Abandono'] = df['Ab'].apply(lambda x: 0 if x < 0 else 1)
    df.drop(['Ab'], inplace=True, axis=1)

    if Verbose:
        print("Number of Rows: {}\nNumber Of Columns: {}\n".format(
            df.shape[0], df.shape[1]))
        print("Number of Students that Not Dropped (0) and Dropped (1)")
        print(_pd.value_counts(df['Abandono'], sort=True))
        count_classes = _pd.value_counts(df['Abandono'], sort=True)
        count_classes.plot(kind='bar', rot=0)
        _plt.xticks(range(2), LABELS)
        _plt.title("Frequency by observation number")
        _plt.xlabel("Abandono")
        _plt.ylabel("Number of Observations")

    y = df['Abandono']
    X = df.drop('Abandono', axis=1)

    seed = 46
    #dividimos en sets de entrenamiento y test
    X_train, X_test, y_train, y_test = _train_test_split(X,
                                                         y,
                                                         train_size=0.8,
                                                         random_state=seed)

    #os_us = _SMOTETomek(sampling_strategy='all', ratio=0.6, )
    os_us = _SMOTETomek(sampling_strategy='auto', ratio=0.6, random_state=seed)
    X_train_res, y_train_res = os_us.fit_sample(X_train, y_train)

    if Verbose:
        print("\nDistribution before resampling {}".format(Counter(y_train)))
        print("Distribution after resampling {}".format(Counter(y_train_res)))

    num_trees = 50
    rfc = _RandomForestClassifier(
        n_estimators=num_trees,
        #class_weight="balanced",
        random_state=seed,
        max_features=4)
    rfc.fit(X_train, y_train)

    if Verbose:
        pred_y = rfc.predict(X_test)
        print()
        _mostrar_resultados(y_test, pred_y)
        print("Saving the model in the following path: {}".format(
            OutputPathModel))

    _pickle.dump(rfc, open(OutputPathModel, 'wb'))

    if Verbose:
        _plt.show()
Exemple #2
0
def test_set1(size=50, plot=False):

    mu = [(1, 100), (2, 100), (3, 100), (4, 100)]
    std = _np.array([[0.05, 0], [0, 50]])

    X = []
    y = []
    for i in range(len(mu)):
        for _ in range(size):
            X.append(_np.random.multivariate_normal(mean=mu[i], cov=std))
            y.append(i)
    X = _np.array(X)
    y = _np.array(y)

    if plot:
        col = ['red', 'blue', 'green', 'orange']
        _plt.figure()
        _plt.subplot(1, 2, 1)
        for i in range(X.shape[0]):
            _plt.plot(X[i, 0], X[i, 1], '.', color=col[y[i]])
        _plt.axis('equal')

        _plt.subplot(1, 2, 2)
        for i in range(X.shape[0]):
            _plt.plot(X[i, 0], X[i, 1], '.', color=col[y[i]])

    X_train, X_test, y_train, y_test = _train_test_split(X, y, test_size=0.1)
    return X_train, y_train, X_test, y_test
Exemple #3
0
def temporal_train_test_split(
    y: ACCEPTED_Y_TYPES,
    X: Optional[pd.DataFrame] = None,
    test_size: Optional[Union[int, float]] = None,
    train_size: Optional[Union[int, float]] = None,
    fh: Optional[FORECASTING_HORIZON_TYPES] = None,
) -> SPLIT_TYPE:
    """Split arrays or matrices into sequential train and test subsets.

    Creates train/test splits over endogenous arrays an optional exogenous
    arrays.

    This is a wrapper of scikit-learn's ``train_test_split`` that
    does not shuffle the data.

    Parameters
    ----------
    y : pd.Series
        Target series
    X : pd.DataFrame, optional (default=None)
        Exogenous data
    test_size : float, int or None, optional (default=None)
        If float, should be between 0.0 and 1.0 and represent the proportion
        of the dataset to include in the test split. If int, represents the
        relative number of test samples. If None, the value is set to the
        complement of the train size. If ``train_size`` is also None, it will
        be set to 0.25.
    train_size : float, int, or None, (default=None)
        If float, should be between 0.0 and 1.0 and represent the
        proportion of the dataset to include in the train split. If
        int, represents the relative number of train samples. If None,
        the value is automatically set to the complement of the test size.
    fh : ForecastingHorizon

    Returns
    -------
    splitting : tuple, length=2 * len(arrays)
        List containing train-test split of `y` and `X` if given.

    References
    ----------
    ..[1]  adapted from https://github.com/alkaline-ml/pmdarima/
    """
    if fh is not None:
        if test_size is not None or train_size is not None:
            raise ValueError(
                "If `fh` is given, `test_size` and `train_size` cannot "
                "also be specified."
            )
        return _split_by_fh(y, fh, X=X)
    else:
        series = (y,) if X is None else (y, X)
        return _train_test_split(
            *series,
            shuffle=False,
            stratify=None,
            test_size=test_size,
            train_size=train_size,
        )
Exemple #4
0
def train_test_split(X: Sequence, y: Sequence) -> tuple:
    """Custom wrapper around Sklearn's `train_test_split` function.

    By using this wrapper in our code, we make sure that we always use the
    split parameters.
    """
    return _train_test_split(X,
                             y,
                             test_size=0.2,
                             random_state=config.RANDOM_SEED,
                             stratify=y)
Exemple #5
0
def temporal_train_test_split(y,
                              X=None,
                              test_size=None,
                              train_size=None,
                              fh=None):
    """Split arrays or matrices into sequential train and test subsets
    Creates train/test splits over endogenous arrays an optional exogenous
    arrays. This is a wrapper of scikit-learn's ``train_test_split`` that
    does not shuffle.

    Parameters
    ----------
    *series : sequence of pd.Series with same length / shape[0]
    test_size : float, int or None, optional (default=None)
        If float, should be between 0.0 and 1.0 and represent the proportion
        of the dataset to include in the test split. If int, represents the
        relative number of test samples. If None, the value is set to the
        complement of the train size. If ``train_size`` is also None, it will
        be set to 0.25.
    train_size : float, int, or None, (default=None)
        If float, should be between 0.0 and 1.0 and represent the
        proportion of the dataset to include in the train split. If
        int, represents the relative number of train samples. If None,
        the value is automatically set to the complement of the test size.
    fh : ForecastingHorizon

    Returns
    -------
    splitting : list, length=2 * len(arrays)
        List containing train-test split of inputs.

    References
    ----------
    ..[1]  adapted from https://github.com/alkaline-ml/pmdarima/
    """
    if fh is not None:
        if test_size is not None or train_size is not None:
            raise ValueError(
                "If `fh` is given, `test_size` and `train_size` cannot "
                "also be specified.")
        return _split_y_by_fh(y, fh, X=X)
    else:
        series = (y, ) if X is None else (y, X)
        return _train_test_split(
            *series,
            shuffle=False,
            stratify=None,
            test_size=test_size,
            train_size=train_size,
        )
Exemple #6
0
def stratified_split(df: pd.DataFrame, frac: float,
                     column: List[str]) -> (pd.DataFrame, pd.DataFrame):

    label_count = df[column].value_counts().to_dict()
    labels_we_can_use = df[column].apply(lambda x: label_count[x] > 1)
    items_with_count_one = df[~labels_we_can_use].copy()
    items_needing_split = df[labels_we_can_use].copy()

    train, test = _train_test_split(items_needing_split,
                                    test_size=frac,
                                    stratify=items_needing_split[column])
    train = pd.concat([train, items_with_count_one], axis=0,
                      sort=True)  #.reset_index(drop = True)
    return train, test
Exemple #7
0
def miml(args):
    """
    Fetches and prepares (in a DeepDIVA friendly format) the Multi-Instance Multi-Label Image Dataset
    on the file system. Dataset available at: http://lamda.nju.edu.cn/data_MIMLimage.ashx

    Parameters
    ----------
    args : dict
        List of arguments necessary to run this routine. In particular its necessary to provide
        output_folder as String containing the path where the dataset will be downloaded

    Returns
    -------
        None
    """
    # Download the files
    url = 'http://lamda.nju.edu.cn/files/miml-image-data.rar'
    if not os.path.exists(
            os.path.join(args.output_folder, 'miml-image-data.rar')):
        print('Downloading file!')
        filename = wget.download(url, out=args.output_folder)
    else:
        print('File already downloaded!')
        filename = os.path.join(args.output_folder, 'miml-image-data.rar')

    # Extract the files
    path_to_rar = filename
    path_to_output = os.path.join(args.output_folder, 'tmp_miml')
    rarfile.RarFile(path_to_rar).extractall(path_to_output)
    path_to_rar = os.path.join(path_to_output, 'original.rar')
    rarfile.RarFile(path_to_rar).extractall(path_to_output)
    path_to_rar = os.path.join(path_to_output, 'processed.rar')
    rarfile.RarFile(path_to_rar).extractall(path_to_output)
    print('Extracted files...')

    # Load the mat file
    mat = _loadmat(os.path.join(path_to_output, 'miml data.mat'))
    targets = mat['targets'].T
    classes = [item[0][0] for item in mat['class_name']]
    # Add filename at 0-index to correctly format the CSV headers
    classes.insert(0, 'filename')

    # Get list of all image files in the folder
    images = [
        item
        for item in _get_all_files_in_folders_and_subfolders(path_to_output)
        if item.endswith('jpg')
    ]
    images = sorted(images,
                    key=lambda e: int(os.path.basename(e).split('.')[0]))

    # Make splits
    train_data, test_data, train_labels, test_labels = _train_test_split(
        images, targets, test_size=0.2, random_state=42)
    train_data, val_data, train_labels, val_labels = _train_test_split(
        train_data, train_labels, test_size=0.2, random_state=42)

    # print('Size of splits\ntrain:{}\nval:{}\ntest:{}'.format(len(train_data),
    #                                                     len(val_data),
    #                                                     len(test_data)))

    # Make output folders
    dataset_root = os.path.join(args.output_folder, 'MIML')
    train_folder = os.path.join(dataset_root, 'train')
    val_folder = os.path.join(dataset_root, 'val')
    test_folder = os.path.join(dataset_root, 'test')

    make_folder_if_not_exists(dataset_root)
    make_folder_if_not_exists(train_folder)
    make_folder_if_not_exists(val_folder)
    make_folder_if_not_exists(test_folder)

    def _write_data_to_folder(data, labels, folder, classes):
        dest = os.path.join(folder, 'images')
        make_folder_if_not_exists(dest)
        for image, label in zip(data, labels):
            shutil.copy(image, dest)

        rows = np.column_stack(
            ([os.path.join('images', os.path.basename(item))
              for item in data], labels))
        rows = sorted(rows,
                      key=lambda e: int(e[0].split('/')[1].split('.')[0]))
        output_csv = pd.DataFrame(rows)
        output_csv.to_csv(os.path.join(folder, 'labels.csv'),
                          header=classes,
                          index=False)
        return

    # Write the images to the correct folders
    print('Writing the data to the filesystem')
    _write_data_to_folder(train_data, train_labels, train_folder, classes)
    _write_data_to_folder(val_data, val_labels, val_folder, classes)
    _write_data_to_folder(test_data, test_labels, test_folder, classes)

    os.remove(filename)
    shutil.rmtree(path_to_output)
    print('All done!')
    return
Exemple #8
0
def train_val_test_split(X, y, *, split: Any = 0.7):
    """Split datasets into train, validation, and testing sets.

   Given training data X and training labels y, the method will split the data
   into relevant training, validation, and testing sets based on the `split` parameter.

   Usage:

   The method can be called directly with training data.

   >>> X = ['a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'i', 'j']
   >>> y = [1, 2, 3, 4, 5, 6, 7, 8, 9, 10]
   >>> X_train, X_val, X_test, y_train, y_val, y_test = train_val_test_split(X, y, split = [0.6, 0.2, 0.2])

   Parameters:
      - X: The training data, should be lists or arrays.
      - y: The training labels, should be lists or arrays.
      - split: How you want to split the data. Either a float m, which will represent the percentage of
               training data, and the val/test data will have a percentage (1 - m)/2, or a list of three numbers,
               containing the exact float percentages for train/val/test data. Defaults to 70/15/15 split.
   Returns:
      - Six arrays: training data, validation data, test data, train labels, validation labels, test labels.
   """
    # Verify and segment provided split.
    if isinstance(split, float):
        train_split = split
        val_split = test_split = (1 - split) / 2
    elif isinstance(split, (list, tuple)):
        if not len(split) == 3:
            raise ValueError(
                "If you are providing a list of percentages for the train/val/test split, it "
                f"must contain three numbers, got {len(split)}.")
        train_split = split[0]
        val_split = split[1]
        test_split = split[2]
        if not train_split + val_split + test_split == 1:
            raise ValueError(
                "If you are providing a list of percentages for the train/va/test split, it "
                f"should add up to 1, got {train_split + val_split + test_split}"
            )
    else:
        raise TypeError(
            "Split argument should either be a float representing the training percentage, "
            f"or a list containing the train/val/test percentages, got {type(split)}."
        )

    # Convert validation/test split to relative numbers.
    total_test_val_split = val_split + test_split
    val_split = val_split / total_test_val_split

    # First, convert data to train/overflow.
    X_train, X_overflow, y_train, y_overflow = _train_test_split(
        X, y, train_size=train_split)

    # Then, convert overflow to val/test.
    X_val, X_test, y_val, y_test = _train_test_split(X_overflow,
                                                     y_overflow,
                                                     train_size=val_split)

    # Finally, return split training, validation, and test data.
    return X_train, X_val, X_test, y_train, y_val, y_test
def generate_data(extraction_data=DEFAULT_DATA_DIRECTORY,
                  train_output_path=DEFAULT_OUTPUT_TRAIN_PATH,
                  test_output_path=DEFAULT_OUTPUT_TEST_PATH,
                  labels_output_path=DEFAULT_OUTPUT_LABELS_PATH):
    """TODO: Update this

    Generate classification training data from content extraction dataset

    This function is highly dependent on the structure of the content extraction
    dataset and should be used accordingly. If changes are made there, they will
    effect the result of this function. Check the output accordingly.

    Given the "Corrected" extraction data, create a CSV for each line of those
    files, assigning a class -- "title", "ingredient", "instruction", "other" --
    to each line.

    Parameters
    ----------
    extraction_data : string
        The path to the directory containing the "Corrected" extraction training
        data
    output_path : string
        The desired path of the output CSV
    """

    original_text = _numpy.array([], dtype='object')
    classified_type = _numpy.array([], dtype='object')

    for filename in _os.listdir(extraction_data):
        filename = _os.path.join(extraction_data, filename)
        with open(filename) as text_file:
            if not filename.endswith('.txt'):
                continue

            lines = [line.rstrip() for line in text_file]

            original_text = _numpy.append(original_text, lines[0])
            classified_type = _numpy.append(classified_type, 'title')

            current_class = ''
            ingredients_done = False
            for line in lines[1:]:
                if line == '':
                    current_class = 'other'
                    continue

                original_text = _numpy.append(original_text, line)
                classified_type = _numpy.append(classified_type, current_class)

                if current_class == 'other':
                    if not ingredients_done:
                        current_class = 'ingredient'
                    else:
                        current_class = 'instruction'
                    ingredients_done = True

    data_frame = _pandas.DataFrame(
        columns=['text', 'title', 'ingredient', 'instruction', 'other'])
    data_frame['text'] = original_text
    data_frame['title'] = (classified_type == 'title').astype('int')
    data_frame['ingredient'] = (classified_type == 'ingredient').astype('int')
    data_frame['instruction'] = (
        classified_type == 'instruction').astype('int')
    data_frame['other'] = (classified_type == 'other').astype('int')

    df_train, df_test = _train_test_split(data_frame)
    df_train.to_csv(train_output_path)
    df_test.to_csv(test_output_path)
    with open(labels_output_path, 'x') as labels_file:
        labels_file.write('\n'.join(
            ['title', 'ingredient', 'instruction', 'other']))
Exemple #10
0
def temporal_train_test_split(
    y: ACCEPTED_Y_TYPES,
    X: Optional[pd.DataFrame] = None,
    test_size: Optional[Union[int, float]] = None,
    train_size: Optional[Union[int, float]] = None,
    fh: Optional[FORECASTING_HORIZON_TYPES] = None,
) -> SPLIT_TYPE:
    """Split arrays or matrices into sequential train and test subsets.

    Creates train/test splits over endogenous arrays an optional exogenous
    arrays.

    This is a wrapper of scikit-learn's ``train_test_split`` that
    does not shuffle the data.

    Parameters
    ----------
    y : pd.Series
        Target series
    X : pd.DataFrame, optional (default=None)
        Exogenous data
    test_size : float, int or None, optional (default=None)
        If float, should be between 0.0 and 1.0 and represent the proportion
        of the dataset to include in the test split. If int, represents the
        relative number of test samples. If None, the value is set to the
        complement of the train size. If ``train_size`` is also None, it will
        be set to 0.25.
    train_size : float, int, or None, (default=None)
        If float, should be between 0.0 and 1.0 and represent the
        proportion of the dataset to include in the train split. If
        int, represents the relative number of train samples. If None,
        the value is automatically set to the complement of the test size.
    fh : ForecastingHorizon

    Returns
    -------
    splitting : tuple, length=2 * len(arrays)
        List containing train-test split of `y` and `X` if given.

    References
    ----------
    ..[1]  adapted from https://github.com/alkaline-ml/pmdarima/
    """
    if fh is not None:
        if test_size is not None or train_size is not None:
            raise ValueError(
                "If `fh` is given, `test_size` and `train_size` cannot "
                "also be specified."
            )
        return _split_by_fh(y, fh, X=X)
    else:
        pd_format = isinstance(y, pd.Series) or isinstance(y, pd.DataFrame)
        if pd_format is True and isinstance(y.index, pd.MultiIndex):
            ys = get_time_index(y)
            # Get index to group across (only indices other than timepoints index)
            yi_name = y.index.names
            yi_grp = yi_name[0:-1]

            # Get split into test and train data for timeindex only
            series = (ys,)
            yret = _train_test_split(
                *series,
                shuffle=False,
                stratify=None,
                test_size=test_size,
                train_size=train_size,
            )

            # Convert into list indices
            ysl = ys.to_list()
            yrl1 = yret[0].to_list()
            yrl2 = yret[1].to_list()
            p1 = [index for (index, item) in enumerate(ysl) if item in yrl1]
            p2 = [index for (index, item) in enumerate(ysl) if item in yrl2]

            # Subset by group based on identified indices
            y_train = y.groupby(yi_grp, as_index=False).nth(p1)
            y_test = y.groupby(yi_grp, as_index=False).nth(p2)
            if X is not None:
                X_train = X.groupby(yi_grp, as_index=False).nth(p1)
                X_test = X.groupby(yi_grp, as_index=False).nth(p2)
                return y_train, y_test, X_train, X_test
            else:
                return y_train, y_test
        else:
            series = (y,) if X is None else (y, X)
            return _train_test_split(
                *series,
                shuffle=False,
                stratify=None,
                test_size=test_size,
                train_size=train_size,
            )