Ejemplo n.º 1
0
    def _prepare_tables(self, df_1, df_2, feature_1=None, feature_2=None):
        """
        Prepares tables for matching.
        Left match on df_1 and df_2

        Parameters
        ----------
        df_1 : pd.DataFrame
            Dataframe that will be used for matching.
        df_2 : pd.DataFrame
            Dataframe that will be used for matching.
        feature_1 : str, by default None
            List of features of df_1 that will be used for matching.
            None indicates all features will be used.
        feature_2 : str, by default None
            List of features of df_2 that will be used for matching.
            None indicates all features will be used.

        Returns
        -------
        df_1 : pd.DataFrame
            df_1 with only those columns that will be used for matching.
        df_2 : pd.DataFrame
            df_2 with only those columns that will be used for matching.

        Raises
        ------
        SunpyUserWarning
            If number of features for df_1 is not equal to
            number of features for df_2.
        SunpyUserWarning
            if key from feature_1 is not present in df_1
        SunpyUserWarning
            if key from feature_2 is not present in df_2
        """
        if feature_1 is None:
            feature_1 = df_1.columns.values

        if feature_2 is None:
            feature_2 = df_2.columns.values

        if len(feature_1) != len(feature_2):
            raise SunpyUserWarning(
                "The number of columns to match the rows on must be the same.")

        try:
            df_1 = df_1[feature_1]
        except KeyError:
            raise SunpyUserWarning("The features specified for table 1 do not "
                                   "correspond to any columns in table 1.")
        try:
            df_2 = df_2[feature_2]
        except KeyError:
            raise SunpyUserWarning("The features specified for table 2 do not "
                                   "correspond to any columns in table 2.")

        return df_1, df_2
Ejemplo n.º 2
0
    def get_nearest_observation(self, obsdate: str):
        """
        Returns the observation time and date in the Timesfits that is
        closest to the given observation time and date.

        Parameters
        ----------
        obsdate : str
            The observation time and date.

        Returns
        -------
        closest_observation : str
            Observation time and date in the Timesfits that is
            closest to the given observation time and date.

        Examples
        --------
        >>> from pythia.seo import Sunspotter
        >>> sunspotter = Sunspotter()
        >>> obsdate = '2000-01-01 22:47:02'
        >>> sunspotter.get_nearest_observation(obsdate)
        '2000-01-01 12:47:02'
        """
        unique_dates = self.timesfits.index.unique()
        index = unique_dates.get_loc(obsdate, method='nearest')
        nearest_date = str(unique_dates[index])
        # casting to str because obsdate can be a pandas.Timestamp
        if nearest_date != str(obsdate):
            warnings.warn(
                SunpyUserWarning(
                    "The given observation date isn't in the Timesfits file.\n"
                    "Using the observation nearest to the given obsdate instead."
                ))
        return nearest_date
Ejemplo n.º 3
0
    def match_cosine(self, df_1, df_2):
        """
        Finds Cosine similarity between the rows of the two dataframes.
        Parameters
        ----------
        df_1: `pd.DataFrame`
            First DataFrame to match the rows from.
        df_2: `pd.DataFrame`
            Second DataFrame to match the rows from.

        Returns
        -------
        result: `numpy.ndarray`
            Array of size `(n,)` where n is the number of rows in df_1.
            Contains indices of rows from df_2 that best correspond to rows from df_1.
        match_score: `numpy.ndarray`
            Array of size `(n,)` where n is the number of rows in df_1.
            Contains match score for  corresponding best matches.
        """

        try:
            from sklearn.metrics.pairwise import cosine_similarity
        except ImportError:
            raise SunpyUserWarning(
                "Table Matcher requires Scikit Learn to be installed")

        cosine = cosine_similarity(X=df_1, Y=df_2)
        result = np.argmax(cosine, axis=1)
        match_score = np.max(cosine, axis=1)

        return result, match_score
Ejemplo n.º 4
0
    def prepare_data(self):
        """
        Prepares the data DataFrame.

        Raises
        ------
        TypeError
            If the data argument is invalid.
        ValueError
            If target column is not explicitely specified.
        ValueError
            If train test split is not a fraction between 0 and 1.
        ValueError
            If train val split is not a fraction between 0 and 1.

        # TODO : Add support for K fold cross validataion. only 1 split supported as of now.
        """
        if isinstance(self.data, str):
            self.data = pd.read_csv(self.data)
        elif not isinstance(self.data, pd.DataFrame):
            raise TypeError(
                "Explicitely passed data must be a pandas Dataframe")

        if self.X_col is None:
            warnings.warn(
                SunpyUserWarning(
                    "No Feature Columns specified." +
                    "Assuming all columns except target columns to be feature columns."
                ))
            self.X_col = set(self.data.columns) - set(self.y_col)

        if not isinstance(
                self.train_test_split, float
        ) or self.train_test_split >= 1 or self.train_test_split <= 0:
            raise ValueError(
                "train test split must be a fraction between 0 and 1")

        if not isinstance(
                self.train_val_split, float
        ) or self.train_val_split >= 1 or self.train_val_split <= 0:
            raise ValueError(
                "train val split must be a fraction between 0 and 1")

        if self.is_regression is True and self.stratified_shuffle is True:
            warnings.warn(
                "Cannot use Stratified Shuffling with Regression tasks. Defaulting to Random Shuffling."
            )
            self.stratified_shuffle = False

        if self.stratified_shuffle is True:
            splitter = StratifiedShuffleSplit
        else:
            splitter = ShuffleSplit

        self.train_test_splitter = splitter(n_splits=self.num_splits,
                                            test_size=self.train_test_split)
        self.train_val_splitter = splitter(n_splits=self.num_splits,
                                           test_size=self.train_val_split)
Ejemplo n.º 5
0
def pytest_runtest_setup(item):
    """
    pytest hook to skip all tests that have the mark 'remotedata' if the
    pytest_remotedata plugin is not installed.
    """
    if isinstance(item, pytest.Function):
        if 'remote_data' in item.keywords and not HAVE_REMOTEDATA:
            pytest.skip("skipping remotedata tests as pytest-remotedata is not installed")

    # Confirm that the pyplot figure stack is empty before the test
    if HAVE_MATPLOTLIB and plt.get_fignums():
        raise SunpyUserWarning(f"There are stale pyplot figures prior to running {item.name}")
Ejemplo n.º 6
0
    def __init__(self, match_type='cosine'):
        """
        Parameters
        ----------
        match_type : str, optional
            The row matching algorithm, by default 'cosine'

        Raises
        ------
        SunpyUserWarning
            If unrecognized match type is passed.
        """
        self.match_type = match_type

        if self.match_type not in ['cosine', 'euclidean']:
            raise SunpyUserWarning('Incorrect matching algorithm specified.')
Ejemplo n.º 7
0
    def __init__(self,
                 *,
                 data,
                 X_col,
                 y_col,
                 root_dir='data/all_clear/mdi/MDI/fits/',
                 transform=None,
                 is_fits=True,
                 is_tabular=False):
        """
        Parameters
        ----------
        data : pd.DataFrame
            The Dataframe with the FITS data information.
        X_col : list or str
            Data Columns
        y_col : list or str
            Label Column
        root_dir : str, optional
            Path to the FITS files, by default 'data/all_clear/mdi/MDI/fits/'
        transform : torchvision.transforms, optional
            Data transforms, by default None
        is_fits : bool, optional
            Is the input Data in FITS files.
        is_tabular : bool, optional
            Is the input Data in Tabular.
        """
        if not isinstance(y_col, (str, list)):
            raise TypeError(
                "y_col must be a list or string denoting the label column")

        if is_tabular is True and is_fits is True:
            warnings.warn(
                SunpyUserWarning(
                    "`is_tabular` and `is_fits` flags both cannot be simultaneously True "
                    "Using tabular data for analysis"))

        self.data = data
        self.X_col = X_col
        self.y_col = y_col
        self.root_dir = root_dir
        self.transform = transform
        self.is_fits = is_fits
        self.is_tabular = is_tabular
        self.X = self.data[self.X_col]
        self.y = self.data[self.y_col]
Ejemplo n.º 8
0
    def train_dataloader(self):
        """
        Returns the Training Dataloader.

        Returns
        -------
        Dataloader : torch.DataLoader
            The Training Dataloader.
        """
        if self.is_regression is True and self.weighted_sampling is True:
            warnings.warn(
                "Cannot use Weighted Sampling with Regression tasks. Defaulting to Random Shuffling."
            )
            self.weighted_sampling = False

        if isinstance(self.y_col, list) and len(self.y_col) > 1:
            raise (SunpyUserWarning(
                "Weighted Sampling does not work with multiclass classification."
                + " Defaulting to random sampling."))

        if self.weighted_sampling is True:
            if isinstance(self.y_col, list):
                y_col = self.y_col[0]
            else:
                y_col = self.y_col

            classes, class_counts = np.unique(self.train[y_col],
                                              return_counts=True)
            class_weights = {}
            weights = 1 / torch.DoubleTensor(class_counts)

            for index, weight in enumerate(weights):
                class_weights[index] = weight
            weight_list = [
                class_weights[i] for i in self.train[np.array(y_col)]
            ]
            sampler = torch.utils.data.sampler.WeightedRandomSampler(
                weight_list, len(weight_list))

            return DataLoader(self.train_dataset,
                              batch_size=self.batch_size,
                              sampler=sampler)

        else:
            return DataLoader(self.train_dataset, batch_size=self.batch_size)
Ejemplo n.º 9
0
    def verify(self, match_score, threshold):
        """
        Verify matching quality. If any match score is less than the threshold,
        raises Sunpy User Warnings.
        Parameters
        ----------
        match_score: `numpy.ndarray`
            Array of size `(n,)` where n is the number of rows in df_1.
            Contains match score for  corresponding best matches.
        threshold: `float`
            Minimum score for considering a proper match.
        """
        match_dict = {
            'euclidean' : lambda x, y: True if x > y else False,
            'cosine' : lambda x, y: True if x < y else False
            }

        for index, score_value in enumerate(match_score):
            if match_dict[self.match_type](score_value, threshold):
                warnings.warn(SunpyUserWarning(f"\nMatch at Index {index} is likely to be incorrect\n"))
Ejemplo n.º 10
0
    def _get_data(self, delimiter: str):
        # Reading the Timesfits file
        try:
            if self.get_all_timesfits_columns:
                self.timesfits = pd.read_csv(self.timesfits,
                                             delimiter=delimiter)
            else:
                self.timesfits = pd.read_csv(self.timesfits,
                                             delimiter=delimiter,
                                             usecols=self.timesfits_columns)
        except ValueError:
            raise SunpyUserWarning(
                "Sunspotter Object cannot be created."
                " Either the Timesfits columns do not match, or the file is corrupted"
            )

        if not self.timesfits_columns.issubset(self.timesfits.columns):
            missing_columns = self.timesfits_columns - \
                self.timesfits_columns.intersection(self.timesfits.columns)
            missing_columns = ", ".join(missing_columns)

            raise SunpyUserWarning(
                "Sunspotter Object cannot be created."
                " The Timesfits CSV is missing the following columns: " +
                missing_columns)

        if 'obs_date' in self.timesfits.columns:
            self.timesfits.obs_date = pd.to_datetime(self.timesfits.obs_date,
                                                     format=self.datetime_fmt)
            self.timesfits.set_index("obs_date", inplace=True)

        # Reading the Properties file
        try:
            if self.get_all_properties_columns:
                self.properties = pd.read_csv(self.properties,
                                              delimiter=delimiter)
            else:
                self.properties = pd.read_csv(self.properties,
                                              delimiter=delimiter,
                                              usecols=self.properties_columns)
        except ValueError:
            raise SunpyUserWarning(
                "Sunspotter Object cannot be created."
                " Either the Properties columns do not match, or the file is corrupted"
            )

        if not self.properties_columns.issubset(self.properties.columns):
            missing_columns = self.properties_columns - \
                self.properties_columns.intersection(self.properties.columns)
            missing_columns = ", ".join(missing_columns)

            raise SunpyUserWarning(
                "Sunspotter Object cannot be created."
                " The Properties CSV is missing the following columns: " +
                missing_columns)

        if 'id_filename' in self.properties.columns:
            self.properties.set_index("id_filename", inplace=True)

        # Reading the Classification file
        if self.classifications is not None:

            if self.classifications_columns is None:
                raise SunpyUserWarning(
                    "Classifications columns cannot be None"
                    "  when classifications.csv is to be loaded.")
            try:
                self.classifications = pd.read_csv(
                    self.classifications,
                    delimiter=delimiter,
                    usecols=self.classifications_columns)
            except ValueError:
                raise SunpyUserWarning(
                    "Sunspotter Object cannot be created."
                    " Either the Classifications columns do not match, or the file is corrupted"
                )

            self.classifications_columns = set(self.classifications_columns)

            if not self.classifications_columns.issubset(
                    self.classifications.columns):
                missing_columns = self.classifications_columns - \
                    self.classifications_columns.intersection(
                        self.classifications.columns)
                missing_columns = ", ".join(missing_columns)

                raise SunpyUserWarning(
                    "Sunspotter Object cannot be created."
                    " The Classifications CSV is missing the following columns: "
                    + missing_columns)
Ejemplo n.º 11
0
    def __init__(
        self,
        score_board: pd.DataFrame,
        *,
        k_value=32,
        default_score=1400,
        max_comparisons=50,
        max_score_change=32,
        min_score_change=16,
        score_memory=10,
        delimiter=';',
        column_map={
            "player 0": "image_id_0",
            "player 1": "image_id_1",
            "score for player 0": "image0_more_complex_image1"
        }):
        """
        Parameters
        ----------
        score_board : pandas.DataFrame
            DataFrame holding the scores of individual matches.
        k_value : int, optional
            Initial K Value to be used for calculating new ratings, by default 32
        default_score : int, optional
            Initial rating, by default 1400
        max_comparisons : int, optional
            Max comparisions for any player, by default 50
        max_score_change : int, optional
            Upper limit on K Value updation, by default 32
        min_score_change : int, optional
            Lower limit on K Value updation, by default 16
        score_memory : int, optional
            Number of previous scores to consider while calculating
            standard deviation and new K value, by default 10
        column_map : dict, optional
            Dictionary, for mapping the column names of the score_board dataframe
            to variable names used in the ELO ranking system.
            by default {"player 0": "image_id_0",
                                    "player 1": "image_id_1",
                                    "score for player 0": "image0_more_complex_image1"}
        """
        self.score_board = score_board
        self.k_value = k_value
        self.default_score = default_score
        self.score_change = {'min': min_score_change, 'max': max_score_change}
        self.max_comparisions = max_comparisons
        self.score_memory = score_memory
        self.column_map = column_map

        if not set(self.column_map.values()).issubset(
                self.score_board.columns):
            missing_columns = set(self.column_map.values()) - set(
                self.column_map.values()).intersection(
                    self.score_board.columns)
            missing_columns = ", ".join(missing_columns)

            raise SunpyUserWarning(
                "The following columns mentioned in the column map"
                f" are not present in the score board: {missing_columns}")

        self._create_ranking()
Ejemplo n.º 12
0
    def setup(self, stage=None):
        """
        Dataset Generation function.

        Parameters
        ----------
        stage : str, optional
            Training or Testing stage, by default None
        """
        for train_index, test_index in self.train_test_splitter.split(
                X=self.data[self.X_col], y=self.data[self.y_col]):
            self.train, self.test = self.data.iloc[
                train_index], self.data.iloc[test_index]

        for train_index, val_index in self.train_val_splitter.split(
                X=self.train[self.X_col], y=self.train[self.y_col]):
            self.train, self.val = self.train.iloc[
                train_index], self.train.iloc[val_index]

        # Assign train/val datasets for use in dataloaders
        if stage == 'fit' or stage is None:

            if self.train_size > 1:
                self.train = self.train[:self.train]
            else:
                self.train = self.train[:int(
                    len(self.train) * self.train_size)]

            if isinstance(self.train_conf, dict):
                self.train_dataset = BaseDataset(data=self.train,
                                                 X_col=self.X_col,
                                                 y_col=self.y_col,
                                                 **self.train_conf)
            else:
                warnings.warn(
                    SunpyUserWarning(
                        "No training configurations specified, using default configuration."
                    ))
                self.train_dataset = BaseDataset(data=self.train,
                                                 X_col=self.X_col,
                                                 y_col=self.y_col)

            if isinstance(self.val_conf, dict):
                self.val_dataset = BaseDataset(data=self.val,
                                               X_col=self.X_col,
                                               y_col=self.y_col,
                                               **self.val_conf)
            else:
                warnings.warn(
                    SunpyUserWarning(
                        "No validation configurations specified, using default configuration."
                    ))
                self.val_dataset = BaseDataset(data=self.val,
                                               X_col=self.X_col,
                                               y_col=self.y_col)

        # Assign test dataset for use in dataloader(s)
        if stage == 'test' or stage is None:

            if isinstance(self.test_conf, dict):
                self.test_dataset = BaseDataset(data=self.test,
                                                X_col=self.X_col,
                                                y_col=self.y_col,
                                                **self.test_conf)
            else:
                warnings.warn(
                    SunpyUserWarning(
                        "No testing configurations specified, using default configuration."
                    ))
                self.test_dataset = BaseDataset(data=self.test,
                                                X_col=self.X_col,
                                                y_col=self.y_col)
Ejemplo n.º 13
0
    def __init__(self,
                 *,
                 data,
                 X_col,
                 y_col,
                 sequence_length,
                 root_dir=None,
                 transform=None,
                 is_tabular=True):
        """
        Parameters
        ----------
        data : pd.DataFrame
            The Dataframe with the data information.
        X_col : list or str
            Feature Columns
        y_col : list or str
            Label Column
        sequence_length : int
            Length of the Sequence in the Time Series.
        root_dir : str, optional
            Path to the data files if any.
        transform : torchvision.transforms, optional
            Data transforms, by default None
        is_tabular : bool, optional
            Is the input Data in Tabular.
        """
        if not isinstance(y_col, (str, list)):
            raise TypeError(
                "y_col must be a string or list denoting the label column(s)")
        if not set(X_col).isdisjoint(set(y_col)):
            raise ValueError(
                "Feature Columns and Label columns must be dijoint")

        self.data = data
        self.X_col = X_col
        self.y_col = y_col
        self.sequence_length = sequence_length
        self.root_dir = root_dir
        self.transform = transform
        self.is_tabular = is_tabular

        if len(data) < self.sequence_length:
            raise ValueError(
                "Length of dataset cannot be smaller than sequence length.")
        if self.sequence_length > len(data) // 2:
            raise ValueError(
                "Length of sequence cannot be greater half of length of data.")

        residual_data_indices = len(data) % self.sequence_length
        if residual_data_indices > 0:
            warning_message = "The following indices cannot be loaded as a sequence : "
            leftover_indices = ", ".join([
                str(index) for index in range(
                    len(data) - residual_data_indices, len(data))
            ])
            warnings.warn(SunpyUserWarning(warning_message + leftover_indices))

            self.data = self.data[:-residual_data_indices]

        self.X = self.data[self.X_col]
        self.y = self.data[self.y_col]