def main(df_anon: DataFrame, df_orig: DataFrame, footprint: DataFrame): df = df_anon.copy() df_original_reduced = df_orig.copy() df = df.drop(columns=['longitude', 'latitude']) df_original_reduced = df_original_reduced.drop( columns=['longitude', 'latitude']) # Convert date to year-week df['date'] = df['date'].dt.year.astype( str) + "-" + df['date'].dt.week.astype(str) df_original_reduced['date'] = df_original_reduced['date'].dt.year.astype( str) + "-" + df_original_reduced['date'].dt.week.astype(str) # Group by user and week number df = df.groupby(['id', 'date']).size().reset_index(name='count') df_original_reduced = df_original_reduced.groupby( ['id', 'date']).size().reset_index(name='count') # Join the two Dataframes on the number of times an id is found each week df = pd.merge(df_original_reduced, df, on=['date', 'count'], how='left') df = df.drop(columns=['count']) df = df.groupby(['date', 'id_x'])['id_y'].apply(list) weeks = df.reset_index()['date'].unique().tolist() df = df.reset_index().set_index(['id_x', 'date']).unstack('date') df.columns = weeks # Compare the two footprints and create score df = (df == footprint) df = df.astype(int) # convert true and falses to 1/0 score = df.to_numpy().sum() values = footprint.fillna(0).astype( 'bool').to_numpy().sum() # sum of each non Nan values return score / values
def transform_dataset_offset( dataset: npt.NDArray, dataframe: DataFrame, dataframe_stopcodons: DataFrame, sequence_raw: str, start_position: int, position_offset: int, stopcodons: bool, ) -> DataFrame: """ Generate a dataframe with the sequence position_offset. """ # Add position_offset sequence offset_sequence = _offset_sequence(dataset, sequence_raw, start_position, position_offset) df_output = dataframe_stopcodons.copy( ) if stopcodons is True else dataframe.copy() # Copy old sequence df_output['Sequence_old'] = df_output['Sequence'] # Count amino acids aa_number = len(set(df_output['Aminoacid'])) # Generate new position_offset sequence df_output['Sequence'] = np.ravel([[aa] * aa_number for aa in offset_sequence]) # Drop rows with X df_output.drop(df_output.index[df_output['Sequence'] == 'X'], inplace=True) return df_output
def test_bool_uint(self): s0 = Series([0, 1, True], dtype=np.bool) s1 = Series([0, 1, 100], dtype=np.uint8) s2 = Series([0, 1, 255], dtype=np.uint8) s3 = Series([0, 1, 2**15 - 100], dtype=np.uint16) s4 = Series([0, 1, 2**16 - 1], dtype=np.uint16) s5 = Series([0, 1, 2**31 - 100], dtype=np.uint32) s6 = Series([0, 1, 2**32 - 1], dtype=np.uint32) original = DataFrame({ 's0': s0, 's1': s1, 's2': s2, 's3': s3, 's4': s4, 's5': s5, 's6': s6 }) original.index.name = 'index' expected = original.copy() expected_types = (np.int8, np.int8, np.int16, np.int16, np.int32, np.int32, np.float64) for c, t in zip(expected.columns, expected_types): expected[c] = expected[c].astype(t) with tm.ensure_clean() as path: original.to_stata(path) written_and_read_again = self.read_dta(path) written_and_read_again = written_and_read_again.set_index('index') tm.assert_frame_equal(written_and_read_again, expected)
def select_snv(df_input: DataFrame) -> DataFrame: """ Select for SNV variants in DSM dataset Parameters ----------- df_input : pandas dataframe containing DSM data Returns -------- Modified dataframe("Variant","Score") where "SNV?"== True. Returns copy """ # Use _add_SNV_boolean funciton df_input = add_snv_boolean(df_input.copy()) # Select SNV? == True only df_input = df_input[df_input["SNV?"] == True].copy() # pylint: disable=singleton-comparison # Select columns of interest df_input = df_input[["Position", "Variant", "Score", "Score_NaN"]].copy() # Reset index df_input.reset_index(drop=True, inplace=True) return df_input
def condense_heatmap(df_input: DataFrame, new_order: List[str]) -> DataFrame: """ Converts the np.array with stored enrichment scores into the condensed heatmap """ df_input = df_input.copy() df_input.drop(['Position'], axis=1, inplace=True) # Group by sequence and aminoacid, and then pivot table df_grouped = df_input.groupby(['Sequence', 'Aminoacid'], sort=False).mean() df_pivoted = df_grouped.pivot_table(values='Score', index='Aminoacid', columns='Sequence') df_pivoted.reset_index(drop=False, inplace=True) # Sort in y axis desired order df_pivoted['Aminoacid'] = Categorical(df_pivoted['Aminoacid'], new_order) df_pivoted = df_pivoted.sort_values(by=['Aminoacid']) # Sort in x axis desired order x_order = return_common_elements(new_order, list(df_pivoted.columns)) # Drop amino acid column data_dropped = df_pivoted.drop(['Aminoacid'], axis=1) return data_dropped[x_order]
def calculate_correlation_by_residue(df_input: DataFrame) -> DataFrame: """ Calculate correlation by position. """ df_output = df_input.copy() df_output = df_output.pivot_table(values='Score', index='Position', columns='Aminoacid') return df_output.T.corr()
def _group_codons_to_aa(self, df_input: DataFrame) -> DataFrame: """ Group different codons that are synonymous. Returns sum of counts. """ df_input = df_input.copy() df_input['Aminoacid'] = self.aminoacids # Group by mean return df_input.groupby(as_index=True, by='Aminoacid', sort=False).sum()
def _polishdf(df: DataFrame) -> DataFrame: df_mean = df.copy() df_mean = df.mean().to_frame() df_mean.reset_index(drop=False, inplace=True) df_mean.rename(columns={0: 'R2'}, inplace=True) df_mean['Combinations'] = list( df_mean['index'].apply(lambda x: ''.join(x))) # pylint: disable=unnecessary-lambda df_mean.drop(columns=['index'], inplace=True) return df_mean
def _generate_codes(df: DataFrame, cat_cols: List) -> dict: tmp = df.copy() for col in cat_cols: tmp[col] = tmp[col].astype("category").cat.as_ordered() # list of categories for each column (always a column for None) codes = {col: list(tmp[col].cat.categories) for col in cat_cols} return codes
def _calculate_secondary(df_input: DataFrame, secondary: list) -> DataFrame: """ Returns copy. """ df_output: DataFrame = df_input.copy() df_output.insert(4, 'Secondary', secondary) df_output = df_output.groupby(['Secondary'], as_index=False, sort=False).mean() df_output = df_output[df_output['Secondary'].str.startswith(('β', 'α'))] return df_output.drop(['Position'], axis=1)
def top_recommended_movies_for_user(userId:int, df:DataFrame, svd:SVD, links:DataFrame): movies = df.copy() for i in range(len(movies['id'])): movies['id'].iloc[i] = links[(links['tmdbId'] == movies['id'].iloc[i])]['movieId'] movies['est'] = movies['id'].apply(lambda x: 0) for i in range(len(movies['id'])): movies['est'].iloc[i] = svd.predict(userId, movies['id'].iloc[i]).est movies = movies.sort_values('est', ascending=False) return movies
def _grou_by_secondary(df: DataFrame, secondary: List[str]) -> DataFrame: """ Groups each secondary motif and makes the mean. Returns dataframe. Returns copy """ df = df.copy() df.insert(4, 'Secondary', secondary) df = df.groupby(['Secondary', 'Aminoacid'], as_index=False).mean() df = df.loc[df['Secondary'].str.startswith(('β', 'α'))] return df
def test_dates_invalid_column(self): original = DataFrame([datetime(2006, 11, 19, 23, 13, 20)]) original.index.name = "index" with tm.ensure_clean() as path: with tm.assert_produces_warning(InvalidColumnName): original.to_stata(path, {0: "tc"}) written_and_read_again = self.read_dta(path) modified = original.copy() modified.columns = ["_0"] tm.assert_frame_equal(written_and_read_again.set_index("index"), modified)
def find_trendline(df_data: DataFrame, y_key: str, high_low: str = "high") -> DataFrame: """Attempts to find a trend line based on y_key column from a given stock ticker data frame. Parameters ---------- df_data : DataFrame The stock ticker data frame with at least date_id, y_key columns. y_key : str Column name to base the trend line on. high_low: str, optional Either "high" or "low". High is the default. Returns ------- DataFrame If a trend is successfully found, An updated Panda's data frame with a trend data {y_key}_trend column. If no trend was found, An original Panda's data frame """ for iteration in [3, 4, 5, 6, 7]: df_temp = df_data.copy() while len(df_temp) > iteration: reg = linregress( x=df_temp["date_id"], y=df_temp[y_key], ) if high_low == "high": df_temp = df_temp.loc[ df_temp[y_key] > reg[0] * df_temp["date_id"] + reg[1] ] else: df_temp = df_temp.loc[ df_temp[y_key] < reg[0] * df_temp["date_id"] + reg[1] ] if len(df_temp) > 1: break if len(df_temp) == 1: return df_data reg = linregress( x=df_temp["date_id"], y=df_temp[y_key], ) df_data[f"{y_key}_trend"] = reg[0] * df_data["date_id"] + reg[1] return df_data
def calculate_correlation(df: DataFrame, order_aminoacids: List[str]) -> DataFrame: """ Calculate correlation by variant. """ dataset: DataFrame = df.copy() dataset = dataset.pivot_table(values='Score', index='Position', columns='Aminoacid') dataset = dataset.corr() dataset = dataset.reindex(index=order_aminoacids)[order_aminoacids] return dataset
def test_dates_invalid_column(self): original = DataFrame([datetime(2006, 11, 19, 23, 13, 20)]) original.index.name = 'index' with tm.ensure_clean() as path: with tm.assert_produces_warning(InvalidColumnName): original.to_stata(path, {0: 'tc'}) written_and_read_again = self.read_dta(path) modified = original.copy() modified.columns = ['_0'] tm.assert_frame_equal(written_and_read_again.set_index('index'), modified)
def update_a_df_column(self, df_to_update: DataFrame, df_as_source: DataFrame, unique_col: str, col_to_update: str): ''' Updates a Dataframe column with a source Dataframe based on their common unique columns parameters: df_to_update: dataframe, main df to be updated df_as_source: dataframe, source df to update the main df unique_col: str, common columns (should have same name) to match records, this unique column must have unique values col_to_update: str, which column value to be updated returns: a copy of the updated DataFrame warning: index is reset during the update ''' # copy df df = df_to_update.copy() source = df_as_source.copy() # reset index: WARNING: drops index if exist df.reset_index(inplace=True, drop=True) source.reset_index(inplace=True, drop=True) # set unique_col as index df.set_index(unique_col, inplace=True) source.set_index(unique_col, inplace=True) # update on series df[col_to_update].update(source[col_to_update]) # reset index: WARNING: puts back the index to first df.reset_index(inplace=True, drop=False) source.reset_index(inplace=True, drop=False) return df
def group_by_aa(df_input: DataFrame, aminoacids: List[str]) -> DataFrame: """ Group different codons that are synonymous. """ # copy df df_output = df_input.copy() # Set up amino acid column df_output['Aminoacid'] = aminoacids # Group by mean df_output = df_output.groupby(as_index=True, by='Aminoacid', sort=False).mean() return df_output
def test_dates_invalid_column(self): original = DataFrame([datetime(2006, 11, 19, 23, 13, 20)]) original.index.name = 'index' with tm.ensure_clean() as path: with warnings.catch_warnings(record=True) as w: tm.assert_produces_warning(original.to_stata(path, {0: 'tc'}), InvalidColumnName) tm.assert_equal(len(w), 1) written_and_read_again = self.read_dta(path) modified = original.copy() modified.columns = ['_0'] tm.assert_frame_equal(written_and_read_again.set_index('index'), modified)
def add_a_col_from_a_df(self, into_df: DataFrame, from_df: DataFrame, unique_col: str, col_to_add: str): """ Add a column into a dataframe from another dataframe parameters: into_df: dataframe, main df, which will be updated with a new column from_df: dataframe, source df, which has the column to add into main df unique_col: str, column name which is common in both dataframes col_to_add: str, column to be added from source dataframe returns: * main dataframe filled with the new column and values, where unique column matches warning: this method assumes no index """ main = into_df.copy() source = from_df.copy() return main.merge(source[[unique_col, col_to_add]], on=unique_col, how="left")
def feature_values_using_filter_and_indexes(move_data: DataFrame, id_: Union[int, Text], feature_name: Text, filter_: List, idxs: List, values: Any, inplace: Optional[bool] = True): """ Create or update move and stop by radius. Parameters ---------- move_data : dataframe The input trajectories data. id_ : str Indicates the index to be changed. feature_name : str The name of the column that the user wants to change values for. filter_ : array Indicates the rows with the index "id_" of the "feature_name" that must be changed. idxs : array like of indexes Indexes to atribute value values : any The new values to be set to the selected feature. inplace: bool, optional if set to true the original dataframe will be altered, otherwise the alteration will be made in a copy, that will be returned, by default True Returns ------- DataFrame A copy of the original dataframe or None """ if not inplace: move_data = move_data.copy() values_feature = move_data.at[id_, feature_name] values_feature_filter = values_feature.iloc[filter_] values_feature_filter.iloc[idxs] = values values_feature.iloc[filter_] = values_feature_filter move_data.at[id_, feature_name] = values_feature if not inplace: return move_data else: return None
def feature_values_using_filter( move_data: DataFrame, id_: Union[Text, int], feature_name: Text, filter_: List, values: Any, inplace: Optional[bool] = True) -> Optional[DataFrame]: """ Changes the values of the feature defined by the user. Parameters ---------- move_data : DataFrame The input trajectories data. id_ : str Indicates the index to be changed. feature_name : str The name of the column that the user wants to change values for. filter_ : list or array Indicates the rows with the index "id_" of the "feature_name" that must be changed. values : any THe new values to be set to the selected feature. inplace: boolean, optional(True by default) if set to true the original dataframe will be altered, otherwise the alteration will be made in a copy, that will be returned. Returns ------- DataFrame A copy of the original dataframe or None """ if not inplace: move_data = move_data.copy() values_feature = move_data.at[id_, feature_name] if filter_.shape == (): move_data.at[id_, feature_name] = values else: values_feature.iloc[filter_] = values move_data.at[id_, feature_name] = values_feature if not inplace: return move_data else: return None
def test_large_value_conversion(self): s0 = Series([1, 99], dtype=np.int8) s1 = Series([1, 127], dtype=np.int8) s2 = Series([1, 2 ** 15 - 1], dtype=np.int16) s3 = Series([1, 2 ** 63 - 1], dtype=np.int64) original = DataFrame({"s0": s0, "s1": s1, "s2": s2, "s3": s3}) original.index.name = "index" with tm.ensure_clean() as path: with tm.assert_produces_warning(PossiblePrecisionLoss): original.to_stata(path) written_and_read_again = self.read_dta(path) modified = original.copy() modified["s1"] = Series(modified["s1"], dtype=np.int16) modified["s2"] = Series(modified["s2"], dtype=np.int32) modified["s3"] = Series(modified["s3"], dtype=np.float64) tm.assert_frame_equal(written_and_read_again.set_index("index"), modified)
def __runPortfolioDesc(self, portfolioDescs:List[APortfolioDescription], portFolioModels:List[DataFrame], evaluatonTools:List[AEvalTool], histories:List[AHistory], trainRatingsDF:DataFrame, testRatingsDF:DataFrame): portfolios:List[APortfolio] = [] portfolioDescI:APortfolioDescription historyI:AHistory for portfolioDescI, historyI in zip(portfolioDescs, histories): print("Training mode: " + str(portfolioDescI.getPortfolioID())) # train portfolio model portfolioI:APortfolio = portfolioDescI.exportPortfolio(self._batchID, historyI) portfolioI.train(historyI, trainRatingsDF.copy(), self._usersDF.copy(), self._itemsDF.copy()) portfolios.append(portfolioI) return self.__iterateOverDataset(portfolios, portfolioDescs, portFolioModels, evaluatonTools, histories, testRatingsDF)
def color_3d_scatter(df_input: DataFrame, mode: str, lof: float, gof: float) -> DataFrame: """ Color the data points by enrichment scores. Parameters ----------- df : pandas dataframe The input is a dataframe that has colum with ['Position', 'Aminoacid', 'Score']. mode : str Specify what enrichment scores to use. If mode = 'mean', it will use the mean of each position to classify the residues. If mode = 'A', it will use the Alanine substitution profile. Can be used for each amino acid. Use the one-letter code and upper case. gof : int, default is 1 cutoff for determining gain of function mutations based on mutagenesis data. lof : int, default is -1 cutoff for determining loss of function mutations based on mutagenesis data. Returns --------- df_grouped: pandas dataframe New dataframe with added column of ['Color'] and the ['Score'] values of the mode you chose. """ # Copy df df_grouped: DataFrame = df_input.copy() # Select grouping. if mode.lower() == 'mean': df_grouped = df_grouped.groupby(['Position'], as_index=False).mean() else: df_grouped = df_grouped.loc[df_grouped['Aminoacid'] == mode] # Select colors based on Score values df_grouped['Color'] = 'green' df_grouped.loc[df_grouped['Score'] < lof, 'Color'] = 'blue' df_grouped.loc[df_grouped['Score'] > gof, 'Color'] = 'red' return df_grouped
def cleanData(data: DataFrame) -> DataFrame: cleanedData: DataFrame = data.copy().dropna( ) # copying and dropping rows with NA # Removing whitespace from the column NAMES cleanedData: DataFrame = cleanedData.rename(columns=lambda x: x.strip(), inplace=False) # Removing whitespace from the column VALUES #cleanedData: DataFrame = cleanedData.applymap(lambda x: x.strip() if type(x) == str else x) # NOTE: the above approach ruins the dataframe printing capability (cannot show data frame as nice as it was # before, but instead it looks like messy string with \n values) for var in cleanedData.columns: valuesNoWhitespace: Series = cleanedData[var].str.strip() cleanedData[var] = valuesNoWhitespace return cleanedData
def test_large_value_conversion(self): s0 = Series([1, 99], dtype=np.int8) s1 = Series([1, 127], dtype=np.int8) s2 = Series([1, 2**15 - 1], dtype=np.int16) s3 = Series([1, 2**63 - 1], dtype=np.int64) original = DataFrame({'s0': s0, 's1': s1, 's2': s2, 's3': s3}) original.index.name = 'index' with tm.ensure_clean() as path: with tm.assert_produces_warning(PossiblePrecisionLoss): original.to_stata(path) written_and_read_again = self.read_dta(path) modified = original.copy() modified['s1'] = Series(modified['s1'], dtype=np.int16) modified['s2'] = Series(modified['s2'], dtype=np.int32) modified['s3'] = Series(modified['s3'], dtype=np.float64) tm.assert_frame_equal(written_and_read_again.set_index('index'), modified)
def test_large_value_conversion(self): s0 = Series([1, 99], dtype=np.int8) s1 = Series([1, 127], dtype=np.int8) s2 = Series([1, 2 ** 15 - 1], dtype=np.int16) s3 = Series([1, 2 ** 63 - 1], dtype=np.int64) original = DataFrame({'s0': s0, 's1': s1, 's2': s2, 's3': s3}) original.index.name = 'index' with tm.ensure_clean() as path: with warnings.catch_warnings(record=True) as w: tm.assert_produces_warning(original.to_stata(path), PossiblePrecisionLoss) # should produce a single warning tm.assert_equal(len(w), 1) written_and_read_again = self.read_dta(path) modified = original.copy() modified['s1'] = Series(modified['s1'], dtype=np.int16) modified['s2'] = Series(modified['s2'], dtype=np.int32) modified['s3'] = Series(modified['s3'], dtype=np.float64) tm.assert_frame_equal(written_and_read_again.set_index('index'), modified)
def test_bool_uint(self): s0 = Series([0, 1, True], dtype=np.bool) s1 = Series([0, 1, 100], dtype=np.uint8) s2 = Series([0, 1, 255], dtype=np.uint8) s3 = Series([0, 1, 2 ** 15 - 100], dtype=np.uint16) s4 = Series([0, 1, 2 ** 16 - 1], dtype=np.uint16) s5 = Series([0, 1, 2 ** 31 - 100], dtype=np.uint32) s6 = Series([0, 1, 2 ** 32 - 1], dtype=np.uint32) original = DataFrame({"s0": s0, "s1": s1, "s2": s2, "s3": s3, "s4": s4, "s5": s5, "s6": s6}) original.index.name = "index" expected = original.copy() expected_types = (np.int8, np.int8, np.int16, np.int16, np.int32, np.int32, np.float64) for c, t in zip(expected.columns, expected_types): expected[c] = expected[c].astype(t) with tm.ensure_clean() as path: original.to_stata(path) written_and_read_again = self.read_dta(path) written_and_read_again = written_and_read_again.set_index("index") tm.assert_frame_equal(written_and_read_again, expected)
def __init__(self, ratingsDF:DataFrame): if type(ratingsDF) is not DataFrame: raise ValueError("Argument ratingsDF isn't type DataFrame.") ratingsCopyDF:DataFrame = ratingsDF.copy() ratingsCopyDF['index1'] = ratingsCopyDF.index #print(ratingsDF) #print(ratingsCopyDF) userIds:List[int] = list(set([rowI[Ratings.COL_USERID] for indexDFI, rowI in ratingsCopyDF.iterrows()])) # dictionary (index = userID, value = list[tuple(int, int)]) # each list contains pair(int,int) or (itemID, indefOfDataFrame) self._dictionaryOfUserIDs:dict[List[tuple(int, int)]] = {} userIdI:int for userIdI in userIds: # select ratings of userIdI ratingsUserIDF:DataFrame = ratingsCopyDF.loc[ratingsCopyDF[Ratings.COL_USERID] == userIdI] userDictI:dict = {} lastItemI:Item = None indexDFI:int rowI:Series for i, rowI in ratingsUserIDF.iterrows(): indexDFI:int = rowI['index1'] userIdI:int = rowI[Ratings.COL_USERID] itemIdI:int = rowI[Ratings.COL_MOVIEID] itemI:Item = Item(userIdI, indexDFI, None) if not lastItemI is None: lastItemI.setNext(itemI) lastItemI = itemI userDictI[itemIdI] = itemI self._dictionaryOfUserIDs[userIdI] = userDictI
def _aa_to_codons_df(df_input: DataFrame, namecolumn: str) -> DataFrame: """ Inputs a dataframe with a column of amino acids, returns all syn for each amino acidcodons. Used dict_codon_to_aa() and _aa_to_codons. Parameters ----------- df_input : pandas dataframe namecolumn : str Name of the column containing the amino acids. Returns -------- Dataframe with a column containing all the codons that code for that amino acid. Returns copy """ # Copy df_input df_input = df_input.copy() # Calculate each possible codon for every amino acid df_input["Codons_" + namecolumn] = df_input.apply( lambda x: _aa_to_codons(x[namecolumn]), axis=1) return df_input
def melt( frame: DataFrame, id_vars=None, value_vars=None, var_name=None, value_name="value", col_level=None, ) -> DataFrame: # TODO: what about the existing index? # If multiindex, gather names of columns on all level for checking presence # of `id_vars` and `value_vars` if isinstance(frame.columns, ABCMultiIndex): cols = [x for c in frame.columns for x in c] else: cols = list(frame.columns) if id_vars is not None: if not is_list_like(id_vars): id_vars = [id_vars] elif isinstance(frame.columns, ABCMultiIndex) and not isinstance(id_vars, list): raise ValueError( "id_vars must be a list of tuples when columns are a MultiIndex" ) else: # Check that `id_vars` are in frame id_vars = list(id_vars) missing = Index(com.flatten(id_vars)).difference(cols) if not missing.empty: raise KeyError("The following 'id_vars' are not present" " in the DataFrame: {missing}" "".format(missing=list(missing))) else: id_vars = [] if value_vars is not None: if not is_list_like(value_vars): value_vars = [value_vars] elif isinstance(frame.columns, ABCMultiIndex) and not isinstance(value_vars, list): raise ValueError( "value_vars must be a list of tuples when columns are a MultiIndex" ) else: value_vars = list(value_vars) # Check that `value_vars` are in frame missing = Index(com.flatten(value_vars)).difference(cols) if not missing.empty: raise KeyError("The following 'value_vars' are not present in" " the DataFrame: {missing}" "".format(missing=list(missing))) frame = frame.loc[:, id_vars + value_vars] else: frame = frame.copy() if col_level is not None: # allow list or other? # frame is a copy frame.columns = frame.columns.get_level_values(col_level) if var_name is None: if isinstance(frame.columns, ABCMultiIndex): if len(frame.columns.names) == len(set(frame.columns.names)): var_name = frame.columns.names else: var_name = [ "variable_{i}".format(i=i) for i in range(len(frame.columns.names)) ] else: var_name = [ frame.columns.name if frame.columns.name is not None else "variable" ] if isinstance(var_name, str): var_name = [var_name] N, K = frame.shape K -= len(id_vars) mdata = {} for col in id_vars: id_data = frame.pop(col) if is_extension_array_dtype(id_data): id_data = concat([id_data] * K, ignore_index=True) else: id_data = np.tile(id_data.values, K) mdata[col] = id_data mcolumns = id_vars + var_name + [value_name] mdata[value_name] = frame.values.ravel("F") for i, col in enumerate(var_name): # asanyarray will keep the columns as an Index mdata[col] = np.asanyarray( frame.columns._get_level_values(i)).repeat(N) return frame._constructor(mdata, columns=mcolumns)
def instance_crossover_augmentation( data: DataFrame, restriction: str = 'destination only', label_local: Text = LOCAL_LABEL, frac: float = 0.5, ) -> DataFrame: """ Generates new data from unobserved trajectories, with a specific restriction. By default, the algorithm uses the same destination constraint as the route and inserts the points on the original dataframe. Parameters ---------- data : DataFrame The input trajectories data restriction : str, optional Constraint used to generate new data, by default 'destination only' label_local : str, optional Label of the points sequences, by default LOCAL_LABEL frac : float, optional Represents the percentage to be exchanged, by default 0.5 Example ------- >>> from pymove.utils.data_augmentation import instance_crossover_augmentation >>> >>> df id local_label 0 [1, 1, 1] [85, 673, 394] 1 [2, 2, 2, 2] [85, 224, 623, 394] 2 [3, 3, 3] [263, 673, 394] >>> >>> aug_df = instance_crossover_augmentation(df) >>> aug_df id local_label 0 [1, 1, 1] [85, 673, 394] 1 [2, 2, 2, 2] [85, 224, 623, 394] 2 [3, 3, 3] [263, 673, 394] 3 [1, 2, 2] [85, 623, 394] 4 [2, 2, 1, 1] [85, 224, 673, 394] 5 [2, 2, 3, 3] [85, 224, 673, 394] 6 [3, 2, 2] [263, 623, 394] """ df = data.copy() df[DESTINY] = df[label_local].apply(lambda x: x[-1]) df[START] = df[label_local].apply(lambda x: x[0]) frames = {} destinations = df[DESTINY].unique() for idx, dest in progress_bar(enumerate(destinations), total=len(destinations)): filter_ = df[df[DESTINY] == dest] if restriction == 'departure and destination': starts = filter_[START].unique() for st in progress_bar(starts, total=len(starts)): filter_ = filter_[filter_[START] == st] if filter_.shape[0] >= 2: frames[idx] = _augmentation(filter_.iloc[:, :-2], frac=frac) else: if filter_.shape[0] >= 2: frames[idx] = _augmentation(filter_.iloc[:, :-2], frac=frac) return pd.concat([frames[i] for i in range(len(frames))], axis=0, ignore_index=True)
def transition_graph_augmentation_all_vertex( traj_df: DataFrame, graph: DiGraph | None = None, min_path_size: int = 3, max_path_size: int = 6, max_sampling_source: int = 10, max_sampling_target: int = 10, source: dict | None = None, target: dict | None = None, label_local: Text = LOCAL_LABEL, simple_paths: bool = False, inplace: bool = True ) -> DataFrame: """ Transition Graph Data Augmentation. Performs the data increase from the transition graph. Parameters ---------- traj_df: DataFrame Trajectory data in sequence format graph: DiGraph Transition graph constructed from trajectory data min_path_size: int, optional Minimum number of points for the trajectory, by default 3 max_path_size: int, optional Maximum number of points for the trajectory, by default 6 max_sampling_source: int, optional Maximum number of paths to be returned, considering the observed origin, by default 10 max_sampling_target: int, optional Maximum number of paths to be returned, considering the observed destination, by default 10 source: dict, optional Degree of entry of each node in the graph, by default None Example: {node: degree-of-entry} target: dict, optional Degree of output of each node in the graph, by default None Example: {node: degree-of-output} label_local: str, optional Name of the column referring to the trajectories, by default LOCAL_LABEL label_tid: str, optional Column name for trajectory IDs, by default TID_STAT simple_paths: boolean, optional If true, use the paths with the most used sections Otherwise, use paths with less used sections, by default False inplace : boolean, optional if set to true the original dataframe will be altered to contain the result of the augmentation, otherwise a copy will be returned, by default True Return ------ DataFrame Increased data set. Example ------- >>> from pymove.utils.data_augmentation import ( transition_graph_augmentation_all_vertex ) >>> >>> traj_df.to_dict() {'id': [[1, 1, 1], [2, 2, 2, 2]], 'datetime': [['2017-09-02 22:00:27', '2017-09-02 22:01:36', '2017-09-02 22:03:08'], ['2017-09-02 23:03:46', '2017-09-02 23:07:19', '2017-09-02 23:07:40', '2017-09-02 23:09:10']], 'local_label': [[85, 673, 394], [263, 224, 623, 394]], 'lat': [[-3.8347478, -3.8235834, -3.813889], [-3.9067654, -3.8857223, -3.8828723, -3.9939834]], 'lon': [[-38.592189, -38.590389, -38.5904445], [-38.5907723, -38.5928892, -38.5929789, -38.70409]]} >>> >>> transition_graph_augmentation_all_vertex(traj_df) [263.0, 224.0, 623.0] [224.0, 623.0, 394.0] """ if inplace: traj_df_ = traj_df else: traj_df_ = traj_df.copy() if graph is None: graph = build_transition_graph_from_df(traj_df_) if source is None: source = dict(graph.nodes) source = {key: value['freq_source'] for key, value in source.items()} if target is None: target = dict(graph.nodes) target = {key: value['freq_source'] for key, value in target.items()} targets = sorted(target.items(), key=lambda x: x[1], reverse=True) sources = sorted(source.items(), key=lambda x: x[1], reverse=True) [[get_all_paths( traj_df_, graph, s, t, min_path_size, max_path_size, max_sampling_source, max_sampling_target, label_local, simple_paths ) for s, _ in sources] for t, _ in targets] if not inplace: return traj_df_
def severDeal(self): datadf = None isInit = False # 1 place1 = self.placele1.text() if place1 != '': place1Arr = [list(p) for p in place1.replace('\n','').replace('\r','').split(' ')] place1df = DataFrame(place1Arr, columns=['a','b','c','d']) if not isInit: datadf = place1df.copy() isInit = True datadf = pd.merge(datadf, place1df) # 2 place2 = self.placele2.text() if place2 != '': place2Arr = [list(p) for p in place2.replace('\n','').replace('\r','').replace(' ',' ').split(' ')] place2df = DataFrame(place2Arr, columns=['a','b','c','e']) if not isInit: datadf = place2df isInit = True datadf = pd.merge(datadf, place2df) # 3 place3 = self.placele3.text() if place3 != '': place3Arr = [list(p) for p in place3.replace('\n','').replace('\r','').replace(' ',' ').split(' ')] place3df = DataFrame(place3Arr, columns=['a','b','c','f']) if not isInit: datadf = place3df isInit = True datadf = pd.merge(datadf, place3df) # 4 place4 = self.placele4.text() if place4 != '': place4Arr = [list(p) for p in place4.replace('\n','').replace('\r','').replace(' ',' ').split(' ')] place4df = DataFrame(place4Arr, columns=['a','b','c','g']) if not isInit: datadf = place4df isInit = True datadf = pd.merge(datadf, place4df) # 5 place5 = self.placele5.text() if place5 != '': place5Arr = [list(p) for p in place5.replace('\n','').replace('\r','').replace(' ',' ').split(' ')] place5df = DataFrame(place5Arr, columns=['a','b','d','e']) if not isInit: datadf = place5df isInit = True datadf = pd.merge(datadf, place5df) # 6 place6 = self.placele6.text() if place6 != '': place6Arr = [list(p) for p in place6.replace('\n','').replace('\r','').replace(' ',' ').split(' ')] place6df = DataFrame(place6Arr, columns=['a','b','d','f']) if not isInit: datadf = place6df isInit = True datadf = pd.merge(datadf, place6df) # 7 place7 = self.placele7.text() if place7 != '': place7Arr = [list(p) for p in place7.replace('\n','').replace('\r','').replace(' ',' ').split(' ')] place7df = DataFrame(place7Arr, columns=['a','b','d','g']) if not isInit: datadf = place7df isInit = True datadf = pd.merge(datadf, place7df) # 8 place8 = self.placele8.text() if place8 != '': place8Arr = [list(p) for p in place8.replace('\n','').replace('\r','').replace(' ',' ').split(' ')] place8df = DataFrame(place8Arr, columns=['a','b','e','f']) if not isInit: datadf = place8df isInit = True datadf = pd.merge(datadf, place8df) # 9 place9 = self.placele9.text() if place9 != '': place9Arr = [list(p) for p in place9.replace('\n','').replace('\r','').replace(' ',' ').split(' ')] place9df = DataFrame(place9Arr, columns=['a','b','e','g']) if not isInit: datadf = place9df isInit = True datadf = pd.merge(datadf, place9df) # 10 place10 = self.placele10.text() if place10 != '': place10Arr = [list(p) for p in place10.replace('\n','').replace('\r','').replace(' ',' ').split(' ')] place10df = DataFrame(place10Arr, columns=['a','b','f','g']) if not isInit: datadf = place10df isInit = True datadf = pd.merge(datadf, place10df) # 11 place11 = self.placele11.text() if place11 != '': place11Arr = [list(p) for p in place11.replace('\n','').replace('\r','').replace(' ',' ').split(' ')] place11df = DataFrame(place11Arr, columns=['a','c','d','e']) if not isInit: datadf = place11df isInit = True datadf = pd.merge(datadf, place11df) # 12 place12 = self.placele12.text() if place12 != '': place12Arr = [list(p) for p in place12.replace('\n','').replace('\r','').replace(' ',' ').split(' ')] place12df = DataFrame(place12Arr, columns=['a','c','d','f']) if not isInit: datadf = place12df isInit = True datadf = pd.merge(datadf, place12df) # 13 place13 = self.placele13.text() if place13 != '': place13Arr = [list(p) for p in place13.replace('\n','').replace('\r','').replace(' ',' ').split(' ')] place13df = DataFrame(place13Arr, columns=['a','c','d','g']) if not isInit: datadf = place13df isInit = True datadf = pd.merge(datadf, place13df) # 14 place14 = self.placele14.text() if place14 != '': place14Arr = [list(p) for p in place14.replace('\n','').replace('\r','').replace(' ',' ').split(' ')] place14df = DataFrame(place14Arr, columns=['a','c','e','f']) if not isInit: datadf = place14df isInit = True datadf = pd.merge(datadf, place14df) # 15 place15 = self.placele15.text() if place15 != '': place15Arr = [list(p) for p in place15.replace('\n','').replace('\r','').replace(' ',' ').split(' ')] place15df = DataFrame(place15Arr, columns=['a','c','e','g']) if not isInit: datadf = place15df isInit = True datadf = pd.merge(datadf, place15df) # 16 place16 = self.placele16.text() if place16 != '': place16Arr = [list(p) for p in place16.replace('\n','').replace('\r','').replace(' ',' ').split(' ')] place16df = DataFrame(place16Arr, columns=['a','c','f','g']) if not isInit: datadf = place16df isInit = True datadf = pd.merge(datadf, place16df) # 17 place17 = self.placele17.text() if place17 != '': place17Arr = [list(p) for p in place17.replace('\n','').replace('\r','').replace(' ',' ').split(' ')] place17df = DataFrame(place17Arr, columns=['a','d','e','f']) if not isInit: datadf = place17df isInit = True datadf = pd.merge(datadf, place17df) # 18 place18 = self.placele18.text() if place18 != '': place18Arr = [list(p) for p in place18.replace('\n','').replace('\r','').replace(' ',' ').split(' ')] place18df = DataFrame(place18Arr, columns=['a','d','e','g']) if not isInit: datadf = place18df isInit = True datadf = pd.merge(datadf, place18df) # 19 place19 = self.placele19.text() if place19 != '': place19Arr = [list(p) for p in place19.replace('\n','').replace('\r','').replace(' ',' ').split(' ')] place19df = DataFrame(place19Arr, columns=['a','d','f','g']) if not isInit: datadf = place19df isInit = True datadf = pd.merge(datadf, place19df) # 20 place20 = self.placele20.text() if place20 != '': place20Arr = [list(p) for p in place20.replace('\n','').replace('\r','').replace(' ',' ').split(' ')] place20df = DataFrame(place20Arr, columns=['a','e','f','g']) if not isInit: datadf = place20df isInit = True datadf = pd.merge(datadf, place20df) # 21 place21 = self.placele21.text() if place21 != '': place21Arr = [list(p) for p in place21.replace('\n','').replace('\r','').replace(' ',' ').split(' ')] place21df = DataFrame(place21Arr, columns=['b','c','d','e']) if not isInit: datadf = place21df isInit = True datadf = pd.merge(datadf, place21df) # 22 place22 = self.placele22.text() if place22 != '': place22Arr = [list(p) for p in place22.replace('\n','').replace('\r','').replace(' ',' ').split(' ')] place22df = DataFrame(place22Arr, columns=['b','c','d','f']) if not isInit: datadf = place22df isInit = True datadf = pd.merge(datadf, place22df) # 23 place23 = self.placele23.text() if place23 != '': place23Arr = [list(p) for p in place23.replace('\n','').replace('\r','').replace(' ',' ').split(' ')] place23df = DataFrame(place23Arr, columns=['b','c','d','g']) if not isInit: datadf = place23df isInit = True datadf = pd.merge(datadf, place23df) # 24 place24 = self.placele24.text() if place24 != '': place24Arr = [list(p) for p in place24.replace('\n','').replace('\r','').replace(' ',' ').split(' ')] place24df = DataFrame(place24Arr, columns=['b','c','e','f']) if not isInit: datadf = place24df isInit = True datadf = pd.merge(datadf, place24df) # 25 place25 = self.placele25.text() if place25 != '': place25Arr = [list(p) for p in place25.replace('\n','').replace('\r','').replace(' ',' ').split(' ')] place25df = DataFrame(place25Arr, columns=['b','c','e','g']) if not isInit: datadf = place25df isInit = True datadf = pd.merge(datadf, place25df) # 26 place26 = self.placele26.text() if place26 != '': place26Arr = [list(p) for p in place26.replace('\n','').replace('\r','').replace(' ',' ').split(' ')] place26df = DataFrame(place26Arr, columns=['b','c','f','g']) if not isInit: datadf = place26df isInit = True datadf = pd.merge(datadf, place26df) # 27 place27 = self.placele27.text() if place27 != '': place27Arr = [list(p) for p in place27.replace('\n','').replace('\r','').replace(' ',' ').split(' ')] place27df = DataFrame(place27Arr, columns=['b','d','e','f']) if not isInit: datadf = place27df isInit = True datadf = pd.merge(datadf, place27df) # 28 place28 = self.placele28.text() if place28 != '': place28Arr = [list(p) for p in place28.replace('\n','').replace('\r','').replace(' ',' ').split(' ')] place28df = DataFrame(place28Arr, columns=['b','d','e','g']) if not isInit: datadf = place28df isInit = True datadf = pd.merge(datadf, place28df) # 29 place29 = self.placele29.text() if place29 != '': place29Arr = [list(p) for p in place29.replace('\n','').replace('\r','').replace(' ',' ').split(' ')] place29df = DataFrame(place29Arr, columns=['b','d','f','g']) if not isInit: datadf = place29df isInit = True datadf = pd.merge(datadf, place29df) # 30 place30 = self.placele30.text() if place30 != '': place30Arr = [list(p) for p in place30.replace('\n','').replace('\r','').replace(' ',' ').split(' ')] place30df = DataFrame(place30Arr, columns=['b','e','f','g']) if not isInit: datadf = place30df isInit = True datadf = pd.merge(datadf, place30df) # 31 place31 = self.placele31.text() if place31 != '': place31Arr = [list(p) for p in place31.replace('\n','').replace('\r','').replace(' ',' ').split(' ')] place31df = DataFrame(place31Arr, columns=['c','d','e','f']) if not isInit: datadf = place31df isInit = True datadf = pd.merge(datadf, place31df) # 32 place32 = self.placele32.text() if place32 != '': place32Arr = [list(p) for p in place32.replace('\n','').replace('\r','').replace(' ',' ').split(' ')] place32df = DataFrame(place32Arr, columns=['c','d','e','g']) if not isInit: datadf = place32df isInit = True datadf = pd.merge(datadf, place32df) # 33 place33 = self.placele33.text() if place33 != '': place33Arr = [list(p) for p in place33.replace('\n','').replace('\r','').replace(' ',' ').split(' ')] place33df = DataFrame(place33Arr, columns=['c','d','f','g']) if not isInit: datadf = place33df isInit = True datadf = pd.merge(datadf, place33df) # 34 place34 = self.placele34.text() if place34 != '': place34Arr = [list(p) for p in place34.replace('\n','').replace('\r','').replace(' ',' ').split(' ')] place34df = DataFrame(place34Arr, columns=['c','e','f','g']) if not isInit: datadf = place34df isInit = True datadf = pd.merge(datadf, place34df) # 35 place35 = self.placele35.text() if place35 != '': place35Arr = [list(p) for p in place35.replace('\n','').replace('\r','').replace(' ',' ').split(' ')] place35df = DataFrame(place35Arr, columns=['d','e','f','g']) if not isInit: datadf = place35df isInit = True datadf = pd.merge(datadf, place35df) return datadf
# 引数でプロジェクト名指定 if len(sys.argv) != 3: print("No argument len") sys.exit() project = sys.argv[1] # 取り出せる最大個数 max_num = int(sys.argv[2]) b = joblib.load(f"scripts/result/{project}_2.pkl") b = DataFrame(b) # bのuserから一見ユーザを取り出す df_first_look = b.copy() df_first_look.drop_duplicates(subset='requester', inplace=True) #print(df_first_look.shape[0]) #print(df_first_look) df_first_look = df_first_look[["predict_proba", "useful"]] total_eval_u = 0 num_total = 0 loop_count = 1000 for i in range(loop_count): use_datetime = df_first_look.sample(frac=0.1) use_datetime = use_datetime.sort_values("predict_proba", ascending=False) num_data_u = len(use_datetime)