def __init__( self, input_df: pd.DataFrame, input_folder: dataiku.Folder = None, minimum_score: float = 0.0, orientation_correction: bool = True, column_prefix: AnyStr = "text_api", error_handling: ErrorHandlingEnum = ErrorHandlingEnum.LOG, parallel_workers: int = DEFAULT_PARALLEL_WORKERS, ): super().__init__( input_df=input_df, input_folder=input_folder, column_prefix=column_prefix, error_handling=error_handling, parallel_workers=parallel_workers, ) self.minimum_score = float(minimum_score) self.orientation_correction = bool(orientation_correction) self.orientation_column = generate_unique("orientation_correction", input_df.keys(), column_prefix) self.text_column_list = generate_unique("detections_list", input_df.keys(), column_prefix) self.text_column_concat = generate_unique("detections_concat", input_df.keys(), column_prefix) self._compute_column_description()
def test_read_visibility(): # A fake visibility is first created and written to file # Then the file is read and the two visibilities (fake and reread) are compared. from pandas import DataFrame from numpy import sqrt from numpy.random import random from numpy.testing import assert_allclose, assert_equal from tempfile import NamedTemporaryFile from purify import read_visibility N = 10 noise = random(N) expected = DataFrame({ 'u': random(N), 'v': random(N), 'w': [0] * N, 'noise': (1+1j) / sqrt(2) * noise, 'y': random(N) + 1j * random(N) }) csv = DataFrame({ 'u': expected['u'], 'v': expected['v'], 'yreal': expected['y'].real, 'yimag': expected['y'].imag, 'noise': noise }) with NamedTemporaryFile(delete=True) as file: file.close() csv.to_csv(file.name, header=False, cols=['u', 'v', 'yreal', 'yimag', 'noise']) actual = read_visibility(file.name) assert_equal(set(expected.keys()), set(expected.keys())) for name in expected.keys(): assert_allclose( actual[name], expected[name], err_msg = "Columns %s did not compare" % name )
def __init__( self, input_df: pd.DataFrame, num_objects: int, orientation_correction: bool = True, input_folder: dataiku.Folder = None, column_prefix: AnyStr = "object_api", error_handling: ErrorHandlingEnum = ErrorHandlingEnum.LOG, parallel_workers: int = DEFAULT_PARALLEL_WORKERS, ): super().__init__( input_df=input_df, input_folder=input_folder, column_prefix=column_prefix, error_handling=error_handling, parallel_workers=parallel_workers, ) self.num_objects = int(num_objects) self.orientation_correction = bool(orientation_correction) self.orientation_column = generate_unique("orientation_correction", input_df.keys(), column_prefix) self.label_list_column = generate_unique("label_list", input_df.keys(), column_prefix) self.label_name_columns = [ generate_unique("label_" + str(n + 1) + "_name", input_df.keys(), column_prefix) for n in range(num_objects) ] self.label_score_columns = [ generate_unique("label_" + str(n + 1) + "_score", input_df.keys(), column_prefix) for n in range(num_objects) ] self._compute_column_description()
def prepare_opm_reference_data( df_opm: pd.DataFrame, str_key: str, n_real: int ) -> np.ndarray: """ This function extracts data from selected columns of the Pandas dataframe containing data from reference simulation, rearranges it into a stacked column vector preserving the original order and repeats it n_real times to form a matrix for comparison with data from ensemble of n_real FlowNet simulations Args: df_opm: is the Pandas dataframe containing data from reference simulation str_key: is the string to select columns; column names starting with str_key n_real: is the size of ensemble of FlowNet simulations Returns: A numpy 2D array [length_data * nb_selected_columns, n_real] containing data from selected columns (i.e., quantity of interest for accuracy metric) of reference simulation stacked in a column-vector and replicated into n_real columns """ keys = df_opm.keys() keys = sorted(keys[df_opm.keys().str.contains(str_key)]) data = np.transpose(np.tile(df_opm[keys].values.flatten(), (n_real, 1))) return data
def combine_columns(data: pd.DataFrame, p, format_func=lambda a, b: f'{a} ({b})'): """ Combine columns such as 'x mean', 'x std' into a format such as 'x: `mean (std)`' :params: p = list of tuples with paired keys, e.g. [('mean','std')] """ old_keys = [] for key in data.keys(): for k1, k2 in p: if k1 in key: # assume format is either 'x mean' or 'mean x' other_key = key.replace(k1, k2) # TODO use regex prefix = ' '.join( (k for k in key.split(' ') if k not in [k1, k2])) n_decimals = 5 pairs = zip(data[key].round(n_decimals), data[other_key].round(n_decimals)) formatted = [format_func(a, b) for a, b in pairs] data.loc[:, prefix] = pd.Series(formatted, index=data.index) old_keys.append(key) if f'{prefix} {k2}' in data.keys(): old_keys.append(f'{prefix} {k2}') elif f'{k2} {prefix}' in data.keys(): old_keys.append(f'{k2} {prefix}') data.drop(columns=old_keys, inplace=True)
def _prepare_df_for_cleaning(self, df: pd.DataFrame, text_column: AnyStr, language_column: AnyStr, language: AnyStr) -> None: """Private method to prepare a Pandas dataframe in-place before feeding it to the `self.clean_df` method Tokenizes the content of the text column into a new column containing spaCy documents Adds new columns to hold the future outputs of the cleaner method Args: df: Input pandas DataFrame text_column: Name of the column containing text data language_column: Name of the column with language codes in ISO 639-1 format language: Language code in ISO 639-1 format If equal to "language_column" this parameter is ignored in favor of language_column """ self.output_column_descriptions = {} for k, v in self.OUTPUT_COLUMN_DESCRIPTIONS.items(): if k == "cleaned": column_name = generate_unique(k, df.keys(), text_column) self.output_column_descriptions[column_name] = v elif k in self.token_filters and self.keep_filtered_tokens: column_name = generate_unique(f"{v.lower()}s", df.keys(), text_column) self.output_column_descriptions[ column_name] = f"{v}s in the original text" self.tokenizer.tokenize_df(df, text_column, language_column, language)
def get_submodel_copasi(submodel_path: str, model_info: pd.DataFrame): """ This function loads a copasi file, if the (relative) path to the folder with this Copasi model is provided. It extracts the respective sbml file from the list and returns it alongside with the model, if any postprecessing of the Copasi results is necessary """ # load the amici model if str(submodel_path) in ('', 'nan', 'NaN'): return None, None copasi_file = os.path.join(DIR_MODELS, submodel_path) # if the amici import did not work, we don't want to consider this model if 'amici_path_final' in model_info.keys(): model_row = model_info.loc[model_info['copasi_path_final'] == submodel_path] elif 'amici_path' in model_info.keys(): model_row = model_info.loc[model_info['copasi_path'] == submodel_path] else: return None, None id = int(model_row.index.values) # import the sbml model sbml_path = os.path.join(DIR_MODELS, model_row.loc[id, 'regrouped_path']) sbml_model = (libsbml.readSBML(sbml_path)).getModel() return copasi_file, sbml_model
def get_contingency_table_general(data: pd.DataFrame, bases: Dict[str, int]) -> np.ndarray: # ctable[x,y,z,...] ctable = np.ones(tuple([v for v in bases.values()])) if any([data[k].empty for k in data.keys()]): return ctable thresholds = dict() for var_name in data.keys(): thresholds[var_name] = np.quantile( data[var_name], [i / bases[var_name] for i in range(1, bases[var_name] + 1)]) if thresholds[var_name][0] == thresholds[var_name][ len(thresholds[var_name]) - 1]: thresholds[var_name][0] = 0 for index, row in data.iterrows(): table_index = list() for var_name in data.keys(): for i, thres in enumerate(thresholds[var_name]): if row[var_name] <= thres: table_index.append(i) break table_index = tuple(table_index) ctable[table_index] = ctable[table_index] + 1 return ctable
def prepare_flownet_data( df_flownet: pd.DataFrame, str_key: str, n_real: int ) -> np.ndarray: """ This function extracts data from selected columns of the Pandas dataframe containing data from an ensemble of FlowNet simulations, rearranges it into a matrix of stacked column-vectors preserving the original order, i.e. one column per realization of the ensemble Args: df_flownet: is the Pandas dataframe containing data from ensemble of FlowNet simulations str_key: is the string to select columns; column names starting with str_key n_real: is the size of ensemble of FlowNet simulations Returns: A numpy 2D array [length_data * nb_selected_columns, n_real] containing data from selected columns (i.e., quantity of interest for accuracy metric) for an ensemble of FlowNet simulations in a column-vector. Each column correspond to one realization of the ensemble """ keys = df_flownet.keys() keys = sorted(keys[df_flownet.keys().str.contains(str_key)]) data = df_flownet[keys].values.flatten() data = np.reshape(data, (data.shape[0] // n_real, n_real), order="F") return data
def project(self, dm: pd.DataFrame) -> pd.DataFrame: """ Project supplementary samples onto the Space. The DM must be strictly finite and real. There should be a column for each active sample. :param dm: a metric distance matrix :return: a pandas data frame with coordinates of supplementary samples, one sample per row; each column encodes a dimension in the Space; columns are sorted with respect to the fraction of variance explained by the corresponding dimensions in descending order. """ # make sure `dm` is fine if not len(dm): raise ValueError("a distance matrix can't be empty") if not set(dm.keys()) == set(self.keys): raise ValueError('a dm must contain distances from supplementary ' 'samples to active samples and its columns must ' 'be named after the active samples') if len(dm.select_dtypes(include=[np.number]).columns) != len( dm.keys()): raise ValueError('a distance matrix must be strictly numeric') # make sure all columns are in correct order distances = cast(np.ndarray, dm[self.keys].values.copy()) if not np.isfinite(distances).all(): raise ValueError( "all values in the distance matrix must be finite") n_act = len(self.keys) n_sup = distances.shape[0] d_sup = distances**2 masses_sup = np.full((n_act, n_sup), (1 / n_act)) s_sup = -0.5 * self._masses_act @ (d_sup.T - (self._d_act @ masses_sup)) f_sup = s_sup.T @ self.active.values @ np.diag(self._values**-1) return pd.DataFrame(f_sup, index=list(dm.index))
def dump_data(df: pd.DataFrame, label='', print_types=False): """ Print the contents of a pandas DataFrame on the screen. Used for testing purposes. :param df: pandas data frame :param label: label to print in output header :param print_types: print data types along with values """ assert (isinstance(df, pd.DataFrame)) aux = '-- ' + label + ' ' if label else '' delimiter = aux + '-' * 100 print(delimiter) print('+ ', end='') for col in df.keys(): print('[' + str(col) + ']', end=' ') print('\n') for row in list(df.index.values): for col in df.keys(): value = df.at[row, col] if print_types: print('[' + str(value) + ' ' + str(type(value)) + ']', end=' ') else: print('[' + str(value) + ']', end=' ') print() print()
def create_tree(df: pd.DataFrame) -> dict: '''ID3 decision tree creating algorithm. Requires df to be a pd.DataFrame, and that the label of the data is the last column''' label = df.keys()[-1] #Get attribute with maximum information gain node = df.keys()[:-1][np.argmax([ total_entropy(df) - attribute_entropy(df, key) for key in df.keys()[:-1] ])] #Get distinct value of that attribute att_values = df[node].unique() # Create the tree tree = {} tree[node] = {} for att_value in att_values: # Create table with specific attribute subtable = df[df[node] == att_value].reset_index(drop=True) # Get how many outcomes there are for the attribute table_values, counts = np.unique(subtable[label], return_counts=True) # If there is only one outcome if len(counts) == 1: tree[node][att_value] = table_values[0] # If there is more than one outcome, need to create another node else: tree[node][att_value] = create_tree(subtable) return tree
def top_correlated_features(df: DataFrame, target_feature, n=5): """ Returns the names of features most strongly correlated (correlation is close to 1 or -1) with a target feature. Correlation is Pearson's-r sense. :param df: A pandas dataframe. :param target_feature: The name of the target feature. :param n: Number of top features to return. :return: A tuple of - top_n_features: Sequence of the top feature names - top_n_corr: Sequence of correlation coefficients of above features Both the returned sequences should be sorted so that the best (most correlated) feature is first. """ # TODO: Calculate correlations with target and sort features by it # ====== YOUR CODE: ====== target = df[target_feature] features = df.keys() features = features.drop('MEDV') correlations = Series([pearsonCorr(df[name],target) for name in features]) correlations = correlations.sort_values(ascending=False) top_n = correlations[:n] # ======================== return df.keys()[top_n.keys()], top_n.values
def test_python_to_c_to_python(): """" Cycle visibility bindings from python to C to python. """ from pandas import DataFrame from numpy import sqrt from numpy.random import random from numpy.testing import assert_allclose, assert_equal from purify.tests.visibility_testing import _bindings_cycle N = 10 noise = random(N) expected = DataFrame({ 'u': random(N), 'v': random(N), 'w': random(N), 'noise': (1 + 1j) / sqrt(2) * noise, 'y': random(N) + 1j * random(N) }) actual = _bindings_cycle(expected) assert_equal(set(expected.keys()), set(expected.keys())) for name in expected.keys(): assert_allclose(actual[name], expected[name], err_msg="Columns %s did not compare" % name)
def print_measurements(mdf: pd.DataFrame): """ This function will print out the results of `measure.measure_subjects`. :param mdf: pd.DataFrame returned from `measure.measure_subjects` """ print(','.join(['nifti']+list(mdf.keys()))) for i in range(0,len(mdf)): print(','.join([mdf.index[i]]+[str(mdf[k][i]) for k in mdf.keys()]))
def __init__( self, input_df: pd.DataFrame, sentiment_scale: AnyStr = "ternary", column_prefix: AnyStr = "sentiment_api", error_handling: ErrorHandlingEnum = ErrorHandlingEnum.LOG, ): super().__init__(input_df, column_prefix, error_handling) self.sentiment_scale = sentiment_scale self.sentiment_score_column = generate_unique("score", input_df.keys(), self.column_prefix) self.sentiment_score_scaled_column = generate_unique("score_scaled", input_df.keys(), column_prefix) self.sentiment_magnitude_column = generate_unique("magnitude", input_df.keys(), column_prefix) self._compute_column_description()
def populate_sell_trend(self, dataframe: DataFrame, metadata: dict) -> DataFrame: conditions = [] for ma_count in range(self.sell_ma_count.value): key = ma_count * self.sell_ma_gap.value past_key = (ma_count - 1) * self.sell_ma_gap.value if past_key > 1 and key in dataframe.keys( ) and past_key in dataframe.keys(): conditions.append(dataframe[key] > dataframe[past_key]) if conditions: dataframe.loc[reduce(lambda x, y: x | y, conditions), "sell"] = 1 return dataframe
def clean_df( self, df: pd.DataFrame, text_column: AnyStr, language_column: AnyStr = "", language: AnyStr = "language_column", ) -> pd.DataFrame: """Public method to clean a text column in a pandas DataFrame, given language information Prepare the dataframe with `self._prepare_df_for_cleaning` to obtain a new column with spaCy documents Run `self.clean_document` on all documents with multithreading Format the output dataframe Args: df: Input pandas DataFrame text_column: Name of the column containing text data language_column: Name of the column with language codes in ISO 639-1 format language: Language code in ISO 639-1 format If equal to "language_column" this parameter is ignored in favor of language_column Returns: Input dataframe with new columns at the end: - Cleaned text after filter, lemmatization, lowercase and unicode normalization steps - One column for each selected `self.token_filters` with a concatenation of filtered tokens """ self._prepare_df_for_cleaning(df, text_column, language_column, language) start = perf_counter() logging.info(f"Cleaning {len(df.index)} document(s)...") output = [{}] * len(df.index) doc_iterator = (doc for doc in df[self.tokenizer.tokenized_column]) with ThreadPoolExecutor( max_workers=self.DEFAULT_NUM_THREADS) as executor: output = list( executor.map(lambda x: self.clean_document(x), doc_iterator)) for k, v in self.OUTPUT_COLUMN_DESCRIPTIONS.items(): if k == "cleaned": column_name = generate_unique(k, df.keys(), text_column) df[column_name] = [d.get(k, "") for d in output] elif k in self.token_filters and self.keep_filtered_tokens: column_name = generate_unique(f"{v.lower()}s", df.keys(), text_column) df[column_name] = [d.get(k, "") for d in output] logging.info( f"Cleaning {len(df.index)} document(s): done in {perf_counter() - start:.2f} seconds" ) del df[self.tokenizer.tokenized_column] return df
def get_correlations(df: pd.DataFrame): correlation = df.corr() # method 1 seaborn_ax = sns.heatmap(correlation, xticklabels=correlation.columns.values, yticklabels=correlation.columns.values) # method 2 fig = go.Figure() fig.add_heatmap(x=df.keys(), y=df.keys(), z=correlation) # method 3 pairwise_correlations = sns.pairplot(df) return seaborn_ax, fig, pairwise_correlations
def dump_star(file_name: str, data: pd.DataFrame, version: str) -> None: """ Create a star file. Arguments: file_name - File name to export data - Data to export version - output version string Returns: None """ header: typing.List[str] new_header: typing.List[str] old_header: typing.List[str] prefix: str new_header, old_header, prefix = \ export_star_header(header_names=data.keys(), version=version) header = create_star_header(names=new_header, prefix=prefix) util.dump_file( file_name=file_name, data=data[old_header], header=header, vertical=True )
def count_missing_values(data: pd.DataFrame) -> [FeatureAndValue]: out = [] for f in data.keys(): x = data[f] t = x[pd.isna(x)].size out.append(FeatureAndValue(f, t)) return out
def plot_sparameters( df: DataFrame, logscale: bool = True, keys: Optional[Tuple[str, ...]] = None, **sim_settings, ): """Plots Sparameters from a pandas DataFrame. Args: df: Sparameters pandas DataFrame logscale: plots 20*log10(S) keys: list of keys to plot, plots all by default. Keyword Args: sim_settings: simulation settings for the write_sparameters_function """ w = df["wavelengths"] * 1e3 keys = keys or [ key for key in df.keys() if key.lower().startswith("s") and key.endswith("m") ] for key in keys: if key in df: y = df[key] y = 20 * np.log10(y) if logscale else y plt.plot(w, y, label=key[:-1]) else: raise ValueError(f"{key} not in {df.keys()}") plt.legend() plt.xlabel("wavelength (nm)") plt.ylabel("|S| (dB)") if logscale else plt.ylabel("|S|")
def drop_cols(df: DataFrame, cols: List[str]) -> DataFrame: """Remove the list of columns from the dataframe. Listed columns that are not available in the dataframe are simply ignored.""" df = df.copy() cols_to_remove = set(cols).intersection(set(df.keys())) df = df.drop(cols_to_remove, axis=1) return df
def load_resspect_photometry_df(photometry_df: pd.DataFrame) -> pd.DataFrame: """ Returns updated RESSPECT photometry dataframe by dropping unnecessary columns ('SNID', 'FLT' and 'SIM_MAGOBS' columns are dropped here) Parameters ---------- photometry_df RESSPECT photometry dataframe Returns ------- photometry_df RESSPECT photometry dataframe after dropping unnecessary columns """ photometry_dict = { 'mjd': photometry_df['MJD'].values, 'band': photometry_df['band'].values, 'flux': photometry_df['FLUXCAL'].values, 'fluxerr': photometry_df['FLUXCALERR'].values } if 'SNR' in photometry_df.keys(): photometry_dict['SNR'] = photometry_df['SNR'].values else: photometry_dict['SNR'] = (photometry_dict['flux'] / photometry_dict['fluxerr']) return pd.DataFrame(photometry_dict)
def get_transformed_features_df( self, full_feature_names: bool, df_with_features: pd.DataFrame) -> pd.DataFrame: # Apply on demand transformations # TODO(adchia): Include only the feature values from the specified input FVs in the ODFV. # Copy over un-prefixed features even if not requested since transform may need it columns_to_cleanup = [] if full_feature_names: for input in self.inputs.values(): if type(input) != FeatureView: continue input_fv = cast(FeatureView, input) for feature in input_fv.features: full_feature_ref = f"{input_fv.name}__{feature.name}" if full_feature_ref in df_with_features.keys(): df_with_features[ feature.name] = df_with_features[full_feature_ref] columns_to_cleanup.append(feature.name) # Compute transformed values and apply to each result row df_with_transformed_features = self.udf.__call__(df_with_features) # Cleanup extra columns used for transformation df_with_features.drop(columns=columns_to_cleanup, inplace=True) return df_with_transformed_features
def group_dataframe(df: pd.DataFrame, policy: Dict): metric = df.columns[1] try: metric_column_value = str(df["Metric"][0].unique()[0]) except AttributeError: metric_column_value = str(df["Metric"][0]) data_grouped = {"Metric": [], metric: [], "": [], "Approach": []} for key in df.keys(): data_grouped[key] = [] for key, values in policy.items(): approaches = df["Approach"].unique() for approach in approaches: mean_metric = None for value in values: row = df.loc[(df[""] == value) & (df["Approach"] == approach)] metric_value = float(row[metric]) if not mean_metric: mean_metric = metric_value else: mean_metric = statistics.mean([mean_metric, metric_value]) data_grouped["Metric"].append(metric_column_value) data_grouped[metric].append(mean_metric) data_grouped[""].append(key) data_grouped["Approach"].append(approach) grouped_df = pd.DataFrame(data_grouped, columns=list(data_grouped.keys())) return grouped_df
def get_photometry_with_id_name_and_snid( full_photometry: pd.DataFrame, id_names_list: list, snid: int) -> Tuple[pd.DataFrame, Union[str, None]]: """ This function loads photometry data of the given SNID. The full_photometry DataFrame should contain one the column name passed in id_names_list. Otherwise the function returns empty dataframe and none snid name Parameters ---------- full_photometry photometry DataFrame id_names_list list of available SNID column names snid SNID Returns ------- full_photometry full photometry data snid_column_name SNID column name """ for snid_column_name in id_names_list: if snid_column_name in full_photometry.keys(): snid_indices = full_photometry[snid_column_name] == snid return full_photometry[snid_indices], snid_column_name return pd.DataFrame(), None
def list_df_to_matrix(D: pd.DataFrame): X = [] for k in D.keys(): v = np.vstack(D[k].values) X.append(v) X = np.hstack(X) return X, D.index
def get_header(kw: str, data: pd.DataFrame) -> str: data = data.head().to_dict() formats = [kw[0].upper() + kw[1:], kw.upper(), kw.lower()] for f in formats: if f in data.keys(): return f raise KeyError
def plot_phonemes(path): phoneme_embeddings = dict() for line in codecs.open(path,"r"): line = line.split(",") key= line[0][1:-1] emb = line[1:] emb[-1] = emb[-1][:-1] emb = np.array([float(e) for e in emb]) phoneme_embeddings[key] = emb phoneme_embeddings = DataFrame(phoneme_embeddings,columns=phoneme_embeddings.keys()) print(phoneme_embeddings.columns) m = TSNE() phoneme_embeddings_tsne = m.fit_transform(phoneme_embeddings.transpose()) print(len(phoneme_embeddings_tsne)) for p,emb in zip(phoneme_embeddings.columns, phoneme_embeddings_tsne): c = "black" if regex.search("^[aeiou3E][*]?$", p): c = "red" plt.annotate(p,(emb[0],emb[1]),color=c) if regex.search("^.*w~$", p): c = "blue" plt.annotate(p,(emb[0],emb[1]),color=c) if regex.search("^.*y~$", p): c = "yellow" plt.annotate(p,(emb[0],emb[1]),color=c) if regex.search("^.*h~$", p): c = "brown" plt.annotate(p,(emb[0],emb[1]),color=c) if regex.search("^.*\"$", p): c = "green" plt.annotate(p,(emb[0],emb[1]),color=c)
def __char_to_int(): names = get_names() df = read_csv('../../data/agaricus-lepiota.data', names=names) # 去掉缺失值多的一列 df.drop('stalk-root', axis=1, inplace=True) df.drop('veil-type', axis=1, inplace=True) dataSet = [] for d in df._values: data = [] for cidx in range(len(d)): # 标签 if cidx == 0: if d[cidx] == 'p': data.append(0) else: data.append(1) # 数据 else: data.append(ord(d[cidx]) - ord('a')) dataSet.append(data) result = DataFrame(dataSet, columns=df.keys()) f = open('../../data/data_preceded.csv', 'w') writer = csv.writer(f) writer.writerow(result.keys()) writer.writerows(result.values)
def move_api_columns_to_end( df: pd.DataFrame, api_column_names: NamedTuple, error_handling: ErrorHandlingEnum = ErrorHandlingEnum.LOG ) -> pd.DataFrame: """ Move non-human-readable API columns to the end of the dataframe """ api_column_names_dict = api_column_names._asdict() if error_handling == ErrorHandlingEnum.FAIL: api_column_names_dict.pop("error_message", None) api_column_names_dict.pop("error_type", None) if not any(["error_raw" in k for k in df.keys()]): api_column_names_dict.pop("error_raw", None) cols = [c for c in df.keys() if c not in api_column_names_dict.values()] new_cols = cols + list(api_column_names_dict.values()) df = df.reindex(columns=new_cols) return df
def dist2D(dist: pd.DataFrame, ranges: pd.DataFrame, nlevels: int=16, nx: int=2, size: int=6, colorbar: bool=True, name: str='dist') -> plt.Figure: """ Plot 2D probability distributions. Parameters ---------- dist : Multiindexed dataframe with force field as primary index and distributions as created by dist2D(). ranges : Multiindexed dataframe with force field as primary index and edges as created by dist1D(). nlevels : Number of contour levels to use. nx : Number of plots per row. size : Relative size of each plot. colorbar : If true, will plot a colorbar. name : Name of the distribution. Returns ------- fig : matplotlib figure. """ # Setup plotting parameters nplots = dist.shape[1] xsize, ysize = nx, (nplots // nx) + 1 cmap = plt.get_cmap('viridis') fig = plt.figure(figsize=(xsize * size, ysize * size)) for i, k in enumerate(dist.keys()): # Get keys for both CVs kx, ky = k.split('.') # Prepare plotting grid (np.meshgrid doesn't work) X = np.broadcast_to(ranges[kx], dist[k].unstack().shape) Y = np.broadcast_to(ranges[ky], dist[k].unstack().shape).T Z = dist[k].unstack().values.T # Contour levels taking inf into account levels = np.linspace(np.amin(Z[~np.isinf(Z)]), np.amax(Z[~np.isinf(Z)]), nlevels) ax = fig.add_subplot(ysize, xsize, i + 1) cm = ax.contourf(X, Y, Z, cmap=cmap, levels=levels) ax.set_xlabel(kx) ax.set_ylabel(ky) ax.set_title(name) if colorbar: fig.colorbar(cm) return fig
def df2boxplots(sc_df: pd.DataFrame) -> None: rows = 5 cols = (len(sc_df.keys()) / 5) + 1 for i, flt in enumerate(sc_df): if flt in ['description', 'SCORE:']: continue ax = plt.subplot(rows, cols, i+1) plt.boxplot(sc_df[flt].tolist()) plt.title(flt) plt.show()
def plot_languages(path): phoneme_embeddings = dict() for line in codecs.open(path,"r"): line = line.split(",") key= line[0][1:-1] emb = line[1:] emb[-1] = emb[-1][:-1] emb = np.array([float(e) for e in emb]) phoneme_embeddings[key] = emb phoneme_embeddings = DataFrame(phoneme_embeddings,columns=phoneme_embeddings.keys()) print(phoneme_embeddings.columns) m = TSNE() phoneme_embeddings_tsne = m.fit_transform(phoneme_embeddings[["STANDARD_GERMAN","BERNESE_GERMAN","EASTERN_FRISIAN","NORTH_FRISIAN_AMRUM", "ENGLISH","DUTCH","YIDDISH_EASTERN","YIDDISH_WESTERN","DANISH","SWEDISH","FAROESE","NORWEGIAN_RIKSMAL", "GJESTAL_NORWEGIAN","NORWEGIAN_BOKMAAL","NORWEGIAN_NYNORSK_TOTEN","SANDNES_NORWEGIAN", "ICELANDIC","POLISH","KASHUBIAN","CZECH", "LOWER_SORBIAN","UPPER_SORBIAN","SLOVAK","SLOVENIAN","MACEDONIAN","BULGARIAN","UKRAINIAN", "BELARUSIAN","RUSSIAN","ARABIC_CYPRIOT_SPOKEN","ARABIC_GULF_SPOKEN","ARABIC_LIBYAN_SPOKEN","ARABIC_NORTH_LEVANTINE_SPOKEN", "ARABIC_SUDANESE_SPOKEN","CAIRO_ARABIC","DAMASCUS_ARABIC"]].transpose()) print(len(phoneme_embeddings_tsne)) for p,emb in zip(["STANDARD_GERMAN","BERNESE_GERMAN","EASTERN_FRISIAN","NORTH_FRISIAN_AMRUM", "ENGLISH","DUTCH","YIDDISH_EASTERN","YIDDISH_WESTERN","DANISH","SWEDISH","FAROESE","NORWEGIAN_RIKSMAL", "GJESTAL_NORWEGIAN","NORWEGIAN_BOKMAAL","NORWEGIAN_NYNORSK_TOTEN","SANDNES_NORWEGIAN", "ICELANDIC","POLISH","KASHUBIAN","CZECH", "LOWER_SORBIAN","UPPER_SORBIAN","SLOVAK","SLOVENIAN","MACEDONIAN","BULGARIAN","UKRAINIAN", "BELARUSIAN","RUSSIAN","ARABIC_CYPRIOT_SPOKEN","ARABIC_GULF_SPOKEN","ARABIC_LIBYAN_SPOKEN","ARABIC_NORTH_LEVANTINE_SPOKEN", "ARABIC_SUDANESE_SPOKEN","CAIRO_ARABIC","DAMASCUS_ARABIC"], phoneme_embeddings_tsne): c = "black" # if regex.search("[aeiou3E]\\*?", p): # c = "red" # if regex.search(".*w~", p): # c = "blue" # if regex.search(".*y~", p): # c = "yellow" # if regex.search(".*h~", p): # c = "brown" # if regex.search(".*\"", p): # c = "green" if p in ["STANDARD_GERMAN","BERNESE_GERMAN","EASTERN_FRISIAN","FRISIAN_WESTERN,","NORTH_FRISIAN_AMRUM" "ENGLISH","DUTCH","YIDDISH_EASTERN","YIDDISH_WESTERN","DANISH","SWEDISH","FAROESE","NORWEGIAN_RIKSMAL", "GJESTAL_NORWEGIAN","NORWEGIAN_BOKMAAL","NORWEGIAN_NYNORSK_TOTEN","SANDNES_NORWEGIAN","ICELANDIC"]: c = "red" plt.annotate(p,(emb[0],emb[1]),color=c) if p in ["POLISH","KASHUBIAN","CZECH", "LOWER_SORBIAN","UPPER_SORBIAN","SLOVAK","SLOVENIAN","MACEDONIAN","BULGARIAN","UKRAINIAN", "BELARUSIAN","RUSSIAN"]: c = "blue" plt.annotate(p,(emb[0],emb[1]),color=c) if p in ["ARABIC_CYPRIOT_SPOKEN","ARABIC_GULF_SPOKEN","ARABIC_LIBYAN_SPOKEN","ARABIC_NORTH_LEVANTINE_SPOKEN", "ARABIC_SUDANESE_SPOKEN","CAIRO_ARABIC","DAMASCUS_ARABIC"]: c = "green" plt.annotate(p,(emb[0],emb[1]),color=c)
def test_python_to_c_to_python(): """" Cycle visibility bindings from python to C to python. """ from pandas import DataFrame from numpy import sqrt from numpy.random import random from numpy.testing import assert_allclose, assert_equal from purify.tests.visibility_testing import _bindings_cycle N = 10 noise = random(N) expected = DataFrame({ 'u': random(N), 'v': random(N), 'w': random(N), 'noise': (1+1j) / sqrt(2) * noise, 'y': random(N) + 1j * random(N) }) actual = _bindings_cycle(expected) assert_equal(set(expected.keys()), set(expected.keys())) for name in expected.keys(): assert_allclose( actual[name], expected[name], err_msg = "Columns %s did not compare" % name )
def dump(self): chip_id_full = '999:chip_id_full' data = DataFrame(self.PFTR_nd).transpose() keys = data.keys() key_hard_bin = ''.join(filter(lambda s: 'HARD_BIN' in s, keys)) key_site_num = ''.join(filter(lambda s: 'SITE_NUM' in s, keys)) key_soft_bin = ''.join(filter(lambda s: 'SOFT_BIN' in s, keys)) key_efuse_burned = ''.join(filter(lambda s: 'efuse_burned' in s, keys)) key_chip_id_part0 = ''.join(filter(lambda s: 'chip_id_part0' in s, keys)) key_chip_id_part1 = ''.join(filter(lambda s: 'chip_id_part1' in s, keys)) for i in data[key_soft_bin].unique(): self.test_yield[i] = {} # basename = os.path.basename(self.Path_name) # name_front = basename.find('_') + 1 # name_end = basename.find('---') - 12 # name = basename[name_front:name_end] # self.test_yield[i][name] = {} # for j in data[key_site_num].unique(): # self.test_yield[i][name][j] = data[(data[key_site_num] == j) & (data[key_soft_bin] == i)][ # key_hard_bin].count() # self.test_yield[i][name]['total'] = data[(data[key_soft_bin] == i)][key_hard_bin].count() # self.test_yield[i][name]['yield'] = '{0:.2%}'.format( # self.test_yield[i][name]['total'] / float(data[key_hard_bin].count())) # df_1 = DataFrame(self.test_yield).transpose() # dfs = [pd.DataFrame([x for x in df_1[col]], index=df_1.index) for col in df_1.columns] # df2 = pd.concat(dfs, axis=1, keys=df_1.columns) # df2.columns.names = ['test', 'info'] # df2.index.names = ['soft_bin'] for j in data[key_site_num].unique(): self.test_yield[i][j] = data[(data[key_site_num] == j) & (data[key_soft_bin] == i)][ key_hard_bin].count() self.test_yield[i]['total'] = data[(data[key_soft_bin] == i)][key_hard_bin].count() self.test_yield[i]['yield'] = '{0:.2%}'.format( self.test_yield[i]['total'] / float(data[key_hard_bin].count())) df_1 = DataFrame(self.test_yield).transpose() data[chip_id_full] = data[key_chip_id_part0] + data[key_chip_id_part1] * 10000000 data_id = data[(data[key_hard_bin] == 1) & (data[key_efuse_burned] == 0)][chip_id_full] if data_id[data_id.duplicated() == True].count() > 0: raise (self.Path_name + 'is with duplicated chip id') with open(self.Path_name + 'duplicated.txt', w) as duplicated_txt: duplicated_txt.write(self.Path_name + 'is with duplicated chip id') with ExcelWriter(self.Path_name) as writer: DataFrame(self.test_info).to_excel(writer, sheet_name='Related') DataFrame(self.PMR_nd).transpose().to_excel(writer, sheet_name='PMR') DataFrame(self.PFTR_nd).transpose().to_excel(writer, sheet_name='PTR_FTR') DataFrame( data[(data[key_hard_bin] == 1) & (data[key_efuse_burned] == 0)]).describe().transpose().combine_first( DataFrame(self.spec_summ).transpose()).to_excel(writer, sheet_name='summary_spec') df_1.to_excel(writer, sheet_name='yield')
def resample_df(original_df, rs_interval='60Min', rs_how='last', window_size=4): # resample df = original_df.copy() rs = original_df.resample(rs_interval, how=rs_how) df = DataFrame(rs) df = df[pd.notnull(df).any(axis=1)] # remove pull NaN rows # add windows for k in df.keys(): for ind in range(1, window_size): vn = unicode(k) + u'-' + unicode(ind) df[vn] = np.hstack((np.array([np.NaN] * ind), df[k].values))[:-ind] # destroy first lines df = df[window_size - 1:] # this -1 is destroyed later return df
def sensors_estimation( signal_data: pd.DataFrame, sensors_delta_distance: list ) -> [np.array]: """ :param signal_data: :param sensors_delta_distance: :return: """ # x axis: time x = signal_data.index.values sensors_peak_time = [] sensors_delta_time = [None] for k in signal_data.keys(): # y axis: volts y = signal_data[k].values indexes = peakutils.indexes(y, thres=0.5, min_dist=30) sensors_peak_time.append(x[indexes]) for i in range(1, len(sensors_peak_time)): sensors_delta_time.append( sensors_peak_time[i] - sensors_peak_time[i - 1] ) # the information about first sensor should be equal to the second sensor sensors_delta_time[0] = sensors_delta_time[1] sensors_delta_speed = [] for i in range(len(sensors_delta_distance)): sensors_delta_speed.append( sensors_delta_distance[i] / sensors_delta_time[i] ) # the information about first sensor should be equal to the second sensor sensors_delta_speed[0] = sensors_delta_speed[1] return sensors_delta_speed
def agg_by_state(df): '''Aggregate data by US state, summing all relevant metrics''' # Define lambda functions for aggregation count_user = lambda x: sum(x == 'user') count_hash = lambda x: sum(x == 'hash') count_none = lambda x: sum(x == 'none') count_user_hash = lambda x: (count_user(x) / count_hash(x)) \ if count_hash(x) > 0 else 0 # Create an aggregation dictionary agg_dict = {'count': len, 'n_user': count_user, 'n_hash': count_hash, 'n_none': count_none, 'user_hash': count_user_hash} # Perform aggregation by state grouped = df.groupby(by='state', as_index=False) df = grouped['u_o_h'].agg(agg_dict) # Load state data with open('J:\WDPRO\BPM\us_states.csv', 'r') as f: states = {} for abbrev, name in reader(f): states[abbrev] = name states = DataFrame(data=states.values(), index=states.keys()) # Restrict results to US states df = df[df.state.isin(states.index)] # Join the full state name df = df.join(states, on='state') df.rename(columns={0: 'state_name'}, inplace=True) df['state_name'] = [i.lower() for i in df['state_name']] # Rank the states df['count_rank'] = df['count'].rank(ascending=False) # Return DataFrame return df
def _assemble_from_unit_mappings(arg, errors): """ assemble the unit specifed fields from the arg (DataFrame) Return a Series for actual parsing Parameters ---------- arg : DataFrame errors : {'ignore', 'raise', 'coerce'}, default 'raise' - If 'raise', then invalid parsing will raise an exception - If 'coerce', then invalid parsing will be set as NaT - If 'ignore', then invalid parsing will return the input Returns ------- Series """ from pandas import to_timedelta, to_numeric, DataFrame arg = DataFrame(arg) if not arg.columns.is_unique: raise ValueError("cannot assemble with duplicate keys") # replace passed unit with _unit_map def f(value): if value in _unit_map: return _unit_map[value] # m is case significant if value.lower() in _unit_map: return _unit_map[value.lower()] return value unit = {k: f(k) for k in arg.keys()} unit_rev = {v: k for k, v in unit.items()} # we require at least Ymd required = ['year', 'month', 'day'] req = sorted(list(set(required) - set(unit_rev.keys()))) if len(req): raise ValueError("to assemble mappings requires at least that " "[year, month, day] be specified: [{required}] " "is missing".format(required=','.join(req))) # keys we don't recognize excess = sorted(list(set(unit_rev.keys()) - set(_unit_map.values()))) if len(excess): raise ValueError("extra keys have been passed " "to the datetime assemblage: " "[{excess}]".format(excess=','.join(excess))) def coerce(values): # we allow coercion to if errors allows values = to_numeric(values, errors=errors) # prevent overflow in case of int8 or int16 if is_integer_dtype(values): values = values.astype('int64', copy=False) return values values = (coerce(arg[unit_rev['year']]) * 10000 + coerce(arg[unit_rev['month']]) * 100 + coerce(arg[unit_rev['day']])) try: values = to_datetime(values, format='%Y%m%d', errors=errors) except (TypeError, ValueError) as e: raise ValueError("cannot assemble the " "datetimes: {error}".format(error=e)) for u in ['h', 'm', 's', 'ms', 'us', 'ns']: value = unit_rev.get(u) if value is not None and value in arg: try: values += to_timedelta(coerce(arg[value]), unit=u, errors=errors) except (TypeError, ValueError) as e: raise ValueError("cannot assemble the datetimes [{value}]: " "{error}".format(value=value, error=e)) return values
def addData(self,other, sameIndex=True, mergeIndex=False, prefix=None, suffix=None, allowExtrapolate=False, interpolationMethod="values"): """Add data from another DataFrame or Series @param other: data as Pandas-DataFrame or Series @param sameIndex: assum both have the same indices. If False the other data will be interpolated to the current indices @param mergeIndex: make the result indices a mixture of the indices""" if not sameIndex and mergeIndex: raise PandasWrapperPyFoamException("Can't specify sameIndex=False and mergeIndex=True at the same time") if not isinstance(other,self.validOtherTypes): raise PandasWrapperPyFoamException("Other data is of type",type(other), "should be one of",self.validOtherTypes) if isinstance(other,DataFrame): o=other else: o=DataFrame(other) k=o.keys() if not self.__allStrings(k): raise PandasWrapperPyFoamException("Added data with non-string columns") v=k.copy() if prefix: v=[prefix+n for n in v] if suffix: v=[n+suffix for n in v] if len(set(v)&set(self.keys()))>0: raise PandasWrapperPyFoamException("Keys of this",self.keys(),"and other",v, "intersect",set(v)&set(self.keys())) keys=dict(zip(k,v)) interpolate=False # only interpolate if necessary if len(self.index)!=len(o.index) or (self.index!=o.index).any(): if sameIndex and not mergeIndex: raise PandasWrapperPyFoamException("Other data has different index. Specify sameIndex=False or mergeIndex=True") ni=unique(hstack([self.index,o.index])) interpolate=True if mergeIndex: minOld=min(self.index) maxOld=max(self.index) result=self.reindex(index=ni,copy=False).interpolate( method=interpolationMethod) if not allowExtrapolate: result[result.index<minOld]=float("NaN") result[result.index>maxOld]=float("NaN") else: # make sure we have values at the current position # o=o.reindex_axis(ni,axis='index').interpolate(method=interpolationMethod) o=o.reindex(index=ni,columns=o.columns).interpolate(method=interpolationMethod) # ,takeable=True result=self.copy() else: result=self.copy() minOld=min(o.index) maxOld=max(o.index) for k,v in keys.items(): result[v]=o[k] if interpolate: result[v]=result[v].interpolate(method=interpolationMethod) if not allowExtrapolate: result[v][result.index<minOld]=float("NaN") result[v][result.index>maxOld]=float("NaN") return PyFoamDataFrame(result)
import codecs import numpy as np from pandas import DataFrame import regex import evaluation import sys phoneme_embeddings = dict() for line in codecs.open("phoneme_embeddings_plm.csv","r"): line = line.split(",") key= line[0][1:-1] emb = line[1:] emb[-1] = emb[-1][:-1] emb = np.array([float(e) for e in emb]) phoneme_embeddings[key] = emb phoneme_embeddings = DataFrame(phoneme_embeddings,columns=phoneme_embeddings.keys()) clf = KNeighborsClassifier(n_neighbors=1,algorithm="brute",metric="euclidean") clf.fit(phoneme_embeddings.transpose(),phoneme_embeddings.columns) tags = set() for test in evaluation.SimilarityTestData: for tag in test["tags"]: tags.add(tag) tags = list(tags) print("EVALUATION") c = dict() c_true = dict() c_true_all = 0 for tag in tags: c[tag] = 0
from pandas import DataFrame as DF import sys ## this stuff should be moved to a unit test if "--test" in sys.argv: ## generate some random csv datastructure import faker, random, tempfile FK = faker.Faker() ncol = random.randint(1, 5) nrow = random.randint(100, 400) df = DF(dict([(key, np.random.rand(1, nrow)[0]) for key in [FK.username() for i in range(ncol-1)] + ["RT"]])) for k in df.keys(): if k == "RT": continue if random.random() > 0.5: ## turn the column into a binary value df[k] = df[k].round() TR_duration = 0.5 * random.randint(2, 6) ## append a duration and onset df['duration'] = TR_duration df['onset'] = df.index * TR_duration csv_filepath = tempfile.mktemp(suffix = ".csv") df.to_csv(csv_filepath) csvf = pe.Node(name = "csvfile", interface = CSVFile()) csvf.inputs.csv_filepath = csv_filepath
def create_data_set( data_file: h5py.File, data: pd.DataFrame, sample_rate: int=None, date_time: datetime=datetime.now(), site_id: str='000', lane_id: str='00', temperature: float=None, license_plate: str=None, sensor_calibration_factory: list=None, distance_between_sensors: list=None, sensor_type: str=None, sensors_layout: str=None, channel_configuration: str=None, **kwargs ) -> h5py.Dataset: """ :param data_file: :param data: :param sample_rate: (e.g. 2000) :param date_time: (e.g. 2017-49-04 00:49:36) :param site_id: (e.g. 001) :param lane_id: (e.g. 01) :param temperature: (e.g. 28.5) :param license_plate: (e.g. AAA9999) :param sensor_calibration_factory: (e.g. [0.98, 0.99, 0.75]) :param distance_between_sensors: (e.g. [1.0, 1.5, 2.0]) :param sensor_type: (e.g. quartz, polymer, ceramic, mixed) :param sensors_layout: (e.g. |/|\|<|>|=|) :param channel_configuration: (this is a, optional attribute, it is required just when sensor type is mixed, e.g. "{'a0': 'polymer', 'a1': 'ceramic'})" :param kwargs: :return: """ dset_id = 'run_{}_{}_{}'.format( site_id, lane_id, date_time.strftime('%Y%M%d_%H%M%S') ) dset = data_file.create_dataset( dset_id, shape=(data.shape[0],), dtype=np.dtype([ (k, float) for k in ['index'] + list(data.keys()) ]) ) dset['index'] = data.index for k in data.keys(): dset[k] = data[k] dset.attrs['sample_rate'] = sample_rate dset.attrs['date_time'] = date_time.strftime('%Y-%M-%d %H:%M:%S') dset.attrs['site_id'] = site_id dset.attrs['lane_id'] = lane_id dset.attrs['temperature'] = temperature dset.attrs['license_plate'] = license_plate dset.attrs['sensor_calibration_factory'] = sensor_calibration_factory dset.attrs['distance_between_sensors'] = distance_between_sensors dset.attrs['sensor_type'] = sensor_type dset.attrs['sensors_layout'] = sensors_layout dset.attrs['channel_configuration'] = channel_configuration if kwargs: for k, v in kwargs.items(): dset.attrs[k] = v return dset
table=xls_file.parse('Sheet1') #****************************************** #分析 HTML JSON格式 数据 用其提供的API 接口 P181 import requests url='http://live.qq.com/json/movie/all/hot2/list_7.json' resp=requests.get(url) resp import json data=json.loads(resp.text) data.keys() #************************************************ #与数据库的 交互 #yong pandas 提供 的 嵌入式 SQLite 数据库 import sqlite3 query=""" CREATE TABLE test (a VARCHAR(20),b VARCHAR (20), c REAL ,d INTEGER );""" con=sqlite3.connect(':memory:') con.execute(query) con.commit()
from pandas import DataFrame import pandas as pd import matplotlib matplotlib.use('Agg') from matplotlib import pyplot as plt import seaborn as sns dir_cm = '' dir_unmod = '' fname = "chin_alpha_gamma.pk" df_cm = DataFrame(pd.read_pickle(dir_cm + fname)) df_unmod = DataFrame(pd.read_pickle(dir_unmod + fname)) keys = df_cm.keys() for key in keys: fig = plt.figure() ax = fig.add_subplot(120) ax.hist(df_cm[key], normed=1) ax.set_xlim([-180., 180.]) ax.set_ylim([0, 0.1]) ax = fig.add_subplot(121) ax.hist(df_unmod[key], normed=1) ax.set_ylim([0, 0.1]) ax.set_xlim([-180., 180.]) fname = key.replace(":", "_") plt.title(fname) plt.savefig("./plots/" + fname + ".png", dpi=300)
def _get_cols_with_nans(in_data: DataFrame): for col_name in in_data.keys(): if in_data[col_name].hasnans: yield col_name