def __init__(
     self,
     input_df: pd.DataFrame,
     input_folder: dataiku.Folder = None,
     minimum_score: float = 0.0,
     orientation_correction: bool = True,
     column_prefix: AnyStr = "text_api",
     error_handling: ErrorHandlingEnum = ErrorHandlingEnum.LOG,
     parallel_workers: int = DEFAULT_PARALLEL_WORKERS,
 ):
     super().__init__(
         input_df=input_df,
         input_folder=input_folder,
         column_prefix=column_prefix,
         error_handling=error_handling,
         parallel_workers=parallel_workers,
     )
     self.minimum_score = float(minimum_score)
     self.orientation_correction = bool(orientation_correction)
     self.orientation_column = generate_unique("orientation_correction",
                                               input_df.keys(),
                                               column_prefix)
     self.text_column_list = generate_unique("detections_list",
                                             input_df.keys(), column_prefix)
     self.text_column_concat = generate_unique("detections_concat",
                                               input_df.keys(),
                                               column_prefix)
     self._compute_column_description()
コード例 #2
0
ファイル: test_visibility.py プロジェクト: UCL/purify
def test_read_visibility():
    # A fake visibility is first created and written to file
    # Then the file is read and the two visibilities (fake and reread) are compared. 
    from pandas  import DataFrame
    from numpy import sqrt
    from numpy.random import random
    from numpy.testing import assert_allclose, assert_equal
    from tempfile import NamedTemporaryFile
    from purify import read_visibility

    N = 10
    noise = random(N)
    expected = DataFrame({
        'u': random(N), 'v': random(N), 'w': [0] * N,
        'noise': (1+1j) / sqrt(2) * noise, 'y': random(N) + 1j * random(N)
    })

    csv = DataFrame({
        'u': expected['u'], 'v': expected['v'],
        'yreal': expected['y'].real, 'yimag': expected['y'].imag,
        'noise': noise
    })

    with NamedTemporaryFile(delete=True) as file:
        file.close()
        csv.to_csv(file.name, header=False, cols=['u', 'v', 'yreal', 'yimag', 'noise'])
        actual = read_visibility(file.name)

        assert_equal(set(expected.keys()), set(expected.keys()))
        for name in expected.keys(): 
            assert_allclose( actual[name], expected[name], 
                             err_msg = "Columns %s did not compare" % name )
 def __init__(
     self,
     input_df: pd.DataFrame,
     num_objects: int,
     orientation_correction: bool = True,
     input_folder: dataiku.Folder = None,
     column_prefix: AnyStr = "object_api",
     error_handling: ErrorHandlingEnum = ErrorHandlingEnum.LOG,
     parallel_workers: int = DEFAULT_PARALLEL_WORKERS,
 ):
     super().__init__(
         input_df=input_df,
         input_folder=input_folder,
         column_prefix=column_prefix,
         error_handling=error_handling,
         parallel_workers=parallel_workers,
     )
     self.num_objects = int(num_objects)
     self.orientation_correction = bool(orientation_correction)
     self.orientation_column = generate_unique("orientation_correction",
                                               input_df.keys(),
                                               column_prefix)
     self.label_list_column = generate_unique("label_list", input_df.keys(),
                                              column_prefix)
     self.label_name_columns = [
         generate_unique("label_" + str(n + 1) + "_name", input_df.keys(),
                         column_prefix) for n in range(num_objects)
     ]
     self.label_score_columns = [
         generate_unique("label_" + str(n + 1) + "_score", input_df.keys(),
                         column_prefix) for n in range(num_objects)
     ]
     self._compute_column_description()
コード例 #4
0
def prepare_opm_reference_data(
    df_opm: pd.DataFrame, str_key: str, n_real: int
) -> np.ndarray:
    """
    This function extracts data from selected columns of the Pandas dataframe
    containing data from reference simulation, rearranges it into a stacked
    column vector preserving the original order and repeats it n_real times to form
    a matrix for comparison with data from ensemble of n_real FlowNet simulations

    Args:
        df_opm: is the Pandas dataframe containing data from reference simulation
        str_key: is the string to select columns; column names starting with str_key
        n_real: is the size of ensemble of FlowNet simulations

    Returns:
        A numpy 2D array [length_data * nb_selected_columns, n_real] containing data
        from selected columns (i.e., quantity of interest for accuracy metric) of
        reference simulation stacked in a column-vector and replicated into n_real columns

    """
    keys = df_opm.keys()
    keys = sorted(keys[df_opm.keys().str.contains(str_key)])
    data = np.transpose(np.tile(df_opm[keys].values.flatten(), (n_real, 1)))

    return data
コード例 #5
0
def combine_columns(data: pd.DataFrame,
                    p,
                    format_func=lambda a, b: f'{a} ({b})'):
    """ Combine columns such as 'x mean', 'x std' into a format such as
    'x: `mean (std)`'

    :params:
    p = list of tuples with paired keys, e.g. [('mean','std')]
    """
    old_keys = []
    for key in data.keys():
        for k1, k2 in p:
            if k1 in key:
                # assume format is either 'x mean' or 'mean x'
                other_key = key.replace(k1, k2)
                # TODO use regex
                prefix = ' '.join(
                    (k for k in key.split(' ') if k not in [k1, k2]))
                n_decimals = 5
                pairs = zip(data[key].round(n_decimals),
                            data[other_key].round(n_decimals))
                formatted = [format_func(a, b) for a, b in pairs]
                data.loc[:, prefix] = pd.Series(formatted, index=data.index)
                old_keys.append(key)
                if f'{prefix} {k2}' in data.keys():
                    old_keys.append(f'{prefix} {k2}')
                elif f'{k2} {prefix}' in data.keys():
                    old_keys.append(f'{k2} {prefix}')

    data.drop(columns=old_keys, inplace=True)
コード例 #6
0
    def _prepare_df_for_cleaning(self, df: pd.DataFrame, text_column: AnyStr,
                                 language_column: AnyStr,
                                 language: AnyStr) -> None:
        """Private method to prepare a Pandas dataframe in-place before feeding it to the `self.clean_df` method

        Tokenizes the content of the text column into a new column containing spaCy documents
        Adds new columns to hold the future outputs of the cleaner method

        Args:
            df: Input pandas DataFrame
            text_column: Name of the column containing text data
            language_column: Name of the column with language codes in ISO 639-1 format
            language: Language code in ISO 639-1 format
                If equal to "language_column" this parameter is ignored in favor of language_column

        """
        self.output_column_descriptions = {}
        for k, v in self.OUTPUT_COLUMN_DESCRIPTIONS.items():
            if k == "cleaned":
                column_name = generate_unique(k, df.keys(), text_column)
                self.output_column_descriptions[column_name] = v
            elif k in self.token_filters and self.keep_filtered_tokens:
                column_name = generate_unique(f"{v.lower()}s", df.keys(),
                                              text_column)
                self.output_column_descriptions[
                    column_name] = f"{v}s in the original text"
        self.tokenizer.tokenize_df(df, text_column, language_column, language)
コード例 #7
0
def get_submodel_copasi(submodel_path: str,
                        model_info: pd.DataFrame):
    """
    This function loads a copasi file, if the (relative) path to the folder
    with this Copasi model is provided.
    It extracts the respective sbml file from the list and returns it alongside
    with the model, if any postprecessing of the Copasi results is necessary
    """

    # load the amici model
    if str(submodel_path) in ('', 'nan', 'NaN'):
        return None, None
    copasi_file = os.path.join(DIR_MODELS, submodel_path)

    # if the amici import did not work, we don't want to consider this model
    if 'amici_path_final' in model_info.keys():
        model_row = model_info.loc[model_info['copasi_path_final'] == submodel_path]
    elif 'amici_path' in model_info.keys():
        model_row = model_info.loc[model_info['copasi_path'] == submodel_path]
    else:
        return None, None

    id = int(model_row.index.values)

    # import the sbml model
    sbml_path = os.path.join(DIR_MODELS, model_row.loc[id, 'regrouped_path'])
    sbml_model = (libsbml.readSBML(sbml_path)).getModel()

    return copasi_file, sbml_model
コード例 #8
0
ファイル: data_preparation.py プロジェクト: pgmeiner/iacm
def get_contingency_table_general(data: pd.DataFrame,
                                  bases: Dict[str, int]) -> np.ndarray:
    # ctable[x,y,z,...]
    ctable = np.ones(tuple([v for v in bases.values()]))
    if any([data[k].empty for k in data.keys()]):
        return ctable
    thresholds = dict()
    for var_name in data.keys():
        thresholds[var_name] = np.quantile(
            data[var_name],
            [i / bases[var_name] for i in range(1, bases[var_name] + 1)])
        if thresholds[var_name][0] == thresholds[var_name][
                len(thresholds[var_name]) - 1]:
            thresholds[var_name][0] = 0

    for index, row in data.iterrows():
        table_index = list()
        for var_name in data.keys():
            for i, thres in enumerate(thresholds[var_name]):
                if row[var_name] <= thres:
                    table_index.append(i)
                    break
        table_index = tuple(table_index)
        ctable[table_index] = ctable[table_index] + 1

    return ctable
コード例 #9
0
def prepare_flownet_data(
    df_flownet: pd.DataFrame, str_key: str, n_real: int
) -> np.ndarray:
    """
    This function extracts data from selected columns of the Pandas dataframe
    containing data from an ensemble of FlowNet simulations, rearranges it into
    a matrix of stacked column-vectors preserving the original order, i.e. one column
    per realization of the ensemble

    Args:
        df_flownet: is the Pandas dataframe containing data from ensemble of FlowNet simulations
        str_key: is the string to select columns; column names starting with str_key
        n_real: is the size of ensemble of FlowNet simulations

    Returns:
        A numpy 2D array [length_data * nb_selected_columns, n_real] containing data
        from selected columns (i.e., quantity of interest for accuracy metric) for
        an ensemble of FlowNet simulations in a column-vector. Each column correspond
        to one realization of the ensemble

    """
    keys = df_flownet.keys()
    keys = sorted(keys[df_flownet.keys().str.contains(str_key)])
    data = df_flownet[keys].values.flatten()
    data = np.reshape(data, (data.shape[0] // n_real, n_real), order="F")

    return data
コード例 #10
0
 def project(self, dm: pd.DataFrame) -> pd.DataFrame:
     """
     Project supplementary samples onto the Space. The DM must be strictly
     finite and real. There should be a column for each active sample.
     :param dm: a metric distance matrix
     :return: a pandas data frame with coordinates of supplementary samples,
     one sample per row; each column encodes a dimension in the Space;
     columns are sorted with respect to the fraction of variance explained
     by the corresponding dimensions in descending order.
     """
     # make sure `dm` is fine
     if not len(dm):
         raise ValueError("a distance matrix can't be empty")
     if not set(dm.keys()) == set(self.keys):
         raise ValueError('a dm must contain distances from supplementary '
                          'samples to active samples and its columns must '
                          'be named after the active samples')
     if len(dm.select_dtypes(include=[np.number]).columns) != len(
             dm.keys()):
         raise ValueError('a distance matrix must be strictly numeric')
     # make sure all columns are in correct order
     distances = cast(np.ndarray, dm[self.keys].values.copy())
     if not np.isfinite(distances).all():
         raise ValueError(
             "all values in the distance matrix must be finite")
     n_act = len(self.keys)
     n_sup = distances.shape[0]
     d_sup = distances**2
     masses_sup = np.full((n_act, n_sup), (1 / n_act))
     s_sup = -0.5 * self._masses_act @ (d_sup.T -
                                        (self._d_act @ masses_sup))
     f_sup = s_sup.T @ self.active.values @ np.diag(self._values**-1)
     return pd.DataFrame(f_sup, index=list(dm.index))
コード例 #11
0
ファイル: ptimecard.py プロジェクト: pgigoux/misc
def dump_data(df: pd.DataFrame, label='', print_types=False):
    """
    Print the contents of a pandas DataFrame on the screen.
    Used for testing purposes.
    :param df: pandas data frame
    :param label: label to print in output header
    :param print_types: print data types along with values
    """
    assert (isinstance(df, pd.DataFrame))
    aux = '-- ' + label + ' ' if label else ''
    delimiter = aux + '-' * 100
    print(delimiter)
    print('+ ', end='')
    for col in df.keys():
        print('[' + str(col) + ']', end=' ')
    print('\n')
    for row in list(df.index.values):
        for col in df.keys():
            value = df.at[row, col]
            if print_types:
                print('[' + str(value) + ' ' + str(type(value)) + ']', end=' ')
            else:
                print('[' + str(value) + ']', end=' ')
        print()
    print()
コード例 #12
0
ファイル: ID3.py プロジェクト: Squeemos/SqueemTools
def create_tree(df: pd.DataFrame) -> dict:
    '''ID3 decision tree creating algorithm. Requires df to be a pd.DataFrame, and that the label of the data is the last column'''
    label = df.keys()[-1]
    #Get attribute with maximum information gain
    node = df.keys()[:-1][np.argmax([
        total_entropy(df) - attribute_entropy(df, key)
        for key in df.keys()[:-1]
    ])]

    #Get distinct value of that attribute
    att_values = df[node].unique()
    # Create the tree
    tree = {}
    tree[node] = {}

    for att_value in att_values:
        # Create table with specific attribute
        subtable = df[df[node] == att_value].reset_index(drop=True)
        # Get how many outcomes there are for the attribute
        table_values, counts = np.unique(subtable[label], return_counts=True)

        # If there is only one outcome
        if len(counts) == 1:
            tree[node][att_value] = table_values[0]
        # If there is more than one outcome, need to create another node
        else:
            tree[node][att_value] = create_tree(subtable)
    return tree
コード例 #13
0
def top_correlated_features(df: DataFrame, target_feature, n=5):
    """
    Returns the names of features most strongly correlated (correlation is
    close to 1 or -1) with a target feature. Correlation is Pearson's-r sense.

    :param df: A pandas dataframe.
    :param target_feature: The name of the target feature.
    :param n: Number of top features to return.
    :return: A tuple of
        - top_n_features: Sequence of the top feature names
        - top_n_corr: Sequence of correlation coefficients of above features
        Both the returned sequences should be sorted so that the best (most
        correlated) feature is first.
    """

    # TODO: Calculate correlations with target and sort features by it

    # ====== YOUR CODE: ======
    target = df[target_feature]
    
    features = df.keys()
    features = features.drop('MEDV')
    

    
    correlations = Series([pearsonCorr(df[name],target) for name in features])
    
    correlations = correlations.sort_values(ascending=False)
    
    top_n = correlations[:n]
    

    # ========================

    return df.keys()[top_n.keys()], top_n.values
コード例 #14
0
def test_python_to_c_to_python():
    """" Cycle visibility bindings from python to C to python. """
    from pandas import DataFrame
    from numpy import sqrt
    from numpy.random import random
    from numpy.testing import assert_allclose, assert_equal
    from purify.tests.visibility_testing import _bindings_cycle

    N = 10
    noise = random(N)
    expected = DataFrame({
        'u': random(N),
        'v': random(N),
        'w': random(N),
        'noise': (1 + 1j) / sqrt(2) * noise,
        'y': random(N) + 1j * random(N)
    })

    actual = _bindings_cycle(expected)

    assert_equal(set(expected.keys()), set(expected.keys()))
    for name in expected.keys():
        assert_allclose(actual[name],
                        expected[name],
                        err_msg="Columns %s did not compare" % name)
コード例 #15
0
ファイル: measure.py プロジェクト: ForrestCKoch/DCDF
def print_measurements(mdf: pd.DataFrame):
    """
    This function will print out the results of `measure.measure_subjects`.

    :param mdf: pd.DataFrame returned from `measure.measure_subjects`
    """
    print(','.join(['nifti']+list(mdf.keys())))
    for i in range(0,len(mdf)):
        print(','.join([mdf.index[i]]+[str(mdf[k][i]) for k in mdf.keys()]))
コード例 #16
0
 def __init__(
     self,
     input_df: pd.DataFrame,
     sentiment_scale: AnyStr = "ternary",
     column_prefix: AnyStr = "sentiment_api",
     error_handling: ErrorHandlingEnum = ErrorHandlingEnum.LOG,
 ):
     super().__init__(input_df, column_prefix, error_handling)
     self.sentiment_scale = sentiment_scale
     self.sentiment_score_column = generate_unique("score", input_df.keys(), self.column_prefix)
     self.sentiment_score_scaled_column = generate_unique("score_scaled", input_df.keys(), column_prefix)
     self.sentiment_magnitude_column = generate_unique("magnitude", input_df.keys(), column_prefix)
     self._compute_column_description()
コード例 #17
0
    def populate_sell_trend(self, dataframe: DataFrame,
                            metadata: dict) -> DataFrame:
        conditions = []

        for ma_count in range(self.sell_ma_count.value):
            key = ma_count * self.sell_ma_gap.value
            past_key = (ma_count - 1) * self.sell_ma_gap.value
            if past_key > 1 and key in dataframe.keys(
            ) and past_key in dataframe.keys():
                conditions.append(dataframe[key] > dataframe[past_key])

        if conditions:
            dataframe.loc[reduce(lambda x, y: x | y, conditions), "sell"] = 1
        return dataframe
コード例 #18
0
    def clean_df(
        self,
        df: pd.DataFrame,
        text_column: AnyStr,
        language_column: AnyStr = "",
        language: AnyStr = "language_column",
    ) -> pd.DataFrame:
        """Public method to clean a text column in a pandas DataFrame, given language information

        Prepare the dataframe with `self._prepare_df_for_cleaning` to obtain a new column with spaCy documents
        Run `self.clean_document` on all documents with multithreading
        Format the output dataframe

        Args:
            df: Input pandas DataFrame
            text_column: Name of the column containing text data
            language_column: Name of the column with language codes in ISO 639-1 format
            language: Language code in ISO 639-1 format
                If equal to "language_column" this parameter is ignored in favor of language_column

        Returns:
            Input dataframe with new columns at the end:
                - Cleaned text after filter, lemmatization, lowercase and unicode normalization steps
                - One column for each selected `self.token_filters` with a concatenation of filtered tokens

        """
        self._prepare_df_for_cleaning(df, text_column, language_column,
                                      language)
        start = perf_counter()
        logging.info(f"Cleaning {len(df.index)} document(s)...")
        output = [{}] * len(df.index)
        doc_iterator = (doc for doc in df[self.tokenizer.tokenized_column])
        with ThreadPoolExecutor(
                max_workers=self.DEFAULT_NUM_THREADS) as executor:
            output = list(
                executor.map(lambda x: self.clean_document(x), doc_iterator))
        for k, v in self.OUTPUT_COLUMN_DESCRIPTIONS.items():
            if k == "cleaned":
                column_name = generate_unique(k, df.keys(), text_column)
                df[column_name] = [d.get(k, "") for d in output]
            elif k in self.token_filters and self.keep_filtered_tokens:
                column_name = generate_unique(f"{v.lower()}s", df.keys(),
                                              text_column)
                df[column_name] = [d.get(k, "") for d in output]
        logging.info(
            f"Cleaning {len(df.index)} document(s): done in {perf_counter() - start:.2f} seconds"
        )
        del df[self.tokenizer.tokenized_column]
        return df
コード例 #19
0
def get_correlations(df: pd.DataFrame):
    correlation = df.corr()
    # method 1
    seaborn_ax = sns.heatmap(correlation,
                             xticklabels=correlation.columns.values,
                             yticklabels=correlation.columns.values)

    # method 2
    fig = go.Figure()
    fig.add_heatmap(x=df.keys(), y=df.keys(), z=correlation)

    # method 3
    pairwise_correlations = sns.pairplot(df)

    return seaborn_ax, fig, pairwise_correlations
コード例 #20
0
def dump_star(file_name: str, data: pd.DataFrame, version: str) -> None:
    """
    Create a star file.

    Arguments:
    file_name - File name to export
    data - Data to export
    version - output version string

    Returns:
    None
    """
    header: typing.List[str]
    new_header: typing.List[str]
    old_header: typing.List[str]
    prefix: str

    new_header, old_header, prefix = \
        export_star_header(header_names=data.keys(), version=version)
    header = create_star_header(names=new_header, prefix=prefix)
    util.dump_file(
        file_name=file_name,
        data=data[old_header],
        header=header,
        vertical=True
        )
コード例 #21
0
ファイル: analytics.py プロジェクト: msknapp/machine-learning
def count_missing_values(data: pd.DataFrame) -> [FeatureAndValue]:
    out = []
    for f in data.keys():
        x = data[f]
        t = x[pd.isna(x)].size
        out.append(FeatureAndValue(f, t))
    return out
コード例 #22
0
ファイル: plot.py プロジェクト: simbilod/gdsfactory
def plot_sparameters(
    df: DataFrame,
    logscale: bool = True,
    keys: Optional[Tuple[str, ...]] = None,
    **sim_settings,
):
    """Plots Sparameters from a pandas DataFrame.

    Args:
        df: Sparameters pandas DataFrame
        logscale: plots 20*log10(S)
        keys: list of keys to plot, plots all by default.

    Keyword Args:
        sim_settings: simulation settings for the write_sparameters_function

    """

    w = df["wavelengths"] * 1e3
    keys = keys or [
        key for key in df.keys()
        if key.lower().startswith("s") and key.endswith("m")
    ]

    for key in keys:
        if key in df:
            y = df[key]
            y = 20 * np.log10(y) if logscale else y
            plt.plot(w, y, label=key[:-1])
        else:
            raise ValueError(f"{key} not in {df.keys()}")
    plt.legend()
    plt.xlabel("wavelength (nm)")
    plt.ylabel("|S| (dB)") if logscale else plt.ylabel("|S|")
コード例 #23
0
ファイル: utils.py プロジェクト: jose-manuel/code_manuscript
def drop_cols(df: DataFrame, cols: List[str]) -> DataFrame:
    """Remove the list of columns from the dataframe.
    Listed columns that are not available in the dataframe are simply ignored."""
    df = df.copy()
    cols_to_remove = set(cols).intersection(set(df.keys()))
    df = df.drop(cols_to_remove, axis=1)
    return df
コード例 #24
0
def load_resspect_photometry_df(photometry_df: pd.DataFrame) -> pd.DataFrame:
    """
    Returns updated RESSPECT photometry dataframe by dropping unnecessary
     columns ('SNID', 'FLT' and 'SIM_MAGOBS' columns are dropped here)

    Parameters
    ----------
    photometry_df
        RESSPECT photometry dataframe

    Returns
    -------
    photometry_df
        RESSPECT photometry dataframe after dropping unnecessary columns
    """
    photometry_dict = {
        'mjd': photometry_df['MJD'].values,
        'band': photometry_df['band'].values,
        'flux': photometry_df['FLUXCAL'].values,
        'fluxerr': photometry_df['FLUXCALERR'].values
    }
    if 'SNR' in photometry_df.keys():
        photometry_dict['SNR'] = photometry_df['SNR'].values
    else:
        photometry_dict['SNR'] = (photometry_dict['flux'] /
                                  photometry_dict['fluxerr'])
    return pd.DataFrame(photometry_dict)
コード例 #25
0
    def get_transformed_features_df(
            self, full_feature_names: bool,
            df_with_features: pd.DataFrame) -> pd.DataFrame:
        # Apply on demand transformations
        # TODO(adchia): Include only the feature values from the specified input FVs in the ODFV.
        # Copy over un-prefixed features even if not requested since transform may need it
        columns_to_cleanup = []
        if full_feature_names:
            for input in self.inputs.values():
                if type(input) != FeatureView:
                    continue
                input_fv = cast(FeatureView, input)
                for feature in input_fv.features:
                    full_feature_ref = f"{input_fv.name}__{feature.name}"
                    if full_feature_ref in df_with_features.keys():
                        df_with_features[
                            feature.name] = df_with_features[full_feature_ref]
                        columns_to_cleanup.append(feature.name)

        # Compute transformed values and apply to each result row
        df_with_transformed_features = self.udf.__call__(df_with_features)

        # Cleanup extra columns used for transformation
        df_with_features.drop(columns=columns_to_cleanup, inplace=True)
        return df_with_transformed_features
コード例 #26
0
def group_dataframe(df: pd.DataFrame, policy: Dict):

    metric = df.columns[1]
    try:
        metric_column_value = str(df["Metric"][0].unique()[0])
    except AttributeError:
        metric_column_value = str(df["Metric"][0])

    data_grouped = {"Metric": [], metric: [], "": [], "Approach": []}
    for key in df.keys():
        data_grouped[key] = []

    for key, values in policy.items():

        approaches = df["Approach"].unique()
        for approach in approaches:
            mean_metric = None
            for value in values:
                row = df.loc[(df[""] == value) & (df["Approach"] == approach)]
                metric_value = float(row[metric])
                if not mean_metric:
                    mean_metric = metric_value
                else:
                    mean_metric = statistics.mean([mean_metric, metric_value])

            data_grouped["Metric"].append(metric_column_value)
            data_grouped[metric].append(mean_metric)
            data_grouped[""].append(key)
            data_grouped["Approach"].append(approach)

    grouped_df = pd.DataFrame(data_grouped, columns=list(data_grouped.keys()))
    return grouped_df
コード例 #27
0
def get_photometry_with_id_name_and_snid(
        full_photometry: pd.DataFrame, id_names_list: list,
        snid: int) -> Tuple[pd.DataFrame, Union[str, None]]:
    """
    This function loads photometry data of the given SNID.
    The full_photometry DataFrame should contain one the column name passed in
    id_names_list. Otherwise the function returns empty dataframe and none snid
    name

    Parameters
    ----------
    full_photometry
        photometry DataFrame
    id_names_list
        list of available SNID column names
    snid
        SNID

    Returns
    -------
    full_photometry
        full photometry data
    snid_column_name
        SNID column name
    """
    for snid_column_name in id_names_list:
        if snid_column_name in full_photometry.keys():
            snid_indices = full_photometry[snid_column_name] == snid
            return full_photometry[snid_indices], snid_column_name

    return pd.DataFrame(), None
コード例 #28
0
def list_df_to_matrix(D: pd.DataFrame):
    X = []
    for k in D.keys():
        v = np.vstack(D[k].values)
        X.append(v)
    X = np.hstack(X)
    return X, D.index
コード例 #29
0
 def get_header(kw: str, data: pd.DataFrame) -> str:
     data = data.head().to_dict()
     formats = [kw[0].upper() + kw[1:], kw.upper(), kw.lower()]
     for f in formats:
         if f in data.keys():
             return f
     raise KeyError
コード例 #30
0
ファイル: PLM_plot.py プロジェクト: marlonbetz/ML
def plot_phonemes(path):
    phoneme_embeddings = dict()
    for line in codecs.open(path,"r"):
        line = line.split(",")
        key= line[0][1:-1]
        emb = line[1:]
        emb[-1] = emb[-1][:-1]
        emb = np.array([float(e) for e in emb])
        phoneme_embeddings[key] = emb
    
    phoneme_embeddings = DataFrame(phoneme_embeddings,columns=phoneme_embeddings.keys())
    print(phoneme_embeddings.columns)
    
    m = TSNE()
    phoneme_embeddings_tsne = m.fit_transform(phoneme_embeddings.transpose())
    print(len(phoneme_embeddings_tsne))
    for p,emb in zip(phoneme_embeddings.columns, phoneme_embeddings_tsne):
        c = "black"
        if regex.search("^[aeiou3E][*]?$", p):
            c = "red"
            plt.annotate(p,(emb[0],emb[1]),color=c)
        if regex.search("^.*w~$", p):
            c = "blue"
            plt.annotate(p,(emb[0],emb[1]),color=c)
        if regex.search("^.*y~$", p):
            c = "yellow"
            plt.annotate(p,(emb[0],emb[1]),color=c)
        if regex.search("^.*h~$", p):
            c = "brown"
            plt.annotate(p,(emb[0],emb[1]),color=c)
        if regex.search("^.*\"$", p):
            c = "green"
            plt.annotate(p,(emb[0],emb[1]),color=c)
コード例 #31
0
def __char_to_int():
    names = get_names()
    df = read_csv('../../data/agaricus-lepiota.data', names=names)
    # 去掉缺失值多的一列
    df.drop('stalk-root', axis=1, inplace=True)
    df.drop('veil-type', axis=1, inplace=True)
    dataSet = []
    for d in df._values:
        data = []
        for cidx in range(len(d)):
            # 标签
            if cidx == 0:
                if d[cidx] == 'p':
                    data.append(0)
                else:
                    data.append(1)
            # 数据
            else:
                data.append(ord(d[cidx]) - ord('a'))
        dataSet.append(data)
    result = DataFrame(dataSet, columns=df.keys())
    f = open('../../data/data_preceded.csv', 'w')
    writer = csv.writer(f)
    writer.writerow(result.keys())
    writer.writerows(result.values)
コード例 #32
0
def move_api_columns_to_end(
    df: pd.DataFrame, api_column_names: NamedTuple, error_handling: ErrorHandlingEnum = ErrorHandlingEnum.LOG
) -> pd.DataFrame:
    """
    Move non-human-readable API columns to the end of the dataframe
    """
    api_column_names_dict = api_column_names._asdict()
    if error_handling == ErrorHandlingEnum.FAIL:
        api_column_names_dict.pop("error_message", None)
        api_column_names_dict.pop("error_type", None)
    if not any(["error_raw" in k for k in df.keys()]):
        api_column_names_dict.pop("error_raw", None)
    cols = [c for c in df.keys() if c not in api_column_names_dict.values()]
    new_cols = cols + list(api_column_names_dict.values())
    df = df.reindex(columns=new_cols)
    return df
コード例 #33
0
ファイル: vis.py プロジェクト: tlhr/plumology
def dist2D(dist: pd.DataFrame,
           ranges: pd.DataFrame,
           nlevels: int=16,
           nx: int=2,
           size: int=6,
           colorbar: bool=True,
           name: str='dist') -> plt.Figure:
    """
    Plot 2D probability distributions.

    Parameters
    ----------
    dist : Multiindexed dataframe with force field as primary
        index and distributions as created by dist2D().
    ranges : Multiindexed dataframe with force field as primary
        index and edges as created by dist1D().
    nlevels : Number of contour levels to use.
    nx : Number of plots per row.
    size : Relative size of each plot.
    colorbar : If true, will plot a colorbar.
    name : Name of the distribution.

    Returns
    -------
    fig : matplotlib figure.

    """

    # Setup plotting parameters
    nplots = dist.shape[1]
    xsize, ysize = nx, (nplots // nx) + 1
    cmap = plt.get_cmap('viridis')
    fig = plt.figure(figsize=(xsize * size, ysize * size))

    for i, k in enumerate(dist.keys()):

        # Get keys for both CVs
        kx, ky = k.split('.')

        # Prepare plotting grid (np.meshgrid doesn't work)
        X = np.broadcast_to(ranges[kx], dist[k].unstack().shape)
        Y = np.broadcast_to(ranges[ky], dist[k].unstack().shape).T
        Z = dist[k].unstack().values.T

        # Contour levels taking inf into account
        levels = np.linspace(np.amin(Z[~np.isinf(Z)]),
                             np.amax(Z[~np.isinf(Z)]), nlevels)
        ax = fig.add_subplot(ysize, xsize, i + 1)
        cm = ax.contourf(X, Y, Z, cmap=cmap, levels=levels)
        ax.set_xlabel(kx)
        ax.set_ylabel(ky)
        ax.set_title(name)

    if colorbar:
        fig.colorbar(cm)

    return fig
コード例 #34
0
def df2boxplots(sc_df: pd.DataFrame) -> None:
    rows = 5
    cols = (len(sc_df.keys()) / 5) + 1
    for i, flt in enumerate(sc_df):
        if flt in ['description', 'SCORE:']:
            continue
        ax = plt.subplot(rows, cols, i+1)
        plt.boxplot(sc_df[flt].tolist())
        plt.title(flt)
    plt.show()
コード例 #35
0
ファイル: PLM_plot.py プロジェクト: marlonbetz/ML
def plot_languages(path):
    phoneme_embeddings = dict()
    for line in codecs.open(path,"r"):
        line = line.split(",")
        key= line[0][1:-1]
        emb = line[1:]
        emb[-1] = emb[-1][:-1]
        emb = np.array([float(e) for e in emb])
        phoneme_embeddings[key] = emb
    
    phoneme_embeddings = DataFrame(phoneme_embeddings,columns=phoneme_embeddings.keys())
    print(phoneme_embeddings.columns)
    
    m = TSNE()
    phoneme_embeddings_tsne = m.fit_transform(phoneme_embeddings[["STANDARD_GERMAN","BERNESE_GERMAN","EASTERN_FRISIAN","NORTH_FRISIAN_AMRUM",                                                           
                 "ENGLISH","DUTCH","YIDDISH_EASTERN","YIDDISH_WESTERN","DANISH","SWEDISH","FAROESE","NORWEGIAN_RIKSMAL",
                 "GJESTAL_NORWEGIAN","NORWEGIAN_BOKMAAL","NORWEGIAN_NYNORSK_TOTEN","SANDNES_NORWEGIAN",
                 "ICELANDIC","POLISH","KASHUBIAN","CZECH",
                 "LOWER_SORBIAN","UPPER_SORBIAN","SLOVAK","SLOVENIAN","MACEDONIAN","BULGARIAN","UKRAINIAN",
                 "BELARUSIAN","RUSSIAN","ARABIC_CYPRIOT_SPOKEN","ARABIC_GULF_SPOKEN","ARABIC_LIBYAN_SPOKEN","ARABIC_NORTH_LEVANTINE_SPOKEN",
                 "ARABIC_SUDANESE_SPOKEN","CAIRO_ARABIC","DAMASCUS_ARABIC"]].transpose())
    print(len(phoneme_embeddings_tsne))
    for p,emb in zip(["STANDARD_GERMAN","BERNESE_GERMAN","EASTERN_FRISIAN","NORTH_FRISIAN_AMRUM",                                                         
                 "ENGLISH","DUTCH","YIDDISH_EASTERN","YIDDISH_WESTERN","DANISH","SWEDISH","FAROESE","NORWEGIAN_RIKSMAL",
                 "GJESTAL_NORWEGIAN","NORWEGIAN_BOKMAAL","NORWEGIAN_NYNORSK_TOTEN","SANDNES_NORWEGIAN",
                 "ICELANDIC","POLISH","KASHUBIAN","CZECH",
                 "LOWER_SORBIAN","UPPER_SORBIAN","SLOVAK","SLOVENIAN","MACEDONIAN","BULGARIAN","UKRAINIAN",
                 "BELARUSIAN","RUSSIAN","ARABIC_CYPRIOT_SPOKEN","ARABIC_GULF_SPOKEN","ARABIC_LIBYAN_SPOKEN","ARABIC_NORTH_LEVANTINE_SPOKEN",
                 "ARABIC_SUDANESE_SPOKEN","CAIRO_ARABIC","DAMASCUS_ARABIC"], phoneme_embeddings_tsne):
        c = "black"
    #     if regex.search("[aeiou3E]\\*?", p):
    #         c = "red"
    #     if regex.search(".*w~", p):
    #         c = "blue"
    #     if regex.search(".*y~", p):
    #         c = "yellow"
    #     if regex.search(".*h~", p):
    #         c = "brown"
    #     if regex.search(".*\"", p):
    #         c = "green"
        if p in ["STANDARD_GERMAN","BERNESE_GERMAN","EASTERN_FRISIAN","FRISIAN_WESTERN,","NORTH_FRISIAN_AMRUM"                                                            
                 "ENGLISH","DUTCH","YIDDISH_EASTERN","YIDDISH_WESTERN","DANISH","SWEDISH","FAROESE","NORWEGIAN_RIKSMAL",
                 "GJESTAL_NORWEGIAN","NORWEGIAN_BOKMAAL","NORWEGIAN_NYNORSK_TOTEN","SANDNES_NORWEGIAN","ICELANDIC"]:
            c = "red"
            plt.annotate(p,(emb[0],emb[1]),color=c)
    
        if p in ["POLISH","KASHUBIAN","CZECH",
                 "LOWER_SORBIAN","UPPER_SORBIAN","SLOVAK","SLOVENIAN","MACEDONIAN","BULGARIAN","UKRAINIAN",
                 "BELARUSIAN","RUSSIAN"]:
            c = "blue"
            plt.annotate(p,(emb[0],emb[1]),color=c)
        if p in ["ARABIC_CYPRIOT_SPOKEN","ARABIC_GULF_SPOKEN","ARABIC_LIBYAN_SPOKEN","ARABIC_NORTH_LEVANTINE_SPOKEN",
                 "ARABIC_SUDANESE_SPOKEN","CAIRO_ARABIC","DAMASCUS_ARABIC"]:
            c = "green"
            plt.annotate(p,(emb[0],emb[1]),color=c)
コード例 #36
0
ファイル: test_visibility.py プロジェクト: UCL/purify
def test_python_to_c_to_python():
    """" Cycle visibility bindings from python to C to python. """
    from pandas  import DataFrame
    from numpy import sqrt
    from numpy.random import random
    from numpy.testing import assert_allclose, assert_equal
    from purify.tests.visibility_testing import _bindings_cycle

    N = 10
    noise = random(N)
    expected = DataFrame({
        'u': random(N), 'v': random(N), 'w': random(N),
        'noise': (1+1j) / sqrt(2) * noise, 'y': random(N) + 1j * random(N)
    })

    actual = _bindings_cycle(expected)

    assert_equal(set(expected.keys()), set(expected.keys()))
    for name in expected.keys(): 
        assert_allclose( actual[name], expected[name], 
                         err_msg = "Columns %s did not compare" % name )
コード例 #37
0
ファイル: test.py プロジェクト: jinstrong/stdfparser
    def dump(self):
        chip_id_full = '999:chip_id_full'
        data = DataFrame(self.PFTR_nd).transpose()
        keys = data.keys()
        key_hard_bin = ''.join(filter(lambda s: 'HARD_BIN' in s, keys))
        key_site_num = ''.join(filter(lambda s: 'SITE_NUM' in s, keys))
        key_soft_bin = ''.join(filter(lambda s: 'SOFT_BIN' in s, keys))
        key_efuse_burned = ''.join(filter(lambda s: 'efuse_burned' in s, keys))
        key_chip_id_part0 = ''.join(filter(lambda s: 'chip_id_part0' in s, keys))
        key_chip_id_part1 = ''.join(filter(lambda s: 'chip_id_part1' in s, keys))

        for i in data[key_soft_bin].unique():
            self.test_yield[i] = {}
            # basename = os.path.basename(self.Path_name)
            # name_front = basename.find('_') + 1
            # name_end = basename.find('---') - 12
            # name = basename[name_front:name_end]
            # self.test_yield[i][name] = {}
            # for j in data[key_site_num].unique():
            #     self.test_yield[i][name][j] = data[(data[key_site_num] == j) & (data[key_soft_bin] == i)][
            #         key_hard_bin].count()
            #     self.test_yield[i][name]['total'] = data[(data[key_soft_bin] == i)][key_hard_bin].count()
            #     self.test_yield[i][name]['yield'] = '{0:.2%}'.format(
            #         self.test_yield[i][name]['total'] / float(data[key_hard_bin].count()))
            # df_1 = DataFrame(self.test_yield).transpose()
            # dfs = [pd.DataFrame([x for x in df_1[col]], index=df_1.index) for col in df_1.columns]
            # df2 = pd.concat(dfs, axis=1, keys=df_1.columns)
            # df2.columns.names = ['test', 'info']
            # df2.index.names = ['soft_bin']

            for j in data[key_site_num].unique():
                self.test_yield[i][j] = data[(data[key_site_num] == j) & (data[key_soft_bin] == i)][
                    key_hard_bin].count()
                self.test_yield[i]['total'] = data[(data[key_soft_bin] == i)][key_hard_bin].count()
                self.test_yield[i]['yield'] = '{0:.2%}'.format(
                    self.test_yield[i]['total'] / float(data[key_hard_bin].count()))
        df_1 = DataFrame(self.test_yield).transpose()

        data[chip_id_full] = data[key_chip_id_part0] + data[key_chip_id_part1] * 10000000
        data_id = data[(data[key_hard_bin] == 1) & (data[key_efuse_burned] == 0)][chip_id_full]
        if data_id[data_id.duplicated() == True].count() > 0:
            raise (self.Path_name + 'is with duplicated chip id')
            with open(self.Path_name + 'duplicated.txt', w) as duplicated_txt:
                duplicated_txt.write(self.Path_name + 'is with duplicated chip id')
        with ExcelWriter(self.Path_name) as writer:
            DataFrame(self.test_info).to_excel(writer, sheet_name='Related')
            DataFrame(self.PMR_nd).transpose().to_excel(writer, sheet_name='PMR')
            DataFrame(self.PFTR_nd).transpose().to_excel(writer, sheet_name='PTR_FTR')
            DataFrame(
                data[(data[key_hard_bin] == 1) & (data[key_efuse_burned] == 0)]).describe().transpose().combine_first(
                DataFrame(self.spec_summ).transpose()).to_excel(writer, sheet_name='summary_spec')
            df_1.to_excel(writer, sheet_name='yield')
コード例 #38
0
ファイル: parking.py プロジェクト: lrei/carpark_prediction
def resample_df(original_df, rs_interval='60Min', rs_how='last',
                window_size=4):
    # resample
    df = original_df.copy()
    rs = original_df.resample(rs_interval, how=rs_how)
    df = DataFrame(rs)
    df = df[pd.notnull(df).any(axis=1)]  # remove pull NaN rows

    # add windows
    for k in df.keys():
        for ind in range(1, window_size):
            vn = unicode(k) + u'-' + unicode(ind)
            df[vn] = np.hstack((np.array([np.NaN] * ind),
                                df[k].values))[:-ind]

    # destroy first lines
    df = df[window_size - 1:]  # this -1 is destroyed later

    return df
コード例 #39
0
ファイル: speed_by_peak.py プロジェクト: OpenWIM/pywim
def sensors_estimation(
    signal_data: pd.DataFrame, sensors_delta_distance: list
) -> [np.array]:
    """

    :param signal_data:
    :param sensors_delta_distance:
    :return:
    """
    # x axis: time
    x = signal_data.index.values

    sensors_peak_time = []
    sensors_delta_time = [None]

    for k in signal_data.keys():
        # y axis: volts
        y = signal_data[k].values

        indexes = peakutils.indexes(y, thres=0.5, min_dist=30)

        sensors_peak_time.append(x[indexes])

    for i in range(1, len(sensors_peak_time)):
        sensors_delta_time.append(
            sensors_peak_time[i] - sensors_peak_time[i - 1]
        )

    # the information about first sensor should be equal to the second sensor
    sensors_delta_time[0] = sensors_delta_time[1]

    sensors_delta_speed = []

    for i in range(len(sensors_delta_distance)):
        sensors_delta_speed.append(
            sensors_delta_distance[i] / sensors_delta_time[i]
        )

    # the information about first sensor should be equal to the second sensor
    sensors_delta_speed[0] = sensors_delta_speed[1]

    return sensors_delta_speed
コード例 #40
0
def agg_by_state(df):
    
    '''Aggregate data by US state, summing all relevant metrics'''
    
    # Define lambda functions for aggregation
    count_user = lambda x: sum(x == 'user')
    count_hash = lambda x: sum(x == 'hash')
    count_none = lambda x: sum(x == 'none')
    count_user_hash = lambda x: (count_user(x) / count_hash(x)) \
                                if count_hash(x) > 0 else 0
    
    # Create an aggregation dictionary
    agg_dict = {'count': len, 'n_user': count_user, 'n_hash': count_hash,
                'n_none': count_none, 'user_hash': count_user_hash}

    # Perform aggregation by state
    grouped = df.groupby(by='state', as_index=False)
    df = grouped['u_o_h'].agg(agg_dict)
    
    # Load state data
    with open('J:\WDPRO\BPM\us_states.csv', 'r') as f:
        states = {}
        for abbrev, name in reader(f):
           states[abbrev] = name
    states = DataFrame(data=states.values(), index=states.keys())
    
    # Restrict results to US states
    df = df[df.state.isin(states.index)]
    
    # Join the full state name
    df = df.join(states, on='state')
    df.rename(columns={0: 'state_name'}, inplace=True)
    df['state_name'] = [i.lower() for i in df['state_name']]
    
    # Rank the states
    df['count_rank'] = df['count'].rank(ascending=False)
    
    # Return DataFrame
    return df
コード例 #41
0
ファイル: datetimes.py プロジェクト: Xbar/pandas
def _assemble_from_unit_mappings(arg, errors):
    """
    assemble the unit specifed fields from the arg (DataFrame)
    Return a Series for actual parsing

    Parameters
    ----------
    arg : DataFrame
    errors : {'ignore', 'raise', 'coerce'}, default 'raise'

        - If 'raise', then invalid parsing will raise an exception
        - If 'coerce', then invalid parsing will be set as NaT
        - If 'ignore', then invalid parsing will return the input

    Returns
    -------
    Series
    """
    from pandas import to_timedelta, to_numeric, DataFrame
    arg = DataFrame(arg)
    if not arg.columns.is_unique:
        raise ValueError("cannot assemble with duplicate keys")

    # replace passed unit with _unit_map
    def f(value):
        if value in _unit_map:
            return _unit_map[value]

        # m is case significant
        if value.lower() in _unit_map:
            return _unit_map[value.lower()]

        return value

    unit = {k: f(k) for k in arg.keys()}
    unit_rev = {v: k for k, v in unit.items()}

    # we require at least Ymd
    required = ['year', 'month', 'day']
    req = sorted(list(set(required) - set(unit_rev.keys())))
    if len(req):
        raise ValueError("to assemble mappings requires at least that "
                         "[year, month, day] be specified: [{required}] "
                         "is missing".format(required=','.join(req)))

    # keys we don't recognize
    excess = sorted(list(set(unit_rev.keys()) - set(_unit_map.values())))
    if len(excess):
        raise ValueError("extra keys have been passed "
                         "to the datetime assemblage: "
                         "[{excess}]".format(excess=','.join(excess)))

    def coerce(values):
        # we allow coercion to if errors allows
        values = to_numeric(values, errors=errors)

        # prevent overflow in case of int8 or int16
        if is_integer_dtype(values):
            values = values.astype('int64', copy=False)
        return values

    values = (coerce(arg[unit_rev['year']]) * 10000 +
              coerce(arg[unit_rev['month']]) * 100 +
              coerce(arg[unit_rev['day']]))
    try:
        values = to_datetime(values, format='%Y%m%d', errors=errors)
    except (TypeError, ValueError) as e:
        raise ValueError("cannot assemble the "
                         "datetimes: {error}".format(error=e))

    for u in ['h', 'm', 's', 'ms', 'us', 'ns']:
        value = unit_rev.get(u)
        if value is not None and value in arg:
            try:
                values += to_timedelta(coerce(arg[value]),
                                       unit=u,
                                       errors=errors)
            except (TypeError, ValueError) as e:
                raise ValueError("cannot assemble the datetimes [{value}]: "
                                 "{error}".format(value=value, error=e))

    return values
コード例 #42
0
ファイル: Pandas.py プロジェクト: blueCFD/PyFoam
    def addData(self,other,
                sameIndex=True,
                mergeIndex=False,
                prefix=None,
                suffix=None,
                allowExtrapolate=False,
                interpolationMethod="values"):
        """Add data from another DataFrame or Series
        @param other: data as Pandas-DataFrame or Series
        @param sameIndex: assum both have the same indices. If False the other data will be interpolated to the current indices
        @param mergeIndex: make the result indices a mixture of the indices"""
        if not sameIndex and mergeIndex:
            raise PandasWrapperPyFoamException("Can't specify sameIndex=False and mergeIndex=True at the same time")
        if not isinstance(other,self.validOtherTypes):
             raise PandasWrapperPyFoamException("Other data is of type",type(other),
                                                "should be one of",self.validOtherTypes)
        if isinstance(other,DataFrame):
             o=other
        else:
             o=DataFrame(other)

        k=o.keys()
        if not self.__allStrings(k):
            raise PandasWrapperPyFoamException("Added data with non-string columns")
        v=k.copy()
        if prefix:
             v=[prefix+n for n in v]
        if suffix:
             v=[n+suffix for n in v]
        if len(set(v)&set(self.keys()))>0:
             raise PandasWrapperPyFoamException("Keys of this",self.keys(),"and other",v,
                                                "intersect",set(v)&set(self.keys()))
        keys=dict(zip(k,v))
        interpolate=False # only interpolate if necessary
        if len(self.index)!=len(o.index) or (self.index!=o.index).any():
             if sameIndex and not mergeIndex:
                  raise PandasWrapperPyFoamException("Other data has different index. Specify sameIndex=False or mergeIndex=True")
             ni=unique(hstack([self.index,o.index]))
             interpolate=True
             if mergeIndex:
                 minOld=min(self.index)
                 maxOld=max(self.index)

                 result=self.reindex(index=ni,copy=False).interpolate(
                    method=interpolationMethod)

                 if not allowExtrapolate:
                     result[result.index<minOld]=float("NaN")
                     result[result.index>maxOld]=float("NaN")
             else:
                  # make sure we have values at the current position
#                  o=o.reindex_axis(ni,axis='index').interpolate(method=interpolationMethod)
                  o=o.reindex(index=ni,columns=o.columns).interpolate(method=interpolationMethod)
                  # ,takeable=True
                  result=self.copy()
        else:
            result=self.copy()

        minOld=min(o.index)
        maxOld=max(o.index)
        for k,v in keys.items():
            result[v]=o[k]
            if interpolate:
                result[v]=result[v].interpolate(method=interpolationMethod)
                if not allowExtrapolate:
                     result[v][result.index<minOld]=float("NaN")
                     result[v][result.index>maxOld]=float("NaN")

        return PyFoamDataFrame(result)
コード例 #43
0
import codecs 
import numpy as np
from pandas import DataFrame
import regex
import evaluation
import sys
phoneme_embeddings = dict()
for line in codecs.open("phoneme_embeddings_plm.csv","r"):
    line = line.split(",")
    key= line[0][1:-1]
    emb = line[1:]
    emb[-1] = emb[-1][:-1]
    emb = np.array([float(e) for e in emb])
    phoneme_embeddings[key] = emb

phoneme_embeddings = DataFrame(phoneme_embeddings,columns=phoneme_embeddings.keys())

clf  = KNeighborsClassifier(n_neighbors=1,algorithm="brute",metric="euclidean")
clf.fit(phoneme_embeddings.transpose(),phoneme_embeddings.columns)  

tags = set()
for test in evaluation.SimilarityTestData:
    for tag in test["tags"]:
        tags.add(tag)
tags = list(tags)
print("EVALUATION")
c = dict()
c_true = dict()
c_true_all = 0
for tag in tags:
    c[tag] = 0
コード例 #44
0
ファイル: __init__.py プロジェクト: whacked/DataPyper
    from pandas import DataFrame as DF
    import sys

    ## this stuff should be moved to a unit test
    if "--test" in sys.argv:


        ## generate some random csv datastructure
        import faker, random, tempfile
        FK = faker.Faker()
        ncol = random.randint(1, 5)
        nrow = random.randint(100, 400)

        df = DF(dict([(key, np.random.rand(1, nrow)[0]) for key in [FK.username() for i in range(ncol-1)] + ["RT"]]))
        for k in df.keys():
            if k == "RT": continue
            if random.random() > 0.5:
                ## turn the column into a binary value
                df[k] = df[k].round()

        TR_duration = 0.5 * random.randint(2, 6)
        ## append a duration and onset
        df['duration'] = TR_duration
        df['onset'] = df.index * TR_duration

        csv_filepath = tempfile.mktemp(suffix = ".csv")
        df.to_csv(csv_filepath)

        csvf = pe.Node(name = "csvfile", interface = CSVFile())
        csvf.inputs.csv_filepath = csv_filepath
コード例 #45
0
ファイル: storage.py プロジェクト: OpenWIM/pywim
def create_data_set(
    data_file: h5py.File,
    data: pd.DataFrame,
    sample_rate: int=None,
    date_time: datetime=datetime.now(),
    site_id: str='000',
    lane_id: str='00',
    temperature: float=None,
    license_plate: str=None,
    sensor_calibration_factory: list=None,
    distance_between_sensors: list=None,
    sensor_type: str=None,
    sensors_layout: str=None,
    channel_configuration: str=None,
    **kwargs
) -> h5py.Dataset:
    """

    :param data_file:
    :param data:
    :param sample_rate: (e.g. 2000)
    :param date_time: (e.g. 2017-49-04 00:49:36)
    :param site_id: (e.g. 001)
    :param lane_id: (e.g. 01)
    :param temperature: (e.g. 28.5)
    :param license_plate: (e.g. AAA9999)
    :param sensor_calibration_factory: (e.g. [0.98, 0.99, 0.75])
    :param distance_between_sensors: (e.g. [1.0, 1.5, 2.0])
    :param sensor_type: (e.g. quartz, polymer, ceramic, mixed)
    :param sensors_layout: (e.g. |/|\|<|>|=|)
    :param channel_configuration: (this is a, optional attribute, it is
        required just when sensor type is mixed,
        e.g. "{'a0': 'polymer', 'a1': 'ceramic'})"
    :param kwargs:
    :return:
    """

    dset_id = 'run_{}_{}_{}'.format(
        site_id, lane_id, date_time.strftime('%Y%M%d_%H%M%S')
    )

    dset = data_file.create_dataset(
        dset_id, shape=(data.shape[0],),
        dtype=np.dtype([
            (k, float) for k in ['index'] + list(data.keys())
        ])
    )

    dset['index'] = data.index

    for k in data.keys():
        dset[k] = data[k]

    dset.attrs['sample_rate'] = sample_rate
    dset.attrs['date_time'] = date_time.strftime('%Y-%M-%d %H:%M:%S')
    dset.attrs['site_id'] = site_id
    dset.attrs['lane_id'] = lane_id
    dset.attrs['temperature'] = temperature
    dset.attrs['license_plate'] = license_plate
    dset.attrs['sensor_calibration_factory'] = sensor_calibration_factory
    dset.attrs['distance_between_sensors'] = distance_between_sensors
    dset.attrs['sensor_type'] = sensor_type
    dset.attrs['sensors_layout'] = sensors_layout
    dset.attrs['channel_configuration'] = channel_configuration

    if kwargs:
        for k, v in kwargs.items():
            dset.attrs[k] = v

    return dset
table=xls_file.parse('Sheet1')

#******************************************
#分析 HTML JSON格式 数据  用其提供的API 接口 P181

import requests
url='http://live.qq.com/json/movie/all/hot2/list_7.json'

resp=requests.get(url)

resp

import json
data=json.loads(resp.text)

data.keys()

#************************************************
#与数据库的 交互
#yong pandas 提供 的 嵌入式 SQLite 数据库

import  sqlite3
query="""
CREATE TABLE test
(a VARCHAR(20),b VARCHAR (20),
c REAL  ,d INTEGER );"""
con=sqlite3.connect(':memory:')
con.execute(query)
con.commit()

コード例 #47
0
ファイル: plot_dihedral.py プロジェクト: josejames00/pytraj
from pandas import DataFrame
import pandas as pd
import matplotlib
matplotlib.use('Agg')
from matplotlib import pyplot as plt
import seaborn as sns

dir_cm = ''
dir_unmod = ''
fname = "chin_alpha_gamma.pk"
df_cm = DataFrame(pd.read_pickle(dir_cm + fname))
df_unmod = DataFrame(pd.read_pickle(dir_unmod + fname))

keys = df_cm.keys()

for key in keys:
    fig = plt.figure()
    ax = fig.add_subplot(120)
    ax.hist(df_cm[key], normed=1)
    ax.set_xlim([-180., 180.])
    ax.set_ylim([0, 0.1])
    ax = fig.add_subplot(121)
    ax.hist(df_unmod[key], normed=1)
    ax.set_ylim([0, 0.1])
    ax.set_xlim([-180., 180.])
    fname = key.replace(":", "_")
    plt.title(fname)
    plt.savefig("./plots/" + fname + ".png", dpi=300)
コード例 #48
0
ファイル: final.py プロジェクト: ANtlord/ml-study
def _get_cols_with_nans(in_data: DataFrame):
    for col_name in in_data.keys():
        if in_data[col_name].hasnans:
            yield col_name