Python clean_data Exemples, data.clean_data Python Exemples

Exemple #1

0

Afficher le fichier

Fichier : attention_metrics.py Projet : Hemaditya/ThinkChipBackend

def get_attention_metric(file_name: str):
    """ This function calculates attention_metric based on the given filename or folder """
    """
        Inputs: file_name is a string. It can be either a file_name or name of the folder, incase which
        all the data inside that folder will be vstacked
        
        Outpus: Attention_metric
    """

    path = Path(file_name)

    if (os.path.isfile(path)):
        data = d.load_pickle_file(path)
    elif os.path.isdir(path):
        data = d.load_all_pickle_files(path, vstack=True)
    else:
        print(f"The path {path} does not exist")

    clean_data = d.clean_data(data,
                              channels=range(8),
                              threshold_fn=fr.max_amplitude)
    print(f"Clean data shape:{clean_data.shape}")

    _bandpower = d.epoch_bandpower(clean_data,
                                   per_epoch=1,
                                   channels=range(8),
                                   relative=False)

    _metrics = d.metric_1(_bandpower)

    return _metrics

Exemple #2

0

Afficher le fichier

Fichier : main.py Projet : jaspajjr/ecommerce-pipeline-model-training

def main():
    query = get_query(start_date='20160801', end_date='20170830')
    print(query)
    private_key = load_private_key()
    df = load_data(query=query,
                   project_id=private_key['project_id'],
                   private_key=private_key)
    df = clean_data(df)
    print(df.info())
    X = df.drop(columns=[
        'fullVisitorId', 'visitId', 'visitStartTime', 'country', 'medium',
        'lifetime_total_revenue'
    ]).values
    y = df['lifetime_total_revenue'].values
    store_training_data(clean_data(df))
    validation_df = model.validation(X, y)
    print(validation_df.head())
    return df

Exemple #3

0

Afficher le fichier

def perform_pca(dataframe, num_dimensions):
    """
    Perform PCA analysis on the given dataframe
    :param dataframe: The dataframe of source data, first column is the label column
    :param num_dimensions: the number of dimensions to reduce to
    :return:
    """

    # Clean/Standardize the Data
    print "Standardizing Data"
    std_df = data.clean_data(dataframe)

    # Compute the covariance matrix
    print "Computing Covariance Matrix"
    cov_matrix = std_df.cov()

    # Compute Eigenvectors + Eigenvalues of Covariance Matrix
    print "Finding Eigenvalues and Eigenvectors"
    evalues, evectors = np.linalg.eig(cov_matrix)

    # Since evalues are not guarenteed sorted, we need to keep a mapping to the eigenvectors and sort the values
    index = {evalues[i]: i for i in xrange(len(evalues))}

    # sort the evalues highest to lowest
    evalues.sort()
    evalues[:] = evalues[::-1]

    # Pick m<d eigenvectors with highest eigenvalues
    selected_indicies = [index[val] for val in evalues[:num_dimensions]]

    # Build projection matrix
    print "Projecting Data to {0} Dimensions".format(num_dimensions)
    projection_matrix = evectors[:, selected_indicies]

    # Project original dataframe using projection matrix
    new_data = np.dot(std_df, projection_matrix)

    # Preserve the labels
    labels = dataframe.index

    # Return new dataframe
    projected_df = pd.DataFrame(new_data, index=labels)
    projected_df.columns = [
        "Feature #{0}".format(i) for i in projected_df.columns
    ]
    return projected_df

Exemple #4

0

Afficher le fichier

        self.pipeline = pipe_cols

    def run(self):
        """set and train the pipeline"""
        pipeline = self.set_pipeline()
        self.pipeline.fit(self.X, self.y)

    def evaluate(self, X_test, y_test):
        """evaluates the pipeline on df_test and return the RMSE"""
        y_pred = self.pipeline.predict(X_test)
        rmse = compute_rmse(y_pred, y_test)
        return rmse


if __name__ == "__main__":
    # get data$
    df = get_data(nrows=10_000)
    # clean data
    df = clean_data(df, test=False)
    # set X and y
    X = df.drop(columns='fare_amount')
    y = df['fare_amount']
    # hold out
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3)
    # train
    trainer = Trainer(X_train, y_train)
    trainer.run()
    # evaluate
    trainer.evaluate(X_test, y_test)
    print('ok')

Exemple #5

0

Afficher le fichier

def kmeans(dataframe,
           col_1_index,
           col_2_index,
           k,
           output_path,
           columns,
           cluster_colors=['r', 'b']):
    """
    Perform Clustering using K-Means
    :param dataframe:
    :param col_1_index: The index of the first feature to plot
    :param col_2_index: The index of the second feature to plot
    :param k: The k value for k-means
    :param output_path: The path to save the graphs to.
    :param columns: The list of columns to use when computing distance.
    :param cluster_colors: A list of colors to use when displaying clusters in graphs.
    :return:
    """

    # Standardize Data
    print "Standardizing Data"
    std_df = data.clean_data(dataframe)

    # Seed Random Number Generator with '0'
    random.seed(0)

    # Randomly select k data instances and use for initial means
    print "Selecting Initial Centers"
    cluster_centers = [
        std_df.iloc[random.randrange(0,
                                     len(std_df) - 1)] for i in xrange(k)
    ]

    # Plot the initial Setup
    plot_setup(cluster_centers, col_1_index, col_2_index, output_path, std_df)
    plt.clf()

    # Do K-Means Algorithm
    # Keep Count of Iterations
    membership = {}
    iteration = 0
    while True:
        plt_name = "InitialAssignment"
        plt_title = "Initial Assignment"

        if iteration > 0:
            plt_name = "Iteration{0}".format(iteration)
            plt_title = "Iteration #{0}".format(iteration)

        # Assign Membership
        print "Assigning Membership"
        membership = assign_membership(std_df, cluster_centers, columns)

        # Plot Cluster Assignments
        plt.clf()
        print "Plotting Assignments"
        plot_iteration(cluster_centers, cluster_colors, col_1_index,
                       col_2_index, k, membership, output_path, std_df,
                       plt_name, plt_title)

        # Update Center
        # For each 'cluster' compute the mean point
        new_cluster_centers = []
        print "Computing new Cluster Centers"
        for cluster_i in xrange(len(cluster_centers)):
            indices = membership[cluster_i]
            new_cluster_centers.append(std_df.iloc[indices].mean())

        # Compute difference between prior center and new center, if < 1eps, exit
        difference = sys.maxint
        for cluster_i in xrange(len(cluster_centers)):
            old_point = cluster_centers[cluster_i]
            new_point = new_cluster_centers[cluster_i]
            difference = (old_point - new_point).max()

        print "Iteration #{0}, Difference: {1}".format(iteration, difference)
        if difference <= np.spacing(1):  # Matlab's Eps == np.spacing(1)
            break

        cluster_centers = new_cluster_centers
        iteration += 1

    #Plot final cluster assignments
    plot_iteration(cluster_centers, cluster_colors, col_1_index, col_2_index,
                   k, membership, output_path, std_df, "FinalAssignment",
                   "Final Cluster Assignments")

Exemple #6

0

Afficher le fichier

with open(CONFIGFILE, "r") as f:
    config = yaml.load(f)


ap = ArgumentParser()
ap.add_argument('--inspect_data', action='store_true', default=False,
                help="plot training data for inspection")
ap.add_argument('--train', action='store_true', default=False,
                help="Run training")
ap.add_argument('--test', action='store_true', default=False,
                help="Run test")
args = ap.parse_args()

df = load_data()
milk = clean_data(df)
train, test = train_test_split(milk)

if args.inspect_data:
    print("RAW DATA")
    print(df.head())
    milk.plot()
    plt.show()
elif args.train:
    model = LSTMPredictor(config)
    model.fit(train['Milk Production'].values.reshape(1, -1))
    model.close()
elif args.test:
    model = LSTMPredictor(config)
    y_pred = model.infer(train['Milk Production'].values.reshape(1, -1), 12)
    y_pred = list(y_pred)

Exemple #7

0

Afficher le fichier

Fichier : ModelAnalysis.py Projet : tnbeasley/542_project

from data import import_data, clean_data, prepare_model_data
from models import model_statistics
from model_diagnostics import ensemble_prediction
from model_diagnostics import partial_dependence, partial_dependence_loop, plot_top_partial_dependences

import numpy as np
import pandas as pd
import pickle
from sklearn.metrics import accuracy_score, f1_score
from tensorflow import keras

# Data
df = import_data()
clean_df = clean_data(df)

X_train, X_test, y_train, y_test, scalers, column_names = prepare_model_data(
    df=clean_df, y_col='Bankrupt?')

# Load Models
rf_clf = pickle.load(open('Models/rf.sav', 'rb'))
xgb_clf = pickle.load(open('Models/xgb.sav', 'rb'))
nn_clf = keras.models.load_model('Models/nn.h5')
knn_clf = pickle.load(open('Models/knn.sav', 'rb'))
lr_clf = pickle.load(open('Models/lr.sav', 'rb'))
model_stats_10 = pd.read_csv('Models/model_stats_10.csv')
model_stats_25 = pd.read_csv('Models/model_stats_25.csv')
model_stats_50 = pd.read_csv('Models/model_stats_50.csv')
model_stats_75 = pd.read_csv('Models/model_stats_75.csv')

# Ensemble Prediction
# train_ens_votes, train_ens_pred = ensemble_prediction(

Exemple #8

0

Afficher le fichier

Fichier : run_stock_prediction.py Projet : abdulqadirs/stock-prediction-using-daily-news

def main():
    #output directory
    output_dir = Path('../output/')

    #setup logging
    output_dir.mkdir(parents=True, exist_ok=True)
    logfile_path = Path(output_dir / "output.log")
    setup_logging(logfile = logfile_path)

    #reading the config file
    config_file = Path('../config.ini')
    reading_config(config_file)

    #dataset paths
    rworldnews_path = Path(Config.get("rworldnews"))
    millionnews_path = Path(Config.get("millionnews"))

    #loading the dataset
    raw_data = load_data(rworldnews_path)
    #raw_data = load_data(millionnews_path)

    dates, labels, news = clean_data(raw_data)
    id_to_word, word_to_id = dictionary(news, threshold = 5)
    training_loader, validation_loader, testing_loader = data_loaders(rworldnews_path)

    #tensorboard

    #loading pretrained embeddings
    pretrained_emb_file = Path(Config.get("pretrained_emb_path"))
    pretrained_embeddings, emb_dim = load_pretrained_embeddings(pretrained_emb_file, id_to_word)

    #text classification model
    num_classes = 2
    model = TextClassifier(pretrained_embeddings, emb_dim, num_classes)

    #load the optimizer
    learning_rate = Config.get("learning_rate")
    optimizer = adam_optimizer(model, learning_rate)

    #load the loss function
    criterion = cross_entropy

    #load checkpoint
    checkpoint_file = Path(output_dir / Config.get("checkpoint_file"))
    checkpoint_stocks = load_checkpoint(checkpoint_file)

    #using available device(gpu/cpu)
    model = model.to(Config.get("device"))
    pretrained_embeddings = pretrained_embeddings.to(Config.get("device"))

    #intializing the model and optimizer from the save checkpoint.
    start_epoch = 1
    if checkpoint_stocks is not None:
        start_epoch = checkpoint_stocks['epoch'] + 1
        model.load_state_dict(checkpoint_stocks['model'])
        optimizer.load_state_dict(checkpoint_stocks['optimizer'])
        logger.info('Initialized model and the optimizer from loaded checkpoint.')
    
    del checkpoint_stocks

    #stock prediction model
    model = StockPrediction(model, optimizer, criterion, 
                            training_loader, validation_loader, testing_loader, output_dir)

    #training and testing the model
    epochs = Config.get("epochs")
    validate_every = Config.get("validate_every")
    model.train(epochs, validate_every, start_epoch)

Exemple #9

0

Afficher le fichier

Fichier : regression.py Projet : caticardelle/eece2300_termproject

import data
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
from sklearn.cross_validation import train_test_split
from sklearn.linear_model import LinearRegression, LassoCV, RidgeCV, ElasticNetCV
from sklearn import metrics
from sklearn import preprocessing

df = data.load_data('C:\\Users\\catic\\Documents\\EECE 2300\\python\\crime_term_project\\data\\raw\\communities.data.txt')
df2 = data.summarize_data(df)
df_attributes = data.label_data('C:\\Users\\catic\\Documents\\EECE 2300\\python\\crime_term_project\\data\\raw\\communities.attributes.txt', df2)
cleaned_df = data.clean_data(df_attributes)
cleaned_df = cleaned_df.replace('?', np.NaN)
cleaned_df = cleaned_df.dropna(axis=0)


x = cleaned_df.drop(['communityname','ViolentCrimesPerPop'], axis = 1)
y = cleaned_df['ViolentCrimesPerPop']
x_labels = x.columns
x_as_array = x.values
min_max_scale = preprocessing.MinMaxScaler()
x_scaled = min_max_scale.fit_transform(x_as_array)
x = pd.DataFrame(x_scaled)
x.columns = x_labels

train_x, test_x, train_y, test_y = train_test_split(x,y, test_size=.3, random_state=1)

def linreg(x,y):
    """
    FUnction for linear regression

Exemple #10

0

Afficher le fichier

    args = parser.parse_args()

    if(not(args.plot_raw_data) and not(args.plot_standardized_data) and
           not(args.perform_pca) and not(args.perform_kmeans)):
        parser.print_help()


    plt.style.use(args.style)

    df = data.read_data(args.data_filepath)

    if(args.plot_raw_data):
        plotting.plot_all_data(df, "Raw Data", "Raw", os.path.join(args.output_folderpath, "raw"), data.column_names)

    if(args.plot_standardized_data):
        clean_df = data.clean_data(df)
        plotting.plot_all_data(clean_df, "Standardized Data", "Clean", os.path.join(args.output_folderpath, "clean"), data.column_names)

    if(args.perform_pca):
        num_dimensions = args.num_dimensions
        projected_df = pca.perform_pca(df, num_dimensions)

        output_path = os.path.join(args.output_folderpath, "PCA")
        if(not(os.path.exists(output_path))):
            os.makedirs(output_path)

        print "Saving Graphs"
        plotting.plot_all_data(projected_df, "PCA {0}-D".format(num_dimensions), "PCA", output_path, projected_df.columns)


    if(args.perform_kmeans):

Exemple #11

0

Afficher le fichier

# Creating a data frame from scraped content using pandas
dataset = pd.DataFrame(data_content)

# Creating column headings
headers = rows[0].find_all('th')
headers = [header.get_text().strip('\n') for header in headers]
headers += [
    'Total Area', 'Percentage Water', 'Total Nominal GDP', 'Per Capita GDP'
]
dataset.columns = headers

# Dropping columns from dataset that we don't want to use
drop_columns = ['Rank', 'Date', 'Source']
dataset.drop(drop_columns, axis=1, inplace=True)
dataset.sample(3)
dataset.to_csv("dataset.csv", index=False)

# Reading the dataset using pandas
dataset = pd.read_csv("dataset.csv")

# Renaming headings
dataset.rename(columns={'Country(or dependent territory)': 'Country'},
               inplace=True)
dataset.rename(
    columns={'% of worldpopulation': 'Percentage of World Population'},
    inplace=True)
dataset.rename(columns={'Total Area': 'Total Area (km2)'}, inplace=True)

# Formatting data
data.clean_data(dataset)