def show_distinct_rows(request): # First, read the data data_df = read_df(request, 'test') json_df = data_df.toPandas() json_df.to_json() # Second, check the data for duplicate rows distinct_df = data_df.distinct( ) # Get the total number of distinct records # Remove any duplicates if they exist if data_df.count() != distinct_df.count(): non_duplicates_df = data_df.dropDuplicates() else: non_duplicates_df = data_df model_data = DistinctRows.objects.create(total_count=data_df.count(), distinct_rows=distinct_df.count()) model_data.save() context = { 'all_data': json_df, 'count': data_df.count(), 'distinct_count': distinct_df.count(), 'distinct_df': non_duplicates_df } return render(request, 'show_distinct_rows.html', context)
def show_missing_observations(request): # Read all the custom fields unique_fields = custom_fields(request) # First, read the data data_df = read_df(request, 'test') json_df = data_df.toPandas() json_df.to_json() # Get the Date column date_column = None for col in data_df.columns: if ('Date' in str(col)): date_column = col # Show the number of missing observations per row as a percentage df_percentage = data_df.agg( *[(1 - (fn.count(c) / fn.count('*'))).alias(c + '_missing') for c in data_df.columns]).collect() df_percentage = df_percentage[0] # Drop the rows whose missing observations exceed a certain threshold data_less_rows = data_df.dropna(thresh=3) # Calculate the mean for every non boolean field means = data_less_rows.agg(*[ fn.mean(c).alias(c) for c in data_less_rows.columns if c != date_column ]).toPandas().to_dict('records')[0] means[date_column] = 'missing' for key, value in means.items(): if (means[key] == None): means[key] = 'missing' # Fill the empty observations with the mean of the column data_less_rows.fillna(means) clean_json_df = data_less_rows.toPandas() clean_json_df.to_json() columns = data_less_rows.columns data = [] for item in df_percentage: data.append(item) columns_json = json.dumps(columns) data_json = json.dumps(data) model_data = MissingObservations.objects.create( missing_columns=columns_json, missing_data=data_json) model_data.save() context = { 'all_data': json_df, 'missing_percentages': df_percentage, 'clean_data': clean_json_df } return render(request, 'missing_observations.html', context)
def show_distinct_ids(request): # Read all the custom fields unique_fields = custom_fields(request) # First, read the data data_df = read_df(request, 'test') json_df = data_df.toPandas() json_df.to_json() # Second, check data for duplicates despite the ID attribute distinct_rows = data_df.select([ c for c in data_df.columns if c != unique_fields['index'] ]).distinct() # Drop rows that are similar but may have different ids if data_df.count() != distinct_rows.count(): unique_rows_df = data_df.dropDuplicates( subset=[c for c in data_df.columns if c != unique_fields['index']]) else: unique_rows_df = data_df # Count the number of distinct id fields distinct_ids = unique_rows_df.agg( fn.countDistinct(unique_fields['index']).alias('distinct')) ids = distinct_ids.select('distinct').collect() distinct = ids[0].distinct # This means there are rows with same id(s) but are not duplicates if unique_rows_df.count() != distinct: clean_df = unique_rows_df.withColumn('New_id', fn.monotonically_increasing_id()) else: clean_df = unique_rows_df clean_json = clean_df.toPandas() clean_json.to_json() model_data = DistinctIds.objects.create(total_ids=data_df.count(), distinct_ids=distinct) model_data.save() context = { 'all_data': json_df, 'distinct_ids': distinct, 'clean_df': clean_json } return render(request, 'show_distinct_ids.html', context)
def cluster(request): unique_fields = custom_fields(request) # First, read the data data_df = read_df(request, 'clean') data_df.cache() json_df = data_df.toPandas() json_df.to_json() # Create a tuple of id and items from the Data Frame dd = [] for p in data_df: dd.append(p) data = [] for row in json_df.itertuples(): id = row[1] items = [] for column in range(2, (len(dd) + 1)): items.append(row[column]) data.append((id, items)) # Create a Data Frame from the data dictionary final_data = Spark.sqlContext.createDataFrame(data, ["id", "items"]) # Create the FPGrowth instance with its arguments and train the model fpGrowth = FPGrowth(itemsCol='items', minSupport=0.5, minConfidence=0.6) model = fpGrowth.fit(final_data) # Frequent Item sets itemSets = model.freqItemsets # Generated Association Rules assocRules = model.associationRules # Examines input items against all association rules and summarize consequents as prediction prediction = model.transform(data) context = { 'all_data': json_df, 'itemSets': itemSets, 'assocRules': assocRules, 'predicted': prediction } return render(request, 'show_clusters.html', context)
def pre_process(request): # Read all the custom fields unique_fields = custom_fields(request) company_name = request.user.project.company created_by = request.user project = request.user.project # First, read the data data_df = read_df(request, 'test') json_df = data_df.toPandas() json_df.to_json() # Get the Date column date_column = None for col in data_df.columns: if ('Date' in str(col)): date_column = col # Second, check the data for duplicate rows distinct_df = data_df.distinct( ) # Get the total number of distinct records # Remove any duplicates if they exist if data_df.count() != distinct_df.count(): non_duplicates_df = data_df.dropDuplicates() else: non_duplicates_df = data_df # Third, check data for duplicates despite the ID attribute distinct_rows = non_duplicates_df.select([ c for c in non_duplicates_df.columns if c != unique_fields['index'] ]).distinct() # Drop rows that are similar but may have different ids if non_duplicates_df.count() != distinct_rows.count(): unique_rows_df = non_duplicates_df.dropDuplicates(subset=[ c for c in non_duplicates_df.columns if c != unique_fields['index'] ]) else: unique_rows_df = non_duplicates_df # Count the number of distinct id fields distinct_ids = unique_rows_df.agg( fn.countDistinct(unique_fields['index']).alias('distinct')) ids = distinct_ids.select('distinct').collect() distinct = ids[0].distinct # This means there are rows with same id(s) but are not duplicates if unique_rows_df.count() != distinct: clean_df = unique_rows_df.withColumn('New_id', fn.monotonically_increasing_id()) else: clean_df = unique_rows_df # Show the number of missing observations per row as a percentage df_percentage = clean_df.agg( *[(1 - (fn.count(c) / fn.count('*'))).alias(c + '_missing') for c in clean_df.columns]).collect() # To save to db df = str(df_percentage) new = df.replace("[", "").replace("Row", "").replace("(", "").replace( "]", "").replace(")", "").split(",") new = json.dumps(new) # Can instead use correlation to check which attributes to drop # Drop the rows whose missing observations exceed a certain threshold data_less_rows = clean_df.dropna(thresh=3) # Calculate the mean for every non boolean field means = data_less_rows.agg(*[ fn.mean(c).alias(c) for c in data_less_rows.columns if c != date_column ]).toPandas().to_dict('records')[0] means[date_column] = 'missing' for key, value in means.items(): if (means[key] == None): means[key] = 'missing' try: means.pop(None) except: pass data_less_rows.fillna(means) clean_json_df = data_less_rows.toPandas() clean_json_df.to_json() table_name = str(company_name) + '_Clean' total_rows = data_df.count() distinct_rows = distinct_rows.count() # Save the data to the database for easier reuse DistinctRows.objects.create(total_count=total_rows, distinct_rows=distinct_rows, created_by=created_by, project=project) DistinctIds.objects.create(total_ids=total_rows, distinct_ids=distinct, created_by=created_by, project=project) MissingObservations.objects.create(missing_columns=new, created_by=created_by, project=project) # Final step is to save the pre_processed DF to the DB data_less_rows.write.format('jdbc').options( url='jdbc:mysql://localhost:3306/disease', dbtable=table_name, user='******', password='******').mode('append').save() context = { 'all_data': json_df, 'rows_count': total_rows, 'distinct_rows': distinct_rows, 'distinct_rows_without_id': distinct_rows, 'distinct_ids': distinct, 'missing_percentages': new, 'clean_data': clean_json_df } return render(request, 'show_distinct.html', context)
def pipeline(request): unique_fields = custom_fields(request) date_column = CustomFields.objects.first() date_column = date_column.date_column # First, read the data data_df = read_df(request, 'clean') json_df = data_df.toPandas() json_df.to_json() # Cast all the columns to numeric new_df = data_df.select( [col(c).cast("double").alias(c) for c in data_df.columns]) new_df = new_df.fillna(0.0) new_df.show() # Split data into training and test sets train, test = new_df.randomSplit([0.7, 0.3]) # Feature Processing featuresCols = new_df.columns featuresCols.remove(unique_fields['prediction']) try: featuresCols.remove(date_column) except: pass # This concatenates all feature columns into a single feature vector in a new column 'rawFeatures' vectorAssembler = VectorAssembler(inputCols=featuresCols, outputCol='rawFeatures') # Model Training standardScaler = StandardScaler(inputCol="rawFeatures", outputCol="features") lr = LinearRegression(labelCol=unique_fields['prediction'], maxIter=10, regParam=.01) # Model tuning paramGrid = ParamGridBuilder() \ .addGrid(lr.maxIter, [10, 100, 1000]) \ .addGrid(lr.regParam, [0.1, 0.01]) \ .addGrid(lr.fitIntercept, [False, True]) \ .addGrid(lr.elasticNetParam, [0.0, 0.5, 1.0]) \ .build() # We define an evaluation metric. # This tells CrossValidator how well we are doing by comparing the true labels with predictions evaluator = RegressionEvaluator(metricName="rmse", labelCol=lr.getLabelCol(), predictionCol=lr.getPredictionCol()) # Declare the CrossValidator which runs model tuning for us. cv = CrossValidator(estimator=lr, evaluator=evaluator, estimatorParamMaps=paramGrid) stages = [vectorAssembler, standardScaler, cv] # Train the pipeline pipeline = Pipeline(stages=stages) model = pipeline.fit(train) predictions = model.transform(test) rmse = evaluator.evaluate(predictions) print("RMSE on our test set is: " + str(rmse)) predictions.show() predicted_df = predictions.toPandas() predicted_df.to_json() # rmse = 23 context = {'all_data': json_df, 'rmse': rmse, 'predicted': predicted_df} return render(request, 'show_predictions.html', context)