def _execute_session(self): dataset = self.dataset_run.dataset data_df = get_data(dataset) column_types = get_schema(data_df) profile = { "column": {}, "row_count": len(data_df.index), "column_count": len(list(data_df)) } dataset_profile = ActualDatasetProfile.objects.create( dataset_run=self.dataset_run, row_count=profile["row_count"], column_count=profile["column_count"]) columns = list(data_df) for index, column in enumerate(columns): analyzer = AnalyzerActualColumn(data_df, column_types, dataset_profile, column, profile["row_count"], index) profile["column"][column] = analyzer.execute() profile["hash_sum"] = calculate_hash_sum(profile) dataset_profile.hash_sum = profile["hash_sum"] dataset_profile.save() return dataset_profile, profile
def get_data_response(dataset_key, sql_query=None): """Query the given connection with the given query and return a formatted dict response.""" # Use the given dataset key to retrieve a connection that can be used to access any supported database type. dataset = Dataset.objects.get(key=dataset_key) data_df = get_data(dataset) if sql_query: # Create a PySpark SparkSession. spark = SparkSession.builder.master("local[1]").appName( "SAEF").getOrCreate() # Convert the pandas dataframe to a spark dataframe and run the given SQL query. spark_df = spark.createDataFrame(data_df) spark_df.createOrReplaceTempView("dataset") sql_df = spark.sql(sql_query) data_df = sql_df.toPandas() response = { "SQL query": sql_query, "column_names": list(data_df), "value": data_df.to_dict('records') } return response
def prepare_train(save_dir): train_examples = get_data(save_dir + "train.json") dev_examples = get_data(save_dir + "dev.json") opt = {} opt["word_emb_path"] = save_dir + 'word_emb_matrix.json' opt["ner2id_path"] = save_dir + 'ner2id.json' # word2id = get_data(save_dir + 'word2id.json') opt["tune_idx_path"] = save_dir + "tune_idx.json" opt["pos2id_path"] = save_dir + 'pos2id.json' opt["word_dim"], opt["ner_dim"], opt["pos_dim"], opt[ "elmo_dim"] = 100, 12, 8, 1024 opt["drop_rate"] = 0.3 opt["hidden_size"] = 125 opt["biattention_size"] = 250 opt["lr"] = 0.002 opt["use_elmo"] = False opt["fix_word_embedding"] = False opt["save_dir"] = save_dir return train_examples, dev_examples, opt
def task_extract_metadata(**kwargs): """Extract current dataset metadata, including column count, row count and column names and types.""" dataset_run = kwargs["dataset_run"] dataset = dataset_run.dataset try: data_df, timestamp = get_data(dataset, get_timestamp=True) column_types = get_schema(data_df) result = { "timestamp": timestamp.strftime("%Y-%m-%d %H:%M:%S"), "columns": column_types, "column_count": len(list(data_df)), "row_count": len(data_df.index) } dataset_run.status = DatasetRun.Status.SUCCEEDED return result except Exception as e: logger.error(f"Error while extracting metadata from {dataset}: {e}") return {"error": type(e).__name__, "message": str(e)}
def data_overview(dataset): """ Return information used to get an overview of the datasets data (column types and data preview). Also return the timestamp of when this information is from. """ try: data_preview, timestamp = get_data(dataset, preview=True, get_timestamp=True) column_types = get_schema(data_preview) # Handle null values and convert the dataframe into a list of tuples. data_preview.fillna("[null]", inplace=True) data_preview = list(data_preview.itertuples(index=False, name=None)) timestamp = timestamp.strftime("%Y-%m-%d %H:%M:%S") except Exception: # If the data overview cannot be retrieved from either the datalake or datastore, set to None to alert the user. data_preview, column_types, timestamp = [], None, None return timestamp, column_types, data_preview
from util.data_util import get_data from util.features_extractor import update_labels from util.plot import plot_confusion_matrix from util.preprocessor import preprocess from util.save_util import save_classifier save_path_prefix = "./resources/" """Data preparation""" data_path = "./data/train_data.csv" data_headers = ["polarity", "id", "date", "query", "user", "text"] train_size = 10000 test_size = train_size * 0.2 x_test, y_test, x_train, y_train = get_data(data_path, train_size, test_size, data_headers) x_test, y_test = preprocess(x_test, y_test) x_train, y_train = preprocess(x_train, y_train) train_labels, test_labels = update_labels(y_train, y_test) """TFiDF""" tfidf = TfidfVectorizer(min_df=2, max_df=0.5, ngram_range=(1, 1)) vectorizer = tfidf.fit(x_train["text"]) features_train = pd.DataFrame(vectorizer.transform(x_train["text"]).todense(), columns=tfidf.get_feature_names()) features_test = pd.DataFrame(vectorizer.transform(x_test["text"]).todense(), columns=tfidf.get_feature_names()) """Support Vector Machine Classifier""" clf = SVC(kernel='linear').fit(features_train.values, train_labels)
from util.data_util import get_data from util.model import get_model_word2vec_cnn from sklearn.metrics import confusion_matrix, classification_report """Data preparation""" data_path = "./data/train_data.csv" data_headers = ["polarity", "id", "date", "query", "user", "text"] train_size = 20000 test_size = train_size * 0.2 batch_size = 200 train_epochs = 30 x_test, y_test, x_train, y_train = get_data(data_path, train_size, test_size, data_headers, skip_rows=1) x_test, y_test = preprocess(x_test, y_test) x_train, y_train = preprocess(x_train, y_train) features_train, features_test = get_word2vec_features(x_train, x_test) train_labels, test_labels = update_labels(y_train, y_test) """Training and evaluation""" model = get_model_word2vec_cnn(features_train[0].shape) history = model.fit(features_train, train_labels, validation_data=(features_test, test_labels), epochs=train_epochs,