def _execute_session(self):
        dataset = self.dataset_run.dataset
        data_df = get_data(dataset)
        column_types = get_schema(data_df)

        profile = {
            "column": {},
            "row_count": len(data_df.index),
            "column_count": len(list(data_df))
        }

        dataset_profile = ActualDatasetProfile.objects.create(
            dataset_run=self.dataset_run,
            row_count=profile["row_count"],
            column_count=profile["column_count"])

        columns = list(data_df)
        for index, column in enumerate(columns):
            analyzer = AnalyzerActualColumn(data_df, column_types,
                                            dataset_profile, column,
                                            profile["row_count"], index)
            profile["column"][column] = analyzer.execute()

        profile["hash_sum"] = calculate_hash_sum(profile)

        dataset_profile.hash_sum = profile["hash_sum"]
        dataset_profile.save()

        return dataset_profile, profile
Example #2
0
def get_data_response(dataset_key, sql_query=None):
    """Query the given connection with the given query and return a formatted dict response."""
    # Use the given dataset key to retrieve a connection that can be used to access any supported database type.
    dataset = Dataset.objects.get(key=dataset_key)
    data_df = get_data(dataset)

    if sql_query:
        # Create a PySpark SparkSession.
        spark = SparkSession.builder.master("local[1]").appName(
            "SAEF").getOrCreate()

        # Convert the pandas dataframe to a spark dataframe and run the given SQL query.
        spark_df = spark.createDataFrame(data_df)
        spark_df.createOrReplaceTempView("dataset")
        sql_df = spark.sql(sql_query)

        data_df = sql_df.toPandas()

    response = {
        "SQL query": sql_query,
        "column_names": list(data_df),
        "value": data_df.to_dict('records')
    }

    return response
Example #3
0
def prepare_train(save_dir):
    train_examples = get_data(save_dir + "train.json")
    dev_examples = get_data(save_dir + "dev.json")

    opt = {}
    opt["word_emb_path"] = save_dir + 'word_emb_matrix.json'
    opt["ner2id_path"] = save_dir + 'ner2id.json'
    # word2id = get_data(save_dir + 'word2id.json')
    opt["tune_idx_path"] = save_dir + "tune_idx.json"
    opt["pos2id_path"] = save_dir + 'pos2id.json'
    opt["word_dim"], opt["ner_dim"], opt["pos_dim"], opt[
        "elmo_dim"] = 100, 12, 8, 1024
    opt["drop_rate"] = 0.3
    opt["hidden_size"] = 125
    opt["biattention_size"] = 250
    opt["lr"] = 0.002
    opt["use_elmo"] = False
    opt["fix_word_embedding"] = False
    opt["save_dir"] = save_dir

    return train_examples, dev_examples, opt
def task_extract_metadata(**kwargs):
    """Extract current dataset metadata, including column count, row count and column names and types."""
    dataset_run = kwargs["dataset_run"]
    dataset = dataset_run.dataset

    try:
        data_df, timestamp = get_data(dataset, get_timestamp=True)
        column_types = get_schema(data_df)

        result = {
            "timestamp": timestamp.strftime("%Y-%m-%d %H:%M:%S"),
            "columns": column_types,
            "column_count": len(list(data_df)),
            "row_count": len(data_df.index)
        }
        dataset_run.status = DatasetRun.Status.SUCCEEDED

        return result
    except Exception as e:
        logger.error(f"Error while extracting metadata from {dataset}: {e}")
        return {"error": type(e).__name__, "message": str(e)}
Example #5
0
def data_overview(dataset):
    """
    Return information used to get an overview of the datasets data (column types and data preview). Also return the
    timestamp of when this information is from.
    """

    try:
        data_preview, timestamp = get_data(dataset,
                                           preview=True,
                                           get_timestamp=True)
        column_types = get_schema(data_preview)

        # Handle null values and convert the dataframe into a list of tuples.
        data_preview.fillna("[null]", inplace=True)
        data_preview = list(data_preview.itertuples(index=False, name=None))

        timestamp = timestamp.strftime("%Y-%m-%d %H:%M:%S")
    except Exception:
        # If the data overview cannot be retrieved from either the datalake or datastore, set to None to alert the user.
        data_preview, column_types, timestamp = [], None, None

    return timestamp, column_types, data_preview
Example #6
0
from util.data_util import get_data
from util.features_extractor import update_labels
from util.plot import plot_confusion_matrix
from util.preprocessor import preprocess
from util.save_util import save_classifier

save_path_prefix = "./resources/"
"""Data preparation"""
data_path = "./data/train_data.csv"
data_headers = ["polarity", "id", "date", "query", "user", "text"]

train_size = 10000
test_size = train_size * 0.2

x_test, y_test, x_train, y_train = get_data(data_path, train_size, test_size,
                                            data_headers)

x_test, y_test = preprocess(x_test, y_test)
x_train, y_train = preprocess(x_train, y_train)

train_labels, test_labels = update_labels(y_train, y_test)
"""TFiDF"""
tfidf = TfidfVectorizer(min_df=2, max_df=0.5, ngram_range=(1, 1))
vectorizer = tfidf.fit(x_train["text"])

features_train = pd.DataFrame(vectorizer.transform(x_train["text"]).todense(),
                              columns=tfidf.get_feature_names())
features_test = pd.DataFrame(vectorizer.transform(x_test["text"]).todense(),
                             columns=tfidf.get_feature_names())
"""Support Vector Machine Classifier"""
clf = SVC(kernel='linear').fit(features_train.values, train_labels)
Example #7
0
from util.data_util import get_data
from util.model import get_model_word2vec_cnn

from sklearn.metrics import confusion_matrix, classification_report
"""Data preparation"""
data_path = "./data/train_data.csv"
data_headers = ["polarity", "id", "date", "query", "user", "text"]

train_size = 20000
test_size = train_size * 0.2
batch_size = 200
train_epochs = 30

x_test, y_test, x_train, y_train = get_data(data_path,
                                            train_size,
                                            test_size,
                                            data_headers,
                                            skip_rows=1)

x_test, y_test = preprocess(x_test, y_test)
x_train, y_train = preprocess(x_train, y_train)

features_train, features_test = get_word2vec_features(x_train, x_test)
train_labels, test_labels = update_labels(y_train, y_test)
"""Training and evaluation"""
model = get_model_word2vec_cnn(features_train[0].shape)

history = model.fit(features_train,
                    train_labels,
                    validation_data=(features_test, test_labels),
                    epochs=train_epochs,