Esempio n. 1
0
# The below code is based on the baseline model's extract features history
# extract_features_history < - function(dt, ref_ids_escalated)

from pyspark.sql import Window
from pyspark.sql import functions as F

from config.data_paths import data_dir
from config.env import *
from source.etl.constants_and_parameters import TIME_INTERVAL
from source.utils.ml_tools.categorical_encoders import encode_categorical_using_mean_response_rate_inplace
from source.utils.reader_writers.reader_writers import (SparkRead, SparkWrite)

spark_read = SparkRead(spark=spark)
spark_write = SparkWrite()

case_status_history = spark_read.parquet(
    path=data_dir.make_interim_path('case_status_history'))

test = spark_read.parquet(path=data_dir.make_interim_path('test'))

ref_ids_escalated = (case_status_history.filter(
    F.col("inverse_time_to_next_escalation") > 0).select(
        'reference_id').distinct())
ref_ids_escalated.count()

history_with_cutoff_times = spark_read.parquet(
    path=data_dir.make_processed_path('history_with_cutoff_times'))

history_with_cutoff_times.show()

base_table_case_status_history_features = (
# The below code is based on the baseline model's extract features history
# extract_features_history < - function(dt, ref_ids_escalated)

from pyspark.sql import functions as F

from config.data_paths import data_dir
from config.env import *
from source.utils.reader_writers.reader_writers import (
    SparkRead,
    SparkWrite
)

spark_read = SparkRead(spark=spark)
spark_write = SparkWrite()

milestone_features = spark_read.parquet(
    path=data_dir.make_feature_path('milestone')
)
print(f'rows in milestone features {milestone_features.count()}')
comments_features = spark_read.parquet(
    path=data_dir.make_feature_path('comments')
)
print(f'rows in milestone features {comments_features.count()}')

case_status_history_features = spark_read.parquet(
    path=data_dir.make_feature_path('case_status_history_features')
)
print(f'rows in case_status_history features {case_status_history_features.count()}')

metadata_features = spark_read.parquet(
    path=data_dir.make_feature_path('metadata')
Esempio n. 3
0
# The below code is based on the baseline model's extract features history
# extract_features_history < - function(dt, ref_ids_escalated)

from pyspark.sql import Window
from pyspark.sql import functions as F

from config.data_paths import data_dir
from config.env import *
from source.etl.constants_and_parameters import TIME_INTERVAL
from source.utils.ml_tools.categorical_encoders import (
    one_hot_encode_categorical, label_encode_categorical_inplace,
    encode_categorical_using_mean_response_rate_inplace)
from source.utils.reader_writers.reader_writers import (SparkRead, SparkWrite)

spark_read = SparkRead(spark=spark)
spark_write = SparkWrite()

comments = spark_read.parquet(path=data_dir.make_interim_path('comments'))
case_status_history = spark_read.parquet(
    path=data_dir.make_interim_path('case_status_history'))

ref_ids_escalated = (case_status_history.filter(
    F.col("inverse_time_to_next_escalation") > 0).select(
        'reference_id').distinct())
ref_ids_escalated.count()

comments_with_cutoff_times = spark_read.parquet(
    path=data_dir.make_processed_path('comments_with_cutoff_times'))

comments_with_cutoff_times.show()
comments_with_cutoff_times.groupby('comment_type').count().orderBy(
Esempio n. 4
0
from pyspark.sql import Window
from pyspark.sql import functions as F

from config.data_paths import data_dir
from config.env import *
from source.utils.reader_writers.reader_writers import (SparkRead, SparkWrite)

spark_read = SparkRead(spark=spark)
spark_write = SparkWrite()

case_status_history = spark_read.parquet(
    path=data_dir.make_interim_path('case_status_history'))

test = spark_read.parquet(path=data_dir.make_interim_path('test')).withColumn(
    'inverse_time_to_next_escalation',
    F.col('inverse_time_to_next_escalation').cast('double'))

milestones = spark_read.parquet(path=data_dir.make_interim_path('milestones'))

comments = spark_read.parquet(path=data_dir.make_interim_path('comments'))

case_status_history.show()

escalation_starts = (case_status_history.filter(
    F.col('is_escalate') == 'Y').groupby('reference_id').agg(
        F.min('seconds_since_case_start').alias('escalation_start'),
        F.max('seconds_since_case_start').alias('case_end')))

escalation_starts.count()  # 646
escalation_starts.filter(F.col('reference_id') == 100087).show()
# The below code is based on the baseline models 'extract features summary
# extract_features_summary < - function(dt, ref_ids_escalated)

from pyspark.sql import functions as F

from config.data_paths import data_dir
from config.env import *
from source.utils.ml_tools.categorical_encoders import (
    label_encode_categorical_inplace,
    encode_categorical_using_mean_response_rate_inplace)
from source.utils.reader_writers.reader_writers import (SparkRead, SparkWrite)

spark_read = SparkRead(spark=spark)
spark_write = SparkWrite()

case_status_history = spark_read.parquet(
    path=data_dir.make_interim_path('case_status_history'))

test = spark_read.parquet(path=data_dir.make_interim_path('test'))

ref_ids_escalated = (case_status_history.filter(
    F.col("inverse_time_to_next_escalation") > 0).select(
        'reference_id').distinct())
ref_ids_escalated.count()

metadata = spark_read.parquet(path=data_dir.make_interim_path('metadata'))

ref_ids_escalated.show()

# *****************************************************************************************
# Demographic based product features

# from config.spark_setup import launch_spark

from pyspark.sql import Window
from pyspark.sql import functions as F

from config.data_paths import data_dir
from config.env import *
from source.utils.reader_writers.reader_writers import (
    SparkRead,
    SparkWrite
)

spark_read = SparkRead(spark=spark)
spark_write = SparkWrite()

response_file = spark_read.parquet(
    data_dir.make_interim_path('response_file')
)
milestones = spark_read.parquet(
    data_dir.make_interim_path('milestones')
)

milestones.orderBy('reference_id', 'seconds_since_case_start').show()

# EDA milestons
# milestone_id distributions
milestones.create_distribution(
    groupby_columns=['milestone_id'],
    numeric_column='seconds_since_case_start',
# The below code is based on the baseline model's extract features history
# extract_features_history < - function(dt, ref_ids_escalated)

from pyspark.sql import Window
from pyspark.sql import functions as F

from config.data_paths import data_dir
from config.env import *
from source.etl.constants_and_parameters import TIME_INTERVAL
from source.utils.ml_tools.categorical_encoders import (
    label_encode_categorical_inplace,
    encode_categorical_using_mean_response_rate_inplace)
from source.utils.reader_writers.reader_writers import (SparkRead, SparkWrite)

spark_read = SparkRead(spark=spark)
spark_write = SparkWrite()

test = spark_read.parquet(path=data_dir.make_interim_path('test'))

milestones_with_cutoff_times = spark_read.parquet(
    path=data_dir.make_processed_path('milestones_with_cutoff_times'))

case_status_history = spark_read.parquet(
    path=data_dir.make_interim_path('case_status_history'))
ref_ids_escalated = (case_status_history.filter(
    F.col("inverse_time_to_next_escalation") > 0).select(
        'reference_id').distinct())
ref_ids_escalated.count()

milestones_with_cutoff_times.show()