Beispiel #1
0
 def setUp(self):
     """
     Start Spark, define config and path to test data
     """
     self.config = json.loads("""{"steps_per_floor": 21}""")
     self.spark, *_ = start_spark(app_name='nlp_clause_test')
     self.test_data_path = 'tests/test_data/segment_test'
Beispiel #2
0
 def setUp(self):
     """Start Spark, define config and path to test data
     """
     self.config = json.loads("""{"Max_Temp_": 21}""")
     self.spark, *_ = start_spark()
     self.test_data_path = (
         '/Users/LRK/project-folder/GreenFlag/sparkjob/test_data/')
Beispiel #3
0
def main():
    """Main ETL script definition.
    """
    # start Spark application and get Spark session, logger and config
    spark, log, config, sc = start_spark(
        app_name='analysis',
        files=['configs/etl_config.json'])
    log.warn('***analysis is up-and-running***')
    # load data
    df = load(spark, config["start_date"], config["stop_date"], config["folder"])
    log.warn('***data loaded***')
    # daily tasks
    if config["daily"]:
        df_visit_per_hour = visit_per_hour(df, config["stop_date"])
        save(df_visit_per_hour, 'out/visit_per_hour', config["stop_date"])
        df_visitor_per_hour = visitor_per_hour(df, config["stop_date"])
        save(df_visitor_per_hour, 'out/visitor_per_hour', config["stop_date"])
        df_referral_path = referral_path(df, sc, config["stop_date"])
        save_json(df_referral_path, 'out/referral_path', config["stop_date"])
        
    # monthly tasks
    if config["monthly"]:
        df_hourly_visit_pattern = hourly_visit_pattern(df, config["stop_date"])
        save(df_hourly_visit_pattern, 'out/hourly_visit_pattern', config["stop_date"])
        df_popular_os = popular_os(df, config["stop_date"])
        save(df_popular_os, 'out/popular_os', config["stop_date"])
        df_popular_browser = popular_browser(df, config["stop_date"])
        save(df_popular_browser, 'out/popular_browser', config["stop_date"])
        df_country_dist = country_dist(df, config["stop_date"])
        save(df_country_dist, 'out/country_dist', config["stop_date"])
        df_average_visit_duration = average_visit_duration(df, config["stop_date"])
        save(df_average_visit_duration, 'out/average_visit_duration', config["stop_date"])
        df_popular_page = popular_page(df, config["stop_date"])
        save(df_popular_page, 'out/popular_page', config["stop_date"])
def main():
    """Main ETL script definition.

    :return: None
    """
    # start Spark application and get Spark session, logger and config
    spark, log, config, environment = start_spark(
        app_name='my_etl_job',
        files=['configs/etl_config.json', 'configs/transformation.sql'])

    # log that main ETL job is starting
    log.warn('etl_job is up-and-running')

    # Create ETL Components
    try:
        tasks = [
            Extract(config['extract']),
            Transform(config['transform']),
            Load(config['load']),
            Impala(config['impala'])
        ]
    except KeyError as e:
        print("Some component missing: " + repr(e))

    Executor(spark, log, tasks, environment).run()

    # log the success and terminate Spark application
    log.warn('etl_job is finished')
    spark.stop()
    return None
Beispiel #5
0
def main():
    """Main ETL script definition.

    :return: None
    """

    job_name = sys.argv[1]

    # start Spark application and get Spark session, logger and config
    spark, log, config = start_spark(app_name=job_name)

    # log that main ETL job is starting
    log.warn('%s is up-and-running' % job_name)

    # execute ETL pipeline
    data = extract_data(spark, config['data_source'])

    #dynamically load transformations from settings
    data = transform_data(data, config['transformations'], log)

    #data_transformed = transform_data(data, config['steps_per_floor'])
    load_data(data, config['data_output'])

    # log the success and terminate Spark application
    log.warn('%s is finished' % job_name)
    spark.stop()
    return None
def main():
    input_path = 'E:\\tmp\\game_csv'
    output_path = 'E:\\tmp\\output'
    spark_session, log, config = start_spark(
        app_name='nlp_tokenization', files=['./configs/file_list_config.json'])
    data_frame = load_data(spark_session, input_path)
    writer_csv(transform_data(data_frame), output_path)
def main(begin_date, end_date):
    """
    品牌序章手百用户分析spark作业
    """
    spark, log, config = start_spark(
        app_name="brand_xuzhang_gen_pv_data_%s_%s" % (begin_date, end_date),
        master='yarn',
        spark_config= {
            "spark.yarn.queue": "brand",
            "spark.shuffle.dce.enable": "true",
            "spark.executor.memory": "8g",
            "spark.executor.cores": 1,
            "spark.executor.instances": 500,
            "spark.default.parallelism": 1000,
            "spark.sql.shuffle.partitions": 1000,
        })

    output_path = "/app/ecom/brand/majian06/moirai/gen_pv_data/%s-%s" % (begin_date, end_date)

    # execute ETL pipeline
    log.warn('job etl is up-and-running')
    data = extract_data(spark)
    data_transformed = transform_data(spark, data)
    load_data(data_transformed, output_path)

    # log the success and terminate Spark application
    log.warn('job etl is finished')
    spark.stop()
    return None
Beispiel #8
0
def main():
    spark, config = start_spark(app_name='my_etl_job',
                                files=['Config/etl_config.json'])
    path = config['file']['load']['path']

    data = extract(spark, path)
    transform_data = transform(data)
    load(transform_data)
Beispiel #9
0
def main():
    """Main analysis script definition.
    :return: None
    """
    # start Spark application and get Spark session, logger and config
    spark, log, config = start_spark(
        app_name='bcg_case_study',
        files=['configs/case_study_config.json'])

    # log that main Analysis job is starting
    log.warn('bcg_case_study_job started running')

    # Execute config queries
    primary_person_path = "..\\..\\Data\\Primary_Person_use.csv"
    # primary_person_path = config['primary_person_csv_path']
    primary_person_df = extract(spark, primary_person_path, log)

    units_path = "..\\..\\Data\\Units_use.csv"
    # units_path = config['units_csv_path']
    units_df = extract(spark, units_path, log)

    damages_path = "..\\..\\Data\\Damages_use.csv"
    # damages_path = config['damages_csv_path']
    damages_df = extract(spark, damages_path, log)

    charges_path = "..\\..\\Data\\Charges_use.csv"
    # charges_path = config['charges_csv_path']
    charges_df = extract(spark, charges_path, log)

    # ANALYSIS - 1
    analysis_1(primary_person_df, log)

    # ANALYSIS - 2
    analysis_2(units_df, log)

    # ANALYSIS - 3
    analysis_3(units_df, primary_person_df, log)

    # ANALYSIS - 4
    analysis_4(units_df, log)

    # ANALYSIS - 5
    analysis_5(units_df, primary_person_df, log)

    # ANALYSIS - 6
    analysis_6(units_df, primary_person_df, log)

    # ANALYSIS - 7
    analysis_7(units_df, damages_df, log)

    # ANALYSIS - 8
    analysis_8(units_df, charges_df, primary_person_df, log)

    # Log the success and terminate Spark application
    log.warn('bcg_case_study job is finished')
    spark.stop()
    return None
Beispiel #10
0
def main():
    """Main ETL script definition.

    :return: None
    """
    # start Spark application and get Spark session, logger and config
    spark, log, config = start_spark(app_name='my_etl_job',
                                     files=['configs/etl_config.json'])

    # log that main ETL job is starting
    log.warn('etl_job is up-and-running')

    # execute Today_Load ETL
    url = 'tests/test_data/energy/NOP_LOAD_FORECAST_20180214_04_input.csv'
    df_NOP_0214_04 = extract_data_csv(spark, url)

    groupbyList = ["CONGESTION_ZONE", "FORECAST_DT", "HOUR_NUM"]
    targetColumn = "NOP"
    resultColumnName = "TODAY_LOAD"
    df_NOP_0214_04_GB = groupby_data(df_NOP_0214_04, groupbyList, targetColumn,
                                     resultColumnName)
    #df_NOP_0214_04_GB.show()

    #execute Prev_Day_Load ETL
    url = 'tests/test_data/energy/NOP_LOAD_FORECAST_20180213_11_input.csv'
    df_NOP_0213_11 = extract_data_csv(spark, url)

    groupbyList = ["CONGESTION_ZONE", "FORECAST_DT", "HOUR_NUM"]
    targetColumn = "NOP"
    resultColumnName = "PREV_DAY_LOAD"
    df_NOP_0213_11_GB = groupby_data(df_NOP_0213_11, groupbyList, targetColumn,
                                     resultColumnName)

    #execute Hour_Load ETL
    url = 'tests/test_data/energy/LFG_ST_Hourly_20180213_input.csv'
    df_LFG_0213 = extract_data_csv(spark, url)

    groupbyList = ["CONGESTION_ZONE", "FORECAST_DT", "HOUR_NUM"]
    sumList = ["UNADJ_LOAD", "DISTRIB_LOSS_LOAD", "TRANSMISSION_LOSS_LOAD"]
    resultColumnName = "ADJ_LOAD"
    df_LFG_0213_GB = groupby_agg_data(df_LFG_0213, groupbyList, sumList,
                                      resultColumnName)

    #Join three DataFrames
    joinList = ["CONGESTION_ZONE", "FORECAST_DT", "HOUR_NUM"]
    df_join = join_data(df_NOP_0214_04_GB, df_NOP_0213_11_GB, joinList, 'left')
    df_join_three = join_data(df_join, df_LFG_0213_GB, joinList, 'left')
    output = order_data(df_join_three, joinList)

    #Write output to output.csv
    load_data(output)

    # log the success and terminate Spark application
    log.warn('test_etl_job is finished')
    spark.stop()
    return None
Beispiel #11
0
    def setUp(self):
        """Start Spark, define config and path to test data
        """

        print(os.getcwd())
        print(os.listdir())

        self.config = json.loads("""{"steps_per_floor": 21}""")
        self.spark, *_ = start_spark()
        self.test_data_path = 'tests/test_data/'
Beispiel #12
0
def run_test():
    """Running test function

    :return: None
    """
    # start Spark application and get Spark session, logger and config
    spark, log, config = start_spark(app_name='my_etl_test_job',
                                     files=['configs/etl_config.json'])
    create_test_data(spark)
    spark.stop()

    return None
Beispiel #13
0
def main():
    spark_session, log, config = start_spark(
        app_name='nlp_clause', files=['./configs/sentence_spilt_config.json'])
    # 本地测试用
    # input_file = 'E:/tmp/review_csv/output'
    # output_file = 'E:/tmp/output_review'
    # data = load_data(spark_session, input_file)
    # data_transform = transform_data(data)

    data = load_data(spark_session, config['input_path'])
    data_transform = transform_data(data)
    writer_csv(data_transform, config['output_path'])
Beispiel #14
0
 def setUp(self):
     """Start Spark, define config and path to test data
     """
     self.config = json.loads("""{
                             "start_date": "20160801",
                             "stop_date": "20160804",
                             "daily": true,
                             "monthly": true,
                             "folder": "tests/test_data/"
                             }""")
     self.spark, _, _, self.sc = start_spark()
     self.test_data_path = self.config["folder"]
     self.input_data = load(self.spark, self.config["start_date"], self.config["stop_date"], self.test_data_path + "ga/")
def main():
    """Main ETL script definition.

    :return: None
    """

    parser = argparse.ArgumentParser()
    parser.add_argument('--ftp_user', dest='ftp_user', help='FTP user name')
    parser.add_argument('--ftp_password',
                        dest='ftp_password',
                        help='FTP Password')
    parser.add_argument('--api_key',
                        dest='api_key',
                        help='Google Maps API Key')

    known_args = parser.parse_args()

    # start Spark application and get Spark session, logger and config
    spark, log, config = start_spark(app_name='my_etl_job',
                                     files=['configs/etl_config.json'])

    # log that main ETL job is starting
    log.warn('etl_job is up-and-running')

    # execute ETL pipeline
    data = extract_data_module(spark)  # extract_data(spark)
    customer_data = read_from_postgres(spark, "localhost", "golang_user", "go",
                                       "customer_price_list")
    print(customer_data.show())
    print(data.show())
    data_transformed = transform_data(data, config['steps_per_floor'])
    load_data(data_transformed)
    """Start the geocoding portion
    """
    address1 = Row(id='123456', address='14 Maryland Blvd Toronto')
    address2 = Row(id='789012',
                   address='Suite 2300 100 Wellington St West Toronto')
    address3 = Row(id='345678', address='10 Bay Street Toronto')
    address4 = Row(id='901234', address='373 Glebeholme Blvd Toronto')
    addresses = [address1, address2, address3, address4]
    address_df = spark.createDataFrame(addresses)
    geo_enriched_data = address_df.withColumn(
        "PlaceID", geocode_address_udf(col("address"),
                                       lit(known_args.api_key)))
    print(geo_enriched_data.show())
    file_name = get_chd_file(known_args.ftp_user, known_args.ftp_password)
    print(file_name)
    # log the success and terminate Spark application
    log.warn('test_etl_job is finished')
    spark.stop()
    return None
def main():
    """Main ETL script definition.

    :return: None
    """
    parser = argparse.ArgumentParser()
    parser.add_argument('--sf_user', dest='sf_user', help='SF user')
    parser.add_argument('--sf_password',
                        dest='sf_password',
                        help='SF password')
    parser.add_argument('--sf_token', dest='sf_token', help='SF token')
    known_args = parser.parse_args()

    # start Spark application and get Spark session, logger and config
    spark, log, config = start_spark(app_name='my_etl_job',
                                     files=['configs/etl_config.json'])

    # log that main ETL job is starting
    log.warn('etl_job is up-and-running')
    sf_user = known_args.sf_user
    sf_password = known_args.sf_password
    salesforce = Salesforce(username=sf_user,
                            password=sf_password,
                            security_token='')

    # query = "select id, name, annual_sales__c,average_check__c, chain_id__c, chain_name__c, chd_id__c, confidence_level__c, county__c,credit_rating__c,dayparts__c ,dma_name__c ,group_health_system__c ,hours__c ,location_name__c ,menu_items__c ,msa_name__c ,number_of_employees__c ,number_of_rooms__c ,number_of_seats__c ,operation_type__c ,parent_id__c,phone ,units__c ,website,years_in_business__c,yelp_url__c,chd_chain_id__c,Google_Place_ID__c,Qualification_Status__c,current_month_match__c,CustomerStatus__c, ShippingCity,ShippingLatitude,ShippingLongitude,ShippingPostalCode,ShippingState,ShippingStreet,market_segment_list__c,Current_Chd_Name__c, Data_Update_Case__c, exclude_from_chd_match__c, Current_Chd_Shipping_Street__c, Current_Chd_Shipping_City__c, Current_Chd_Shipping_State__c,Current_Chd_Shipping_Postal_Code__c from Account"
    query = "SELECT id, chd_id__c, Google_Place_ID__c from Account"  #Google_Place_ID__c
    # accounts_spark = get_sf_df(query, salesforce, spark)

    query_to_geocode = "SELECT id, Name, ShippingAddress from Account where chd_id__c = null AND google_place_id__c = null"
    accounts_to_geocode_list = salesforce.query_all(query_to_geocode)
    accounts_to_geocode_records = accounts_to_geocode_list['records']
    accounts_to_geocode_pdf = pd.DataFrame(accounts_to_geocode_records)
    # accounts_to_geocode_pdf = convert_simple_salesforce_ordered_dictionary_to_pandas_dataframe(accounts_to_geocode_records)
    accounts_to_geocode_pdf['parsed_address'] = accounts_to_geocode_pdf[
        'ShippingAddress'].apply(lambda x: json.dumps(x))
    accounts_to_geocode_pdf = accounts_to_geocode_pdf.drop(
        ["attributes", "ShippingAddress"], axis=1)
    accounts_to_geocode_spark = spark.createDataFrame(accounts_to_geocode_pdf)
    accounts_to_geocode_spark.printSchema()
    print(accounts_to_geocode_spark.count())
    accounts_to_geocode_spark.write.parquet('tests/chd/sf_accounts_to_geocode',
                                            mode='overwrite')

    accounts = salesforce.query_all(query)
    accounts_pandas = pd.DataFrame(accounts['records'])

    accounts_spark = spark.createDataFrame(accounts_pandas)
    accounts_spark.printSchema()
    accounts_spark.write.parquet('tests/chd/sf_accounts', mode='overwrite')
Beispiel #17
0
def main():
    # gameid = ['69698', '5151', '60187', '47330', '54928', '10497', '12492', '55307', '2301', '70056', '50500', '74870',
    #           '34768', '35141', '91972', '6922', '69383', '85118', '85452', '69411', '85552', '31074', '69405', '70215',
    #           '59520', '66187', '10056', '85846', '33973', '71417']
    # input_path = 'E:\\tmp\\review_csv'
    # input_path = 'E:\\tmp\\csv_test'
    # output_path = 'E:\\tmp\\output_review'
    spark_session, log, config = start_spark(app_name='nlp_tokenization',
                                             files=['./configs/file_list_config.json'])
    gameid = ('69698', '5151', '60187', '47330', '54928', '10497', '12492', '55307', '2301', '70056', '50500', '74870',
              '34768', '35141', '91972', '6922', '69383', '85118', '85452', '69411', '62422', '31074', '69405', '70215',
              '59520', '66187', '10056', '85846', '33973', '71417')
    data_frame = load_data(spark_session, config[''], str(gameid))
    writer_csv(data_frame, config[''], id.strip())
def main():

    spark, log, config = start_spark(app_name='my_spark_app', files=[
        'configs/etl_config.json'])

    log.warn('etl job  is up and running')

    # execute ETL pipeline
    data = extract_data(spark)
    data_transformed = transform_data(data, config['steps_per_floor_'])
    load_data(data_transformed)

    log.warn('test etl job finished')
    spark.stop()
    return None
def main():
    """

    :return:
    """
    user_dict = 'user_dict.txt'
    user_dict_path = resolve_filename(get_module_res(user_dict))
    print(get_module_res(user_dict).name)
    jieba.load_userdict(user_dict_path)
    stop_path = resolve_filename(get_module_res('stop_word.txt'))
    spark_session, log, config = start_spark(
        app_name='nlp_tokenization', files=['configs/file_list_config.json'])
    stop_words = stop_word(spark_session, stop_path)
    # input_path=config['file_input'], out_put=config['file_output']
    participle(session=spark_session, stop_words=stop_words)
Beispiel #20
0
def main():

    spark, sc = start_spark(app_name="PySpark - AMRDEF", config='localhost')

    # job-translate-amrdef
    data = translator(
        "file:////home/ivan/Documents/Primestone/Esquemas/AMRDEF_sample_modif_flags_actualdates.xml",
        spark)
    print("\n" * 10, "data translation done", "\n" * 10)

    # job-enrich
    data = enricher(data, spark)
    print("\n" * 10, "data enrichment done", "\n" * 10)

    # job-clean
    data = cleaner(data, spark)
    print("\n" * 10, "data cleaning done", "\n" * 10)
def main():
    """Main ETL script definition.

    :return: None
    """
    # start Spark application and get Spark session, logger, config and audit
    spark, logger, config_dict, audit = start_spark(
        app_name='PHM_alinity_i_205_results'
    )

    # log that main ETL job is starting
    logger.info('etl_job is up-and-running')

    # execute ETL pipeline

    data = extract_data(spark, config_dict)
    data_transformed = transform_data(data)

    partition_list = data_transformed.agg(collect_set('transaction_date')).collect()[0][0]
    logger.debug("Patitions to dedup: ")
    logger.debug(str(partition_list))

    cleansed_bucket = config_dict['etl_cleansed_bucket']
    cleansed_key_prefix = config_dict['etl_cleansed_key'] + "/transaction_date="
    s3_utils = S3Utils(spark, config_dict)
    paths_list = []

    for partition_suffix in partition_list:
        if s3_utils.is_key_prefix_empty(cleansed_bucket, cleansed_key_prefix + str(partition_suffix)):
            paths_list.extend(config_dict['s3a_prefix'] + cleansed_bucket + cleansed_key_prefix
                              + str(partition_suffix) + "/*")
            logger.debug("Patitions to dedup: " + str(paths_list))

    if paths_list:
        data_cleansed = spark.read.format("parquet").load(paths_list)
        data_deduped = deduplicate_data(data_transformed, data_cleansed)
        load_data(data_deduped, config_dict['s3a_prefix'] + config_dict['etl_cleansed_bucket']
                  + "/" + config_dict['etl_cleansed_key'])
    else:
        load_data(data_transformed, config_dict['s3a_prefix'] + config_dict['etl_cleansed_bucket']
                  + "/" + config_dict['etl_cleansed_key'])

    # log the success and terminate Spark application
    logger.info('test_etl_job is finished')
    stop_spark(spark, config_dict, audit)
    return None
Beispiel #22
0
def main():
    spark, sql_context, log, config = start_spark(
        app_name='radiography_analysis',
        files=['configs/radiography_analysis_config.json'])

    log.warn('Running radiography analysis...')

    # extracting and transforming the dataset
    [data_normal, data_covid19, data_lung_opacity,
     data_viral_pneumonia] = extract_data(spark)
    data_initial = transform_data(data_normal, data_covid19, data_lung_opacity,
                                  data_viral_pneumonia, sql_context)

    # percentage of samples (different categories)
    data_transformed = transform_percentage_of_samples(data_initial)
    load_data(data_transformed, "percentage_of_samples")

    # take one sample of each group
    data_transformed = transform_take_samples(data_initial)
    load_data(data_transformed, "take_samples")

    # colour distribution
    data_transformed = transform_colour_distribution(data_initial)
    load_data(data_transformed, "colour_distribution")

    # ML classification (distributed)
    data_transformed = transform_ml_classification(data_initial, spark)
    load_data(data_transformed, "ml_classification")

    # The trained model is available in -> /keras_model
    # DL model compiling/training (not distributed)
    # [data_transformed_matrix, data_transformed_acc] = transform_dl_classification(data_initial, spark)
    # load_data(data_transformed_matrix, "dl_classification_matrix")
    # load_data(data_transformed_acc, "dl_classification_accuracy")

    # DL model inference (distributed)
    data_transformed = transform_dl_model_inference(data_initial)
    load_data(data_transformed, "dl_inference")

    log.warn('Terminating radiography analysis...')

    spark.stop()
    return None
Beispiel #23
0
    def setUp(self):
        """Start Spark, define config and path to test data
        """
        self.spark, self.log, *_ = start_spark(app_name='unittest_etl_job')

        self.config = json.loads("""{
                                      "extract"  : {"uri": "tests/test_data/udf_test_data/recipes_negative.json",
                                                    "clean": "True"},
                                      "transform": {"udfs_required":["tominutes"],
                                                    "ingredient": "beef",
                                                    "ingredient": 30,
                                                    "ingredient": 60,
                                                    "ingredient": 60
                                      },
                                      "load"     : {
                                                    "load_path": "output/report.csv"
                                                    }
                                    }
                      """)
Beispiel #24
0
    def setUp(self):
        """Start Spark, define config and path to test data
        """
        self.spark, *_ = start_spark(app_name='my_etl_job')

        self.config = json.loads("""{
                              "extract"  : {"uri": "tests/test_data/udf_test_data/recipes_positive.json",
                                            "clean": "True",
                                            "temptable": "recipes"},
                              "transform": {"sql_path": "configs/transformation.sql",
                                            "udfs_required":["tominutes"]},
                              "load"     : {"database": "hellofresh",
                                            "tablename": "recipes",
                                            "load_path": "user/hive/warehouse/hellofresh.db/recipes",
                                            "partition_cols": {"difficulty": "string"}
                                            },
                              "impala"     : {"impala_host": "localhost"}
                            }
                            
                            """)
Beispiel #25
0
def main():
    """Main ETL script definition.
    :return: None
    """
    # start Spark application and get Spark session, logger and config
    spark, log, config = start_spark(
        app_name='my_etl_job',
        files=['configs/etl_config.json'])
    # log that main ETL job is starting
    log.warn('etl_job is up-and-running')

    # execute ETL pipeline
    data = extract_data(spark)
    data_transformed = transform_data(data, config['steps_per_floor'])
    load_data(data_transformed)

    # log the success and terminate Spark application
    log.warn('test_etl_job is finished')
    spark.stop()
    return None
Beispiel #26
0
def main():
    """
    Main ETL script definition.

    :return: None
    """
    platform = sys.argv[1] if len(sys.argv) > 1 else "local"

    if platform not in ["local", "emr"]:
        platform = "local"

    config_path = "./configs/etl_config.json"

    # start Spark application and get Spark session, logger and config
    spark, log, config = start_spark(app_name="spark-app", files=[config_path])

    # log that main ETL job is starting
    log.warn("spark-app is up-and-running")

    if platform == "local":
        spark.sparkContext.addPyFile("jobs/common.py")

    spark.conf.set("spark.sql.crossJoin.enabled", "true")

    # read config
    config = config[platform]

    # execute ETL pipeline

    # extract
    data_frames = extract_data(spark, log, config)

    # transform
    data_frames = transform_data(spark, log, config, data_frames)

    # load
    load_data(spark, log, config, data_frames)

    # log the success and terminate Spark application
    spark.stop()
    return None
Beispiel #27
0
def main():
    """

    :return:
    """
    pattern = u'[\s+\.\!\/_,$%^*(+\"\')]+|[+——()?【】“”!,。?、~@#¥%……&*()]+'
    stop_words_file = './dependencies/stop_word.txt'
    user_dict = './dependencies/user_dict.txt'
    stop_words = stop_word(stop_words_file)
    file_path = 'E:\\workspaces_learn\\taptap-spider'
    spark_session, log, config = start_spark(app_name='nlp_tokenization',
                                             files=['./configs/file_list_config.json'])
    file_list = os.listdir(file_path)
    for file in file_list:
        if file.startswith('app_review'):
            if file.endswith('.csv'):
                file_split = file.split('.')[0]
                file = file_path + '/' + file
                sentences_list = load_data(spark_session, file)
                word_split(sentences_list=sentences_list, stop_words=stop_words, user_dict=user_dict, pattern=pattern,
                           session=spark_session, file=file_split)
def main():
    """Main ETL script definition.

    :return: None
    """
    # start Spark application and get Spark session, logger and config
    spark, log, config = start_spark(
        app_name='process_text_job',
        files=['configs/process_text_config.json'])

    # log that main ETL job is starting
    log.warn('process_text_job is up-and-running')

    # execute ETL pipeline
    data = extract_data(spark, config['input_path'])
    data_transformed = transform_data(data)
    load_data(data_transformed, config['output_path'])

    # log the success and terminate Spark application
    log.info('process_text_job is finished')
    spark.stop()
    return None
Beispiel #29
0
def main():
    """Main ETL script definition.

    :return: None
    """
    # start Spark application and get Spark session, logger and config
    spark, log, config = start_spark(app_name='dpl_ecf',
                                     files=['configs/etl_config.json'])

    # log that main ETL job is starting
    log.warn('dpl_ecf is up-and-running')

    # execute ETL pipeline
    data_frames = extract_data(spark, config)
    data_frames_drop, data_frames_transpose = transform_data(
        data_frames, config)
    load_data(data_frames_drop, data_frames_transpose, config)

    # log the success and terminate Spark application
    log.warn('dpl_ecf is finished')
    spark.stop()
    return None
Beispiel #30
0
 def __init__(self, app_name, files, jar_packages, dependencies = "packages.zip"):
     # start Spark application and get Spark session, logger and config
     self.app_name = app_name
     spark, log, config = start_spark(
         app_name=app_name,
         files=[files],
         master="192.168.122.3:7077",
         jar_packages=jar_packages,
         dependencies=dependencies
     )
     self.kafka_server = config["kafka_server"]
     self.es_server = config["es_server"]
     self.log = log
     self.spark = spark
     self.config = config
     self.es_reader = (spark.read
                  .format("org.elasticsearch.spark.sql")
                  .option("es.nodes", self.es_server)
                  .option("es.net.http.auth.user","elastic")
                  .option("es.read.field.as.array.include", "winlog.keywords,etl_pipeline")
                  .option("es.read.field.exclude", "tags,user,z_original_message,z_logstash_pipeline")
                  )