Ejemplo n.º 1
0
def main():
    spark_conf = SparkConf().setAppName("Text Preprocesser").set(
        "spark.cores.max", "30")

    global sc
    sc = SparkContext(conf=spark_conf)
    sc.setLogLevel("ERROR")
    sc.addFile(
        os.path.dirname(os.path.dirname(os.path.abspath(__file__))) +
        "/lib/util.py")
    sc.addFile(
        os.path.dirname(os.path.dirname(os.path.abspath(__file__))) +
        "/config/config.py")

    global sql_context
    sql_context = SQLContext(sc)

    start_time = time.time()
    preprocess_all()
    end_time = time.time()
    print(
        colored(
            "Preprocessing run time (seconds): {0}".format(end_time -
                                                           start_time),
            "magenta"))
Ejemplo n.º 2
0
def main():
    #spark_conf = SparkConf().setAppName("Text Preprocesser").set("spark.cores.max", "30")

    global sc
    #sc = SparkContext(conf=spark_conf)
    sc_conf = SparkConf()
    sc_conf.set("spark.redis.host", config.REDIS_SERVER)
    sc_conf.set("spark.redis.port", "6379")
    sc = SparkContext(conf=sc_conf)
    sc.setLogLevel("ERROR")
    sc.addFile(
        os.path.dirname(os.path.dirname(os.path.abspath(__file__))) +
        "/lib/util.py")
    sc.addFile(
        os.path.dirname(os.path.dirname(os.path.abspath(__file__))) +
        "/config/config.py")

    global sql_context
    sql_context = SQLContext(sc)

    start_time = time.time()
    preprocess_files(config.S3_BUCKET, config.S3_FOLDER_EXTRACTED)
    end_time = time.time()
    print(
        colored(
            "Preprocessing run time (seconds): {0}".format(end_time -
                                                           start_time),
            "magenta"))
def main():
    spark_conf = SparkConf().setAppName("Spark Custom MinHashLSH").set(
        "spark.cores.max", "30")

    global sc
    sc = SparkContext(conf=spark_conf)
    sc.setLogLevel("ERROR")
    sc.addFile(
        os.path.dirname(os.path.dirname(os.path.abspath(__file__))) +
        "/lib/min_hash.py")
    sc.addFile(
        os.path.dirname(os.path.dirname(os.path.abspath(__file__))) +
        "/lib/locality_sensitive_hash.py")
    sc.addFile(
        os.path.dirname(os.path.dirname(os.path.abspath(__file__))) +
        "/lib/util.py")
    sc.addFile(
        os.path.dirname(os.path.dirname(os.path.abspath(__file__))) +
        "/config/config.py")

    global sql_context
    sql_context = SQLContext(sc)

    start_time = time.time()
    run_minhash_lsh()
    end_time = time.time()
    print(
        colored(
            "Spark Custom MinHashLSH run time (seconds): {0} seconds".format(
                end_time - start_time), "magenta"))
Ejemplo n.º 4
0
def main():
    spark_conf = SparkConf().setAppName("Spark Custom MinHashLSH").set("spark.cores.max", "30")

    global sc
    global sql_context    

    sc = SparkContext(conf=spark_conf)
    sc.setLogLevel("ERROR")
    sql_context = SQLContext(sc)
    sc.addFile(os.path.dirname(os.path.dirname(os.path.abspath(__file__))) + "/lib/util.py")
    sc.addFile(os.path.dirname(os.path.dirname(os.path.abspath(__file__))) + "/config/config.py")


    start_time = time.time()
    similarity_scores_df = compare_text()

    config = configparser.ConfigParser()
    config.read('../config/db_properties.ini')
    similarity_scores_df.write.jdbc(config['postgres']['url'], config['postgres']['table'], mode='overwrite', properties={'user': config['postgres']['user'], 'password': config['postgres']['password']})    

    end_time = time.time()
    print(colored("Spark MinHash run time (seconds): {0} seconds".format(end_time - start_time), "magenta"))
Ejemplo n.º 5
0

if __name__ == '__main__':

    # Creating the streaming spark context and spark session
    sp_context = SparkContext('local[2]', "Read_Stream")
    ssp_context = StreamingContext(sp_context, 5.000)
    sp_sess = SparkSession.builder.appName('Read_Data').getOrCreate()

    # Adding the files to Spark Context
    # sp_context.addFile("calc_stats_1.py")
    # sp_context.addFile("calc_stats_2.py")
    # sp_context.addFile("init_stats.py")
    # sp_context.addFile("model.py")
    # sp_context.addFile("chances_of_winning.py")
    sp_context.addFile('metrics.py')

    # Reading the CSV files using Spark session
    players = sp_sess.read.csv(play_path, header=True, inferSchema=True)
    teams = sp_sess.read.csv(team_path, header=True, inferSchema=True)

    # Importing the files
    from metrics import *
    """global player_chemistry
    global player_ratings
    global regr_player
    global player_profile
    global match_details
"""
    # Initializing the player chem and ratings
    player_chemistry = init_chemistry(players)
Ejemplo n.º 6
0
    write_to_file("player_rate.json", player_rate_json)
    write_to_file("player_profile.json", player_prof_json)
    write_to_file("match_details.json", match_details_json)

    # Writing the JSON strings to files


if __name__ == '__main__':

    # Creating the streaming spark context and spark session
    sp_context = SparkContext('local[2]', "Read_Stream")
    ssp_context = StreamingContext(sp_context, 5.000)
    sp_sess = SparkSession.builder.appName('Read_Data').getOrCreate()

    # Adding the files to Spark Context
    sp_context.addFile("calc_stats_1.py")
    sp_context.addFile("calc_stats_2.py")
    sp_context.addFile("init_stats.py")
    sp_context.addFile("model.py")
    sp_context.addFile("chances_of_winning.py")

    # Reading the CSV files using Spark session
    players = sp_sess.read.csv(play_path, header=True, inferSchema=True)
    teams = sp_sess.read.csv(team_path, header=True, inferSchema=True)

    # Importing the files
    from calc_stats_1 import *
    from calc_stats_2 import *
    from init_stats import *
    from model import *
    from chances_of_winning import *
Ejemplo n.º 7
0
                    "own_goals": i['own_goals'],
                    "yellow_cards": i['yellow_cards'],
                    "red_cards": i['red_cards']
                }
                json_object = json.dumps(dictionary, indent=4)
                # Writing to sample.json
                with open("output_req_2.json", "w") as outfile:
                    print("Writing....to JSON")
                    outfile.write(json_object)
                break


if __name__ == "__main__":
    sp_context = SparkContext('local[2]', "UI")
    sp_sess = SparkSession.builder.appName('user_input').getOrCreate()
    sp_context.addFile("model.py")
    input_file = sys.argv[1]
    with open(input_file, 'r') as file:
        content = file.read()
        input_data = eval(content)
        if input_data["req_type"] == 1:
            # calling predict function:
            """
            output = predict(input_)
            """
            predict_helper(input_data)

        elif input_data["req_type"] == 2:
            # calling profile function
            player_profile_helper(input_data)
Ejemplo n.º 8
0
                json_object = json.dumps(dictionary, indent=4) 
                # Writing to sample.json 
                with open("output_req_3.json", "w") as outfile:
                    outfile.write(json_object) 
                return
                break
    with open("output_req_3.json", "w") as outfile:
        dicte = {"match_found":False}
        outfile.write(json.dumps(dicte, indent=4))    



if __name__ == "__main__":
    sp_context = SparkContext('local[2]', "UI")
    sp_sess = SparkSession.builder.appName('user_input').getOrCreate()
    sp_context.addFile("metrics.py")
    input_file = sys.argv[1]
    with open(input_file, 'r') as file:
        content = file.read()
        input_data = eval(content)
        if 'req_type' not in input_data:
            # calling match info function 
            match_data_helper(input_data)
        else:
            if input_data["req_type"] == 1:
                # calling predict function:
                """
                output = predict(input_)
                """
                predict_helper(input_data)