from pyspark.sql import Row
from stockRdd import StockRdd
from dateInterval import DateIntervalManager

sample_data_rdd = sc.textFile("file:///var/data/stocks/historical_data/*.csv").distinct()

yesterday_date = DateInterval.getYesterdayDate()

dailyDateIntervalDictionaryToCalculateFor = DateIntervalManager.createDailyIntervalDictionaryForPastYear(yesterday_date)

number_of_days_in_dictionary = dailyDateIntervalDictionaryToCalculateFor.getNumberOfDaysInDictionary()

minimum_number_of_days = int((4.0 / 7.0) * float(number_of_days_in_dictionary))

mapStockCsvToKeyValueClosure = StockRdd.getMapStockCsvToKeyValueForDatesInDictionaryClosure(dailyDateIntervalDictionaryToCalculateFor)
symbol_creation_function_closure = StockRdd.getSymbolDataInstanceForDateDictionaryDataPointsClosure(dailyDateIntervalDictionaryToCalculateFor, yesterday_date)

symbol_down_stocks_data_filtered = sample_data_rdd.map(mapStockCsvToKeyValueClosure)\
                                           .filter(lambda line: not(line is None))\
                                           .reduceByKey(lambda a,b : a + b)\
                                           .map(lambda tuple : ( tuple[0], StockRdd.sort_and_compute_deltas( list(tuple[1]) ) ) )\
                                           .filter(lambda tuple : len(list(tuple[1])) > minimum_number_of_days)\
                                           .map(symbol_creation_function_closure)\
                                           .filter(lambda symbol_and_instance_tuple: not(symbol_and_instance_tuple[1].getTodayPrice() is None))\
                                           .map(StockRdd.getDownStocksDataTuple)\
                                           .filter(lambda data_tuple: not(data_tuple[1] is None))\
                                           .filter(lambda data_tuple: not(data_tuple[1] == float("inf")))

symbol_down_stocks_data_filtered_rows = symbol_down_stocks_data_filtered\
                                            .map(lambda tuple : Row(symbol = tuple[0], span_unit_delta_percentage_ratio = tuple[1], today_price = tuple[2], today_unit_delta_percentage = tuple[3]))
sc = SparkContext(spark_url, spark_context_name, pyFiles=included_python_files_package)
sqlContext = HiveContext(sc)

# Initialize the RDD with the stock data files
sample_data_rdd = sc.textFile(data_files).distinct()

# Create a dictionary containing date-intervals to represent 26 intervals of 2-week spans spanning the past year
dateDictionaryToCalculateFor = DateIntervalManager.createDateIntervalDictionaryForPastYear(today_date)

# We want to ensure that any stocks being calculated existed during the entire period
number_of_days_in_dictionary = dateDictionaryToCalculateFor.getNumberOfDaysInDictionary()
minimum_number_of_days_for_stock = int((4.0 / 7.0) * float(number_of_days_in_dictionary))

# map_stock_csv_to_key_value_closure is a function closure that will filter out lines of data whose dates are outside of the
#   time-frame we are concerned about
map_stock_csv_to_key_value_closure = StockRdd.getMapStockCsvToKeyValueForDatesInDictionaryClosure(dateDictionaryToCalculateFor)

# symbol_creation_function_closure is a function closure which will convert the list of csv data lines to a SymbolData object
#   which can return the data points we need to cluster a stock
symbol_creation_function_closure = StockRdd.getSymbolDataInstanceForDateDictionaryDataPointsClosure(dateDictionaryToCalculateFor, today_date)

# symbol_cluster_data_closure is a function closure which will convert a SymbolData object to a list of data points which
#   should be used to cluster stocks by
symbol_cluster_data_closure = StockRdd.getDataToClusterByDateDictionariesClosure(dateDictionaryToCalculateFor)

# symbol_has_none_values_closure is a function closure which will return False is any of the values in the list of data points
#   returned by symbol_cluster_data_closure contains a None value; it will return True otherwise
symbol_has_none_values_closure = StockRdd.getDoesSymbolTupleHaveNoNoneValueClosure()

print "\n\n\n\nAbout to Map and Reduce Data\n\n\n\n"