def __init__(self): self.report_path = "\\\\psstats03\\reports\\" self.temp_in_files = "\\\\filer01\\public\\Data_Analytics\\Data_Distributions\\temporary_files\\in_files\\" self.temp_out_files = "\\\\filer01\\public\\Data_Analytics\\Data_Distributions\\temporary_files\\out_files\\" self.zip_path = "\\\\filer01\\public\\Data_Analytics\\Data Distributions\\temporary_files\\out_files\\" self.most_recent_sunday = dates.DateFunctions( ).find_most_recent_sunday() self.most_recent_sunday = dates.DateFunctions( ).datetime_to_string_y_m_d(self.most_recent_sunday, "-") self.first_of_month = dates.DateFunctions( ).return_ymd_first_of_the_current_month("-") self.monthly_zip = [ ("Ryan Data Dist ONET 40.zip", 'onet_forty'), ("Ryan Data Dist ONET 40 Overall.zip", 'onet_forty_overall'), ("Ryan Data Dist Month by Month NAICS 40.zip", 'naics_codes'), ("Ryan Data Dist Month by Month NAICS 40 Overall.zip", 'naics_codes_overall'), ("Ryan Data Dist Metros.zip", 'metro_main'), ("Ryan Data Dist Metros Overall.zip", 'metro_main_overall'), ("Ryan Data Dist Degree Affinities.zip", 'degree_affinities'), ("Ryan Data Dist Degree Affinities Overall.zip", 'degree_affinities_overall'), ("Ryan Data Dist Company Sizes.zip", 'company_size'), ("Ryan Data Dist Company Sizes Overall.zip", 'company_size_overall'), ("Ryan Data Dist Age Buckets.zip", 'age_ranges'), ("Ryan Data Dist Age Buckets Overall.zip", 'age_ranges_overall'), ("Ryan Data Dist YE.zip", 'years_experience'), ("Ryan Data Dist YE Overall.zip", 'years_experience_overall'), ("Ryan Data Dist Rollups.zip", 'rollups'), ("Ryan Data Dist Rollups Overall.zip", 'rollups_overall') ]
def find_weekly_date_values(): closest_sunday = dates.DateFunctions().find_most_recent_sunday() weeks_prior = dates.DateFunctions().create_prior_date_by_weeks( closest_sunday, 6) closest_sunday = str(closest_sunday.strftime("%Y%m%d")) weeks_prior = str(weeks_prior.strftime("%Y%m%d")) return closest_sunday, weeks_prior
def switchDates(self, weekly): if weekly is True: self.startDateLabel.setText('From Week starting ' + str(df.DateFunctions().end_week(self.startDate.date() .toPyDate()))) self.startDate.setDisplayFormat('MM.dd.yyyy') self.endDateLabel.setText('Through Week starting ' + str(df.DateFunctions().end_week(self.endDate.date() .toPyDate()))) self.endDate.setDisplayFormat('MM.dd.yyyy') else: self.startDateLabel.setText('From 1st of ') self.startDate.setDisplayFormat('MM.yyyy') self.endDateLabel.setText('Through 31st of') self.endDate.setDisplayFormat('MM.yyyy')
def update_database_with_monthly_values(self, dictionary, data_table, mapping=None, rollup_metro=False): conn = sqlite3.connect(self.sql_data_base) c = conn.cursor() if rollup_metro is False: print('Placing data into ' + data_table + ' \n') for keys in dictionary: use_date = daten.DateFunctions().return_full_year_month_day( str(keys)) category = set() [ category.add(new_key) for new_key in dictionary[keys] if new_key is not None ] for i, value in enumerate(category): profiles = dictionary[keys][value] if mapping is None: c.execute( 'INSERT INTO ' + data_table + ' VALUES (?,?,?)', (use_date, value, profiles)) if mapping is not None: c.execute( 'INSERT INTO ' + data_table + ' VALUES (?,?,?)', (use_date, mapping[value], profiles)) elif rollup_metro is True: for keys in dictionary: use_date = daten.DateFunctions().return_full_year_month_day( keys) category = set() [ category.add(new_key) for new_key in dictionary[keys] if new_key is not None ] for i, value in enumerate(category): data_list = dictionary[keys][value] for pair in data_list: rollup = pair[0] profiles = pair[1] c.execute( 'INSERT INTO ' + data_table + ' VALUES (?,?,?,?)', (use_date, rollup, mapping[value], profiles)) conn.commit() conn.close()
def extract_data(self, full_zip_file, file_name): current_year_month = dates.DateFunctions().return_year_month_as_string( "") if current_year_month in file_name: pass else: full_zip_file.extract(file_name, self.working_directory) print "Working on " + file_name with open(self.working_directory + "\\" + file_name) as reader: for row in csv.reader(reader, delimiter=',', quotechar='"'): #Get rid of bad rows if len(row) < 16: continue else: date = file_name[len('JobTitleMatchingV2_' ):len('JobTitleMatchingV2_') + 10] mon, day, year = date[4:6], date[6:8], date[0:4] final_date = year + '-' + mon data = self.data_pull(row) if data[6] is False: self.is_internal_data.append(data) self.is_internal_dates.append(final_date) else: self.is_customer_data.append(data) self.is_customer_dates.append(final_date)
def create_weekly_statistics(self, active_list, total_list): current_distribution = [] past_distribution = dict() past_distribution['active'] = 0 past_distribution['total'] = 0 last_sunday, six_weeks_prior = self.find_weekly_date_values() two_sundays_ago = str( dates.DateFunctions().find_date_two_sundays_ago().strftime( "%Y%m%d")) for i in xrange(0, len(active_list) - 1): if str(active_list[i][0]) == two_sundays_ago: current_distribution.append(active_list[i][1]) current_distribution.append(total_list[i][1] - active_list[i][1]) else: past_distribution["active"] += active_list[i][1] past_distribution['total'] += total_list[i][1] current_distribution.append(past_distribution['active']) current_distribution.append(past_distribution['total'] - past_distribution['active']) current_ratio = float(current_distribution[0]) / float( current_distribution[1]) past_ratio = float(past_distribution['active']) / float( past_distribution['total']) stats = chi.ChiSquaredIndependence(current_distribution) stats.calculate_expected_values() final_stats = stats.calculate_chi_square() return last_sunday, current_distribution[0], current_distribution[1], current_ratio, current_distribution[2], \ current_distribution[3], past_ratio, final_stats[0], final_stats[1], current_ratio - past_ratio
def solver_alt(self): self.answer = gui_classes.AnswerPopup() begin = df.DateFunctions().end_week( self.dates.startDate.date().toPyDate()) end = df.DateFunctions().end_week(self.dates.endDate.date().toPyDate()) self.data = pull_stats.Solver( self.combo.currentIndex(), begin, end, self.variables.variable_box.currentText(), self.variables.metroCountry.currentText(), self.variables.metroSt.currentText(), self.variables.metroCity.currentText()) self.data.dist.canvas.setParent(self.answer) self.answer.setLayout(self.data.v_box_data) self.answer.show()
def __init__(self, min_date, max_date): self.host = 'digger' self.user = '******' self.password = '******' self.port = 8089 self.date_format = df.DateFunctions() self.min_date = min_date self.max_date = max_date self.search_string = "search sourcetype=PSPJobMatching | table AccountID, AccountName, Algorithm, AllQueries," \ " Created, Email, JobIndex, NumSearches, NumViewMore, PayscaleTitle, Query, " \ "SalesForceAccountID, SubscriptionType"
def update_database_with_weekly_values(self, count_dict, data_table, medians=False): #Inserting data into the created tables if medians is False: print('Placing data into ' + data_table + ' \n') conn = sqlite3.connect(self.sql_data_base) c = conn.cursor() for keys in count_dict: use_date = daten.DateFunctions().return_full_year_month_day( str(keys)) c.execute('INSERT INTO ' + data_table + ' VALUES (?,?)', (use_date, count_dict[keys])) conn.commit() conn.close() elif medians is True: print('Placing data into ' + data_table + ' \n') conn = sqlite3.connect(self.sql_data_base) c = conn.cursor() for keys in count_dict: use_date = daten.DateFunctions().return_full_year_month_day( str(keys)) twenty_fifth = count_dict[keys][1] median = count_dict[keys][2] seventy_fifth = count_dict[keys][3] c.execute('INSERT INTO eac_weekly VALUES (?,?,?,?)', (use_date, twenty_fifth, median, seventy_fifth)) conn.commit() conn.close()
def getData(self, query, start, weekly): #gets data from query, adds zeros to dates without entry if weekly is True: start = df.DateFunctions().end_week(start) for row in self.c.execute(query): if str(row[0]).startswith("9"): #Keeping out values from before 2010 continue while df.DateFunctions().from_date( row[0]) != start and start < df.DateFunctions( ).from_date(row[0]): #print(start) #self.data_points_x.append(start) #self.data_points_y.append(0) start = start + relativedelta(days=+7) self.data_points_x.append(df.DateFunctions().from_date(row[0])) self.data_points_y.append(row[1]) start = df.DateFunctions().from_date( row[0]) + relativedelta(days=+7) else: for row in self.c.execute(str(query)): while df.DateFunctions().from_date( row[0]) != start and start < df.DateFunctions( ).from_date(row[0]): #self.data_points_x.append(start) #self.data_points_y.append(0) start = start + relativedelta(months=+1) self.data_points_x.append(df.DateFunctions().from_date(row[0])) self.data_points_y.append(row[1]) start = df.DateFunctions().from_date( row[0]) + relativedelta(months=+1) '''puts data in box for user retrieval''' text = QtGui.QTextEdit() nums = "" for i in range(len(self.data_points_x)): nums = nums + str(self.data_points_x[i]) + "\t" + str( self.data_points_y[i]) + "\n" text.setText(nums) self.hboxNums.addWidget(QtGui.QLabel("Data points")) self.hboxNums.addWidget(text)
def __init__(self, start, end, rollup, country, state, city): super(RollupMetro, self).__init__() self.key = self.getMetroKey(country, state, city) query = "SELECT date, profiles FROM metro_rollups"+" WHERE date >= " + \ start+" AND date <= "+end+" AND rollup='"+rollup + \ "' AND metro_key="+self.key+" ORDER BY date ASC" self.getData(query, df.DateFunctions().from_date(start), False) self.graph() self.fig.suptitle(rollup + ' Rollup in ' + city + ', ' + state + ' Metro Area Profiles') self.start = start self.end = end self.rollup = rollup
def __init__(self, start, end, country=0, state=0, city=0, metrokey=0): super(Metro, self).__init__() if metrokey == 0: key = self.getMetroKey(country, state, city) else: key = metrokey query= "SELECT date, profiles FROM metro_main WHERE date >= "+start+\ " AND date <= "+end+" AND key="+key+" ORDER BY date ASC" self.getData(query, df.DateFunctions().from_date(start), False) self.graph() if city != 0: self.fig.suptitle(city + ', ' + state + ' Metro Area Profiles') else: self.fig.suptitle('All Metro Area Profiles') self.start = start self.end = end
def find_monthly_date_values(): first_of_month = dates.DateFunctions( ).return_ymd_first_of_the_current_month("") current_month = int(first_of_month[4:6]) current_year = int(first_of_month[:4]) for i in xrange(0, 6): current_month -= 1 if current_month == 0: current_month += 12 current_year -= 1 current_month = str(current_month) if len(current_month) == 1: current_month = "0" + current_month return str(current_year) + current_month + "01", first_of_month
def __init__(self, index, start, end, other): super(MonthlyDist, self).__init__() tables = [ "company_size", "degree_affinities", "naics_codes", "age_ranges", "onet_forty", "years_experience", "rollups" ] self.table = tables[index - 4] columns = [ "size_range", "level", "code", "age_range", "code", "experience_range", "name" ] self.column = columns[index - 4] names = [ 'Company Size', 'Degree Level', 'NAICS Code', 'Years Old', 'ONET Code', 'Years Experience', 'Rollup' ] if index == 6: name = other[6:] other = other[:3] if index == 8: name = other[13:] other = other[:10] self.index = index self.other = other query = "SELECT date, profiles FROM " + self.table + " WHERE date >= " \ + start + " AND date <= " + end + " AND " + self.column + "='" + other + \ "' ORDER BY date ASC" self.getData(query, df.DateFunctions().from_date(start), False) self.start = start self.end = end self.graph() if index == 11: self.fig.suptitle(other + ' ' + names[index - 4] + ' Profiles') elif index == 6 or index == 8: self.fig.suptitle(other + ' ' + name + ' Profiles') else: self.fig.suptitle(names[index - 4] + ' ' + other + ' Profiles')
def __init__(self, index, start, end): super(WeeklyDist, self).__init__() tables = [ "active_profiles", "overall_profiles", "eac_weekly", "combined_salary_weekly" ] self.table = tables[index] columns = ["profiles", "profiles", "median", "median"] self.column = columns[index] names = [ 'All Active Profiles', 'All Profiles', 'Median EAC', 'Median Combined Salary' ] #query = "SELECT * FROM " + self.table query = "SELECT date, " + self.column + " FROM " + self.table + " WHERE date >= " \ + start + " AND date <= " + end + " ORDER BY date ASC" self.getData(query, df.DateFunctions().from_date(start), True) self.start = start self.end = end self.graph() self.fig.suptitle(names[index])
import data_distributions as data_dist import DateFunctions.date_functions as dates import data_dist_database as db analytics_file_path = '\\\\filer01\\public\\Data_Analytics\\Data_Distributions\\temporary_files\\in_files\\' #Find the most recent sunday. This is for the week by week data pulls #since the weeks end on sunday most_recent_sunday = dates.DateFunctions().find_most_recent_sunday() most_recent_sunday = dates.DateFunctions().datetime_to_string_y_m_d( most_recent_sunday, "-") def find_weekly_query_start_date(table): last_run_date = db.DatabaseWork().pull_most_recent_date_value(table) + 1 last_run_date = str(last_run_date) year = last_run_date[:4] month = last_run_date[4:6] day = last_run_date[6:8] return year + "-" + month + "-" + day def find_monthly_query_start_date(table): last_run_date = str(db.DatabaseWork().pull_most_recent_date_value(table)) year = int(last_run_date[:4]) month = int(last_run_date[4:6]) + 1 if month == 13: month = 1
def create_monthly_statistics(self, active_list, total_list, category): past_actives = {} past_totals = {} last_month = dates.DateFunctions().return_ymd_first_of_last_month( "")[:6] current_actives = {} current_total = {} [ self.create_monthly_values_dictionary(item, past_actives) for item in active_list if last_month not in str(item[0]) ] [ self.create_monthly_values_dictionary(item, past_totals) for item in total_list if last_month not in str(item[0]) ] [ self.create_monthly_values_dictionary(key_trio, current_actives) for key_trio in active_list if last_month in str(key_trio[0]) ] [ self.create_monthly_values_dictionary(key_trio, current_total) for key_trio in total_list if last_month in str(key_trio[0]) ] for keys in past_totals.keys(): distribution_list = list() try: current_active_count = current_actives[keys] except KeyError: current_active_count = 0 try: current_total_count = current_total[keys] - current_active_count except KeyError: current_total_count = 0 distribution_list.append(current_active_count) distribution_list.append(current_total_count) try: current_ratio = float(current_active_count) / float( current_active_count + current_total_count) except ZeroDivisionError: current_ratio = 0 try: past_active_count = past_actives[keys] except KeyError: past_active_count = 0 distribution_list.append(past_active_count) distribution_list.append(past_totals[keys] - past_active_count) past_ratio = float(past_active_count) / float(past_totals[keys]) pass_fail = True for number in distribution_list: if number in distribution_list <= 5: pass_fail = False if pass_fail is True: stats = chi.ChiSquaredIndependence(distribution_list) stats.calculate_expected_values() final_stats = stats.calculate_chi_square() else: final_stats = ("N/A", "N/A") yield category, keys, current_active_count, current_total_count + current_active_count, current_ratio, \ past_active_count, past_totals[keys], past_ratio, \ final_stats[0], final_stats[1], current_ratio - past_ratio
def __init__(self, index, start, end, other_var=0, country=0, state=0, city=0): self.v_box_data = None if index < 4: self.dist = WeeklyDist( index, df.DateFunctions().return_full_year_month_day(start), df.DateFunctions().return_full_year_month_day(end)) elif index < 11: self.dist = MonthlyDist( index, df.DateFunctions().return_full_year_month_day( df.DateFunctions().start_month(start)), df.DateFunctions().return_full_year_month_day( df.DateFunctions().start_month(end)), other_var) elif index == 11: self.dist = RollupMetro( df.DateFunctions().return_full_year_month_day( df.DateFunctions().start_month(start)), df.DateFunctions().return_full_year_month_day( df.DateFunctions().start_month(end)), other_var, country, state, city) elif index == 12: self.dist = Metro( df.DateFunctions().return_full_year_month_day( df.DateFunctions().start_month(start)), df.DateFunctions().return_full_year_month_day( df.DateFunctions().start_month(end)), country, state, city) self.dist.special() self.layout()
import sys sys.path.append("C:\\hg\\payscale\\users\\ryanm\\PayScaleAnalytics\\") import datetime import csv import os import smtplib from email.mime.multipart import MIMEMultipart from email.mime.text import MIMEText import DataDistributions.data_dist_database as db import DateFunctions.date_functions as df import DataDistributions.data_distributions as dd alert_writing_path = '\\\\filer01\\public\\Data_Analytics\\Data_Distributions\\Distributions_Alert_Report\\' today = datetime.datetime.today() string_today = df.DateFunctions().datetime_to_string_y_m_d(today, "-") print "Running weekly queries..." dd.DataDistributions().run_weekly_analysis_tool_queries() dd.DataDistributions().update_weekly_database() if today.day < 9: print "Running monthly queries..." dd.DataDistributions().run_monthly_analysis_tool_queries() dd.DataDistributions().update_monthly_database() monthly_reports = [ 'onet_forty', 'onet_forty_overall', 'naics_codes', 'naics_codes_overall', 'metro_main', 'metro_main_overall', 'degree_affinities', 'degree_affinities_overall', 'company_size', 'company_size_overall', 'age_ranges', 'age_ranges_overall', 'years_experience',
import csv import DateFunctions.date_functions as df with open("C:\\users\\ryanm\\big_fancy_file\\out.txt", 'rb') as R: with open("C:\\users\\ryanm\\big_fancy_file\\fixed_dates.txt", 'wb') as W: reader = csv.reader(R, delimiter=',') writer = csv.writer(W, delimiter=',', lineterminator='\n') header = reader.next() writer.writerow(header) for i, row in enumerate(reader): row_index = header.index("ActivityDateTime") test_date = row[row_index] formatted_date = df.DateFunctions().rob_date_return(test_date) row[row_index] = formatted_date writer.writerow(row)