def get_capacity_data(): result = defaultdict(list) engine = get_engine() cur = engine.cursor() # Get Partner Data cur.execute(""" select user_name, avg(ground_truth_rate / speed) capacity from valid_tests where user_name in (select * from partner_users) and speed = 100 group by user_name """) for user_name, capacity in cur.fetchall(): result["partner"].append((user_name, capacity)) # Get HOT Data cur.execute(""" select user_name, avg(ground_truth_rate / speed) capacity from valid_tests where user_name in (select * from hot_users) and speed = 100 group by user_name """) for user_name, capacity in cur.fetchall(): result["hot"].append((user_name, capacity)) # Get Bezeq Data cur.execute(""" select user_name, avg(ground_truth_rate / speed) capacity from valid_tests where user_name in (select * from bezeq_users) and speed = 100 group by user_name """) for user_name, capacity in cur.fetchall(): result["bezeq"].append((user_name, capacity)) # Get All Users Data cur.execute(""" select user_name, avg(ground_truth_rate / speed) capacity from valid_tests where speed = 100 group by user_name """) for user_name, capacity in cur.fetchall(): result["all"].append((user_name, capacity)) # Get None Hot Users Data cur.execute(""" select user_name, avg(ground_truth_rate / speed) capacity from valid_tests where user_name not in (select * from bezeq_users) and speed = 100 group by user_name """) for user_name, capacity in cur.fetchall(): result["not_hot"].append((user_name, capacity)) return result
def get_household_data() -> List[Dict[str, float]]: engine = get_engine() cur = engine.cursor() cur.execute(""" with user_household_sizes as ( select "יוזר" as user_name, "מס_נפשות_בבית"::int as "household_size" from testers where "מס_נפשות_בבית" is not null order by "מס_נפשות_בבית"::int desc ), user_capacities as ( select user_name, avg(ground_truth_rate) / speed capacity from valid_tests where connection = 'LAN' group by user_name, speed ) select user_capacities.user_name, capacity, household_size from user_capacities join user_household_sizes on user_capacities.user_name = user_household_sizes.user_name ; """) result = [] for _, capacity, number_of_internet_users in cur.fetchall(): result.append({ NUM_USERS: float(number_of_internet_users), CAPACITY: float(capacity) }) return result
def main(): parser = argparse.ArgumentParser(prog="download", usage="%(prog)s [options] [paths...]\n", add_help=True) main = parser.add_argument_group("Main") main.add_argument('--cache', help='Cache package', dest='cache', action='store_true') parser.add_argument('--engine', '-e', help='foo help', dest='engine', default='default') args = parser.parse_args() if args.engine in utils.get_engines(): utils.get_engine(args.engine)().install(args) else: print 'Error: unsupported engine.'
def get_user_tests_in_time_interval_evening() -> Dict[str, List[TestResult]]: engine = get_engine() cur: cursor = engine.cursor() cur.execute(""" select user_name, result, speed, infra, isp from test_random_sample_evening ; """) return fetch_sample_results(cur)
def count_users(vendor: VendorUsers, speed: int) -> int: postgres_engine = get_engine() cur: cursor = postgres_engine.cursor() cur.execute(f""" select count(distinct user_name) from valid_tests where user_name in (select * from {vendor.value}) and speed = {speed} ; """) return cur.fetchall()[0][0]
def get_ground_truth_speeds_by_vendor(vendor: VendorUsers, speed: int): postgres_engine = get_engine() cur: cursor = postgres_engine.cursor() cur.execute(f""" select ground_truth_rate from valid_tests where speed = {speed} and user_name in (select * from {vendor.value}) and connection = 'LAN' ; """) return [row[0] for row in cur.fetchall()]
async def update_schema_for(payload: Dict[str, Any], webhook: str): Base = declarative_base() # Marshal JSON into SQL-able data objects = extract_github_objects(payload, webhook) # NB: This has to be before create_all since it passively registers the tables [generate_orm(name, obj, Base) for name, obj in objects] # # Set up link to DB # session, engine = get_session() Base.metadata.create_all(get_engine(connection_string()))
def read_users(): engine = utils.get_engine() cur = engine.cursor() cur.execute(''' select distinct "יוזר" from testers; ''') names = [] for row in cur.fetchall(): names.append(row[0]) return names
def get_speed_test_ratios(speed_test: str) -> List[float]: postgres_engine = get_engine() cur: cursor = postgres_engine.cursor() cur.execute(f""" select ground_truth_rate / speed_test_rate as ratio from valid_tests where true_or_null(is_classic_test) and is_classic_resource(file_name) and speed_test_rate != 0 and ground_truth_rate != 0 and website = '{speed_test}' and (ground_truth_rate / speed_test_rate) between 0.01 and 100 ; """) return [row[0] for row in cur.fetchall()]
def calculate_ci_stats_for_user_group(user_group: List[UserStats], vendor: Vendor, tests: Dict[str, List[TestResult]], k: int, default_rates: List[float], pure: bool, evening: bool): test_random_sample = flatten_tests(user_group, tests) users_with_ci_results = calcuate_ci_for_user_group(user_group, test_random_sample, k) suffix = " (evening)" if evening is True else "" print(f"{vendor.infra.capitalize()} users (n={len(user_group)})" + suffix) for def_rate in default_rates: defaulted_users = count_defaulted_users_by_upper_bound( users_with_ci_results, def_rate) print( f"""defaulted {vendor.infra.capitalize()} with default ratio of: {def_rate}: {defaulted_users}""" + suffix) print( f"{vendor.infra.capitalize()} default rate: {defaulted_users / len(user_group)}" + suffix) print() columns = [ USER_NAME_HEBREW_KEY, USER_SPEED_PROGRAM_KEY_HEBREW, ISP_KEY_HEBREW, INFRASTRUCTURE_KEY_HEBREW, SAMPLE_AVERAGE_SPEED_KEY_HEBREW, LOWER_BOUND_KEY_HEBREW, UPPER_BOUND_KEY_HEBREW, CONFIDENCE_LEVEL_KEY_HEBREW ] data = pd.DataFrame() \ .from_records([u.to_dict() for u in users_with_ci_results], columns=columns) \ .sort_values(UPPER_BOUND_KEY_HEBREW) \ csv_no_header = data.to_csv(sep=",", columns=columns, index=False, header=False) ci_table_name = vendor.infra.lower() + "_ci" if evening is True: ci_table_name += "_evening" if pure is True: ci_table_name = "pure_" + ci_table_name print(f"copying to: '{ci_table_name}'") copy_csv_to_table(StringIO(csv_no_header), get_engine(), ci_table_name) spreadsheet_title = get_sheet_title(vendor, is_pure=pure, is_evening=evening) update_sheet(spreadsheet_title, data)
async def update_schema_for(payload: Dict[str, Any], webhook: str): Base = declarative_base() # Only look at allowlisted webhooks if webhook not in ACCEPTABLE_WEBHOOKS: return {"statusCode": 200, "body": f"not processing {webhook}"} # Marshal JSON into SQL-able data objects = extract_github_objects(payload, webhook) # NB: This has to be before create_all since it passively registers the tables [generate_orm(name, obj, Base) for name, obj in objects] # # Set up link to DB # session, engine = get_session() Base.metadata.create_all(get_engine(connection_string()))
def calc_intervals_speed_test_website_comparisons(): postgres_engine = get_engine() cur: cursor = postgres_engine.cursor() for website in ["גוגל", "בזק", "אוקלה", "הוט", "נטפליקס"]: cur.execute(get_speed_test_websites_rates(website)) rates = [x[0] for x in list(cur.fetchall())] print("רווח סמך עבור אתר בדיקת מהירות: {}".format(website)) print("יחס ממוצע: {}".format(round(np.mean(rates), DECIMAL_PLACES))) print("סטיית תקן (מדגם): {}".format( round(np.std(rates, ddof=1), DECIMAL_PLACES))) print("מספר דגימות (N): {}".format(len(rates))) confs = [.95, .99, .999] for confidence in confs: mean, lower_bound, upper_bound, h = calc_confidence_interval( rates, confidence) lower_bound = round(lower_bound, DECIMAL_PLACES) upper_bound = round(upper_bound, DECIMAL_PLACES) print( "ברמת סמך של {}% יחס מהירות בפועל \ למהירות בדיקה באתר בדיקת המהירות של {} הוא בין {} ל-{}" .format(confidence * 100, website, lower_bound, upper_bound)) print()
async def handle_webhook(payload: Dict[str, Any], type: str): engine = get_engine(connection_string()) # Only look at allowlisted webhooks if type not in ACCEPTABLE_WEBHOOKS: return {"statusCode": 200, "body": f"not processing {type}"} # Marshal JSON into SQL-able data objects = extract_github_objects(payload, type) print("Writing", ", ".join([n for n, o in objects])) with engine.connect() as conn: for tablename, obj in objects: # Some of the data is not already in the right form (e.g. dates and # lists, so fix that up here) obj = transform_data(obj) model_data = [tablename] + [column(k) for k in obj.keys()] model = table(*model_data) upsert(conn, model, obj) return {"statusCode": 200, "body": "ok"}
async def handle_webhook(payload: Dict[str, Any], type: str): engine = get_engine(connection_string()) # Marshal JSON into SQL-able data objects = extract_github_objects(payload, type) print("Writing", ", ".join([n for n, o in objects])) with engine.connect() as conn: for tablename, obj in objects: # Some of the data is not already in the right form (e.g. dates and # lists, so fix that up here) obj = transform_data(obj) model_data = [tablename] + [column(k) for k in obj.keys()] model = table(*model_data) if tablename not in existing_schema: print( f"Skipping write of {tablename} since it doesn't exist in hardcoded schema" ) continue # Remove non-existent fields newdata = {} for key, value in obj.items(): if key in existing_schema[tablename]: newdata[key] = value else: print( f"Dropping key '{key}' with value '{value}' since it doesn't exist in table {tablename}" ) obj = newdata upsert(conn, model, obj) return {"statusCode": 200, "body": "ok"}
def get_new_data(): engine = get_engine() metrobuses_raw_data = urlopen(METROBUSES_API_URL).read() metrobuses_required_data = filter_json_raw_data(metrobuses_raw_data) create_historical_points(engine, metrobuses_required_data)
from keras.layers import Dense, Input from keras.models import Model from keras.wrappers.scikit_learn import KerasRegressor, KerasClassifier from sklearn.model_selection import cross_val_score from sklearn.model_selection import KFold from sklearn.preprocessing import StandardScaler from sklearn.pipeline import Pipeline from utils import get_engine from sklearn import linear_model import matplotlib.pyplot as plt if __name__ == '__main__': # load dataset table = "data_fraud_little" engine = get_engine() dataframe = pd.read_sql_query("select * from {table} limit 10".format(table=table),engine) dataset = dataframe.values print("First one row of the dataset") print("Shape [{}]".format(dataset.shape)) print(dataset[0:2,:]) # split into input (X) and output (Y) variables data_dimensions = 45 #first dimension is the index, must be removed!!!! X = dataset[:, 1:data_dimensions] Y = dataset[:, data_dimensions] print("Fraud {}% ".format(float(np.sum(Y==1))*100.0/Y.shape[0])) print("Total #samples:",Y.shape[0]) Y = to_categorical(Y, nb_classes=None)
id = Column(Integer, primary_key=True) name = Column(String(50), nullable=False, unique=True) # post model: class Post(Base): __tablename__ = 'posts' id = Column(Integer, primary_key=True) title = Column(String(100), nullable=False) content = Column(Text, nullable=False) created_on = Column(DateTime, default=datetime.now) updated_on = Column(DateTime, default=datetime.now, onupdate=datetime.now) category_id = Column(Integer, ForeignKey('categories.id')) author_id = Column(Integer, ForeignKey('authors.id')) tags = relationship('Tag', secondary=post_tag, backref='posts') if __name__ == '__main__': # default message msg = 'action not found!' # get action from command line action = sys.argv[1] if action == 'create': Base.metadata.create_all(get_engine()) msg = 'models created successfully.' elif action == 'drop': Base.metadata.drop_all(get_engine()) msg = 'models destroyed.' print(msg)
from typing import List, Union, Optional, Generator from ipwhois import IPWhois from utils import get_engine, get_rows CREATE_WHOIS_CACHE = """ create table if not exists whois_data ( cidr cidr, reference_filename text, data json, last_updated timestamptz, unique (cidr) ); """ engine = get_engine() engine.cursor().execute(CREATE_WHOIS_CACHE) engine.commit() @ttl_cache(1000) def whois_lookup(ip: str, file_name: str, use_cache=True) -> Generator[None, None, Optional[dict]]: """Perform Whois lookup for a given IP :ip: Ip to peform whois lookup :returns Optional[dict] with whois data """ ip_obj = ipaddress.ip_address(ip) if ip_obj.is_private is True: print(ip_obj, "is private")
class TestDataValidity(unittest.TestCase): engine = get_engine() cur = engine.cursor() def get_rows(self, query: str) -> List[tuple]: rows = [] self.cur.execute(query) for row in self.cur.fetchall(): rows.append(row) return rows def test_ci_table_vendors(self): bezeq_vendors = self.get_rows(""" select bezeq_ci.ספקית, bezeq_ci.תשתית from bezeq_ci ; """) bezeq_vendors += self.get_rows(""" select bezeq_ci_evening.ספקית, bezeq_ci_evening.תשתית from bezeq_ci_evening ; """) for isp, infra in bezeq_vendors: self.assertTrue(isp == "Bezeq International-Ltd" or infra == "BEZEQ") hot_vendors = self.get_rows(""" select hot_ci.ספקית, hot_ci.תשתית from hot_ci ; """) hot_vendors += self.get_rows(""" select hot_ci_evening.ספקית, hot_ci_evening.תשתית from hot_ci_evening ; """) for isp, infra in hot_vendors: self.assertTrue(isp == "Hot-Net internet services Ltd." or infra == "HOT") partner_vendors = self.get_rows(""" select partner_ci.ספקית, partner_ci.תשתית from partner_ci ; """) partner_vendors += self.get_rows(""" select partner_ci_evening.ספקית, partner_ci_evening.תשתית from partner_ci_evening ; """) for isp, infra in partner_vendors: self.assertTrue(isp == "Partner Communications Ltd." or infra == "PARTNER") def test_ci_evening_tables(self): partner_evening_numbers = self.get_rows(""" select partner_ci_evening.שם_משתמש, partner_ci_evening.גבול_עליון, partner_ci_evening.גבול_תחתון, partner_ci_evening.מהירות_ממוצעת_מדגם from partner_ci_evening ; """) partner_numbers = self.get_rows(""" select partner_ci.שם_משתמש, partner_ci.גבול_עליון, partner_ci.גבול_תחתון, partner_ci.מהירות_ממוצעת_מדגם from partner_ci ; """) partner_evening_numbers.sort(key=lambda columns: columns[0]) partner_numbers.sort(key=lambda columns: columns[0]) for eve_num, num in zip(partner_evening_numbers, partner_numbers): eve_name, eve_upper, eve_lower, eve_average = eve_num name, upper, lower, average = num self.assertEqual(eve_name, name) self.assertNotEqual(eve_average, average) self.assertNotEqual(eve_lower, lower) self.assertNotEqual(eve_upper, upper) def test_random_sample(self): random_sample = self.get_rows( "select user_name, count(*) from test_random_sample group by user_name" ) for _user_name, count in random_sample: self.assertEqual(300, count) speeds = self.get_rows("select speed from test_random_sample") for speed, in speeds: self.assertIn(speed, [100, 200, 500, 1000]) # random sample data persistency randomized_valid_test = self.get_rows( "select ground_truth_rate, user_name, file_name, timestamp, random_index" " from randomized_valid_tests order by random_index") second = (6.437308051433778, 'ben_b', 'go', 1540301748578, 1.1055979456386922e-06) last_row = (63.250679088980895, 'etl', 'firefox', 1571828370256, 0.9999999137277591) row_minus_pi = (46.146673387096776, 'ArielG', 'go', 1543489131958, 0.8610400669148355) row_minus_1000 = (30.569313143358247, 'dor_p', 'dlink', 1563453594052, 0.9995655430256711) self.assertEqual(second, randomized_valid_test[1]) self.assertEqual(last_row, randomized_valid_test[-1]) self.assertEqual(row_minus_pi, randomized_valid_test[-314159]) self.assertEqual(row_minus_1000, randomized_valid_test[-1000]) sample = self.get_rows("select user_name, result, timestamp, file_name" " from test_random_sample order by timestamp") first_row = ('admin', 18.813411540900443, 1529870019339, 'my-sql') last_row = ('dan_florentin', 36.70050094950409, 1587377434453, 'dlink') row_minus_1000 = ('artium', 37.905948297764226, 1584833539085, 'dlink') self.assertEqual(first_row, sample[0]) self.assertEqual(last_row, sample[-1]) self.assertEqual(row_minus_1000, sample[-1000]) public_access_resources = [ "file_name", "amazon-workSpaces", "windows-games", "windows-games-studio", "my-sql", "dlink", "vlc", "go", "firefox", "quicktime" ] for _, _, _, filename in sample: self.assertIn(filename, public_access_resources) def test_evening_random_sample(self): evening_timestamps = self.get_rows( "select timestamp from test_random_sample_evening") for timestamp, in evening_timestamps: self.assertIn( "Evening", self.get_rows(f"select get_time_of_day({timestamp})")[0][0]) random_sample_evening = self.get_rows( "select user_name, count(*) from test_random_sample_evening group by user_name" ) for _user_name, count in random_sample_evening: self.assertEqual(300, count, f"user: {_user_name} has {count} tests") speeds = self.get_rows("select speed from test_random_sample_evening") for speed, in speeds: self.assertIn(speed, [100, 200, 500, 1000]) def test_ci_tables(self): # Test CI Tables data persistency bezeq_ci = self.get_rows("select * from bezeq_ci") bezeq_ci_first_row = ('yarden', '100', 'Cellcom Fixed Line Communication L.P.', 'BEZEQ', '24.17307459253051', '22.419327471344598', '25.92682171371644', '0.9989583333333333') bezeq_ci_last_row = ('raz', '100', 'Hot-Net internet services Ltd.', 'BEZEQ', '66.30938669799932', '61.388106035193466', '71.23066736080521', '0.9989583333333333') bezeq_ci_tenth_row = ('nimrod', '100', '013 NetVision Ltd', 'BEZEQ', '31.192815980664005', '29.223790648991308', '33.16184131233672', '0.9989583333333333') self.assertEqual(bezeq_ci_first_row, bezeq_ci[0]) self.assertEqual(bezeq_ci_last_row, bezeq_ci[-1]) self.assertEqual(bezeq_ci_tenth_row, bezeq_ci[10]) pure_bezeq_ci = self.get_rows("select * from pure_bezeq_ci") pure_bezeq_ci_first = ('michael', '100', 'Bezeq International-Ltd', 'BEZEQ', '29.66160497821917', '29.077337604229974', '30.245872352208366', '0.9970588235294118') pure_bezeq_ci_last = ('alon', '100', 'Bezeq International-Ltd', 'BEZEQ', '64.8046254918595', '60.96868027095032', '68.64057071276866', '0.9970588235294118') pure_bezeq_ci_tenth = ('rina', '100', 'Bezeq International-Ltd', 'BEZEQ', '52.732937250037054', '48.413901608636756', '57.05197289143735', '0.9970588235294118') self.assertEqual(pure_bezeq_ci_first, pure_bezeq_ci[0]) self.assertEqual(pure_bezeq_ci_last, pure_bezeq_ci[-1]) self.assertEqual(pure_bezeq_ci_tenth, pure_bezeq_ci[10]) hot_ci = self.get_rows("select * from hot_ci") hot_ci_first_row = ('rom', '100', 'Hot-Net internet services Ltd.', 'HOT', '12.34002004187883', '10.041289542437283', '14.638750541320395', '0.9985714285714286') hot_ci_last_row = ('barak', '100', '013 NetVision Ltd', 'HOT', '114.77583966117818', '104.20295342751203', '125.3487258948444', '0.9985714285714286') hot_ci_tenth_row = ('alon_s', '100', 'ITC NG ltd', 'HOT', '49.53545118889848', '43.974820709604955', '55.09608166819204', '0.9985714285714286') self.assertEqual(hot_ci_first_row, hot_ci[0]) self.assertEqual(hot_ci_last_row, hot_ci[-1]) self.assertEqual(hot_ci_tenth_row, hot_ci[10]) def test_summary_ci_tables(self): isp_or_infra_summary_table = read_sheet( "ממצאי רווח סמך (ספקית או תשתית)") isp_and_infra_summary_table = read_sheet( "ממצאי רווח סמך (ספקית + תשתית)") expected_table_isp_or_infra = [{ "ספקית או תשתית": "בזק", "מספר משתמשים": 48, "מ. משתמשים שמהירות הגלישה הממוצעת שלהם נמוכה מחצי הבטחת החבילה": "23 (47.91%)", "מ. משתמשים שמהירות הגלישה הממוצעת שלהם נמוכה מחצי הבטחת החבילה בשעות הערב": "26 (54.16%)", "מ. משתמשים שמהירות הגלישה הממוצעת שלהם נמוכה משליש הבטחת החבילה": "11 (22.91%)", "מ. משתמשים שמהירות הגלישה הממוצעת שלהם נמוכה משליש הבטחת החבילה בשעות הערב": "11 (22.91%)" }, { "ספקית או תשתית": "הוט", "מספר משתמשים": 35, "מ. משתמשים שמהירות הגלישה הממוצעת שלהם נמוכה מחצי הבטחת החבילה": "13 (37.14%)", "מ. משתמשים שמהירות הגלישה הממוצעת שלהם נמוכה מחצי הבטחת החבילה בשעות הערב": "15 (42.85%)", "מ. משתמשים שמהירות הגלישה הממוצעת שלהם נמוכה משליש הבטחת החבילה": "8 (22.85%)", "מ. משתמשים שמהירות הגלישה הממוצעת שלהם נמוכה משליש הבטחת החבילה בשעות הערב": "10 (28.57%)" }, { "ספקית או תשתית": "פרטנר", "מספר משתמשים": 14, "מ. משתמשים שמהירות הגלישה הממוצעת שלהם נמוכה מחצי הבטחת החבילה": "7 (50.00%)", "מ. משתמשים שמהירות הגלישה הממוצעת שלהם נמוכה מחצי הבטחת החבילה בשעות הערב": "8 (57.14%)", "מ. משתמשים שמהירות הגלישה הממוצעת שלהם נמוכה משליש הבטחת החבילה": "2 (14.28%)", "מ. משתמשים שמהירות הגלישה הממוצעת שלהם נמוכה משליש הבטחת החבילה בשעות הערב": "2 (14.28%)" }] expected_table_isp_and_infra = [{ "ספקית + תשתית": "בזק", "מספר משתמשים": 17, "מ. משתמשים שמהירות הגלישה הממוצעת שלהם נמוכה מחצי הבטחת החבילה": "7 (41.17%)", "מ. משתמשים שמהירות הגלישה הממוצעת שלהם נמוכה מחצי הבטחת החבילה בשעות הערב": "7 (41.17%)", "מ. משתמשים שמהירות הגלישה הממוצעת שלהם נמוכה משליש הבטחת החבילה": "2 (11.76%)", "מ. משתמשים שמהירות הגלישה הממוצעת שלהם נמוכה משליש הבטחת החבילה בשעות הערב": "2 (11.76%)" }, { "ספקית + תשתית": "הוט", "מספר משתמשים": 20, "מ. משתמשים שמהירות הגלישה הממוצעת שלהם נמוכה מחצי הבטחת החבילה": "8 (40.00%)", "מ. משתמשים שמהירות הגלישה הממוצעת שלהם נמוכה מחצי הבטחת החבילה בשעות הערב": "10 (50.00%)", "מ. משתמשים שמהירות הגלישה הממוצעת שלהם נמוכה משליש הבטחת החבילה": "5 (25.00%)", "מ. משתמשים שמהירות הגלישה הממוצעת שלהם נמוכה משליש הבטחת החבילה בשעות הערב": "6 (30.00%)" }, { "ספקית + תשתית": "פרטנר", "מספר משתמשים": 6, "מ. משתמשים שמהירות הגלישה הממוצעת שלהם נמוכה מחצי הבטחת החבילה": "2 (33.33%)", "מ. משתמשים שמהירות הגלישה הממוצעת שלהם נמוכה מחצי הבטחת החבילה בשעות הערב": "3 (50.00%)", "מ. משתמשים שמהירות הגלישה הממוצעת שלהם נמוכה משליש הבטחת החבילה": "2 (33.33%)", "מ. משתמשים שמהירות הגלישה הממוצעת שלהם נמוכה משליש הבטחת החבילה בשעות הערב": "2 (33.33%)" }] self.assertEqual(expected_table_isp_or_infra, isp_or_infra_summary_table) self.assertEqual(expected_table_isp_and_infra, isp_and_infra_summary_table)
class CITablesInit: engine = get_engine() def init_all_ci_tables(self): self.engine.cursor().execute(""" DROP TABLE IF EXISTS bezeq_ci; CREATE TABLE bezeq_ci ( "שם_משתמש" text, "תכנית" text, "ספקית" text, "תשתית" text, "מהירות_ממוצעת_מדגם" text, "גבול_תחתון" text, "גבול_עליון" text, "רמת_סמך" text ); DROP TABLE IF EXISTS pure_bezeq_ci; CREATE TABLE pure_bezeq_ci ( "שם_משתמש" text, "תכנית" text, "ספקית" text, "תשתית" text, "מהירות_ממוצעת_מדגם" text, "גבול_תחתון" text, "גבול_עליון" text, "רמת_סמך" text ); DROP TABLE IF EXISTS bezeq_ci_evening; CREATE TABLE bezeq_ci_evening ( "שם_משתמש" text, "תכנית" text, "ספקית" text, "תשתית" text, "מהירות_ממוצעת_מדגם" text, "גבול_תחתון" text, "גבול_עליון" text, "רמת_סמך" text ); DROP TABLE IF EXISTS pure_bezeq_ci_evening; CREATE TABLE pure_bezeq_ci_evening ( "שם_משתמש" text, "תכנית" text, "ספקית" text, "תשתית" text, "מהירות_ממוצעת_מדגם" text, "גבול_תחתון" text, "גבול_עליון" text, "רמת_סמך" text ); DROP TABLE IF EXISTS hot_ci; CREATE TABLE hot_ci ( "שם_משתמש" text, "תכנית" text, "ספקית" text, "תשתית" text, "מהירות_ממוצעת_מדגם" text, "גבול_תחתון" text, "גבול_עליון" text, "רמת_סמך" text ); DROP TABLE IF EXISTS pure_hot_ci; CREATE TABLE pure_hot_ci ( "שם_משתמש" text, "תכנית" text, "ספקית" text, "תשתית" text, "מהירות_ממוצעת_מדגם" text, "גבול_תחתון" text, "גבול_עליון" text, "רמת_סמך" text ); DROP TABLE IF EXISTS hot_ci_evening; CREATE TABLE hot_ci_evening ( "שם_משתמש" text, "תכנית" text, "ספקית" text, "תשתית" text, "מהירות_ממוצעת_מדגם" text, "גבול_תחתון" text, "גבול_עליון" text, "רמת_סמך" text ); DROP TABLE IF EXISTS pure_hot_ci_evening; CREATE TABLE pure_hot_ci_evening ( "שם_משתמש" text, "תכנית" text, "ספקית" text, "תשתית" text, "מהירות_ממוצעת_מדגם" text, "גבול_תחתון" text, "גבול_עליון" text, "רמת_סמך" text ); DROP TABLE IF EXISTS partner_ci; CREATE TABLE partner_ci ( "שם_משתמש" text, "תכנית" text, "ספקית" text, "תשתית" text, "מהירות_ממוצעת_מדגם" text, "גבול_תחתון" text, "גבול_עליון" text, "רמת_סמך" text ); DROP TABLE IF EXISTS pure_partner_ci; CREATE TABLE pure_partner_ci ( "שם_משתמש" text, "תכנית" text, "ספקית" text, "תשתית" text, "מהירות_ממוצעת_מדגם" text, "גבול_תחתון" text, "גבול_עליון" text, "רמת_סמך" text ); DROP TABLE IF EXISTS partner_ci_evening; CREATE TABLE partner_ci_evening ( "שם_משתמש" text, "תכנית" text, "ספקית" text, "תשתית" text, "מהירות_ממוצעת_מדגם" text, "גבול_תחתון" text, "גבול_עליון" text, "רמת_סמך" text ); DROP TABLE IF EXISTS pure_partner_ci_evening; CREATE TABLE pure_partner_ci_evening ( "שם_משתמש" text, "תכנית" text, "ספקית" text, "תשתית" text, "מהירות_ממוצעת_מדגם" text, "גבול_תחתון" text, "גבול_עליון" text, "רמת_סמך" text ); """) self.engine.commit()
class RandomSample: engine = get_engine() def execute_persistent(self, query): self.engine.cursor().execute(query) self.engine.commit() def iterate_lines(self, query): cur = self.engine.cursor() cur.execute(query) for row in cur.fetchall(): yield row def create_user_stats_view(self): self.execute_persistent(""" create or replace view user_stats as ( select user_name, min(to_israel_dst_aware(timestamp)) first_test, min(to_israel_dst_aware(timestamp)) + interval '30' day first_test_plus_30_days, min(to_israel_dst_aware(timestamp)) + interval '60' day first_test_plus_60_days, count(*) num_test from valid_tests where connection = 'LAN' and speed not in (15, 30, 40) and is_classic_resource(file_name) group by user_name ); """) def create_randomized_valid_test_table(self): self.execute_persistent(""" drop table if exists randomized_valid_tests; select setseed(0.314159265359); create table if not exists randomized_valid_tests as ( select random() as random_index, is_evening(timestamp) evening, * from valid_tests order by random_index ); create index on randomized_valid_tests(timestamp); create index on randomized_valid_tests(user_name); create index on randomized_valid_tests(random_index); ; """) def create_random_sample(self): self.execute_persistent(""" drop table if exists test_random_sample; create table if not exists test_random_sample( id serial primary key, user_name text, result float, speed integer, isp text, infra text, connection text, file_name text, timestamp bigint ); do $$ declare uname text; begin for uname in (select distinct user_name from valid_tests order by user_name) loop insert into test_random_sample(user_name, result, speed, isp, infra, connection, file_name, timestamp) select randomized_valid_tests.user_name, ground_truth_rate, speed, isp, infrastructure, connection, file_name, timestamp from randomized_valid_tests join user_stats on randomized_valid_tests.user_name = user_stats.user_name where randomized_valid_tests.user_name = uname and to_israel_dst_aware(timestamp) between first_test and first_test_plus_30_days and connection = 'LAN' and num_test >= 700 and is_classic_resource(file_name) order by random_index limit 300; end loop; end; $$; """) def create_random_sample_evening(self): self.execute_persistent(""" drop table if exists test_random_sample_evening; create table if not exists test_random_sample_evening( id serial primary key, user_name text, result float, speed integer, isp text, infra text, connection text, file_name text, timestamp bigint) ; do $$ declare uname text; begin for uname in (select distinct user_name from valid_tests order by user_name) loop insert into test_random_sample_evening(user_name, result, speed, isp, infra, connection, file_name, timestamp) select randomized_valid_tests.user_name, ground_truth_rate, speed, isp, infrastructure, connection, file_name, timestamp from randomized_valid_tests join user_stats on randomized_valid_tests.user_name = user_stats.user_name where randomized_valid_tests.user_name = uname and to_israel_dst_aware(timestamp) between first_test and first_test_plus_60_days and connection = 'LAN' and num_test >= 700 and is_classic_resource(file_name) and evening is True order by random_index limit 300; end loop; end; $$; """)
def speed_test_website_scatter(): engine = get_engine() all_resources_query = """ select website_to_hebrew(website) "אתר בדיקה", avg(ground_truth_rate / speed_test_rate) "יחס מהירות בפועל למהירות בדיקה" from valid_tests where (ground_truth_rate / speed_test_rate) between 0.01 and 100 and true_or_null(is_classic_test) and ground_truth_rate > 0 and speed_test_rate > 0 and not website = 'atnt' group by website_to_hebrew(website) order by "יחס מהירות בפועל למהירות בדיקה" ; """ public_servers_query = """ select website_to_hebrew(website) "אתר בדיקה", avg(ground_truth_rate / speed_test_rate) "יחס מהירות בפועל למהירות בדיקה" from valid_tests where (ground_truth_rate / speed_test_rate) between 0.01 and 100 and true_or_null(is_classic_test) and ground_truth_rate > 0 and speed_test_rate > 0 and is_classic_resource(file_name) and not website = 'atnt' group by website order by "יחס מהירות בפועל למהירות בדיקה" ; """ israel_cache_query = """ select website_to_hebrew(website) "אתר בדיקה", avg(ground_truth_rate / speed_test_rate) "יחס מהירות בפועל למהירות בדיקה" from valid_tests where (ground_truth_rate / speed_test_rate) between 0.01 and 100 and true_or_null(is_classic_test) and ground_truth_rate > 0 and speed_test_rate > 0 and file_name = 'israel_cache' and not website = 'atnt' group by website order by "יחס מהירות בפועל למהירות בדיקה" ; """ cur = engine.cursor() lables = "כל המקורות", "קבצים ציבוריים", "שרת מטמון ישראל" for label, query in zip( lables, [all_resources_query, public_servers_query, israel_cache_query]): print(f"handeling: {label}") cur.execute(query) all_resources = list(cur.fetchall()) plot.scatter(x=[normalize_hebrew(x[0]) for x in all_resources], y=[x[1] for x in all_resources], label=normalize_hebrew(label)) first_on_x, last_on_x = normalize_hebrew('הוט'), normalize_hebrew('גוגל') plot.hlines(y=1, xmin=first_on_x, xmax=last_on_x, colors='aqua', linestyles='dotted', lw=2, label=normalize_hebrew('חיזוי מדויק')) plot.legend(loc="best") plot.ylabel(normalize_hebrew('יחס בדיקת אתר למהירות בפועל')) snapshots_path = Path("question_snapshots") / Path( 'ground_truth_violin_plots') if not os.path.exists(snapshots_path): os.makedirs(snapshots_path) snapshots_path = create_snapshot_path("website_comparison_scatter") fig_path = snapshots_path / Path("השוואת ממוצעי אתרי בדיקה" + ".png") plot.savefig(fig_path) print(f"saving {fig_path}") plot.show()