Example #1
0
def assign_servers_test_output(df_train, df_test, percentile, confidence,
                               apps_server):
    df_train['hour'] = None
    df_train['hour'] = pd.DatetimeIndex(df_train['Date']).hour

    data_l = list(df_train['pairs'])
    pairs_count = (df_train.groupby('pairs2').agg({
        'Date': 'count',
        'norm_latency': 'mean',
        'Duration': 'sum',
        'Packets': 'sum'
    }).reset_index())
    pairs_count.columns = [
        'pairs', 'frequency', 'avg_norm_latency', 'total_duration',
        'total_packets'
    ]
    pairs_count['norm_latency'] = (
        pairs_count['total_duration'] / pairs_count['total_packets'].sum()
    ) * 100  #sum of all duration time divided by sum of all packets transfered for that pair

    per_n = (pairs_count['frequency'].quantile(percentile))
    patterns = pyfpgrowth.find_frequent_patterns(data_l, per_n)
    rules = pyfpgrowth.generate_association_rules(patterns, confidence)

    #format the rules, bring back in the other info on latency rank

    formated_rules = format_rules(rules, df_train, apps_server)

    #now we make the server assignments based on the training rules applied to the test data
    server_df, server_assignments, total_latency, total_latency_model, avg_latency, avg_latency_model = server_association(
        formated_rules, df_test, apps_server)  #this function loaded fr

    #return(formated_rules)
    return (server_df, server_assignments, total_latency, total_latency_model,
            avg_latency, avg_latency_model)
Example #2
0
def do_form_2(request, session):
    """
      Process CUNY Subject list from form 2.
      Generate form_3: the selected transfer rules for review
  """
    if DEBUG:
        print(f'*** do_form_2({session})')
    for k, v in session.items():
        print(f'{k}: {v}')

    conn = PgConnection()
    cursor = conn.cursor()

    # Look up transfer rules where the sending course belongs to a sending institution and is one of
    # the source disciplines and the receiving course belongs to a receiving institution and is one of
    # the receiving disciplines.
    try:
        source_institution_params = ', '.join(
            '%s' for i in session['source_institutions'])
        destination_institution_params = ', '.join(
            '%s' for i in session['destination_institutions'])
    except KeyError:
        # the session is expired or invalid. Go back to Step 1.
        return render_template('review_rules.html',
                               result=Markup("""
                                                           <h1>Session Expired</h1>
                                                           <p>
                                                             <a href="/" class="button">
                                                                Main Menu</a>
                                                             <a href="/review_rules"
                                                                  class="restart button">Restart
                                                              </a>
                                                           </p>

                                                           """))

    # Be sure there is the possibility there will be some rules
    source_subject_list = request.form.getlist('source_subject')
    destination_subject_list = request.form.getlist('destination_subject')

    if len(source_subject_list) < 1:
        return render_template(
            'review_rules.html',
            result=Markup(
                '<h1 class="error">No sending disciplines selected.</h1>'))
    if len(destination_subject_list) < 1:
        return render_template(
            'review_rules.html',
            result=Markup(
                '<h1 class="error">No receiving disciplines selected.</h1>'))

    # Prepare the query to get the set of rules that match the institutions and cuny_subjects
    # selected.
    if request.form.get('all-source-subjects'):
        source_subjects_clause = ''
    else:
        source_subjects_str = '|'.join(f':{s}:' for s in source_subject_list)
        source_subjects_clause = f"  and '{source_subjects_str}' ~ source_subjects"
        source_subjects = ', '.join(f"'{s}'" for s in source_subject_list)
        source_subjects_clause = f"""
      and id in (select rule_id from subject_rule_map where subject in ({source_subjects}))"""

    # Get all the rules where,
    #  - The source and destination institutions have been selected
    #  and
    #  - The source_subjects have been selected
    q = f"""
  select *
    from transfer_rules
   where source_institution in ({source_institution_params})
     and destination_institution in ({destination_institution_params})
     {source_subjects_clause}
  order by source_institution, destination_institution, subject_area, group_number"""
    cursor.execute(
        q,
        (session['source_institutions'] + session['destination_institutions']))

    if cursor.rowcount < 1:
        return render_template(
            'review_rules.html',
            result=Markup(
                '<h1 class="error">There are no matching rules.</h1>'))

    all_rules = cursor.fetchall()
    selected_rules = []
    # Get the source and destination course lists from the above set of rules where the destination
    # subject was selected. It's possible to have selected rules that don’t transfer to any of the
    # selected destination subjects, so those rules are dropped while building the selected-rules
    # list.
    if request.form.get('all-destination-subjects'):
        destination_subjects_clause = ''
    else:
        # Create a clause that makes sure the destination course has one of the destination subjects
        destination_subject_list = request.form.getlist('destination_subject')
        destination_subject_params = ', '.join(
            f"'{s}'" for s in destination_subject_list)
        destination_subjects_clause = f" and dc.cuny_subject in ({destination_subject_params})"

    for rule in all_rules:
        # It’s possible some of the selected rules don’t have destination courses in any of the selected
        # disciplines, so that has to be checked first.
        cursor.execute(
            f"""
   select  dc.course_id,
           dc.offer_nbr,
           dc.offer_count,
           dc.discipline,
           dc.catalog_number,
           dn.discipline_name,
           dc.cuny_subject,
           dc.cat_num,
           dc.transfer_credits,
           dc.credit_source,
           dc.is_mesg,
           dc.is_bkcr
      from destination_courses dc, cuny_disciplines dn
      where dc.rule_id = %s
        and dn.institution = %s
        and dn.discipline = dc.discipline
        {destination_subjects_clause}
       order by discipline, cat_num
    """, (rule.id, rule.destination_institution))
        if cursor.rowcount > 0:
            destination_courses = [
                Destination_Course._make(c) for c in cursor.fetchall()
            ]
            cursor.execute(
                """
         select  sc.course_id,
                 sc.offer_nbr,
                 sc.offer_count,
                 sc.discipline,
                 sc.catalog_number,
                 dn.discipline_name,
                 sc.cuny_subject,
                 sc.cat_num,
                 sc.min_credits,
                 sc.max_credits,
                 sc.min_gpa,
                 sc.max_gpa
         from source_courses sc, cuny_disciplines dn
        where sc.rule_id = %s
          and dn.institution = %s
          and dn.discipline = sc.discipline
        order by discipline, cat_num
        """, (rule.id, rule.source_institution))
            if cursor.rowcount > 0:
                source_courses = [
                    Source_Course._make(c) for c in cursor.fetchall()
                ]

            # Create the Transfer_Rule tuple suitable for passing to format_rules, and add it to the
            # list of rules to pass.
            selected_rules.append(
                Transfer_Rule._make([
                    rule.id, rule.source_institution,
                    rule.destination_institution, rule.subject_area,
                    rule.group_number, rule.source_disciplines,
                    rule.source_subjects, rule.review_status, source_courses,
                    destination_courses
                ]))
    cursor.close()
    conn.close()

    if len(selected_rules) == 0:
        num_rules = 'No matching transfer rules found.'
    if len(selected_rules) == 1:
        num_rules = 'There is one matching transfer rule.'
    if len(selected_rules) > 1:
        num_rules = f'There are {len(selected_rules):,} matching transfer rules.'

    rules_table = format_rules(selected_rules, scrollable=True)

    result = f"""
  {header(title='Review Rules: Review Selected Rules',
          nav_items=[{'type': 'link',
          'href': '/',
          'text': 'Main Menu'},
          {'type': 'link',
           'href': '/review_rules',
           'text': 'Change Colleges'},
           {'type': 'button',
            'class': 'back-button',
            'text': 'Change Subjects'

           }])}
    <details open>
      <summary>Instructions (click to open/close)</summary>
      <hr>
      {num_rules}
      <p>
      Blanket Credit courses are <span class="blanket">highlighted like this</span>.
      </p>
      <p>
        Rules that are <span class="credit-mismatch">highlighted like this</span> have a different
        number of credits taken from the number of credits transferred.
        Hover over the “=>” to see the numbers of credits.
      </p>
      <p>
        Credits in parentheses give the number of credits transferred where that does not match the
        nominal number of credits for a course.
      </p>
      <p>
        Rules that are <span class="evaluated">highlighted like this</span> are ones that you have
        reviewed but not yet submitted.
      </p>
      <p class="call-to-action">
        Click on a rule to review it
      </p>
    </details>
    <fieldset id="verification-fieldset"><legend>Review Reviews</legend>
        <p id="num-pending">You have not reviewed any transfer rules yet.</p>
        <button type="text" id="send-email" disabled="disabled">
        Review Your Reviews
      </button>
      <form method="post" action="#" id="review-form">
        Waiting for rules to finish loading ...
      </form>
    </fieldset>
    <div id="rules-table-div" class="selection-table-div table-height">
    {rules_table}
    </div>
  """
    return render_template('review_rules.html', result=Markup(result))
pairs_count.columns = [
    'pairs', 'frequency', 'avg_norm_latency', 'total_duration', 'total_packets'
]
pairs_count['norm_latency'] = (
    pairs_count['total_duration'] / pairs_count['total_packets'].sum()
) * 100  #sum of all duration time divided by sum of all packets transfered for that pair

#we only want a list of all the individual pairs at each timestamp. Think of this where each timestamp is a 'transaction' and we chose to buy which 2 items (IP addresses)
data_l = list(df['pairs'])

per_40 = np.percentile(pairs_count['frequency'], [40])[0]
per_80 = np.percentile(pairs_count['frequency'], [80])[0]
#patterns40 = pyfpgrowth.find_frequent_patterns(data_l, per_40)
patterns80 = pyfpgrowth.find_frequent_patterns(
    data_l,
    per_80)  #generates less patterns b/c the min threshold is set higher

confidence = 0.7  #this means the rule is likely to be true 70% of the time, it is a high threshold, used for testing
#rules40 = pyfpgrowth.generate_association_rules(patterns40, confidence)
rules80 = pyfpgrowth.generate_association_rules(patterns80, confidence)
#the input 'rules' is the result of the pyfgrowth function run above. You must have specified the thresholds you want to use
#the orig_df is the original data, with
#the apps_server is the number of apps that can fit on a server

#rules_df_40per_70con_all=format_rules(rules40, df, 20)
rules_df_80per_70con = format_rules(rules80, df, 20)

#test that the rule lenght is as expected
print(len(rules_df_80per_70con))
#head=rules_df_80per_70con.head()
print(rules_df_80per_70con)
Example #4
0
def assign_servers_hourly(df, percentile, confidence, apps_server):
    df['hour'] = None
    df['hour'] = pd.DatetimeIndex(df['Date']).hour

    data_groups = []

    for i in range(0, df['hour'].nunique()):
        data = df[df['hour'] == i]
        data_groups.append(data)

    pairs_list = []
    patterns_list = []
    rules_list = []

    for i in data_groups:
        data_l = list(i['pairs'])
        pairs_count = (i.groupby('pairs2').agg({
            'Date': 'count',
            'norm_latency': 'mean',
            'Duration': 'sum',
            'Packets': 'sum'
        }).reset_index())
        pairs_count.columns = [
            'pairs', 'frequency', 'avg_norm_latency', 'total_duration',
            'total_packets'
        ]
        pairs_count['norm_latency'] = (
            pairs_count['total_duration'] / pairs_count['total_packets'].sum()
        ) * 100  #sum of all duration time divided by sum of all packets transfered for that pair
        pairs_list.append(pairs_count)
        per_n = (pairs_count['frequency'].quantile(percentile))
        patterns = pyfpgrowth.find_frequent_patterns(data_l, per_n)
        patterns_list.append(patterns)
        rules = pyfpgrowth.generate_association_rules(patterns, confidence)
        rules_list.append(rules)

    #format the rules, bring back in the other info on latency rank
    while {} in rules_list:
        rules_list.remove(
            {})  #lremove empty items if they have less than 24 hours
    formated_rules = []

    for i in rules_list:
        formatrule = format_rules(i, df, apps_server)
        formated_rules.append(formatrule)

    #assign IPs to the servers at each hour
    server_assign_list = []
    total_latency_list = []
    total_latency_model_list = []
    avg_latency_list = []
    avg_latency_model_list = []

    #now we use the training model on the unseen test data
    data_groups_test = []
    for i in range(0, df['hour'].nunique()):
        data = df_test[df['hour'] == i]
        data_groups_test.append(data)

    for i, j in zip(formated_rules, data_groups_test):
        server_df, server_assignments, total_latency, total_latency_model, avg_latency, avg_latency_model = server_association(
            i, j, apps_server)  #this function loaded fr
        server_assign_list.append(server_assignments)
        total_latency_list.append(total_latency)
        total_latency_model_list.append(total_latency_model)
        avg_latency_list.append(avg_latency)
        avg_latency_model_list.append(avg_latency_model)

#bring together all the durations for the actual data and the model
    hours = range(0, df['hour'].nunique())
    model_output = pd.DataFrame({
        'hours':
        hours,
        'total_latency_list':
        total_latency_list,
        'total_latency_model_list':
        total_latency_model_list,
        'avg_latency_list':
        avg_latency_list,
        'avg_latency_model_list':
        avg_latency_model_list
    })
    model_output.columns = [
        'hours', 'total_latency', 'total_latency_model', 'avg_latency',
        'avg_latency_model'
    ]
    model_output['avg_latency_per_reduction'] = (
        (model_output['avg_latency'] - model_output['avg_latency_model']) /
        model_output['avg_latency']) * 100

    #return(formated_rules)
    return (server_assign_list, model_output,
            model_output['total_latency'].sum(),
            model_output['total_latency_model'].sum(),
            model_output['avg_latency'].mean(),
            model_output['avg_latency_model'].mean())
Example #5
0
pairs_count.columns = [
    'pairs', 'frequency', 'avg_norm_latency', 'total_duration', 'total_packets'
]
pairs_count['norm_latency'] = (
    pairs_count['total_duration'] / pairs_count['total_packets'].sum()
) * 100  #sum of all duration time divided by sum of all packets transfered for that pair

#we only want a list of all the individual pairs at each timestamp. Think of this where each timestamp is a 'transaction' and we chose to buy which 2 items (IP addresses)
data_l = list(df['pairs'])

per_40 = np.percentile(pairs_count['frequency'], [40])[0]
per_80 = np.percentile(pairs_count['frequency'], [80])[0]
patterns40 = pyfpgrowth.find_frequent_patterns(data_l, per_40)
patterns80 = pyfpgrowth.find_frequent_patterns(
    data_l,
    per_80)  #generates less patterns b/c the min threshold is set higher

confidence = 0.7  #this means the rule is likely to be true 70% of the time, it is a high threshold, used for testing
rules40 = pyfpgrowth.generate_association_rules(patterns40, confidence)
rules80 = pyfpgrowth.generate_association_rules(patterns80, confidence)
#the input 'rules' is the result of the pyfgrowth function run above. You must have specified the thresholds you want to use
#the orig_df is the original d

rules_df_80per_70con = format_rules(
    rules80, df,
    20)  #this function was loaded above from the format_rules.py file

server_assignments80, total_latency, total_latency80, avg_latency, avg_latency80 = server_association(
    rules_df_80per_70con, df,
    20)  #this function loaded from server_assocation.py file