Beispiel #1
0
def do_eval(model, train, dev, input_model=None):
    """ Evaluates a model on training and development set

    Args:  
        model: QA model that has an instance variable 'answer' that returns answer span and takes placeholders  
        question, question_length, paragraph, paragraph_length  
        train: Training set  
        dev: Development set
    """
    checkpoint_dir = os.path.join(FLAGS.train_dir, FLAGS.model_name)
    parameter_space_size()
    saver = tf.train.Saver()
    # TODO add loop to run over all checkpoints in folder,
    # Training session
    with tf.Session() as session:
        saver.restore(session, tf.train.latest_checkpoint(checkpoint_dir))
        print('Evaluation in progress.', flush=True)

        # Train/Dev Evaluation
        start_evaluate = timer()

        prediction, truth = multibatch_prediction_truth(
            session, model, train, FLAGS.eval_batches, input_model=input_model)
        train_f1 = f1(prediction, truth)
        train_em = exact_match(prediction, truth)

        prediction, truth = multibatch_prediction_truth(
            session, model, dev, FLAGS.eval_batches, input_model=input_model)
        dev_f1 = f1(prediction, truth)
        dev_em = exact_match(prediction, truth)

        logging.info(f'Train/Dev F1: {train_f1:.3f}/{dev_f1:.3f}')
        logging.info(f'Train/Dev EM: {train_em:.3f}/{dev_em:.3f}')
        logging.info(f'Time to evaluate: {timer() - start_evaluate:.1f} sec')
Beispiel #2
0
 def match_payment(self):
     """
     Match visitors payment fields. If both visitors have globally unique payment
     fingerprints, just compare those. Otherwise, match all fields (including fingerprint)
     :return score: calculated payment similarity score based on all attributes
     """
     match_scores = []
     new_payment = self.new_v["payment_methods"]
     for prev_v in self.prev_vs:
         prev_payment = prev_v["payment_methods"]
         if (
             prev_payment["fingerprint"]
             and new_payment["fingerprint"]
             and prev_payment["global_unique_fingerprint"]
             and new_payment["global_unique_fingerprint"]
         ):
             match_scores.append(
                 exact_match(prev_payment["fingerprint"], new_payment["fingerprint"])
             )
             break
         else:
             for field in ["brand", "expMonth", "expYear", "last4", "country"]:
                 if (
                     not prev_payment[field]
                     or not new_payment[field]
                     or not (prev_payment[field] == new_payment[field])
                 ):
                     match_scores.append(0)
                     break
             if prev_payment["fingerprint"] and new_payment["fingerprint"]:
                 match_scores.append(
                     exact_match(prev_payment["fingerprint"], new_payment["fingerprint"])
                 )
     return max(match_scores) * self.weights["payment_methods"]
Beispiel #3
0
    def match_identity_information(self):
        """
        Match visitors identification fields.
        :return score: calculated address similarity score based on all attributes
        """
        new_id = self.new_v["visitor_users"]
        match_scores = []
        for prev_v in self.prev_vs:
            prev_id = prev_v["visitor_users"]
            results = {
                "email": exact_match(new_id["email"], prev_id["email"]),
                "Phone": exact_match(new_id["Phone"], prev_id["Phone"]),
                "First_name": exact_match(new_id["First_name"], prev_id["First_name"]),
                "Last_name": exact_match(new_id["Last_name"], prev_id["Last_name"]),
                "username": exact_match(new_id["username"], prev_id["username"]),
            }

            match_scores.append(generate_match_score(results, self.weights["visitor_users"]))

        return max(match_scores)
Beispiel #4
0
    def match_address(self):
        """
        Match visitors address fields.
        :return score: calculated address similarity score based on all attributes
        """
        new_address = self.new_v["visitor_addresses"]
        match_scores = []
        for prev_v in self.prev_vs:
            prev_address = prev_v["visitor_addresses"]
            results = {
                "Line1": 1.0 - edit_distance(prev_address["Line1"], new_address["Line1"]),
                "Line2": 1.0 - edit_distance(prev_address["Line2"], new_address["Line2"]),
                "City": exact_match(prev_address["City"], new_address["City"]),
                "Country": exact_match(prev_address["Country"], new_address["Country"]),
                "Postal_code": exact_match(prev_address["Postal_code"], new_address["Postal_code"]),
                "State": exact_match(prev_address["State"], new_address["State"]),
            }

            match_scores.append(generate_match_score(results, self.weights["visitor_addresses"]))

        return max(match_scores)
Beispiel #5
0
    def match_telemetry(self):
        """
        Match visitors telemetry fields
        :return score: calculated telemetry score based on all attributes
        :return ip_timing_red_flag: True if IP timing/geographic location values indicate it can't
                                    be the same person
        """
        ip_timing_red_flag = False
        match_scores = []
        for prev_v in self.prev_vs:
            distance_scores, ip_matches = [], []
            for previous_ip_data in prev_v["ips"]:
                for new_ip_data in self.new_v["ips"]:
                    distance = haversine_distance(previous_ip_data["props"], new_ip_data["props"])
                    time_delta = timestamp_difference(
                        previous_ip_data["updated_at"], new_ip_data["updated_at"]
                    )
                    ip_matches.append(exact_match(previous_ip_data["ip"], new_ip_data["ip"]))
                    if distance > 10 and (distance / time_delta) > self.MAX_PLAUSIBLE_SPEED:
                        ip_timing_red_flag = True
                    elif distance < 10:
                        distance_scores.append(1)
                    else:
                        distance_scores.append(min(1000, distance) / 1000)

            results = {
                "ip_match": max(ip_matches),
                "geographic_proximity": 1
                - (sum(distance_scores) / len(distance_scores)),  # average distance score
                "creation_time_proximity": 1
                - min(
                    1,
                    timestamp_difference(
                        prev_v["visitors"]["createdAt"], self.new_v["visitors"]["createdAt"]
                    )
                    / 86400,  # seconds in a day
                ),
                "visitor_age_proximity": 1
                - age_difference(
                    prev_v["visitors"]["createdAt"], self.new_v["visitors"]["createdAt"]
                ),
            }

            match_scores.append(generate_match_score(results, self.weights["telemetry"]))

        return {"score": max(match_scores), "ip_timing_red_flag": ip_timing_red_flag}
Beispiel #6
0
def do_train(model, train, dev, input_model=None):
    """ Trains a model

    Args:  
        model: QA model that has an instance variable 'answer' that returns answer span and takes placeholders  
        question, question_length, paragraph, paragraph_length  
        train: Training set  
        dev: Development set
    """
    parameter_space_size()

    checkpoint_dir = os.path.join(FLAGS.train_dir, FLAGS.model_name)
    summary_writer = tf.summary.FileWriter(checkpoint_dir)

    losses = []
    init = tf.global_variables_initializer()
    summary = tf.summary.merge_all()

    saver = tf.train.Saver()

    # Training session
    config = tf.ConfigProto()
    config.gpu_options.allow_growth = True

    with tf.Session(config=config) as sess:
        sess.run(init)
        latest_ckpt = tf.train.latest_checkpoint(checkpoint_dir)
        if latest_ckpt:
            saver.restore(sess, latest_ckpt)

        start = timer()
        epoch = -1
        for i in itertools.count():
            feed_dict_inputs = train.get_batch(FLAGS.batch_size, replace=False)
            if input_model:
                #feed into siamese model instead
                question = feed_dict_inputs[0]
                M = input_model.run(question)
                feed_dict_inputs[0] = M
            feed_dict = model.fill_feed_dict(*feed_dict_inputs,
                                             is_training=True)

            if epoch != train.epoch:
                epoch = train.epoch
                print(f'Epoch {epoch}')

            fetch_dict = {
                'step': tf.train.get_global_step(),
                'loss': model.loss,
                'train': model.train
            }
            if i > 0 and (step + 1) % 20 == 0:
                fetch_dict['summary'] = summary
            result = sess.run(fetch_dict, feed_dict)
            step = result['step']
            if 'summary' in result:
                summary_writer.add_summary(result['summary'], step)

            if step > 0 and (step == 50 or (step % 300 == 0)):
                saver.save(sess, os.path.join(checkpoint_dir, 'model'), step)

            # Moving Average loss
            losses.append(result['loss'])
            if step == 1 or step == 10 or step == 50 or step == 100 or step % FLAGS.print_every == 0:
                mean_loss = sum(losses) / len(losses)
                losses = []
                print(f'Step {step}, loss {mean_loss:.2f}')

            # Train/Dev Evaluation
            if step != 0 and (step == 200 or step % 600 == 0):
                feed_dict = model.fill_feed_dict(
                    *dev.get_batch(FLAGS.batch_size))
                fetch_dict = {'loss': model.loss}
                dev_loss = sess.run(fetch_dict, feed_dict)['loss']
                start_evaluate = timer()
                prediction, truth = multibatch_prediction_truth(
                    sess,
                    model,
                    dev,
                    num_batches=20,
                    random=True,
                    input_model=input_model)
                dev_f1 = f1(prediction, truth)
                dev_em = exact_match(prediction, truth)
                prediction, truth = multibatch_prediction_truth(
                    sess,
                    model,
                    train,
                    num_batches=20,
                    random=True,
                    input_model=input_model)
                train_f1 = f1(prediction, truth)
                train_em = exact_match(prediction, truth)
                summary_writer.add_summary(
                    tf.Summary(value=[
                        tf.Summary.Value(tag='F1_DEV', simple_value=dev_f1)
                    ]), step)
                summary_writer.add_summary(
                    tf.Summary(value=[
                        tf.Summary.Value(tag='F1_TR', simple_value=train_f1)
                    ]), step)
                print(
                    f'Step {step}, Dev loss {dev_loss:.2f}, Train/Dev F1: {train_f1:.3f}/{dev_f1:.3f}, Train/Dev EM: {train_em:.3f}/{dev_em:.3f}, Time to evaluate: {timer() - start_evaluate:.1f} sec'
                )

            if step > 0 and step % FLAGS.global_steps_per_timing == 0:
                time_iter = timer() - start
                print(
                    f'INFO:global_step/sec: {FLAGS.global_steps_per_timing/time_iter:.2f}'
                )
                start = timer()

            if step == FLAGS.max_steps:
                break
Beispiel #7
0
 def identify_top_fingerprint_match(self):
     """
     Iterate through all potential fingerprint matches, return the score of the
     fingerprint that matches the new fingerprint most closely
     """
     max_similarity = 0.0
     for prev_fp in self.prev_fps:
         results = {
             "browserVersion":
             less_than_or_equal(prev_fp["browserVersion"],
                                self.new_fp["browserVersion"]),
             "browserMajorVersion":
             less_than_or_equal(prev_fp["browserMajorVersion"],
                                self.new_fp["browserMajorVersion"]),
             "isIE":
             exact_match(prev_fp["isIE"], self.new_fp["isIE"]),
             "isChrome":
             exact_match(prev_fp["isChrome"], self.new_fp["isChrome"]),
             "isFirefox":
             exact_match(prev_fp["isFirefox"], self.new_fp["isFirefox"]),
             "isSafari":
             exact_match(prev_fp["isSafari"], self.new_fp["isSafari"]),
             "isOpera":
             exact_match(prev_fp["isOpera"], self.new_fp["isOpera"]),
             "engine":
             exact_match(prev_fp["engine"], self.new_fp["engine"]),
             "engineVersion":
             less_than_or_equal(prev_fp["engineVersion"],
                                self.new_fp["engineVersion"]),
             "osVersion":
             less_than_or_equal(prev_fp["osVersion"],
                                self.new_fp["osVersion"]),
             "isWindows":
             exact_match(prev_fp["isWindows"], self.new_fp["isWindows"]),
             "isMac":
             exact_match(prev_fp["isMac"], self.new_fp["isMac"]),
             "isLinux":
             exact_match(prev_fp["isLinux"], self.new_fp["isLinux"]),
             "isUbuntu":
             exact_match(prev_fp["isUbuntu"], self.new_fp["isUbuntu"]),
             "isSolaris":
             exact_match(prev_fp["isSolaris"], self.new_fp["isSolaris"]),
             "IsMobile":
             exact_match(prev_fp["IsMobile"], self.new_fp["IsMobile"]),
             "isMobileMajor":
             exact_match(prev_fp["isMobileMajor"],
                         self.new_fp["isMobileMajor"]),
             "isMobileAndroid":
             exact_match(prev_fp["isMobileAndroid"],
                         self.new_fp["isMobileAndroid"]),
             "isMobileOpera":
             exact_match(prev_fp["isMobileOpera"],
                         self.new_fp["isMobileOpera"]),
             "isMobileWindows":
             exact_match(prev_fp["isMobileWindows"],
                         self.new_fp["isMobileWindows"]),
             "isMobileBlackBerry":
             exact_match(
                 prev_fp["isMobileBlackBerry"],
                 self.new_fp["isMobileBlackBerry"],
             ),
             "isMobileIOS":
             exact_match(prev_fp["isMobileIOS"],
                         self.new_fp["isMobileIOS"]),
             "isIphone":
             exact_match(prev_fp["isIphone"], self.new_fp["isIphone"]),
             "isIpad":
             exact_match(prev_fp["isIpad"], self.new_fp["isIpad"]),
             "isIpod":
             exact_match(prev_fp["isIpod"], self.new_fp["isIpod"]),
             "colorDepth":
             less_than_or_equal(prev_fp["colorDepth"],
                                self.new_fp["colorDepth"]),
             "currentResolution":
             exact_match(
                 prev_fp["currentResolution"],
                 self.new_fp["currentResolution"],
             ),
             "plugins":
             match_set(prev_fp["plugins"], self.new_fp["plugins"]),
             "isJava":
             asymmetric_match(prev_fp["isJava"], self.new_fp["isJava"]),
             "isFlash":
             asymmetric_match(prev_fp["isFlash"], self.new_fp["isFlash"]),
             "isSilverlight":
             asymmetric_match(prev_fp["isSilverlight"],
                              self.new_fp["isSilverlight"]),
             "mimeTypes":
             match_set(prev_fp["mimeTypes"], self.new_fp["mimeTypes"]),
             "isMimeTypes":
             asymmetric_match(prev_fp["isMimeTypes"],
                              self.new_fp["isMimeTypes"]),
             "fonts":
             match_set(prev_fp["fonts"], self.new_fp["fonts"]),
             "isLocalStorage":
             asymmetric_match(prev_fp["isLocalStorage"],
                              self.new_fp["isLocalStorage"]),
             "isSessionStorage":
             asymmetric_match(
                 prev_fp["isSessionStorage"],
                 self.new_fp["isSessionStorage"],
             ),
             "isCookie":
             asymmetric_match(prev_fp["isCookie"], self.new_fp["isCookie"]),
             "timeZone":
             exact_match(prev_fp["timeZone"], self.new_fp["timeZone"]),
             "language":
             exact_match(prev_fp["language"], self.new_fp["language"]),
             "systemLanguage":
             exact_match(prev_fp["systemLanguage"],
                         self.new_fp["systemLanguage"]),
             "isCanvas":
             asymmetric_match(prev_fp["isCanvas"], self.new_fp["isCanvas"]),
         }
         similarity = self.estimate_fingerprint_match(results)
         if similarity == 1.0:
             return similarity, results
         elif similarity >= max_similarity:
             max_similarity, max_results = similarity, results
         return max_similarity, max_results