def run(geoentities): try: print "starting ai.predict" connection.close() start = datetime.now() classifier = DNNLinearCombinedClassifier( model_dir=MODEL_DIR, linear_feature_columns=wide_columns, dnn_feature_columns=deep_columns, dnn_hidden_units=[100,50] ) print "classifier:", classifier print "creating the classifier took", (datetime.now() - start).total_seconds(), "seconds" df = get_fake_df() print "about to populate data frame for prediction" start_df = datetime.now() for index, geoentity in enumerate(geoentities): place_id = geoentity.place_id name = geoentity.target feature_admin_levels = set([g.admin_level for g in geoentities if g.admin_level and g.target == name]) if feature_admin_levels: lowest_admin_level = min(feature_admin_levels) else: lowest_admin_level = -99 population = g.population is_highest_population = population and population == max([g.population for g in geoentities if g.target == name]) or False admin_level = geoentity.admin_level df['admin_level'].append(str(geoentity.admin_level or "None")) df['cluster_frequency'].append(geoentity.cluster_frequency or 0) df['country_code'].append(geoentity.country_code or "UNKNOWN") df['country_rank'].append(geoentity.country_rank or 999) df['edit_distance'].append(str(geoentity.edit_distance)) df['has_mpoly'].append(str(geoentity.has_mpoly or False)) df['has_pcode'].append(str(geoentity.has_pcode or False)) df['is_country'].append(str(admin_level == 0)) df['is_lowest_admin_level'].append(str(lowest_admin_level == g.admin_level)) df['is_highest_population'].append(str(is_highest_population)) df['median_distance'].append(geoentity.median_distance_from_all_other_points) df['matches_topic'].append(str(geoentity.matches_topic or "False")) df['population'].append(geoentity.population) df['popularity'].append(geoentity.popularity) print "populating df took", ((datetime.now() - start_df).total_seconds() / 60), "minutes" for index, row in enumerate(classifier.predict_proba(input_fn=lambda: input_fn(df))): geoentities[index].probability = row[1] except Exception as e: fail("EXCPETION in scripts.ai.predict.run: " + str(e))
def train(): try: start = datetime.now() print "starting appbkto.scripts.predict.train" connection.close() features = list(Feature.objects.filter(verified=True).values("id","featureplace__id","featureplace__place__admin_level","featureplace__correct","featureplace__place_id","featureplace__cluster_frequency","featureplace__place__country_code","featureplace__country_rank","featureplace__place__mpoly","featureplace__place__pcode","featureplace__popularity","featureplace__place__population","featureplace__median_distance","featureplace__place__topic_id","topic_id")) print "features:", type(features), len(features) rmtree(MODEL_DIR, ignore_errors=True) print "creating classifier" classifier = DNNLinearCombinedClassifier( model_dir=MODEL_DIR, linear_feature_columns=wide_columns, dnn_feature_columns=deep_columns, dnn_hidden_units=[100,50] ) print "classifier:", classifier number_of_features = len(features) print "training with real data" print "shuffle the features" shuffle(features) half = number_of_features / 2 print "half is", half df_train = get_df_from_features(features[:half]) df_test = get_df_from_features(features[half:]) for filename in listdir(PATH_TO_DIRECTORY_OF_INPUT_DATA): print "filename for import :", filename if filename.endswith(".csv"): df = get_df_from_csv(PATH_TO_DIRECTORY_OF_INPUT_DATA + "/" + filename) #print "loaded", filename, "into", df half = len(df.values()[0]) / 2 print "half:", half for column_name in df: if type(df_train[column_name][0]) != type(df[column_name][0]): print "mismatch type for ", column_name print "type(df_train[column_name][0]):", type(df_train[column_name][0]) print "type(df[column_name][:half][0]):", type(df[column_name][0]) df_train[column_name] = df[column_name][:half] df_test[column_name] = df[column_name][half:] print "fitting" try: classifier.fit(input_fn=lambda: input_fn(df_train), steps=200) except Exception as e: fail("EXCEPTION fitting model in scripts.ai.predict.train: " + str(e)) print "\nfitted" results = classifier.evaluate(input_fn=lambda: input_fn(df_test), steps=10) for key in sorted(results): print("%s: %s" % (key, results[key])) print "took", ((datetime.now() - start).total_seconds() / 60), "minutes to train" except Exception as e: fail("EXCEPTION in ai.predict.train: " + str(e))