def main(): ''' Prepares the dataset ready for training by preprocessing the data, building and engineering the features from the dataset and creating the .csv file of the dataset into the data folder. Returns: data_name_bf : prepared dataset file located on the data folder ''' data_lst = ["val","train"] for i in data_lst: #Preprocessing the data my_input = "../data/{input}.csv".format(input=i) preprocess.execute(input_file=my_input, output_file='../data/{input}_bf.csv'.format(input=i)) #Preparing the data for training my_input= "../data/{input}_bf.csv".format(input=i) build_features.execute(input_file=my_input, output_file='../data/{input}_bf.csv'.format(input=i)) print("{input}_bf.csv file has been created".format(input=i)) train_acc = train.execute() pred_acc = predict.execute() compare_models = pd.DataFrame({ 'Model': ['RandForest','ExtTree', 'GraBoo', 'AdaBoo'], 'Train Score': train_acc, 'Prediction Score': pred_acc }) return print(compare_models)
def test_DistancePoint100FromCentre(self): df = preprocess.execute("../../Data/TestData/bee-data_NT_test.csv") Distance = df['DistanceCentre'].iloc[99] print("Distance:", Distance) self.assertAlmostEqual(Distance, 36.1062682861314)
def test_locationDensity(self): df = preprocess.execute( "../../Data/TestData/bee-data_NT_test_locationDensity.csv") #(traj, lseg, ovlp, cum_dist_end_prev) arena = classArena.classArena(df) dt_first_segment, cum_dist_end_segment, end_trajectory = cs.getSegment( df, 7, 0, 0) length_first_segment = cs.getSegmentLength(dt_first_segment) features_first_segment = sg.Segment(dt_first_segment, length_first_segment, arena, 0) feat = features_first_segment.getFeature( enums.eFeature.LocationDensity) SumDistanceBetweenEachPairPoints = 1 + math.sqrt(1**2 + 2**2) + 2 \ + 2 + math.sqrt(1**2 + 2**2) \ + 1 nCr = 6 LocationDensity_TrueValue = SumDistanceBetweenEachPairPoints / nCr self.assertEqual(feat.value, LocationDensity_TrueValue)
def test_calcCentralDisplacement_withinCorrectRange(self): df = preprocess.execute("../../Data/TestData/bee-data_NT_test.csv") arena = classArena.classArena(df) dt_segment, cum_dist_end_segment, end_trajectory = cs.getSegment( df, 10, 0, 0) for i in range(0, 20): dt_segment, cum_dist_end_segment, end_trajectory = cs.getSegment( df, 10, 0.3, cum_dist_end_segment) length_segment = cs.getSegmentLength(dt_segment) features_segment = sg.Segment(dt_segment, length_segment, arena, 0) cent_displ = features_segment.getFeature( enums.eFeature.CentralDisplacement).value # print("cent_displ:", cent_displ) self.assertLessEqual(cent_displ, 1) # test never bigger than arena size self.assertGreater(cent_displ, 0) # test positive self.assertGreaterEqual( cent_displ, (features_segment.ellipse.centre[0] - arena.centre_x) * 2 / arena.diameter) # test greater than ellipse centre x self.assertGreaterEqual( cent_displ, (features_segment.ellipse.centre[1] - arena.centre_y) * 2 / arena.diameter) # test greater than ellipse centre y
def execute(data_dir, data_file, categorize=False): """Builds features Args: data_dir (str): relative path to data subdirectory data_file (str): name of csv data file categorize: set to True if Age and Fare should be categorized """ # Read preprocessed data: data = preprocess.execute(data_dir + data_file) # Replace sex strings with binary value: data["Sex"] = data["Sex"].replace("male", 0) data["Sex"] = data["Sex"].replace("female", 1) if categorize: # Convert age into categories: data["Age"] = data["Age"].astype(int) data.loc[data["Age"] <= 19, "Age"] = 0 data.loc[(data["Age"] > 19) & (data["Age"] <= 25), "Age"] = 1 data.loc[(data["Age"] > 25) & (data["Age"] <= 32), "Age"] = 2 data.loc[(data["Age"] > 32) & (data["Age"] <= 42), "Age"] = 3 data.loc[(data["Age"] > 42), "Age"] = 4 # Convert fare into categories: data.loc[data["Fare"] <= 7.854, "Fare"] = 0 data.loc[(data["Fare"] > 7.854) & (data["Fare"] <= 10.5), "Fare"] = 1 data.loc[(data["Fare"] > 10.5) & (data["Fare"] <= 22.225), "Fare"] = 2 data.loc[(data["Fare"] > 22.225) & (data["Fare"] <= 39.688), "Fare"] = 3 data.loc[(data["Fare"] > 39.688), "Fare"] = 4 # Embarked: C = Cherbourg, Q = Queenstown, S = Southampton # Replace above labels with numbers from 1 to 3: embarked_dict = {} embarked_dict_values = 0 for i in data.Embarked: if i in embarked_dict.keys(): pass else: embarked_dict_values = embarked_dict_values + 1 embarked_dict[i] = embarked_dict_values for i in embarked_dict.keys(): data["Embarked"].replace(i, embarked_dict[i], inplace=True) # Add columns FamilySize and IsAlone: data["FamilySize"] = data["SibSp"] + data["Parch"] + 1 data["IsAlone"] = 0 data.loc[data["FamilySize"] == 1, "IsAlone"] = 1 data.to_csv(data_dir + data_file + processed_suffix + "_" + str(categorize), sep=";", index=False)
def test_getSegmentLength(self): df = preprocess.execute("../../Data/TestData/bee-data_NT_test.csv") first_segment, cum_dist_end_segment, end_trajectory = cs.getSegment( df, 10, 0, 0) second_segment, cum_dist_end_segment, end_trajectory = cs.getSegment( df, 10, 0.3, cum_dist_end_segment) length_segment = cs.getSegmentLength(second_segment) self.assertAlmostEqual(length_segment, 10.4350527761)
def test_FindingCorrectSecondSegment(self): df = preprocess.execute("../../Data/TestData/bee-data_NT_test.csv") first_segment, cum_dist_end_segment, end_trajectory = cs.getSegment( df, 10, 0, 0) second_segment, cum_dist_end_segment, end_trajectory = cs.getSegment( df, 10, 0.3, cum_dist_end_segment) self.assertAlmostEqual(second_segment['CumulativeDistance'].iloc[0], 7.0969949712) self.assertAlmostEqual(second_segment['CumulativeDistance'].iloc[-1], 17.5320477473)
def test_iQRangeDistanceCentre(self): df = preprocess.execute("../../Data/TestData/bee-data_NT_test.csv") arena = classArena.classArena(df) dt_first_segment, cum_dist_end_segment, end_trajectory = cs.getSegment( df, 10, 0, 0) dt_second_segment, cum_dist_end_segment, end_trajectory = cs.getSegment( df, 10, 0.3, cum_dist_end_segment) len_second_segment = cs.getSegmentLength(dt_second_segment) second_segment_features = sg.Segment(dt_second_segment, len_second_segment, arena, 0) feat = second_segment_features.getFeature(enums.eFeature.IQRange) self.assertAlmostEqual(feat.value, 0.0164803959758471)
def test_checkCorrectingRotation(self): df = preprocess.execute( "../../Data/TestData/bee-data_NT_test_maxloop.csv") #(traj, lseg, ovlp, cum_dist_end_prev) arena = classArena.classArena(df) dt_first_segment, cum_dist_end_segment, end_trajectory = cs.getSegment( df, 20, 0, 0) length_first_segment = cs.getSegmentLength(dt_first_segment) features_first_segment = sg.Segment(dt_first_segment, length_first_segment, arena, 0) feat = features_first_segment.getFeature(enums.eFeature.MeanSpeed) self.assertEqual(feat.value, 118.75)
def test_sumAbsAngles(self): df = preprocess.execute( "../../Data/TestData/bee-data_NT_test_sum_abs_angles.csv") #(traj, lseg, ovlp, cum_dist_end_prev) arena = classArena.classArena(df) dt_first_segment, cum_dist_end_segment, end_trajectory = cs.getSegment( df, 7, 0, 0) length_first_segment = cs.getSegmentLength(dt_first_segment) features_first_segment = sg.Segment(dt_first_segment, length_first_segment, arena, 0) feat = features_first_segment.getFeature( enums.eFeature.SumAbsoluteAngles) self.assertEqual(feat.value, 2 * math.pi)
def test_MedianDistanceCentre(self): df = preprocess.execute("../../Data/TestData/bee-data_NT_test.csv") arena = classArena.classArena(df) dt_first_segment, cum_dist_end_segment, end_trajectory = cs.getSegment( df, 10, 0, 0) dt_second_segment, cum_dist_end_segment, end_trajectory = cs.getSegment( df, 10, 0.3, cum_dist_end_segment) length_second_segment = cs.getSegmentLength(dt_second_segment) features_second_segment = sg.Segment(dt_second_segment, length_second_segment, arena, 0) feat = features_second_segment.getFeature( enums.eFeature.MedianDistanceFromCentre) self.assertAlmostEqual(feat.value, 0.8628325515)
def test_pathEfficiency(self): df = preprocess.execute( "../../Data/TestData/bee-data_NT_test_sum_abs_angles.csv") #(traj, lseg, ovlp, cum_dist_end_prev) arena = classArena.classArena(df) dt_first_segment, cum_dist_end_segment, end_trajectory = cs.getSegment( df, 20, 0, 0) length_first_segment = cs.getSegmentLength(dt_first_segment) features_first_segment = sg.Segment(dt_first_segment, length_first_segment, arena, 0) feat = features_first_segment.getFeature(enums.eFeature.PathEfficiency) PathEfficiency_TrueValue = 1.0 / 7.0 self.assertEqual(feat.value, PathEfficiency_TrueValue)
def segmentIndividualFilenames(df, exp_name, arena, segment_length, overlap): list_segments = [] unique_filename = df.filename.unique() for iFile in unique_filename: logging.info(".....................Filename: " + iFile + ".....................") df_file = df[df['filename'] == iFile] df_file.reset_index(drop=True, inplace=True) df_file = preprocess.execute(df_file, experiment_name=exp_name, arena=arena) temp_list_segments = sendSectionDfSegment( df=df_file, using_light=True, arena=arena, segment_length=segment_length, overlap=overlap) list_segments = list_segments + temp_list_segments return list_segments
def test_areaFormula(self): df = preprocess.execute("../../Data/TestData/bee-data_NT_test.csv") arena = classArena.classArena(df) dt_first_segment, cum_dist_end_segment, end_trajectory = cs.getSegment( df, 10, 0, 0) dt_second_segment, cum_dist_end_segment, end_trajectory = cs.getSegment( df, 10, 0.3, cum_dist_end_segment) length_second_segment = cs.getSegmentLength(dt_second_segment) features_second_segment = sg.Segment(dt_second_segment, length_second_segment, arena, 0) #features_second_segment.calcMinEnclosingEllipseArea points = np.array([[-1, 0, 0, 1], [0, 1, -1, 0]]).T ellipse = features_second_segment.findMinEnclosingEllipse(points) min_enclosing_ellipse_area = features_second_segment.calcMinEnclosingEllipseArea( ellipse.radii) self.assertAlmostEqual(min_enclosing_ellipse_area, math.pi)
# To run this code you need the following Folde structure # ./ # | # |---- [to_process] --> this must contain your json files # |---- [CSV] --> after pre-processing files # |---- [Bulk] --> one grouped file containing all csv # |---- [Final_files] --> summary result files (Tweets & Users) # if __name__ == '__main__': # Pre processing # Will read all json files from input directory and will produce new files # after data cleaning. # Will also produce a Bulkfile containing all data. pre.execute() # Processing # Will read bulk file and calculate the number of tweets and users # found in each day. # The result is two csv files. Tweets and Users csv pro.execute() # Final merge of Tweets and Users Datasets # todo: Implement a for loop to cover all tweets and users csv files rootdir = './Final-files' file = 'processed-tweets.csv' filepath = rootdir + os.sep + file dfTweets = pd.read_csv(filepath, names=['Date', 'Hour', 'Tweets']) rootdir = './Final-files'