コード例 #1
0
def main():
    '''
    Prepares the dataset ready for training by preprocessing the data, building
    and engineering the features from the dataset and creating the .csv file of the
    dataset into the data folder.

    Returns:
        data_name_bf : prepared dataset file located on the data folder
    '''
    data_lst = ["val","train"]

    for i in data_lst:

        #Preprocessing the data
        my_input = "../data/{input}.csv".format(input=i)
        preprocess.execute(input_file=my_input,
        output_file='../data/{input}_bf.csv'.format(input=i))

        #Preparing the data for training
        my_input= "../data/{input}_bf.csv".format(input=i)
        build_features.execute(input_file=my_input,
        output_file='../data/{input}_bf.csv'.format(input=i))
        print("{input}_bf.csv file has been created".format(input=i))

    train_acc = train.execute()
    pred_acc = predict.execute()

    compare_models = pd.DataFrame({
        'Model': ['RandForest','ExtTree', 'GraBoo', 'AdaBoo'],
        'Train Score': train_acc,
        'Prediction Score': pred_acc

    })

    return print(compare_models)
コード例 #2
0
    def test_DistancePoint100FromCentre(self):

        df = preprocess.execute("../../Data/TestData/bee-data_NT_test.csv")

        Distance = df['DistanceCentre'].iloc[99]
        print("Distance:", Distance)
        self.assertAlmostEqual(Distance, 36.1062682861314)
コード例 #3
0
    def test_locationDensity(self):

        df = preprocess.execute(
            "../../Data/TestData/bee-data_NT_test_locationDensity.csv")
        #(traj, lseg, ovlp, cum_dist_end_prev)
        arena = classArena.classArena(df)

        dt_first_segment, cum_dist_end_segment, end_trajectory = cs.getSegment(
            df, 7, 0, 0)
        length_first_segment = cs.getSegmentLength(dt_first_segment)
        features_first_segment = sg.Segment(dt_first_segment,
                                            length_first_segment, arena, 0)

        feat = features_first_segment.getFeature(
            enums.eFeature.LocationDensity)

        SumDistanceBetweenEachPairPoints = 1 + math.sqrt(1**2 + 2**2) + 2 \
                                            + 2 + math.sqrt(1**2 + 2**2) \
                                            + 1

        nCr = 6

        LocationDensity_TrueValue = SumDistanceBetweenEachPairPoints / nCr

        self.assertEqual(feat.value, LocationDensity_TrueValue)
コード例 #4
0
    def test_calcCentralDisplacement_withinCorrectRange(self):

        df = preprocess.execute("../../Data/TestData/bee-data_NT_test.csv")
        arena = classArena.classArena(df)

        dt_segment, cum_dist_end_segment, end_trajectory = cs.getSegment(
            df, 10, 0, 0)
        for i in range(0, 20):
            dt_segment, cum_dist_end_segment, end_trajectory = cs.getSegment(
                df, 10, 0.3, cum_dist_end_segment)
            length_segment = cs.getSegmentLength(dt_segment)
            features_segment = sg.Segment(dt_segment, length_segment, arena, 0)
            cent_displ = features_segment.getFeature(
                enums.eFeature.CentralDisplacement).value
            #            print("cent_displ:", cent_displ)
            self.assertLessEqual(cent_displ,
                                 1)  # test never bigger than arena size
            self.assertGreater(cent_displ, 0)  # test positive
            self.assertGreaterEqual(
                cent_displ,
                (features_segment.ellipse.centre[0] - arena.centre_x) * 2 /
                arena.diameter)  # test greater than ellipse centre x
            self.assertGreaterEqual(
                cent_displ,
                (features_segment.ellipse.centre[1] - arena.centre_y) * 2 /
                arena.diameter)  # test greater than ellipse centre y
コード例 #5
0
def execute(data_dir, data_file, categorize=False):
    """Builds features
    Args:
        data_dir (str): relative path to data subdirectory
        data_file (str): name of csv data file
        categorize: set to True if Age and Fare should be categorized
    """

    # Read preprocessed data:
    data = preprocess.execute(data_dir + data_file)

    # Replace sex strings with binary value:
    data["Sex"] = data["Sex"].replace("male", 0)
    data["Sex"] = data["Sex"].replace("female", 1)

    if categorize:

        # Convert age into categories:
        data["Age"] = data["Age"].astype(int)
        data.loc[data["Age"] <= 19, "Age"] = 0
        data.loc[(data["Age"] > 19) & (data["Age"] <= 25), "Age"] = 1
        data.loc[(data["Age"] > 25) & (data["Age"] <= 32), "Age"] = 2
        data.loc[(data["Age"] > 32) & (data["Age"] <= 42), "Age"] = 3
        data.loc[(data["Age"] > 42), "Age"] = 4

        # Convert fare into categories:
        data.loc[data["Fare"] <= 7.854, "Fare"] = 0
        data.loc[(data["Fare"] > 7.854) & (data["Fare"] <= 10.5), "Fare"] = 1
        data.loc[(data["Fare"] > 10.5) & (data["Fare"] <= 22.225), "Fare"] = 2
        data.loc[(data["Fare"] > 22.225) & (data["Fare"] <= 39.688),
                 "Fare"] = 3
        data.loc[(data["Fare"] > 39.688), "Fare"] = 4

    # Embarked: C = Cherbourg, Q = Queenstown, S = Southampton
    # Replace above labels with numbers from 1 to 3:
    embarked_dict = {}
    embarked_dict_values = 0
    for i in data.Embarked:
        if i in embarked_dict.keys():
            pass
        else:
            embarked_dict_values = embarked_dict_values + 1
            embarked_dict[i] = embarked_dict_values
    for i in embarked_dict.keys():
        data["Embarked"].replace(i, embarked_dict[i], inplace=True)

    # Add columns FamilySize and IsAlone:
    data["FamilySize"] = data["SibSp"] + data["Parch"] + 1
    data["IsAlone"] = 0
    data.loc[data["FamilySize"] == 1, "IsAlone"] = 1

    data.to_csv(data_dir + data_file + processed_suffix + "_" +
                str(categorize),
                sep=";",
                index=False)
コード例 #6
0
    def test_getSegmentLength(self):
        df = preprocess.execute("../../Data/TestData/bee-data_NT_test.csv")

        first_segment, cum_dist_end_segment, end_trajectory = cs.getSegment(
            df, 10, 0, 0)
        second_segment, cum_dist_end_segment, end_trajectory = cs.getSegment(
            df, 10, 0.3, cum_dist_end_segment)

        length_segment = cs.getSegmentLength(second_segment)

        self.assertAlmostEqual(length_segment, 10.4350527761)
コード例 #7
0
    def test_FindingCorrectSecondSegment(self):

        df = preprocess.execute("../../Data/TestData/bee-data_NT_test.csv")

        first_segment, cum_dist_end_segment, end_trajectory = cs.getSegment(
            df, 10, 0, 0)
        second_segment, cum_dist_end_segment, end_trajectory = cs.getSegment(
            df, 10, 0.3, cum_dist_end_segment)

        self.assertAlmostEqual(second_segment['CumulativeDistance'].iloc[0],
                               7.0969949712)
        self.assertAlmostEqual(second_segment['CumulativeDistance'].iloc[-1],
                               17.5320477473)
コード例 #8
0
    def test_iQRangeDistanceCentre(self):
        df = preprocess.execute("../../Data/TestData/bee-data_NT_test.csv")
        arena = classArena.classArena(df)

        dt_first_segment, cum_dist_end_segment, end_trajectory = cs.getSegment(
            df, 10, 0, 0)
        dt_second_segment, cum_dist_end_segment, end_trajectory = cs.getSegment(
            df, 10, 0.3, cum_dist_end_segment)

        len_second_segment = cs.getSegmentLength(dt_second_segment)
        second_segment_features = sg.Segment(dt_second_segment,
                                             len_second_segment, arena, 0)

        feat = second_segment_features.getFeature(enums.eFeature.IQRange)
        self.assertAlmostEqual(feat.value, 0.0164803959758471)
コード例 #9
0
    def test_checkCorrectingRotation(self):

        df = preprocess.execute(
            "../../Data/TestData/bee-data_NT_test_maxloop.csv")
        #(traj, lseg, ovlp, cum_dist_end_prev)
        arena = classArena.classArena(df)

        dt_first_segment, cum_dist_end_segment, end_trajectory = cs.getSegment(
            df, 20, 0, 0)
        length_first_segment = cs.getSegmentLength(dt_first_segment)
        features_first_segment = sg.Segment(dt_first_segment,
                                            length_first_segment, arena, 0)

        feat = features_first_segment.getFeature(enums.eFeature.MeanSpeed)

        self.assertEqual(feat.value, 118.75)
コード例 #10
0
    def test_sumAbsAngles(self):

        df = preprocess.execute(
            "../../Data/TestData/bee-data_NT_test_sum_abs_angles.csv")
        #(traj, lseg, ovlp, cum_dist_end_prev)
        arena = classArena.classArena(df)

        dt_first_segment, cum_dist_end_segment, end_trajectory = cs.getSegment(
            df, 7, 0, 0)
        length_first_segment = cs.getSegmentLength(dt_first_segment)
        features_first_segment = sg.Segment(dt_first_segment,
                                            length_first_segment, arena, 0)

        feat = features_first_segment.getFeature(
            enums.eFeature.SumAbsoluteAngles)

        self.assertEqual(feat.value, 2 * math.pi)
コード例 #11
0
    def test_MedianDistanceCentre(self):

        df = preprocess.execute("../../Data/TestData/bee-data_NT_test.csv")
        arena = classArena.classArena(df)

        dt_first_segment, cum_dist_end_segment, end_trajectory = cs.getSegment(
            df, 10, 0, 0)
        dt_second_segment, cum_dist_end_segment, end_trajectory = cs.getSegment(
            df, 10, 0.3, cum_dist_end_segment)
        length_second_segment = cs.getSegmentLength(dt_second_segment)

        features_second_segment = sg.Segment(dt_second_segment,
                                             length_second_segment, arena, 0)

        feat = features_second_segment.getFeature(
            enums.eFeature.MedianDistanceFromCentre)

        self.assertAlmostEqual(feat.value, 0.8628325515)
コード例 #12
0
    def test_pathEfficiency(self):

        df = preprocess.execute(
            "../../Data/TestData/bee-data_NT_test_sum_abs_angles.csv")
        #(traj, lseg, ovlp, cum_dist_end_prev)
        arena = classArena.classArena(df)

        dt_first_segment, cum_dist_end_segment, end_trajectory = cs.getSegment(
            df, 20, 0, 0)
        length_first_segment = cs.getSegmentLength(dt_first_segment)
        features_first_segment = sg.Segment(dt_first_segment,
                                            length_first_segment, arena, 0)

        feat = features_first_segment.getFeature(enums.eFeature.PathEfficiency)

        PathEfficiency_TrueValue = 1.0 / 7.0

        self.assertEqual(feat.value, PathEfficiency_TrueValue)
コード例 #13
0
def segmentIndividualFilenames(df, exp_name, arena, segment_length, overlap):
    list_segments = []
    unique_filename = df.filename.unique()
    for iFile in unique_filename:
        logging.info(".....................Filename: " + iFile +
                     ".....................")
        df_file = df[df['filename'] == iFile]
        df_file.reset_index(drop=True, inplace=True)
        df_file = preprocess.execute(df_file,
                                     experiment_name=exp_name,
                                     arena=arena)
        temp_list_segments = sendSectionDfSegment(
            df=df_file,
            using_light=True,
            arena=arena,
            segment_length=segment_length,
            overlap=overlap)
        list_segments = list_segments + temp_list_segments
    return list_segments
コード例 #14
0
    def test_areaFormula(self):
        df = preprocess.execute("../../Data/TestData/bee-data_NT_test.csv")
        arena = classArena.classArena(df)

        dt_first_segment, cum_dist_end_segment, end_trajectory = cs.getSegment(
            df, 10, 0, 0)
        dt_second_segment, cum_dist_end_segment, end_trajectory = cs.getSegment(
            df, 10, 0.3, cum_dist_end_segment)
        length_second_segment = cs.getSegmentLength(dt_second_segment)

        features_second_segment = sg.Segment(dt_second_segment,
                                             length_second_segment, arena, 0)

        #features_second_segment.calcMinEnclosingEllipseArea

        points = np.array([[-1, 0, 0, 1], [0, 1, -1, 0]]).T

        ellipse = features_second_segment.findMinEnclosingEllipse(points)
        min_enclosing_ellipse_area = features_second_segment.calcMinEnclosingEllipseArea(
            ellipse.radii)

        self.assertAlmostEqual(min_enclosing_ellipse_area, math.pi)
コード例 #15
0
ファイル: mainProcess.py プロジェクト: LeoArruda/twitter
# To run this code you need the following Folde structure
#     ./
#      |
#      |---- [to_process]   --> this must contain your json files
#      |---- [CSV]          --> after pre-processing files
#      |---- [Bulk]         --> one grouped file containing all csv
#      |---- [Final_files]  --> summary result files (Tweets & Users)
#

if __name__ == '__main__':

    # Pre processing
    # Will read all json files from input directory and will produce new files
    # after data cleaning.
    # Will also produce a Bulkfile containing all data.
    pre.execute()

    # Processing
    # Will read bulk file and calculate the number of tweets and users
    # found in each day.
    # The result is two csv files. Tweets and Users csv
    pro.execute()

    # Final merge of Tweets and Users Datasets
    # todo: Implement a for loop to cover all tweets and users csv files

    rootdir = './Final-files'
    file = 'processed-tweets.csv'
    filepath = rootdir + os.sep + file
    dfTweets = pd.read_csv(filepath, names=['Date', 'Hour', 'Tweets'])
    rootdir = './Final-files'