예제 #1
0
 def test_combined_fractions_greater_than_one_throw(self, df):
     with pytest.raises(ValueError):
         train_dev_test_split(df, dev_fraction=0.5, test_fraction = 0.51)
예제 #2
0
INPUT_FILE = os.environ['INPUT_FILE']
TARGET_DIRECTORY = os.environ['TARGET_DIRECTORY']

# ## Read raw data

df = pd.read_csv(INPUT_FILE)

# ## Filter to data containing complaints only

df = df[df['Consumer complaint narrative'].notnull()]

# ## Split data into train, dev and test subsets

train, dev, test = train_dev_test_split(
  df, dev_fraction=0.2, test_fraction=0.1
)

# ## Create target directory
# If necessary.

if not os.path.exists(TARGET_DIRECTORY):
    os.mkdir(TARGET_DIRECTORY)

# ## Write subsets to disk
# Quote all fields to avoid weird character shenanigans.

train.to_csv(TARGET_DIRECTORY+'/train.csv', quoting=csv.QUOTE_ALL)
dev.to_csv(TARGET_DIRECTORY+'/dev.csv', quoting=csv.QUOTE_ALL)
test.to_csv(TARGET_DIRECTORY+'/test.csv', quoting=csv.QUOTE_ALL)
예제 #3
0
 def test_same_random_seed_returns_same_split(self, df):
     train_a, dev_a, test_a = train_dev_test_split(df, random_state=42)
     train_b, dev_b, test_b = train_dev_test_split(df, random_state=42)
     assert_frame_equal(train_a, train_b)
     assert_frame_equal(dev_a, dev_b)
     assert_frame_equal(test_a, test_b)
예제 #4
0
 def test_no_duplicates_in_splits(self, df):
     train, dev, test = train_dev_test_split(df)
     tdt = pd.concat([train, dev, test])
     assert len(tdt.drop_duplicates()) == len(tdt)
예제 #5
0
 def test_splits_have_expected_lengths(self, df):
     train, dev, test = train_dev_test_split(df)
     assert len(train) == 60
     assert len(dev) == 20
     assert len(test) == 20
예제 #6
0
 def test_total_length_of_splits_unchanged(self, df):
     train, dev, test = train_dev_test_split(df)
     assert (len(train) + len(dev) + len(test)) == len(df)
예제 #7
0
 def test_train_dev_test_split_returns_six_objects(self, df):
     tdts = train_dev_test_split(df)
     assert len(tdts) == 3