Beispiel #1
0
    def process_downloaded_dataset(self):
        df = pd.read_csv(os.path.join(self.raw_dataset_path, "HIGGS.csv.gz"), header=None)

        df.columns = [
            "label",
            "lepton_pT",
            "lepton_eta",
            "lepton_phi",
            "missing_energy_magnitude",
            "missing_energy_phi",
            "jet_1_pt",
            "jet_1_eta",
            "jet_1_phi",
            "jet_1_b-tag",
            "jet_2_pt",
            "jet_2_eta",
            "jet_2_phi",
            "jet_2_b-tag",
            "jet_3_pt",
            "jet_3_eta",
            "jet_3_phi",
            "jet_3_b-tag",
            "jet_4_pt",
            "jet_4_eta",
            "jet_4_phi",
            "jet_4_b-tag",
            "m_jj",
            "m_jjj",
            "m_lv",
            "m_jlv",
            "m_bb",
            "m_wbb",
            "m_wwbb",
        ]

        df["label"] = df["label"].astype("int32")
        if self.add_validation_set:
            df["split"] = [0] * 10000000 + [1] * 500000 + [2] * 500000
        else:
            df["split"] = [0] * 10500000 + [2] * 500000

        makedirs(self.processed_temp_path, exist_ok=True)
        df.to_parquet(
            os.path.join(self.processed_temp_path, self.parquet_filename),
            engine="pyarrow",
            row_group_size=50000,
            index=False,
        )

        rename(self.processed_temp_path, self.processed_dataset_path)
Beispiel #2
0
 def process_downloaded_dataset(self):
     """Read the training and test directories and write out
     a csv containing the training path and the label.
     """
     makedirs(self.processed_temp_path, exist_ok=True)
     for dataset in ["training", "testing"]:
         print(f'>>> create ludwig formatted {dataset} data')
         labels, data = self.read_source_dataset(dataset,
                                                 self.raw_dataset_path)
         self.write_output_dataset(
             labels, data, os.path.join(self.processed_temp_path, dataset))
     self.output_training_and_test_data()
     rename(self.processed_temp_path, self.processed_dataset_path)
     print('>>> completed data preparation')
Beispiel #3
0
    def process_downloaded_dataset(self):
        train_df = pd.read_csv(os.path.join(self.raw_dataset_path, "adult.data"), header=None)
        test_df = pd.read_csv(os.path.join(self.raw_dataset_path, "adult.test"), header=None, skiprows=1)

        # age: continuous.
        # workclass: Private, Self-emp-not-inc, Self-emp-inc, Federal-gov, Local-gov, State-gov, Without-pay, Never-worked.
        # fnlwgt: continuous.
        # education: Bachelors, Some-college, 11th, HS-grad, Prof-school, Assoc-acdm, Assoc-voc, 9th, 7th-8th, 12th, Masters, 1st-4th, 10th, Doctorate, 5th-6th, Preschool.
        # education-num: continuous.
        # marital-status: Married-civ-spouse, Divorced, Never-married, Separated, Widowed, Married-spouse-absent, Married-AF-spouse.
        # occupation: Tech-support, Craft-repair, Other-service, Sales, Exec-managerial, Prof-specialty, Handlers-cleaners, Machine-op-inspct, Adm-clerical, Farming-fishing, Transport-moving, Priv-house-serv, Protective-serv, Armed-Forces.
        # relationship: Wife, Own-child, Husband, Not-in-family, Other-relative, Unmarried.
        # race: White, Asian-Pac-Islander, Amer-Indian-Eskimo, Other, Black.
        # sex: Female, Male.
        # capital-gain: continuous.
        # capital-loss: continuous.
        # hours-per-week: continuous.
        # native-country: United-States, Cambodia, England, Puerto-Rico, Canada, Germany, Outlying-US(Guam-USVI-etc), India, Japan, Greece, South, China, Cuba, Iran, Honduras, Philippines, Italy, Poland, Jamaica, Vietnam, Mexico, Portugal, Ireland, France, Dominican-Republic, Laos, Ecuador, Taiwan, Haiti, Columbia, Hungary, Guatemala, Nicaragua, Scotland, Thailand, Yugoslavia, El-Salvador, Trinadad&Tobago, Peru, Hong, Holand-Netherlands.
        # income: >50K, <=50K.
        columns = [
            "age",
            "workclass",
            "fnlwgt",
            "education",
            "education-num",
            "marital-status",
            "occupation",
            "relationship",
            "race",
            "sex",
            "capital-gain",
            "capital-loss",
            "hours-per-week",
            "native-country",
            "income",
        ]
        train_df.columns = columns
        test_df.columns = columns
        # Remove the trailing period on the income field in adult.test (not in adult.data)
        test_df["income"] = test_df["income"].str.rstrip(".")

        train_df["split"] = 0
        test_df["split"] = 2

        df = pd.concat([train_df, test_df])

        makedirs(self.processed_temp_path, exist_ok=True)
        df.to_csv(os.path.join(self.processed_temp_path, self.csv_filename), index=False)

        rename(self.processed_temp_path, self.processed_dataset_path)
Beispiel #4
0
    def process_downloaded_dataset(self, header=0):
        zip_file = ZipFile(
            os.path.join(self.raw_dataset_path, "orange_small_train.data.zip"))
        train_df = pd.read_csv(zip_file.open("orange_small_train.data"),
                               sep="\t")

        zip_file = ZipFile(
            os.path.join(self.raw_dataset_path, "orange_small_test.data.zip"))
        test_df = pd.read_csv(zip_file.open("orange_small_test.data"),
                              sep="\t")

        train_df = process_categorical_features(train_df, categorical_features)
        train_df = process_numerical_features(train_df, categorical_features)

        targets = (pd.read_csv(os.path.join(
            self.raw_dataset_path,
            f"orange_small_train_{self.task_name}.labels"),
                               header=None)[0].astype(str).apply(
                                   lambda x: "true" if x == "1" else "false"))

        train_idcs = pd.read_csv(os.path.join(
            self.raw_dataset_path,
            f"stratified_train_idx_{self.task_name}.txt"),
                                 header=None)[0]

        val_idcs = pd.read_csv(os.path.join(
            self.raw_dataset_path,
            f"stratified_test_idx_{self.task_name}.txt"),
                               header=None)[0]

        processed_train_df = train_df.iloc[train_idcs].copy()
        processed_train_df["target"] = targets.iloc[train_idcs]
        processed_train_df["split"] = 0

        processed_val_df = train_df.iloc[val_idcs].copy()
        processed_val_df["target"] = targets.iloc[val_idcs]
        processed_val_df["split"] = 1

        test_df["target"] = ""
        test_df["split"] = 2

        df = pd.concat([processed_train_df, processed_val_df, test_df])

        makedirs(self.processed_temp_path, exist_ok=True)
        df.to_csv(os.path.join(self.processed_temp_path, self.csv_filename),
                  index=False)

        rename(self.processed_temp_path, self.processed_dataset_path)
Beispiel #5
0
    def process_downloaded_dataset(self):

        stores_df = pd.read_csv(os.path.join(self.raw_dataset_path, "store.csv"))

        train_df = pd.read_csv(os.path.join(self.raw_dataset_path, "train.csv"), low_memory=False)
        train_df = preprocess_df(train_df, stores_df)

        train_df["split"] = -1
        train_df.loc[train_df["Year"] == 2014, "split"] = 0
        train_df.loc[train_df["Year"] == 2015, "split"] = 2
        train_df.drop(train_df[train_df["split"] == -1].index, inplace=True)
        df = train_df

        makedirs(self.processed_temp_path, exist_ok=True)
        df.to_csv(os.path.join(self.processed_temp_path, self.csv_filename), index=False)
        rename(self.processed_temp_path, self.processed_dataset_path)
Beispiel #6
0
    def process_downloaded_dataset(self):
        """The final method where we create a concatenated CSV file with both training ant dest data."""
        train_file = self.config["split_filenames"]["train_file"]
        test_file = self.config["split_filenames"]["test_file"]

        train_df = pd.read_csv(os.path.join(self.raw_dataset_path, train_file))
        test_df = pd.read_csv(os.path.join(self.raw_dataset_path, test_file))

        train_df["split"] = 0
        test_df["split"] = 2

        df = pd.concat([train_df, test_df])

        makedirs(self.processed_temp_path, exist_ok=True)
        df.to_csv(os.path.join(self.processed_temp_path, self.csv_filename),
                  index=False)
        rename(self.processed_temp_path, self.processed_dataset_path)
Beispiel #7
0
    def process_downloaded_dataset(self):
        makedirs(self.processed_temp_path, exist_ok=True)

        # create a dictionary matching image_path --> list of captions
        image_to_caption = defaultdict(list)
        with open(
            f"{self.raw_dataset_path}/Flickr8k.token.txt",
            "r"
        ) as captions_file:
            image_to_caption = defaultdict(list)
            for line in captions_file:
                line = line.split("#")
                # the regex is to format the string to fit properly in a csv
                line[1] = line[1].strip("\n01234.\t ")
                line[1] = re.sub('\"', '\"\"', line[1])
                line[1] = '\"' + line[1] + '\"'
                image_to_caption[line[0]].append(line[1])
        # create csv file with 7 columns: image_path, 5 captions, and split
        with open(
                os.path.join(self.processed_temp_path, self.csv_filename),
                'w'
        ) as output_file:
            output_file.write('image_path,caption0,caption1,caption2,')
            output_file.write('caption3,caption4,split\n')
            splits = ["train", "dev", "test"]
            for i in range(len(splits)):
                split = splits[i]
                with open(
                    f"{self.raw_dataset_path}/Flickr_8k.{split}Images.txt",
                    "r"
                ) as split_file:
                    for image_name in split_file:
                        image_name = image_name.strip('\n')
                        if image_name in image_to_caption:
                            output_file.write('{},{},{},{},{},{},{}\n'.format(
                                # Note: image folder is named Flicker8k_Dataset
                                "{}/Flicker8k_Dataset/{}".format(
                                    self.raw_dataset_path, image_name
                                ),
                                *image_to_caption[image_name],
                                i
                            ))
        # Note: csv is stored in /processed while images are stored in /raw
        rename(self.processed_temp_path, self.processed_dataset_path)
Beispiel #8
0
    def process_downloaded_dataset(self):
        makedirs(self.processed_temp_path, exist_ok=True)

        dataset_name = self.config["kaggle_dataset_name"]
        for url in self.config["kaggle_dataset_files"]:
            file_name = os.path.join(self.raw_dataset_path, dataset_name, url)
            # TODO(shreya): DataFrame created twice: here + CSVMixin. Figure out
            # options for using it once.
            df = pd.read_csv(
                file_name,
                header=0,
                names=[
                    "image_path",
                    "insurance_company",
                    "cost_of_vehicle",
                    "min_coverage",
                    "expiry_date",
                    "max_coverage",
                    "condition",
                    "amount",
                ],
            )
            df["image_path"] = df["image_path"].apply(
                lambda x: os.path.join(self.raw_dataset_path, dataset_name, "trainImages", os.path.basename(x))
            )
            df.to_csv(
                os.path.join(self.processed_temp_path, self.csv_filename),
                columns=[
                    "image_path",
                    "insurance_company",
                    "cost_of_vehicle",
                    "min_coverage",
                    "expiry_date",
                    "max_coverage",
                    "condition",
                    "amount",
                ],
            )

        # Note: csv is stored in /processed while images are stored in /raw
        rename(self.processed_temp_path, self.processed_dataset_path)
Beispiel #9
0
    def process_downloaded_dataset(self):
        df = pd.read_csv(os.path.join(self.raw_dataset_path, "covtype.data.gz"), header=None)

        # Elevation                               quantitative    meters                       Elevation in meters
        # Aspect                                  quantitative    azimuth                      Aspect in degrees azimuth
        # Slope                                   quantitative    degrees                      Slope in degrees
        # Horizontal_Distance_To_Hydrology        quantitative    meters                       Horz Dist to nearest surface water features      # noqa: E501
        # Vertical_Distance_To_Hydrology          quantitative    meters                       Vert Dist to nearest surface water features      # noqa: E501
        # Horizontal_Distance_To_Roadways         quantitative    meters                       Horz Dist to nearest roadway                     # noqa: E501
        # Hillshade_9am                           quantitative    0 to 255 index               Hillshade index at 9am, summer solstice          # noqa: E501
        # Hillshade_Noon                          quantitative    0 to 255 index               Hillshade index at noon, summer soltice          # noqa: E501
        # Hillshade_3pm                           quantitative    0 to 255 index               Hillshade index at 3pm, summer solstice          # noqa: E501
        # Horizontal_Distance_To_Fire_Points      quantitative    meters                       Horz Dist to nearest wildfire ignition points    # noqa: E501
        # Wilderness_Area (4 binary columns)      qualitative     0 (absence) or 1 (presence)  Wilderness area designation                      # noqa: E501
        # Soil_Type (40 binary columns)           qualitative     0 (absence) or 1 (presence)  Soil Type designation
        # Cover_Type (7 types)                    integer         1 to 7                       Forest Cover Type designation                    # noqa: E501
        columns = [
            "Elevation",
            "Aspect",
            "Slope",
            "Horizontal_Distance_To_Hydrology",
            "Vertical_Distance_To_Hydrology",
            "Horizontal_Distance_To_Roadways",
            "Hillshade_9am",
            "Hillshade_Noon",
            "Hillshade_3pm",
            "Horizontal_Distance_To_Fire_Points",
            "Wilderness_Area_1",
            "Wilderness_Area_2",
            "Wilderness_Area_3",
            "Wilderness_Area_4",
            "Soil_Type_1",
            "Soil_Type_2",
            "Soil_Type_3",
            "Soil_Type_4",
            "Soil_Type_5",
            "Soil_Type_6",
            "Soil_Type_7",
            "Soil_Type_8",
            "Soil_Type_9",
            "Soil_Type_10",
            "Soil_Type_11",
            "Soil_Type_12",
            "Soil_Type_13",
            "Soil_Type_14",
            "Soil_Type_15",
            "Soil_Type_16",
            "Soil_Type_17",
            "Soil_Type_18",
            "Soil_Type_19",
            "Soil_Type_20",
            "Soil_Type_21",
            "Soil_Type_22",
            "Soil_Type_23",
            "Soil_Type_24",
            "Soil_Type_25",
            "Soil_Type_26",
            "Soil_Type_27",
            "Soil_Type_28",
            "Soil_Type_29",
            "Soil_Type_30",
            "Soil_Type_31",
            "Soil_Type_32",
            "Soil_Type_33",
            "Soil_Type_34",
            "Soil_Type_35",
            "Soil_Type_36",
            "Soil_Type_37",
            "Soil_Type_38",
            "Soil_Type_39",
            "Soil_Type_40",
            "Cover_Type",
        ]
        df.columns = columns

        # Map the 40 soil types to a single integer
        # instead of 40 binary columns
        st_cols = [
            "Soil_Type_1",
            "Soil_Type_2",
            "Soil_Type_3",
            "Soil_Type_4",
            "Soil_Type_5",
            "Soil_Type_6",
            "Soil_Type_7",
            "Soil_Type_8",
            "Soil_Type_9",
            "Soil_Type_10",
            "Soil_Type_11",
            "Soil_Type_12",
            "Soil_Type_13",
            "Soil_Type_14",
            "Soil_Type_15",
            "Soil_Type_16",
            "Soil_Type_17",
            "Soil_Type_18",
            "Soil_Type_19",
            "Soil_Type_20",
            "Soil_Type_21",
            "Soil_Type_22",
            "Soil_Type_23",
            "Soil_Type_24",
            "Soil_Type_25",
            "Soil_Type_26",
            "Soil_Type_27",
            "Soil_Type_28",
            "Soil_Type_29",
            "Soil_Type_30",
            "Soil_Type_31",
            "Soil_Type_32",
            "Soil_Type_33",
            "Soil_Type_34",
            "Soil_Type_35",
            "Soil_Type_36",
            "Soil_Type_37",
            "Soil_Type_38",
            "Soil_Type_39",
            "Soil_Type_40",
        ]
        st_vals = []
        for _, row in df[st_cols].iterrows():
            st_vals.append(row.to_numpy().nonzero()[0].item(0))
        df = df.drop(columns=st_cols)
        df["Soil_Type"] = st_vals

        # Map the 4 wilderness areas to a single integer
        # instead of 4 binary columns
        wa_cols = ["Wilderness_Area_1", "Wilderness_Area_2", "Wilderness_Area_3", "Wilderness_Area_4"]
        wa_vals = []
        for _, row in df[wa_cols].iterrows():
            wa_vals.append(row.to_numpy().nonzero()[0].item(0))
        df = df.drop(columns=wa_cols)
        df["Wilderness_Area"] = wa_vals

        if not self.use_tabnet_split:
            # first 11340 records used for training data subset
            # next 3780 records used for validation data subset
            # last 565892 records used for testing data subset
            df["split"] = [0] * 11340 + [1] * 3780 + [2] * 565892
        else:
            # Split used in the tabNet paper
            # https://github.com/google-research/google-research/blob/master/tabnet/download_prepare_covertype.py
            train_val_indices, test_indices = train_test_split(range(len(df)), test_size=0.2, random_state=0)
            train_indices, val_indices = train_test_split(train_val_indices, test_size=0.2 / 0.6, random_state=0)

            df["split"] = 0
            df.loc[val_indices, "split"] = 1
            df.loc[test_indices, "split"] = 2

        makedirs(self.processed_temp_path, exist_ok=True)
        df.to_csv(os.path.join(self.processed_temp_path, self.csv_filename), index=False)
        rename(self.processed_temp_path, self.processed_dataset_path)
Beispiel #10
0
 def process_downloaded_dataset(self):
     rename(self.raw_dataset_path, self.processed_dataset_path)
Beispiel #11
0
from ludwig.api import LudwigModel
from ludwig.datasets import twitter_bots
from ludwig.utils.fs_utils import rename
from ludwig.visualize import confusion_matrix, learning_curves

if __name__ == "__main__":
    # Cleans out prior results
    shutil.rmtree("./results", ignore_errors=True)
    shutil.rmtree(".visualizations", ignore_errors=True)

    # Loads the dataset
    dataset = twitter_bots.TwitterBots(cache_dir=".")
    training_set, val_set, test_set = dataset.load(split=True)
    # Moves profile images into local directory, so relative paths in the dataset will be resolved.
    rename(os.path.join(dataset.processed_dataset_path, "profile_images"),
           "./profile_images")

    with open("./config.yaml") as f:
        config = yaml.safe_load(f.read())

    model = LudwigModel(config, logging_level=logging.INFO)

    train_stats, preprocessed_data, output_directory = model.train(
        dataset=training_set)

    # Generates predictions and performance statistics for the test set.
    test_stats, predictions, output_directory = model.evaluate(
        test_set, collect_predictions=True, collect_overall_stats=True)

    confusion_matrix(
        [test_stats],