Esempio n. 1
0
def load_training_data(data_path: str,
                       row_subset: float = 1,
                       train_split: float = 0.7,
                       shuffle: bool = False,
                       seed=None):
    """
    Load the training set and divide it into training and test splits.
    "LinkedID" is the value that we want to predict;
    
    :param data_path: path to the dataset to load;
    :param row_subset: use only the specified fraction of rows in the dataset (value in (0, 1]);
    :param train_split: fraction of rows placed in the training set;
    :param shuffle: if True, shuffle the rows before splitting or subsetting the data;
    """
    if row_subset <= 0 or row_subset > 1:
        row_subset = 1

    data = read_file(training_file, set_record_id_as_index=True)
    if shuffle:
        data = data.sample(frac=1, random_state=seed)
    # Obtain the specified subset of rows;
    data = data.iloc[:int(np.ceil(len(data) * row_subset))]

    X = data.drop(columns="linked_id")
    y = data["linked_id"]
    X_train, X_test, y_train, y_test = train_test_split(X,
                                                        y,
                                                        train_size=train_split,
                                                        shuffle=shuffle,
                                                        random_state=seed)

    return X_train, X_test, y_train, y_test
Esempio n. 2
0
from evaluation_script import read_file
from test_er import clean_data, index_data, \
    predict_er_parallel, \
    prediction_dict_to_kaggle_df

if __name__ == "__main__":

    # Startup of Ray;
    num_cpus = psutil.cpu_count(logical=False)
    ray.shutdown()
    ray.init(num_cpus=num_cpus)

    #%% 1. Build the dataset;
    training_file = "/Users/marti/Downloads/oracle-polimi-contest-2019/data/entity-resolution_advanced-topics-training_data.csv"
    train = read_file(training_file, set_record_id_as_index=True)

    X_train = train.drop(columns="linked_id")
    y_train = train["linked_id"]

    test_file = "/Users/marti/Downloads/oracle-polimi-contest-2019/data/test_data.csv"
    X_test = read_file(test_file, set_record_id_as_index=True)

    #%% 2. Clean data;
    X_train = clean_data(X_train)
    X_test = clean_data(X_test)

    #%% 3. Create indices for the data;
    X_train, X_test = index_data([X_train, X_test])

    #%% 4. Compute the predictions;
def kaggle_sol_to_df(kaggle_df: pd.DataFrame) -> pd.DataFrame:
    kaggle_df_indexed = kaggle_df.set_index("queried_record_id")
    results = []
    for query_id, pred_list in kaggle_df_indexed.iterrows():
        results += [[query_id, pred]
                    for pred in pred_list["predicted_record_id"].split(" ")]
    return pd.DataFrame(results,
                        columns=["queried_record_id", "predicted_record_id"])


#############################
#############################
#Load data set

training_filez = "../panama-papers-polimi/data/entity-resolution_advanced-topics-training_data.csv"
train = read_file(training_filez, set_record_id_as_index=False)
#X_train_parra = train.drop(columns="linked_id")
y_train = train["record_id"]
#X_train, X_test, y_train, y_test = load_training_data(training_file, shuffle=True, row_subset=0.01, seed=42)

trainn = pd.read_csv(
    '../panama-papers-polimi/data/panama_train_expanded_2.csv')
#trainn = read_file(training_file, set_record_id_as_index=True)

testt = pd.read_csv('../panama-papers-polimi/data/panama_train_expanded_2.csv')
#testt = read_file(testing_file, set_record_id_as_index=True)

X_train = trainn.copy()
X_test = testt.copy()

X_train = X_train[[