def run(train_csv, train_tensor, test_csv, test_tensor, metric='euclidean'):
    print("K means\n")
    """
        Construction du dataset train.
    """
    df = pd.read_csv(train_csv,
                     sep=';',
                     header='infer',
                     quotechar='"',
                     low_memory=True)

    df = df[['Longitude','Latitude','glc19SpId','scName']]\
       .dropna(axis=0, how='all')\
       .astype({'glc19SpId': 'int64'})
    # target pandas series of the species identifiers (there are 505 labels)
    target_df = df['glc19SpId']

    # building the environmental data
    env_df = build_environmental_data(df[['Latitude', 'Longitude']],
                                      patches_dir=train_tensor)
    X_train = env_df.values
    y_train = target_df.values
    """
        Construction du dataset test.
    """
    df = pd.read_csv(test_csv,
                     sep=';',
                     header='infer',
                     quotechar='"',
                     low_memory=True)

    df = df[['Longitude','Latitude','glc19SpId','scName']]\
       .dropna(axis=0, how='all')\
       .astype({'glc19SpId': 'int64'})
    # target pandas series of the species identifiers (there are 505 labels)
    target_df = df['glc19SpId']

    # building the environmental data
    env_df = build_environmental_data(df[['Latitude', 'Longitude']],
                                      patches_dir=test_tensor)
    X_test = env_df.values
    y_test = target_df.values
    """
        Entrainement modèle.
    """
    # Standardize the features by removing the mean and scaling to unit variance
    scaler = StandardScaler()
    X_train = scaler.fit_transform(X_train)
    classifier = VectorModel(metric=metric)
    classifier.fit(X_train, y_train)
    """
         Évaluation et Calcul de score.
    """

    y_predicted = classifier.predict(X_test)
    print(f'Top30 score:{classifier.top30_score(y_predicted, y_test)}')
    print(f'MRR score:{classifier.mrr_score(y_predicted, y_test)}')
    print('Params:', classifier.get_params())
Exemple #2
0
           .dropna(axis=0, how='all')\
           .astype({'glc19SpId': 'int64'})

    # target pandas series of the species identifiers (there are 505 labels)
    target_df = df['glc19SpId']

    # correspondence table between ids and the species taxonomic names
    # (Taxref names with year of discoverie)
    taxonomic_names = pd.read_csv('../data/occurrences/taxaName_glc19SpId.csv',
                                  sep=';',
                                  header='infer',
                                  quotechar='"',
                                  low_memory=True)

    # building the environmental data
    env_df = build_environmental_data(df[['Latitude', 'Longitude']],
                                      patches_dir='example_envtensors')
    X = env_df.values
    y = target_df.values
    # Standardize the features by removing the mean and scaling to unit variance
    scaler = StandardScaler()
    X = scaler.fit_transform(X)

    # Evaluate as the average accuracy on one train/split random sample:
    print("Test nearest centroid model")
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)
    classifier = NearestCentroidModel(metric='euclidean')
    classifier.fit(X_train, y_train)
    y_predicted = classifier.predict(X_test)
    print(f'Top30 score:{classifier.top30_score(y_predicted, y_test)}')
    print(f'MRR score:{classifier.mrr_score(y_predicted, y_test)}')
    print('Params:', classifier.get_params())