def output_predictions(dataset: Dataset, y_preds: np.ndarray, csv_out: str) -> None: """Writes predictions to file. Writes predictions made on `dataset` to a specified file on disk. `dataset.ids` are used to format predictions. The produce CSV file will have format as follows | ID | Task1Name | Task2Name | | ----------- | ------------ | ------------ | | identifer1 | prediction11 | prediction12 | | identifer2 | prediction21 | prediction22 | Parameters ---------- dataset: dc.data.Dataset Dataset on which predictions have been made. y_preds: np.ndarray Predictions to output csv_out: str Name of file to write predictions to. """ data_ids = dataset.ids n_tasks = len(dataset.get_task_names()) y_preds = np.reshape(y_preds, (len(y_preds), n_tasks)) assert len(y_preds) == len(data_ids) with open(csv_out, "w") as csvfile: csvwriter = csv.writer(csvfile) csvwriter.writerow(["ID"] + dataset.get_task_names()) for mol_id, y_pred in zip(data_ids, y_preds): csvwriter.writerow([mol_id] + list(y_pred))
def __init__(self, dataset: Dataset): super(DuplicateBalancingTransformer, self).__init__(transform_X=True, transform_y=True, transform_w=True, transform_ids=True, dataset=dataset) if len(dataset.get_task_names()) > 1: raise ValueError( "This transformation is only defined for singletask datsets.") # Get the labels/weights y = dataset.y w = dataset.w # Normalize shapes if len(y.shape) == 1: y = np.reshape(y, (len(y), 1)) if len(w.shape) == 1: w = np.reshape(w, (len(w), 1)) if len(y.shape) != 2: raise ValueError("y must be of shape (N,) or (N, n_tasks)") if len(w.shape) != 2: raise ValueError("w must be of shape (N,) or (N, n_tasks)") self.classes = sorted(np.unique(y)) # Remove labels with zero weights y = y[w != 0] N = len(y) class_weights = [] # Note that we may have 0 elements of a given class since we remove those # labels with zero weight. for c in self.classes: # this works because y is 1D c_weight = np.sum(w[y == c]) class_weights.append(c_weight) weight_largest = max(class_weights) # This is the right ratio since int(N/num_c) * num_c \approx N # for all classes duplication_ratio = [ int(weight_largest / float(c_weight)) if c_weight > 0 else 0 for c_weight in class_weights ] self.duplication_ratio = duplication_ratio