コード例 #1
0
ファイル: evaluate.py プロジェクト: wruijun/deepchem
def output_predictions(dataset: Dataset, y_preds: np.ndarray,
                       csv_out: str) -> None:
    """Writes predictions to file.

  Writes predictions made on `dataset` to a specified file on
  disk. `dataset.ids` are used to format predictions. The produce CSV file will have format as follows

  | ID          | Task1Name    | Task2Name    |
  | ----------- | ------------ | ------------ |
  | identifer1  | prediction11 | prediction12 |
  | identifer2  | prediction21 | prediction22 |

  Parameters
  ----------
  dataset: dc.data.Dataset
    Dataset on which predictions have been made.
  y_preds: np.ndarray
    Predictions to output
  csv_out: str
    Name of file to write predictions to.
  """
    data_ids = dataset.ids
    n_tasks = len(dataset.get_task_names())
    y_preds = np.reshape(y_preds, (len(y_preds), n_tasks))
    assert len(y_preds) == len(data_ids)
    with open(csv_out, "w") as csvfile:
        csvwriter = csv.writer(csvfile)
        csvwriter.writerow(["ID"] + dataset.get_task_names())
        for mol_id, y_pred in zip(data_ids, y_preds):
            csvwriter.writerow([mol_id] + list(y_pred))
コード例 #2
0
ファイル: duplicate.py プロジェクト: yingyingjin/deepchem
    def __init__(self, dataset: Dataset):
        super(DuplicateBalancingTransformer, self).__init__(transform_X=True,
                                                            transform_y=True,
                                                            transform_w=True,
                                                            transform_ids=True,
                                                            dataset=dataset)

        if len(dataset.get_task_names()) > 1:
            raise ValueError(
                "This transformation is only defined for singletask datsets.")

        # Get the labels/weights
        y = dataset.y
        w = dataset.w
        # Normalize shapes
        if len(y.shape) == 1:
            y = np.reshape(y, (len(y), 1))
        if len(w.shape) == 1:
            w = np.reshape(w, (len(w), 1))
        if len(y.shape) != 2:
            raise ValueError("y must be of shape (N,) or (N, n_tasks)")
        if len(w.shape) != 2:
            raise ValueError("w must be of shape (N,) or (N, n_tasks)")
        self.classes = sorted(np.unique(y))
        # Remove labels with zero weights
        y = y[w != 0]
        N = len(y)
        class_weights = []
        # Note that we may have 0 elements of a given class since we remove those
        # labels with zero weight.
        for c in self.classes:
            # this works because y is 1D
            c_weight = np.sum(w[y == c])
            class_weights.append(c_weight)
        weight_largest = max(class_weights)
        # This is the right ratio since int(N/num_c) * num_c \approx N
        # for all classes
        duplication_ratio = [
            int(weight_largest / float(c_weight)) if c_weight > 0 else 0
            for c_weight in class_weights
        ]
        self.duplication_ratio = duplication_ratio