Esempio n. 1
0
def generate_sparse(qa_queue, length, l_matrix_fname, r_matrix_fname):
    logging.info("Start consumer")
    Qs = None
    As = None
    count = 0
    while True:
        try:
            Qs_temp, As_temp = qa_queue.get(timeout=120)
            count += Qs_temp.shape[0]
            if Qs is None:
                Qs = Qs_temp
                As = As_temp
            else:
                Qs = sparse_vstack((Qs, Qs_temp))
                As = sparse_vstack((As, As_temp))
            if count == length:
                raise Empty
            if count % INF_FREQ == 0:
                logging.info("loading: %d/%d, %.2f%%" % (count, length, count / length * 100))
        except Empty:
            logging.info("loading: %d/%d, %.2f%%" % (count, length, count / length * 100))
            break

    logging.info("Stop consumer")
    with open(l_matrix_fname, 'wb') as f:
        pkl.dump(Qs, f, protocol=4)
    with open(r_matrix_fname, 'wb') as f:
        pkl.dump(As, f, protocol=4)
Esempio n. 2
0
    def _load_pdbbind_desc(self, desc_path, pdbbind_version=2016,
                           train_set='refined', test_set='core',
                           train_blacklist=None, fold_size=None):
        """
        TODO: write the docs

        """

        df = pd.read_csv(desc_path, index_col='pdbid')

        # generate dense representation of sparse descriptor in CSV
        cols = list(map(str, range(len(self.descriptor_generator))))
        if 'sparse' in df.columns:
            # convert strings to np.arrays
            df['sparse'] = df['sparse'].map(
                lambda x: np.fromstring(x[1:-1], dtype=np.uint64, sep=','))
            cols = 'sparse'  # sparse array will have one column
            # fold only if necessary
            if fold_size:
                df['sparse'] = df['sparse'].map(lambda x: fold(x, fold_size))
            # convert to sparse csr_matrix
            df['sparse'] = df['sparse'].map(
                partial(sparse_to_csr_matrix,
                        size=len(self.descriptor_generator)))

        if isinstance(train_set, six.string_types):
            train_idx = df['%i_%s' % (pdbbind_version, train_set)]
        else:
            train_idx = df[['%i_%s' % (pdbbind_version, s)
                            for s in train_set]].any(axis=1)
        if train_blacklist:
            train_idx &= ~df.index.isin(train_blacklist)
        train_idx &= ~df['%i_%s' % (pdbbind_version, test_set)]

        # load sparse matrices as training is usually faster on them
        if 'sparse' in df.columns:
            self.train_descs = sparse_vstack(df.loc[train_idx, cols].values,
                                             format='csr')
        else:
            self.train_descs = df.loc[train_idx, cols].values
        self.train_target = df.loc[train_idx, 'act'].values

        test_idx = df['%i_%s' % (pdbbind_version, test_set)]
        if 'sparse' in df.columns:
            self.test_descs = sparse_vstack(df.loc[test_idx, cols].values,
                                            format='csr')
        else:
            self.test_descs = df.loc[test_idx, cols].values
        self.test_target = df.loc[test_idx, 'act'].values
Esempio n. 3
0
def ZfromN(normals, mask, Mx, My):
    """
    Compute (integrate) the depth map of a normal map.
    
    The reconstruction is up to a scaling factor.
    """
    b = -normals
    b[:,2] = 0
    b = b.T.ravel()

    N = normals.shape[0]
    ij = list(range(N))
    X = coo_matrix((normals[:,0], (ij, ij)), shape=Mx.shape)
    Y = coo_matrix((normals[:,1], (ij, ij)), shape=Mx.shape)
    Z = coo_matrix((normals[:,2], (ij, ij)), shape=Mx.shape)
    A = sparse_vstack((Z.dot(Mx),
                       Z.dot(My),
                       Y.dot(Mx) - X.dot(My)))
    # Is the 3rd constraint really useful?

    surf = sparse_lsqr(A, b)
    surf = surf[0]
    surf -= surf.min()

    out = np.zeros(mask.shape, np.float32)
    out[mask] = surf.ravel()

    return out
Esempio n. 4
0
    def build(self, ligands, protein=None):
        """Builds descriptors for series of ligands

        Parameters
        ----------
        ligands: iterable of oddt.toolkit.Molecules or oddt.toolkit.Molecule
            A list or iterable of ligands to build the descriptor or a
            single molecule.

        protein: oddt.toolkit.Molecule or None (default=None)
            Default protein to use as reference

        """
        if protein:
            self.protein = protein
        if is_molecule(ligands):
            ligands = [ligands]
        out = []
        for mol in ligands:
            if self.protein is None:
                out.append(self.func(mol))
            else:
                out.append(self.func(mol, protein=self.protein))
        if self.sparse:
            # out = list(map(partial(sparse_to_csr_matrix, size=self.shape), out))
            return sparse_vstack(map(
                partial(sparse_to_csr_matrix, size=self.shape), out),
                                 format='csr')
        else:
            return np.vstack(out)
Esempio n. 5
0
def test_sparse_densify():
    """FP densify"""
    sparse_fp = [0, 33, 49, 53, 107, 156, 161, 203, 215, 230, 251, 269, 299,
                 323, 331, 376, 389, 410, 427, 430, 450, 484, 538, 592, 593,
                 636, 646, 658, 698, 699, 702, 741, 753, 807, 850, 861, 882,
                 915, 915, 915, 969, 969, 1023]

    # count vectors
    dense = sparse_to_dense(sparse_fp, size=1024, count_bits=True)
    csr = sparse_to_csr_matrix(sparse_fp, size=1024, count_bits=True)
    assert_array_equal(dense.reshape(1, -1), csr.toarray())
    resparsed = dense_to_sparse(dense)
    resparsed_csr = csr_matrix_to_sparse(csr)
    assert_array_equal(sparse_fp, resparsed)
    assert_array_equal(sparse_fp, resparsed_csr)

    # bool vectors
    dense = sparse_to_dense(sparse_fp, size=1024, count_bits=False)
    csr = sparse_to_csr_matrix(sparse_fp, size=1024, count_bits=False)
    assert_array_equal(dense.reshape(1, -1), csr.toarray())
    resparsed = dense_to_sparse(dense)
    resparsed_csr = csr_matrix_to_sparse(csr)
    assert_array_equal(np.unique(sparse_fp), resparsed)
    assert_array_equal(np.unique(sparse_fp), resparsed_csr)

    # test stacking
    np.random.seed(0)
    sparse_fps = np.random.randint(0, 1024, size=(20, 100))
    dense = np.vstack(sparse_to_dense(fp, size=1024) for fp in sparse_fps)
    csr = sparse_vstack(sparse_to_csr_matrix(fp, size=1024) for fp in sparse_fps)
    assert_array_equal(dense, csr.toarray())

    # test exceptions
    with pytest.raises(ValueError):
        csr_matrix_to_sparse(np.array([1, 2, 3]))
Esempio n. 6
0
def test_sparse_densify():
    """FP densify"""
    sparse_fp = [
        0, 33, 49, 53, 107, 156, 161, 203, 215, 230, 251, 269, 299, 323, 331,
        376, 389, 410, 427, 430, 450, 484, 538, 592, 593, 636, 646, 658, 698,
        699, 702, 741, 753, 807, 850, 861, 882, 915, 915, 915, 969, 969, 1023
    ]

    # count vectors
    dense = sparse_to_dense(sparse_fp, size=1024, count_bits=True)
    csr = sparse_to_csr_matrix(sparse_fp, size=1024, count_bits=True)
    assert_array_equal(dense.reshape(1, -1), csr.toarray())
    resparsed = dense_to_sparse(dense)
    assert_array_equal(sparse_fp, resparsed)

    # bool vectors
    dense = sparse_to_dense(sparse_fp, size=1024, count_bits=False)
    csr = sparse_to_csr_matrix(sparse_fp, size=1024, count_bits=False)
    assert_array_equal(dense.reshape(1, -1), csr.toarray())
    resparsed = dense_to_sparse(dense)
    assert_array_equal(np.unique(sparse_fp), resparsed)

    # test stacking
    np.random.seed(0)
    sparse_fps = np.random.randint(0, 1024, size=(20, 100))
    dense = np.vstack(sparse_to_dense(fp, size=1024) for fp in sparse_fps)
    csr = sparse_vstack(
        sparse_to_csr_matrix(fp, size=1024) for fp in sparse_fps)
    assert_array_equal(dense, csr.toarray())
Esempio n. 7
0
    def _run_interface(self, runtime):
        from scipy.sparse import vstack as sparse_vstack

        # Calculate the physical coordinates of target grid
        targetnii = nb.load(self.inputs.in_target)
        allmask = np.ones_like(targetnii.dataobj, dtype="uint8")

        weights = []
        coeffs = []

        for cname in self.inputs.in_coeff:
            coeff_nii = nb.load(cname)
            wmat = grid_bspline_weights(targetnii, coeff_nii)
            weights.append(wmat)
            coeffs.append(coeff_nii.get_fdata(dtype="float32").reshape(-1))

        data = np.zeros(targetnii.shape, dtype="float32")
        data[allmask == 1] = np.squeeze(
            np.vstack(coeffs).T) @ sparse_vstack(weights)

        hdr = targetnii.header.copy()
        hdr.set_data_dtype("float32")
        self._results["out_field"] = fname_presuffix(self.inputs.in_target,
                                                     suffix="_field",
                                                     newpath=runtime.cwd)
        targetnii.__class__(data, targetnii.affine,
                            hdr).to_filename(self._results["out_field"])

        # Generate warp field
        phaseEncDim = "ijk".index(self.inputs.pe_dir[0])
        phaseEncSign = [1.0, -1.0][len(self.inputs.pe_dir) != 2]

        data *= phaseEncSign * self.inputs.ro_time

        fieldshape = tuple(list(data.shape[:3]) + [3])
        self._results["out_warp"] = fname_presuffix(self.inputs.in_target,
                                                    suffix="_xfm",
                                                    newpath=runtime.cwd)
        # Compose a vector field
        field = np.zeros((data.size, 3), dtype="float32")
        field[..., phaseEncDim] = data.reshape(-1)
        aff = targetnii.affine.copy()
        aff[:3, 3] = 0.0
        # Multiplying by the affine implicitly applies the voxel size to the shift map
        field = nb.affines.apply_affine(aff, field).reshape(fieldshape)
        warpnii = targetnii.__class__(
            field[:, :, :, np.newaxis, :].astype("float32"), targetnii.affine,
            None)
        warpnii.header.set_intent("vector", (), "")
        warpnii.header.set_xyzt_units("mm")
        warpnii.to_filename(self._results["out_warp"])
        return runtime
Esempio n. 8
0
    def fit(self, spatialimage):
        r"""
        Generate the interpolation matrix (and the VSM with it).

        Implements Eq. :math:`\eqref{eq:1}`, interpolating :math:`f(\mathbf{s})`
        for all voxels in the target-image's extent.

        Returns
        -------
        updated : :obj:`bool`
            ``True`` if the internal field representation was fit,
            ``False`` if cache was valid and will be reused.

        """
        # Calculate the physical coordinates of target grid
        if isinstance(spatialimage, (str, bytes, Path)):
            spatialimage = nb.load(spatialimage)

        if self.shifts is not None:
            newaff = spatialimage.affine
            newshape = spatialimage.shape

            if np.all(newshape == self.shifts.shape) and np.allclose(
                    newaff, self.shifts.affine):
                return False

        weights = []
        coeffs = []

        # Generate tensor-product B-Spline weights
        for level in listify(self.coeffs):
            self.xfm.reference = spatialimage
            moved_cs = level.__class__(level.dataobj,
                                       self.xfm.matrix @ level.affine,
                                       level.header)
            wmat = grid_bspline_weights(spatialimage, moved_cs)
            weights.append(wmat)
            coeffs.append(level.get_fdata(dtype="float32").reshape(-1))

        # Interpolate the VSM (voxel-shift map)
        vsm = np.zeros(spatialimage.shape[:3], dtype="float32")
        vsm = (
            np.squeeze(np.hstack(coeffs).T) @ sparse_vstack(weights)).reshape(
                vsm.shape)

        # Cache
        self.shifts = nb.Nifti1Image(vsm, spatialimage.affine, None)
        self.shifts.header.set_intent("estimate", name="Voxel shift")
        self.shifts.header.set_xyzt_units("mm")
        return True
Esempio n. 9
0
def compute_feature_matrix(df, vectorizer, combine=None):
    fq1 = vectorizer.transform(df.ix[:, Fields.question1])
    fq2 = vectorizer.transform(df.ix[:, Fields.question2])

    combine = combine or 'diff'

    if combine == 'stack':
        return sparse_vstack([fq1, fq2])

    if combine == 'intersect':
        return fq1.multiply(fq2)

    if combine == 'diff':
        return abs(fq1 - fq2).tocsr()
Esempio n. 10
0
def get_upsampling(train_df, count_threshold, tfidf_vector):
    train_df_copy = train_df.copy().reset_index(
        drop=True)  # needed for tfidf_vector.getrow(index)
    dfs_by_target = []
    max_df_size = 0
    sizes = train_df_copy.target.value_counts().to_dict()
    for target in train_df_copy.target.unique():
        df = train_df_copy[train_df_copy['target'] == target]
        if sizes[target] >= count_threshold:
            dfs_by_target.append(df)
        else:
            dfs_by_target.append(df.sample(max(sizes.values()), replace=True))
    X_train = sparse_vstack(
        [tfidf_vector.getrow(i) for df in dfs_by_target for i in df.index])
    y_train = pd.concat(dfs_by_target, axis=0).target
    return X_train, y_train
Esempio n. 11
0
    def restoreMaskedBins(self):
        """
        Puts backs into the matrix the bins
        removed
        """
        if len(self.orig_bin_ids) == 0:
            return
        # the rows to add are
        # as an empty sparse matrix
        M = self.matrix.shape[0]
        N = len(self.orig_bin_ids) - M
        rows_mat = csr_matrix((N, M))
        # cols to add
        cols_mat = csr_matrix((M + N, N))

        # add the rows and cols at the end of the
        # current matrix
        self.matrix = sparse_vstack([self.matrix, rows_mat])
        self.matrix = sparse_hstack([self.matrix, cols_mat], format='csr')

        # the new matrix has the right number of cols and rows, now
        # they need to be reordered to be back in their original places
        rows = cols = np.argsort(self.orig_bin_ids)
        self.matrix = self.matrix[rows, :][:, cols]
        self.cut_intervals = [self.orig_cut_intervals[x] for x in rows]
        self.interval_trees, self.chrBinBoundaries = \
            self.intervalListToIntervalTree(self.cut_intervals)
        # set as nan_bins the masked bins that were restored
        self.nan_bins = self.orig_bin_ids[M:]

        if self.correction_factors is not None:
            # add missing values as nans at end of array
            self.correction_factors = np.concatenate(
                [self.correction_factors,
                 np.repeat(np.nan, N)])
            # reorder array
            self.correction_factors = self.correction_factors[rows]

        # reset orig bins ids and cut intervals
        self.orig_bin_ids = []
        self.orig_cut_intervals = []
        log.info("masked bins were restored\n")
Esempio n. 12
0
def testMultiModel(X, y, numModels):
    activeIndexTuple = y.nonzero()
    activeIndexValues = activeIndexTuple[0]
    activeTotalCount = activeIndexValues.shape[0]

    X_active = X[activeIndexValues, :]

    fs = frozenset(activeIndexValues)

    allIndices = [k for k in range(len(y))]
    nonActiveIndices = list(filter(lambda q: q not in fs, allIndices))
    nonActiveIndexValues = np.array(nonActiveIndices, dtype=np.int64)
    X_nonActive = X[nonActiveIndexValues, :]

    modelRangeList = getRangeList(len(nonActiveIndices), numModels)

    returnList = []

    for modelIndex in range(numModels):
        currentZerosList = modelRangeList[modelIndex]
        currentZerosArray = np.array(currentZerosList, dtype=np.int64)
        X_nonActiveCurrent = X_nonActive[currentZerosArray, :]

        #X_model = np.append(X_active, X_nonActiveCurrent)

        X_model = sparse_vstack([X_active, X_nonActiveCurrent]).tolil()

        y_model = [1] * X_active.shape[0]
        y_model.extend([0] * X_nonActiveCurrent.shape[0])

        print("Sub model X = " + str(X_model.shape))
        print("Sub model y = " + str(len(y_model)))

        print("Constructing model #" + str(modelIndex))

        returnList.append((X_model, y_model))

    return returnList
Esempio n. 13
0
def cross_validation_fold(index, splits_in, splits_out):
    """
    k-fold cross-validation "fold": performs validation using exactly
    one of the splits as validation set and the rest of the dataset
    as training data.
    :param index: Index of the split to use as validation data
    :param splits_in: List of splits of the original dataset inputs
    :param splits_out: List of splits of the origina dataset outputs
    :return: The accuracy score for a LinearSVC trained on all the
    splits except <index> and then validated on split <index>
    """
    validation_in = splits_in[index]
    validation_out = splits_out[index]
    cf = LabelPowerset(LinearSVC())

    # train on all splits except split <index>
    cf.fit(np.vstack(splits_in[:index] + splits_in[index + 1:]),
           sparse_vstack(splits_out[:index] + splits_out[index + 1:]))

    # validate on split <index>
    return validate(cf,
                    validation_in,
                    validation_out,
                    return_predictions=False)
Esempio n. 14
0
    def _run_interface(self, runtime):
        from sklearn import linear_model as lm
        from scipy.sparse import vstack as sparse_vstack

        # Load in the fieldmap
        fmapnii = nb.load(self.inputs.in_data)
        data = fmapnii.get_fdata(dtype="float32")
        mask = (
            nb.load(self.inputs.in_mask).get_fdata() > 0
            if isdefined(self.inputs.in_mask)
            else np.ones_like(data, dtype=bool)
        )
        bs_spacing = [np.array(sp, dtype="float32") for sp in self.inputs.bs_spacing]

        # Recenter the fieldmap
        if self.inputs.recenter == "mode":
            from scipy.stats import mode

            data -= mode(data[mask], axis=None)[0][0]
        elif self.inputs.recenter == "median":
            data -= np.median(data[mask])
        elif self.inputs.recenter == "mean":
            data -= np.mean(data[mask])

        # Calculate the spatial location of control points
        bs_levels = []
        ncoeff = []
        weights = None
        for sp in bs_spacing:
            level = bspline_grid(fmapnii, control_zooms_mm=sp)
            bs_levels.append(level)
            ncoeff.append(level.dataobj.size)

            weights = (
                gbsw(fmapnii, level)
                if weights is None
                else sparse_vstack((weights, gbsw(fmapnii, level)))
            )

        regressors = weights.T.tocsr()[mask.reshape(-1), :]

        # Fit the model
        model = lm.Ridge(alpha=self.inputs.ridge_alpha, fit_intercept=False)
        model.fit(regressors, data[mask])

        interp_data = np.zeros_like(data)
        interp_data[mask] = np.array(model.coef_) @ regressors.T  # Interpolation

        # Store outputs
        out_name = fname_presuffix(
            self.inputs.in_data, suffix="_field", newpath=runtime.cwd
        )
        hdr = fmapnii.header.copy()
        hdr.set_data_dtype("float32")
        fmapnii.__class__(interp_data, fmapnii.affine, hdr).to_filename(out_name)
        self._results["out_field"] = out_name

        index = 0
        self._results["out_coeff"] = []
        for i, (n, bsl) in enumerate(zip(ncoeff, bs_levels)):
            out_level = out_name.replace("_field.", f"_coeff{i:03}.")
            bsl.__class__(
                np.array(model.coef_, dtype="float32")[index : index + n].reshape(
                    bsl.shape
                ),
                bsl.affine,
                bsl.header,
            ).to_filename(out_level)
            index += n
            self._results["out_coeff"].append(out_level)

        # Write out fitting-error map
        self._results["out_error"] = out_name.replace("_field.", "_error.")
        fmapnii.__class__(
            data * mask - interp_data, fmapnii.affine, fmapnii.header
        ).to_filename(self._results["out_error"])

        if not self.inputs.extrapolate:
            return runtime

        if np.all(mask):
            self._results["out_extrapolated"] = self._results["out_field"]
            return runtime

        extrapolators = weights.tocsc()[:, ~mask.reshape(-1)]
        interp_data[~mask] = np.array(model.coef_) @ extrapolators  # Extrapolation
        self._results["out_extrapolated"] = out_name.replace("_field.", "_extra.")
        fmapnii.__class__(interp_data, fmapnii.affine, hdr).to_filename(
            self._results["out_extrapolated"]
        )
        return runtime
Esempio n. 15
0
    def _generate_feats(self, data, mode):
        # lexical feats
        #if mode == "train":
        #    self.tfidf_vect = TfidfVectorizer(ngram_range = self.ngram_rng, min_df = self.min_df, use_idf = self.use_idf)
        #    self.tfidf_vect.fit([x[1:-1] for x in list(data.text)]) # the x[1:-1] strips the initial and final [ and ] from the texts
        #feats = self.tfidf_vect.transform([x[1:-1] for x in list(data.text)])

        feats = self.transformer_model.encode(
            [x[1:-1] for x in list(data.text)])
        feats = np.array(feats)

        if self.use_utterance_feats:
            # utterance feats
            ut_feats = np.zeros((data.shape[0], 3))

            current_mid = data.iloc[0, 8]
            current_max_timestamp = max(
                data[data.meeting_id == current_mid].timestamp)
            for i in range(data.shape[0]):
                text = data.iloc[i, 2][1:-1]
                timestamp = data.iloc[i, 1]
                next_timestamp = data.iloc[i + 1, 1] if (
                    i + 1 < data.shape[0]
                    and data.iloc[i + 1, 8] == data.iloc[i, 8]) else None
                # first condition is for the end of the data frame (last utterance of the last meeting) second is on a breaking point between two meetings (happens for last utterance of every meeting)
                # without the second we would get 1853.2 as the last timestamp of meeting X and 0.0 as the first in meeting Y and the difference would be negative which messes up things down the line

                ut_feats[i, 0] = len(text.split(" "))  # length in words
                ut_feats[
                    i,
                    1] = next_timestamp - timestamp if next_timestamp is not None else 2.0  # 2.0 is just an arbitray approximate value for the duration of the last utterance of each meeting
                ut_feats[i, 2] = timestamp / current_max_timestamp

                if next_timestamp is None and i + 1 < data.shape[
                        0]:  # this is a breaking point between meetings and we have to update some of the vals for the next iteration
                    current_mid = data.iloc[i + 1, 8]
                    current_max_timestamp = max(
                        data[data.meeting_id == current_mid].timestamp)

            feats = csr_matrix(sparse_hstack([feats, csr_matrix(ut_feats)]))

        # expand all utterance level  feats to include feats of the prev and next utterances
        prev_context_feat_mats, next_context_feat_mats = [], []
        # prev context
        for offset in range(1, self.prev_context_len + 1):
            context_feats = feats[:-offset, :]
            padding = csr_matrix(np.zeros((offset, feats.shape[1])))
            final = sparse_vstack((padding, context_feats))
            prev_context_feat_mats.append(final)

        # next context
        for offset in range(1, self.next_context_len + 1):
            context_feats = feats[offset:, :]
            padding = csr_matrix(np.zeros((offset, feats.shape[1])))
            final = sparse_vstack((context_feats, padding))
            next_context_feat_mats.append(final)

        #feats = sparse_hstack([feats] + prev_context_feat_mats + next_context_feat_mats)

        if self.do_scaling:
            if mode == "train":
                self.scaler = StandardScaler(with_mean=False)
                self.scaler.fit(feats)
            feats = self.scaler.transform(feats)

        return feats
Esempio n. 16
0
    def restoreMaskedBins(self):
        """
        Puts backs into the matrix the bins
        removed


        Examples
        --------
        >>> from scipy.sparse import coo_matrix
        >>> row, col = np.triu_indices(5)
        >>> cut_intervals = [('a', 0, 10, 1), ('a', 10, 20, 1),
        ... ('a', 20, 30, 1), ('a', 30, 40, 1), ('b', 40, 50, 1)]
        >>> hic = hiCMatrix()
        >>> hic.nan_bins = []
        >>> matrix = np.array([
        ... [ 0, 10,  5, 3, 0],
        ... [ 0,  0, 15, 5, 1],
        ... [ 0,  0,  0, 7, 3],
        ... [ 0,  0,  0, 0, 1],
        ... [ 0,  0,  0, 0, 0]], dtype=np.int32)

        make the matrix symmetric:
        >>> hic.matrix = csr_matrix(matrix + matrix.T)
        >>> hic.setMatrix(csr_matrix(matrix + matrix.T), cut_intervals)

        Add masked bins masked bins
        >>> hic.maskBins([3])
        >>> hic.matrix.todense()
        matrix([[ 0, 10,  5,  0],
                [10,  0, 15,  1],
                [ 5, 15,  0,  3],
                [ 0,  1,  3,  0]], dtype=int32)
        >>> hic.cut_intervals
        [('a', 0, 10, 1), ('a', 10, 20, 1), ('a', 20, 30, 1), ('b', 40, 50, 1)]

        >>> hic.restoreMaskedBins()
        >>> hic.matrix.todense()
        matrix([[ 0., 10.,  5.,  0.,  0.],
                [10.,  0., 15.,  0.,  1.],
                [ 5., 15.,  0.,  0.,  3.],
                [ 0.,  0.,  0.,  0.,  0.],
                [ 0.,  1.,  3.,  0.,  0.]])

        >>> hic.cut_intervals
        [('a', 0, 10, 1), ('a', 10, 20, 1), ('a', 20, 30, 1), ('a', 30, 40, 1), ('b', 40, 50, 1)]
        """
        if len(self.orig_bin_ids) == 0:
            return
        # the rows to add are
        # as an empty sparse matrix
        M = self.matrix.shape[0]
        N = len(self.orig_bin_ids) - M
        rows_mat = csr_matrix((N, M))
        # cols to add
        cols_mat = csr_matrix((M + N, N))

        # add the rows and cols at the end of the
        # current matrix
        self.matrix = sparse_vstack([self.matrix, rows_mat])
        self.matrix = sparse_hstack([self.matrix, cols_mat], format='csr')

        # the new matrix has the right number of cols and rows, now
        # they need to be reordered to be back in their original places
        rows = cols = np.argsort(self.orig_bin_ids)
        self.matrix = self.matrix[rows, :][:, cols]
        self.cut_intervals = [self.orig_cut_intervals[x] for x in rows]
        self.interval_trees, self.chrBinBoundaries = \
            self.intervalListToIntervalTree(self.cut_intervals)
        # set as nan_bins the masked bins that were restored
        self.nan_bins = self.orig_bin_ids[M:]

        if self.correction_factors is not None:
            # add missing values as nans at end of array
            self.correction_factors = np.concatenate(
                [self.correction_factors,
                 np.repeat(np.nan, N)])
            # reorder array
            self.correction_factors = self.correction_factors[rows]

        # reset orig bins ids and cut intervals
        self.orig_bin_ids = []
        self.orig_cut_intervals = []
        log.info("masked bins were restored\n")
Esempio n. 17
0
lsi_model = gensim.models.LsiModel.load('exports/lsi.model')
lsi_releasenotes_vecs = np.zeros((1,num_topics))
lsi_reviews_vecs = np.zeros((1,num_topics))

count = 0
for doc in releasenotes:
    if gensim_tfidf[doc.id - 3717281]:
        lda_releasenotes_vecs = np.vstack((lda_releasenotes_vecs, gensim.matutils.corpus2dense(lda_model[[gensim_tfidf[doc.id-3717281]]],num_topics).T))
    else:
        lda_releasenotes_vecs = np.vstack((lda_releasenotes_vecs,np.zeros((1,num_topics))))
    lsi_releasenotes_vecs = np.vstack((lsi_releasenotes_vecs, gensim.matutils.corpus2dense(lsi_model[[gensim_tfidf[doc.id - 3717281]]], num_topics).T))
    if count == 0:
        tfidf_releasenotes_vecs = tfidf[doc.id - 3717281]
    else:
        tfidf_releasenotes_vecs = sparse_vstack((tfidf_releasenotes_vecs, tfidf[doc.id - 3717281]))
    count += 1

lda_releasenotes_vecs = np.delete(lda_releasenotes_vecs,0,0)
lsi_releasenotes_vecs = np.delete(lsi_releasenotes_vecs,0,0)

count = 0
for doc in reviews:
    if gensim_tfidf[doc.id - 3717281]:
        lda_reviews_vecs = np.vstack((lda_reviews_vecs, gensim.matutils.corpus2dense(lda_model[[gensim_tfidf[doc.id - 3717281]]], num_topics).T))
        # we could send the whole corpus of this app_id to the model in one step
    else:
        lda_reviews_vecs = np.vstack((lda_reviews_vecs, np.zeros((1, num_topics))))
    lsi_reviews_vecs = np.vstack((lsi_reviews_vecs, gensim.matutils.corpus2dense(lsi_model[[gensim_tfidf[doc.id - 3717281]]], num_topics).T))
    if count == 0:
        tfidf_reviews_vecs = tfidf[doc.id - 3717281]
Esempio n. 18
0
def vstack(vectors):
    if type(vectors[0]) == scipy.sparse.coo.coo_matrix:
        return sparse_vstack(vectors)
    else:
        return np.vstack(vectors)