コード例 #1
0
 def __init__(self,
              complexity=5,
              n_clusters=10,
              min_subarray_size=4,
              max_subarray_size=10,
              estimator=SGDClassifier(warm_start=True),
              class_estimator=SGDClassifier(),
              clusterer=MiniBatchKMeans(),
              pos_block_size=300,
              neg_block_size=300,
              n_jobs=-1):
     """Construct."""
     self.complexity = complexity
     self.n_clusters = n_clusters
     self.min_subarray_size = min_subarray_size
     self.max_subarray_size = max_subarray_size
     self.pos_block_size = pos_block_size
     self.neg_block_size = neg_block_size
     self.n_jobs = n_jobs
     self.vectorizer = Vectorizer(complexity=complexity,
                                  auto_weights=True,
                                  nbits=15)
     self.estimator = estimator
     self.class_estimator = class_estimator
     self.clusterer = clusterer
     self.clusterer_is_fit = False
コード例 #2
0
    def __init__(self,
                 n_differences=1,
                 enhance=True,
                 vectorizer=Vectorizer(complexity=3),
                 n_jobs=-1,
                 random_state=1):
        """Generate sequences starting from input sequences that are 'better' if enhance is set to True
        ('worse' otherwise) given the set of sequences used in the fit phase.

        Parameters
        ----------
        n_differences : int (default 1)
            Number of characters that differ for the generated sequence from the original input sequence.

        enhance : bool (default True)
            If set to True then the score computed by the estimator will be higher for the sequences
            generated than for the input sequences. If False than the score will be lower.

        vectorizer : EDeN sequence vectorizer
            The vectorizer to map sequences to sparse vectors.

        n_jobs : int (default -1)
            The number of cores to use in parallel. -1 indicates all available.

        random_state: int (default 1)
            The random seed.
        """

        self.random_state = random_state
        self.n_jobs = n_jobs
        self.n_differences = n_differences
        self.enhance = enhance
        self.vectorizer = vectorizer
        self.estimator = None
コード例 #3
0
    def _order_clusters(self, clusters, complexity=3):
        sep = ' ' * (complexity * 2)
        # join all sequences in a cluster with enough space that
        # kmers dont interfere
        cluster_seqs = []
        for cluster_id in clusters:
            if len(clusters[cluster_id]) > 0:
                seqs = [s for h, s in clusters[cluster_id]]
                seq = sep.join(seqs)
                cluster_seqs.append(seq)

        # vectorize the seqs and compute their gram matrix K
        cluster_vecs = Vectorizer(complexity).transform(cluster_seqs)
        gram_matrix = metrics.pairwise.pairwise_kernels(
            cluster_vecs, metric='linear')
        c = linkage(gram_matrix, method='single')
        orders = []
        for id1, id2 in c[:, 0:2]:
            if id1 < len(cluster_seqs):
                orders.append(int(id1))
            if id2 < len(cluster_seqs):
                orders.append(int(id2))
        return orders