def get_data(assay_id, pos_count=100,neg_count=100, selectall=False): ''' :param assay_id: :param pos_count: :param neg_count: :return: pos/neg count restrict the number of graphs that are returned ''' active_X = pipe(assay_id, download_active, babel_load, vectorize) inactive_X = pipe(assay_id, download_inactive, babel_load, vectorize) X = vstack((active_X, inactive_X)) y = np.array([1] * active_X.shape[0] + [-1] * inactive_X.shape[0]) esti = SGDClassifier(average=True, class_weight='balanced', shuffle=True, n_jobs=4, loss='log') esti.fit(X,y) if not selectall: select_p= lambda x: selection_iterator(x,np.random.choice(active_X.shape[0], pos_count, replace=False)) select_n= lambda x: selection_iterator(x,np.random.choice(inactive_X.shape[0], neg_count, replace=False)) print "selecting pos graphs: %d/%d neg graphs in set %d/%d" % (pos_count, active_X.shape[0], neg_count, inactive_X.shape[0]) else: select_p = lambda x:x select_n = lambda x:x graphs_p = list(pipe(assay_id, download_active,load_sdf, select_p,lambda x: map(rdkmol_to_nx,x))) graphs_n = list(pipe(assay_id, download_active,load_sdf, select_n,lambda x: map(rdkmol_to_nx,x))) print {'active':active_X.shape[0], 'inactive':inactive_X.shape[0]} return X, y, graphs_p, graphs_n,esti
def _select_data_matrices(self, iterable_pos, iterable_neg, n_active_learning_iterations=2, size_positive=-1, size_negative=100, lower_bound_threshold_positive=-1, upper_bound_threshold_positive=1, lower_bound_threshold_negative=-1, upper_bound_threshold_negative=1): # select the initial ids simply as the first occurrences if size_positive != -1: positive_ids = range(size_positive) if size_negative != -1: negative_ids = range(size_negative) # iterate: select instances according to current model and create novel # data matrix to fit the model in next round for i in range(n_active_learning_iterations): # make data matrix on selected instances # if this is the first iteration or we need to select positives if i == 0 or size_positive != -1: iterable_pos, iterable_pos_, iterable_pos__ = tee(iterable_pos, 3) if size_positive == -1: # if we take all positives data_matrix_pos = self._data_matrix(iterable_pos_, fit_vectorizer=self.fit_vectorizer) else: # otherwise use selection data_matrix_pos = self._data_matrix(selection_iterator(iterable_pos_, positive_ids), fit_vectorizer=self.fit_vectorizer) # if this is the first iteration or we need to select negatives if i == 0 or size_negative != -1: iterable_neg, iterable_neg_, iterable_neg__ = tee(iterable_neg, 3) if size_negative == -1: # if we take all negatives data_matrix_neg = self._data_matrix(iterable_neg_, fit_vectorizer=False) else: # otherwise use selection data_matrix_neg = self._data_matrix(selection_iterator(iterable_neg_, negative_ids), fit_vectorizer=False) # assemble data matrix data_matrix, y = self._assemble_data_matrix(data_matrix_pos, data_matrix_neg) # stop the fitting procedure at the last-1 iteration and return data_matrix,y if i == n_active_learning_iterations - 1: break # fit the estimator on selected instances self.estimator.fit(data_matrix, y) # use the trained estimator to select the next instances if size_positive != -1: positive_ids = self._bounded_selection(iterable_pos__, size=size_positive, lower_bound_threshold=lower_bound_threshold_positive, upper_bound_threshold=upper_bound_threshold_positive) if size_negative != -1: negative_ids = self._bounded_selection(iterable_neg__, size=size_negative, lower_bound_threshold=lower_bound_threshold_negative, upper_bound_threshold=upper_bound_threshold_negative) return data_matrix, y
def _active_learning_data_matrices(self, iterable_pos, iterable_neg, n_active_learning_iterations=2, size_positive=-1, size_negative=100, lower_bound_threshold_positive=-1, upper_bound_threshold_positive=1, lower_bound_threshold_negative=-1, upper_bound_threshold_negative=1): # select the initial ids simply as the first occurrences if size_positive != -1: positive_ids = range(size_positive) if size_negative != -1: negative_ids = range(size_negative) # iterate: select instances according to current model and create novel data matrix to fit the model in next round for i in range(n_active_learning_iterations): # make data matrix on selected instances # if this is the first iteration or we need to select positives if i == 0 or size_positive != -1: iterable_pos, iterable_pos_, iterable_pos__ = tee(iterable_pos, 3) if size_positive == -1: # if we take all positives Xpos = self._data_matrix(iterable_pos_, fit_vectorizer=self.fit_vectorizer) else: # otherwise use selection Xpos = self._data_matrix(selection_iterator(iterable_pos_, positive_ids), fit_vectorizer=self.fit_vectorizer) # if this is the first iteration or we need to select negatives if i == 0 or size_negative != -1: iterable_neg, iterable_neg_, iterable_neg__ = tee(iterable_neg, 3) if size_negative == -1: # if we take all negatives Xneg = self._data_matrix(iterable_neg_, fit_vectorizer=False) else: # otherwise use selection Xneg = self._data_matrix(selection_iterator(iterable_neg_, negative_ids), fit_vectorizer=False) # assemble data matrix X, y = self._assemble_data_matrix(Xpos, Xneg) # stop the fitting procedure at the last-1 iteration and return X,y if i == n_active_learning_iterations - 1: break # fit the estimator on selected instances self.estimator.fit(X, y) # use the trained estimator to select the next instances if size_positive != -1: positive_ids = self._bounded_selection( iterable_pos__, size=size_positive, lower_bound_threshold=lower_bound_threshold_positive, upper_bound_threshold=upper_bound_threshold_positive) if size_negative != -1: negative_ids = self._bounded_selection( iterable_neg__, size=size_negative, lower_bound_threshold=lower_bound_threshold_negative, upper_bound_threshold=upper_bound_threshold_negative) return X, y
def get_sequences_with_names(size=9999, rand=True): it = fasta_to_sequence("../toolsdata/%s.fa" % RFAM) it = list(it) if rand: #sequences , boring = random_bipartition_iter(it,.9,random_state=random.random()) r = range(len(it)) random.shuffle(r) return selection_iterator(it, r[:size]) else: sequences = itertools.islice(it, size) return sequences
def make_data(assay_id, repeats=3, train_sizes=[50]): X,y,graphs_p,graphs_n, esti = get_data(assay_id,selectall=True) print 'indicator of tak-ease:' print eden_tricks.task_difficulty(X,y) for size in train_sizes: for repeat in range(repeats): poslist = np.random.permutation(range(len(graphs_p)))[:size] neglist = np.random.permutation(range(len(graphs_n)))[:size] #r={} #r['pos']= list(selection_iterator(graphs_p, poslist)) #r['neg']= list(selection_iterator(graphs_n, neglist)) neg= list(selection_iterator(graphs_n, neglist)) pos= list(selection_iterator(graphs_p, poslist)) for samplerid, sampler in enumerate(make_samplers_chem()): yield task(samplerid,size,repeat,sampler,copy.deepcopy(neg),copy.deepcopy(pos)) yield esti