def __init__(self, *args): unittest.TestCase.__init__(self, *args) fname = "data/test_data/sequences/alignment.fa" dl = DataLoader() _, self.seq_x, self.ann_x, self.seq_y, self.ann_y = dl.loadSequence(fname) self.seq_xs = Fasta.alnToSeq(self.seq_x) self.seq_ys = Fasta.alnToSeq(self.seq_y)
def __init__(self, *args): unittest.TestCase.__init__(self, *args) fname = "data/test_data/sequences/alignment.fa" dl = DataLoader() _, self.seq_x, self.ann_x, self.seq_y, self.ann_y = dl.loadSequence( fname) self.seq_xs = Fasta.alnToSeq(self.seq_x) self.seq_ys = Fasta.alnToSeq(self.seq_y)
def load_data(preparer): dl = DataLoader() data, target, weights = list(), list(), list() sequences = dl.loadDirectory('data/sequences/model_train_seq/simulated') # sequences = dl.loadDirectory('data/sequences/train_sequences') for _, s_x, a_x, s_y, a_y in sequences: d, t, _ = preparer.prepare_training_data(s_x, a_x, s_y, a_y) data += d target += t return data, target
def load_data(preparer): dl = DataLoader() data, target, weights = list(), list(), list() sequences = dl.loadDirectory('data/sequences/model_train_seq/simulated') # sequences = dl.loadDirectory('data/sequences/train_sequences') for _, s_x, a_x, s_y, a_y in sequences: d, t, _ = preparer.prepare_training_data( s_x, a_x, s_y, a_y ) data += d target += t return data, target
def __init__( self, preparer, filename="data/clf/randomforest.clf", training_data_dir="data/sequences/train_sequences", params=None, autotrain=True, memoization=False, inverted=False, use_global_classifier=False, ): global _global_classifier """ @rtype : PairClassifier """ self._preparer = None self.preparer = preparer self.default_filename = filename self.training_data_dir = training_data_dir if params is None: self.params = config.classifiers[config.classifier_index][2] else: self.params = params self.mem = dict() self.memoization = memoization self.inverted = inverted if _global_classifier is None or not use_global_classifier: if autotrain and path.exists(self.default_filename): if path.isfile(self.default_filename): self.load(self.default_filename) else: self.classifier = self._get_classifier() if autotrain: sys.stderr.write('Training clasifier\n') dl = DataLoader() data, target, weights = list(), list(), list() sequences = dl.loadDirectory(self.training_data_dir) for _, s_x, a_x, s_y, a_y in sequences: d, t, w = self.preparer.prepare_training_data( s_x, a_x, s_y, a_y ) data += d target += t weights += w self.fit(data, target, array(weights)) self.save(self.default_filename) if use_global_classifier: _global_classifier = self.classifier else: self.classifier = _global_classifier
def __init__( self, preparer, filename="data/clf/randomforest.clf", training_data_dir="data/sequences/train_sequences", params=None, autotrain=True, memoization=False, inverted=False, use_global_classifier=False, ): global _global_classifier """ @rtype : PairClassifier """ self._preparer = None self.preparer = preparer self.default_filename = filename self.training_data_dir = training_data_dir if params is None: self.params = config.classifiers[config.classifier_index][2] else: self.params = params self.mem = dict() self.memoization = memoization self.inverted = inverted if _global_classifier is None or not use_global_classifier: if autotrain and path.exists(self.default_filename): if path.isfile(self.default_filename): self.load(self.default_filename) else: self.classifier = self._get_classifier() if autotrain: sys.stderr.write('Training clasifier\n') dl = DataLoader() data, target, weights = list(), list(), list() sequences = dl.loadDirectory(self.training_data_dir) for _, s_x, a_x, s_y, a_y in sequences: d, t, w = self.preparer.prepare_training_data( s_x, a_x, s_y, a_y) data += d target += t weights += w self.fit(data, target, array(weights)) self.save(self.default_filename) if use_global_classifier: _global_classifier = self.classifier else: self.classifier = _global_classifier
def compare_with_source(source_fname, realigned_fname, seq1, seq2): d = DataLoader() src = d.getSequences(source_fname, [seq1+'$', seq2+'$']) realigned = d.getSequences(realigned_fname, [seq1+'$', seq2+'$']) src_f = create_alignemnt_function(src) realigned_f = create_alignemnt_function(realigned) l = max(len(src[0]), len(realigned[0])) s = 0 for i in range(min(len(src[0]), len(realigned[0]))): if src_f(i) == realigned_f(i): s += 1 return float(s)/l
def score(fname): sequence_names = ['sequence1', 'sequence2', 'sequence3'] sequences = [] # fname = 'data/sequences/simulated_alignment.{}_{}.realigned.fa' d = DataLoader() for x in range(len(sequence_names)-1): for y in range(x+1, len(sequence_names)): sX = sequence_names[x] sY = sequence_names[y] if sX != sY: sequences.append(d.getSequences( fname.format(sX, sY), [sX+'$', sY+'$']) ) return compare(*sequences)
def main(preparer_index): path_to_data = "data/" dp = config.preparers[preparer_index][0](constants.window_size) clf_fname = 'data/clf/{}{}{}.clf'.format( PairClassifier.get_name(), config.preparers[preparer_index][2], constants.window_size, ) idp = config.preparers[preparer_index][1](0, constants.window_size) iclf_fname = 'data/clf/{}{}{}{}.clf'.format( PairClassifier.get_name(), config.preparers[preparer_index][2], constants.window_size, '_indel', ) dl = DataLoader() # _, s_x, a_x, s_y, a_y = dl.loadSequence( # path.join(path_to_data, 'sequences/train_sequences/simulated_alignment0.fa'), # ) # x, y, _ = dp.prepare_training_data(s_x, a_x, s_y, a_y) # ix, iy, _ = idp.prepare_training_data(s_x, a_x, s_y, a_y) # c.fit(x, y) # ic.fit(ix, iy) _, s_x, a_x, s_y, a_y = dl.loadSequence( path.join( path_to_data, 'sequences/model_train_seq/simulated/simulated_alignment.fa')) px, py, _ = dp.prepare_training_data(s_x, a_x, s_y, a_y) ipx, ipy, _ = idp.prepare_training_data(s_x, a_x, s_y, a_y) # plot(*compute_01graph_data(c, x, y)) # plot(*compute_01graph_data(ic, ix, iy)) # plot(*compute_01graph_data(c, px, py)) # plot(*compute_01graph_data(ic, ipx, ipy)) plot_clf(dp, px, py, clf_fname) plt.savefig(path.splitext(clf_fname)[0] + "_test" + pic_suffix, transparent=True, bbox_inches='tight') plot_clf(idp, ipx, ipy, iclf_fname) plt.savefig(path.splitext(iclf_fname)[0] + "_test" + pic_suffix, transparent=True, bbox_inches='tight') plt.show()
def main(preparer_index): path_to_data = "data/" dp = config.preparers[preparer_index][0](constants.window_size) clf_fname = 'data/clf/{}{}{}.clf'.format( PairClassifier.get_name(), config.preparers[preparer_index][2], constants.window_size, ) idp = config.preparers[preparer_index][1](0, constants.window_size) iclf_fname = 'data/clf/{}{}{}{}.clf'.format( PairClassifier.get_name(), config.preparers[preparer_index][2], constants.window_size, '_indel', ) dl = DataLoader() # _, s_x, a_x, s_y, a_y = dl.loadSequence( # path.join(path_to_data, 'sequences/train_sequences/simulated_alignment0.fa'), # ) # x, y, _ = dp.prepare_training_data(s_x, a_x, s_y, a_y) # ix, iy, _ = idp.prepare_training_data(s_x, a_x, s_y, a_y) # c.fit(x, y) # ic.fit(ix, iy) _, s_x, a_x, s_y, a_y = dl.loadSequence( path.join(path_to_data, 'sequences/model_train_seq/simulated/simulated_alignment.fa') ) px, py, _ = dp.prepare_training_data(s_x, a_x, s_y, a_y) ipx, ipy, _ = idp.prepare_training_data(s_x, a_x, s_y, a_y) # plot(*compute_01graph_data(c, x, y)) # plot(*compute_01graph_data(ic, ix, iy)) # plot(*compute_01graph_data(c, px, py)) # plot(*compute_01graph_data(ic, ipx, ipy)) plot_clf(dp, px, py, clf_fname) plt.savefig( path.splitext(clf_fname)[0] + "_test" + pic_suffix, transparent=True, bbox_inches='tight' ) plot_clf(idp, ipx, ipy, iclf_fname) plt.savefig( path.splitext(iclf_fname)[0] + "_test" + pic_suffix, transparent=True, bbox_inches='tight' ) plt.show()
def train(self, dirname): dl = DataLoader() sequences = dl.loadDirectory(dirname) return self.train_multi(sequences)