def test_unit_train_classify(tmpdir): tmpdir = str(tmpdir) out_path = os.path.join(tmpdir, 'model.pkl') run([ 'train', '--model', get_test_file('random_forest_test.json'), '--classes', get_test_file('BGC0000015.classes.csv'), '--output', out_path, get_test_file('BGC0000015.pfam.csv') ]) assert os.path.exists(out_path) model = SequenceModelWrapper.load(out_path) domains = pd.read_csv(get_test_file('BGC0000015.pfam.csv')) classes = model.predict( [sample for _, sample in domains.groupby('sequence_id')]) assert isinstance(classes, pd.DataFrame) assert list(classes.columns) == ['class1', 'class2', 'class3', 'class4'] assert len(classes.index) == 2 assert list(classes.iloc[0] > 0.5) == [True, False, True, False] assert list(classes.iloc[1] > 0.5) == [False, True, False, True]
def run(self, inputs, output, model, target, classes, config, log, validation, verbose): pipeline = SequenceModelWrapper.from_config(model, vars=dict(config)) if classes: class_df = util.read_compatible_csv(classes).set_index( 'sequence_id').astype('int8') train_samples, train_y = util.read_samples_with_classes( inputs, class_df) logging.info('Training samples:\n%s', train_y.sum()) validation_samples, validation_y = util.read_samples_with_classes( validation, class_df) if len(validation_y): logging.info('Validation samples:\n%s', validation_y.sum()) else: train_samples, train_y = util.read_samples(inputs, target) validation_samples, validation_y = util.read_samples( validation, target) pipeline.fit(samples=train_samples, y=train_y, debug_progress_path=log, validation_samples=validation_samples, validation_y=validation_y, verbose=verbose) pipeline.save(output) if log: logging.info('Progress log saved to: %s', log) logging.info('Trained model saved to: %s', output)
def test_unit_train_detect(model, tmpdir): tmpdir = str(tmpdir) out_path = os.path.join(tmpdir, 'model.pkl') run([ 'train', '--model', get_test_file(model), '--config', 'PFAM2VEC', get_test_file('pfam2vec.test.tsv'), '--output', out_path, get_test_file('BGC0000015.pfam.csv'), get_test_file('negative.pfam.csv') ]) assert os.path.exists(out_path) model = SequenceModelWrapper.load(out_path) pos_domains = pd.read_csv(get_test_file('BGC0000015.pfam.csv')) neg_domains = pd.read_csv(get_test_file('negative.pfam.csv')) pos_prediction = model.predict(pos_domains) neg_prediction = model.predict(neg_domains) assert isinstance(pos_prediction, pd.Series) assert isinstance(neg_prediction, pd.Series) assert pos_prediction.index.equals(pos_domains.index) assert neg_prediction.index.equals(neg_domains.index) assert pos_prediction.mean() > 0.5 assert neg_prediction.mean() < 0.5
def __init__(self, classifier, score_threshold=0.5): if classifier is None or not isinstance(classifier, six.string_types): raise ValueError( 'Expected classifier name, got {}'.format(classifier)) self.classifier_name = classifier self.score_threshold = score_threshold classifier_path = util.get_model_path(self.classifier_name, 'classifier') self.model = SequenceModelWrapper.load(classifier_path) self.total_class_counts = pd.Series()
def print_model(self, name, model_path): logging.info("-" * 80) logging.info('Model: %s', name) try: model = SequenceModelWrapper.load(model_path) logging.info('Type: %s', type(model.model).__name__) logging.info('Version: %s', model.version) logging.info('Timestamp: %s (%s)', model.timestamp, datetime.fromtimestamp(model.timestamp).isoformat()) except Exception as e: logging.warning('Model not supported: %s', e) return False return True
def __init__(self, classifier, score_threshold=0.5): if classifier is None or not isinstance(classifier, six.string_types): raise ValueError('Expected classifier name or path, got {}'.format(classifier)) if (os.path.exists(classifier) or os.path.sep in classifier) and not os.path.isdir(classifier): classifier_path = classifier # Set classifier name to filename without suffix classifier, _ = os.path.splitext(os.path.basename(classifier)) else: classifier_path = util.get_model_path(classifier, 'classifier') self.classifier_name = classifier self.score_threshold = score_threshold self.model = SequenceModelWrapper.load(classifier_path) self.total_class_counts = pd.Series()
def __init__(self, detector, label=None, score_threshold=0.5, merge_max_protein_gap=0, merge_max_nucl_gap=0, min_nucl=1, min_proteins=1, min_domains=1, min_bio_domains=0): self.score_threshold = score_threshold if detector is None or not isinstance(detector, six.string_types): raise ValueError('Expected detector name, got {}'.format(detector)) self.detector_name = detector self.detector_label = label or self.detector_name self.score_column = util.format_bgc_score_column(self.detector_name) self.merge_max_protein_gap = merge_max_protein_gap self.merge_max_nucl_gap = merge_max_nucl_gap self.min_nucl = min_nucl self.min_proteins = min_proteins self.min_domains = min_domains self.min_bio_domains = min_bio_domains model_path = util.get_model_path(self.detector_name, 'detector') self.model = SequenceModelWrapper.load(model_path) self.num_detected = 0
def __init__(self, detector, label=None, score_threshold=0.5, merge_max_protein_gap=0, merge_max_nucl_gap=0, min_nucl=1, min_proteins=1, min_domains=1, min_bio_domains=0): self.score_threshold = score_threshold if detector is None or not isinstance(detector, six.string_types): raise ValueError('Expected detector name or path, got {}'.format(detector)) if (os.path.exists(detector) or os.path.sep in detector) and not os.path.isdir(detector): model_path = detector # Set detector name to filename without suffix detector, _ = os.path.splitext(os.path.basename(detector)) else: model_path = util.get_model_path(detector, 'detector') self.detector_name = detector self.detector_label = label or self.detector_name self.score_column = util.format_bgc_score_column(self.detector_name) self.merge_max_protein_gap = merge_max_protein_gap self.merge_max_nucl_gap = merge_max_nucl_gap self.min_nucl = min_nucl self.min_proteins = min_proteins self.min_domains = min_domains self.min_bio_domains = min_bio_domains self.model = SequenceModelWrapper.load(model_path) self.num_detected = 0