def taskmain(predictions_path, prefix_len=0, suffix_len=0, count=1, random_only=False, n_processes=5, out=None): start_time = time.time() predictions = prediction_tools.prepare_model_file(predictions_path) decoy_path = (predictions_path if out is None else out).rsplit("scored", 1)[0] + "decoy.db" metadata = predictions.metadata metadata = copy.deepcopy(metadata) metadata["tag"] = (metadata["tag"] if metadata["tag"] not in ["", None] else "") + "decoy" metadata["decoy_ratio"] = count decoy_logger.info("Saving metadata") metadata_store = sqlitedict.SqliteDict(decoy_path, tablename="metadata", flag='n') metadata_store.update(metadata) metadata_store.commit() enzyme = predictions.metadata.get("enzyme") starts = sum([enz["cleavage_start"] for enz in enzyme], []) ends = sum([enz["cleavage_end"] for enz in enzyme], []) decoy_logger.info("Building random sequence builder") decoy_logger.info("Parameters: {}".format(dict(prefix_len=prefix_len, suffix_len=suffix_len, count=count, random_only=random_only))) modification_table = modification.ModificationTable.bootstrap() builder = RandomGlycopeptideBuilder( ppm_error=predictions.metadata.get("ms1_ppm_tolerance", 10e-6), constant_modifications=map(modification_table.__getitem__, predictions.metadata.get("constant_modifications", [])), variable_modifications=map(modification_table.__getitem__, predictions.metadata.get("variable_modifciations", [])), glycans=glycans_from_predictions(predictions), cleavage_start=starts, cleavage_end=ends) task_fn = functools.partial(build_shuffle_sequences, prefix_len=prefix_len, suffix_len=suffix_len, count=count, builder=builder, random_only=random_only) decoy_logger.info("Beginning generation") decoy_sequence_store = sqlitedict.SqliteDict(decoy_path, tablename="theoretical_search_space", flag='w') total_missing = 0 preds_missed = [] pool = None if n_processes > 1: pool = multiprocessing.Pool(n_processes) dispatcher = functools.partial(pool.imap_unordered, chunksize=1000) decoy_logger.info("Building decoys concurrently on %d processes", n_processes) else: decoy_logger.info("Building decoys sequentially") dispatcher = itertools.imap i = 0 pred_iter = 0 for decoy_tuple in ( itertools.chain.from_iterable( dispatcher(task_fn, predictions.iterrows()))): decoys, missing = decoy_tuple for decoy in decoys: decoy_sequence_store[i] = decoy i += 1 if i % 10000 == 0: decoy_sequence_store.commit() total_missing += missing if missing > 0: decoy_logger.info("%d missing", missing) preds_missed.append((predictions.ix[pred_iter].Glycopeptide_identifier, missing)) pred_iter += 1 if pool is not None: pool.close() pool.join() decoy_sequence_store.commit() decoy_sequence_store.close() metadata_store["total_missing_decoys"] = total_missing metadata_store["predictions_missing_decoys"] = preds_missed metadata_store.commit() metadata_store.close() decoy_logger.info("Done. %d Sequences missing. %f seconds elapsed", total_missing, time.time() - start_time) return decoy_path
from glycresoft_ms2_classification import prediction_tools from glycresoft_ms2_classification.ms import default_loader predictions = prediction_tools.prepare_model_file("./ResultOf20131219_006_isos.scored_fdr.json").query("MS2_Score >= 0.5") observed = default_loader("./20131219_006.mzML.20150125-173417matched.pkl").index_by_scan_ids() spec = observed[predictions.iloc[2].scan_id_range[0]] drop_mass = predictions.iloc[2].glycanMass