def look_for_pssms_in_dir(directory): """Find the models that are in the specified directory (named <dataset>-<cross fold index>.pssm).""" # for each file called *.pssm in the model directory logging.info('Looking for PSSMs in: %s' % directory) parsed_models = dict() for file in glob.glob(os.path.join(directory, '*-*.pssm')): try: #logging.info('Found models in %s' % file) base, ext = os.path.splitext(os.path.basename(file)) dataset, cross_fold_idx = base.split('-') cross_fold_idx = int(cross_fold_idx)-1 logging.info( 'Read models from: %s for dataset: %s and cross-fold validation set: %d', file, dataset, cross_fold_idx ) parsed_models[(dataset, cross_fold_idx)] = list( parse_models( open(os.path.join(file)) ) ) except: print sys.exc_info() logging.warning('Could not parse: %s' % file) return parsed_models
def __init__(self, pssm_file, log): self.pssm_file = pssm_file self.png_file = pssm_file.replace('.pssm', '.png') self.eps_file = pssm_file.replace('.pssm', '.eps') re_match = Pssm.pssm_file_name_re.match(os.path.basename(pssm_file)) self.tag = re_match.group(1) self.pssm_idx = int(re_match.group(2)) self.num_sites, self.num_seqs_with_site, self.num_seqs = log.site_numbers[self.pssm_idx] # logging.info('%s: %s %d %d', self.pssm_file, self.fragment, self.cross_fold, self.pssm_idx) self.model = parse_models(open(self.pssm_file)).next() self.emissions = calculate_emissions(self.model) self.gap_probs = calculate_gap_probs(self.model) self.first_order_entropy_score = calculate_first_order_entropy_score(self.emissions) self.information_content_score = calculate_information_content_score(self.emissions) self.num_seqs_with_site_score = float(self.num_seqs_with_site) / float(self.num_seqs) self.overall_score = weighted_geometric_mean( (self.first_order_entropy_score, self.information_content_score, self.num_seqs_with_site_score), [1.5 , 1. , 1.] ) logging.info( '%s; %8g; %8g; %8g; %8g', self.pssm_file, self.first_order_entropy_score, self.information_content_score, self.num_seqs_with_site_score, self.overall_score )
def look_for_pssms_in_dir(directory): """Find the models that are in the specified directory (named <dataset>-<cross fold index>.pssm).""" # for each file called *.pssm in the model directory logging.info('Looking for PSSMs in: %s' % directory) parsed_models = dict() for file in glob.glob(os.path.join(directory, '*-*.pssm')): try: #logging.info('Found models in %s' % file) base, ext = os.path.splitext(os.path.basename(file)) dataset, cross_fold_idx = base.split('-') cross_fold_idx = int(cross_fold_idx) - 1 logging.info( 'Read models from: %s for dataset: %s and cross-fold validation set: %d', file, dataset, cross_fold_idx) parsed_models[(dataset, cross_fold_idx)] = list( parse_models(open(os.path.join(file)))) except: print sys.exc_info() logging.warning('Could not parse: %s' % file) return parsed_models
help="File in which the gapped PSSMs are stored.") option_parser.add_option( "-l", "--logo-files-basename", dest="logo_files_basename", help="basename of files to write logos to. Extension will be -0.png") option_parser.add_option("-t", "--image-type", dest="image_type", default='png', help="type of images to write") options, args = option_parser.parse_args() for option in option_parser.option_list: if option.dest: logging.info('%s: %s (%s)', option.dest, str(getattr(options, option.dest)), option.help) # Load PSSMs logging.info('Loading PSSMs: %s', options.models_file) pssms = list(parse_models(open(options.models_file))) for i, p in enumerate(pssms): filename = '%s-%d.%s' % (options.logo_files_basename, i, options.image_type) logging.info('Creating image for PSSM: %s', filename) emissions, gap_probs = emissions_and_gaps_from_semi_parsed(p) logo_image = logo.pssm_as_image(emissions, transparencies=gap_probs) logo_image.save(filename)
#model_dir = '/home/reid/Analysis/GappedPssms/apr-2009/single-gap' #sequence_dir = '/home/reid/Data/GappedPssms/apr-2009/' fisher_p_values = list() for fragment, pssm in pssms(): sequence_file = os.path.join(sequence_dir, sequence_filename_fmt % fragment) model_file = os.path.join(model_dir, '%s-%s.pssm' % (fragment, pssm)) logging.info('Loading sequences: %s', sequence_file) sequences = list(sequences_from_fasta(sequence_file)) numpy_seqs = map(seq_to_numpy, sequences) logging.info('Loaded %d sequences', len(sequences)) logging.info('Parsing PSSMs: %s', model_file) pssms = list(parse_models(open(model_file))) logging.info('Building models') models = [ build_hmm_from_semi_parsed(parsed, p_binding_site=p_binding_site) for parsed in pssms ] def nucleotide_dist(): return numpy.zeros(4) + .25 base_dists = DictOf(nucleotide_dist) min_site_length = 20 logging.info('Analysing sequences') for hmm, traits in models:
help="basename of files to write logos to. Extension will be -0.png" ) option_parser.add_option( "-t", "--image-type", dest="image_type", default='png', help="type of images to write" ) options, args = option_parser.parse_args() for option in option_parser.option_list: if option.dest: logging.info('%s: %s (%s)', option.dest, str(getattr(options, option.dest)), option.help) # Load PSSMs logging.info('Loading PSSMs: %s', options.models_file) pssms = list(parse_models(open(options.models_file))) for i, p in enumerate(pssms): filename = '%s-%d.%s' % (options.logo_files_basename, i, options.image_type) logging.info('Creating image for PSSM: %s', filename) emissions, gap_probs = emissions_and_gaps_from_semi_parsed(p) logo_image = logo.pssm_as_image( emissions, transparencies=gap_probs ) logo_image.save(filename)
#sequence_dir = '/home/reid/Data/GappedPssms/apr-2009/' fisher_p_values = list() for fragment, pssm in pssms(): sequence_file = os.path.join(sequence_dir, sequence_filename_fmt % fragment) model_file = os.path.join(model_dir, '%s-%s.pssm' % (fragment, pssm)) logging.info('Loading sequences: %s', sequence_file) sequences = list(sequences_from_fasta(sequence_file)) numpy_seqs = map(seq_to_numpy, sequences) logging.info('Loaded %d sequences', len(sequences)) logging.info('Parsing PSSMs: %s', model_file) pssms = list(parse_models(open(model_file))) logging.info('Building models') models = [ build_hmm_from_semi_parsed(parsed, p_binding_site=p_binding_site) for parsed in pssms ] def nucleotide_dist(): return numpy.zeros(4) + .25 base_dists = DictOf(nucleotide_dist) min_site_length = 20 logging.info('Analysing sequences') for hmm, traits in models:
logging.info('Looking for PSSMs in: %s' % options.model_dir) pssms_for_method = dict() # a dict of parsed models indexed by dataset/cross-validation set index tuple for file in os.listdir(options.model_dir): base, ext = os.path.splitext(file) if '.pssm' == ext: dataset, cross_fold_idx = base.split('-') cross_fold_idx = int(cross_fold_idx) logging.info( 'Reading models from: %s for dataset: %s and cross-fold validation set: %d', file, dataset, cross_fold_idx ) pssms_for_method[(dataset, cross_fold_idx)] = list( parse_models( open(os.path.join(options.model_dir, file)) ) ) # # Generate a ROC point for each p_binding_site for these PSSMs # #p_binding_sites = [.1] num_roc_points = int(options.num_points) min_p_binding_site = float(options.max_p_binding_site) max_p_binding_site = float(options.min_p_binding_site)