コード例 #1
0
ファイル: test_harness.py プロジェクト: JohnReid/biopsy
def look_for_pssms_in_dir(directory):
    """Find the models that are in the specified directory (named <dataset>-<cross fold index>.pssm)."""
    # for each file called *.pssm in the model directory
    logging.info('Looking for PSSMs in: %s' % directory)
    parsed_models = dict()
    for file in glob.glob(os.path.join(directory, '*-*.pssm')):
        try:
            #logging.info('Found models in %s' % file)
            base, ext = os.path.splitext(os.path.basename(file))
            dataset, cross_fold_idx = base.split('-')
            cross_fold_idx = int(cross_fold_idx)-1
            logging.info(
              'Read models from: %s for dataset: %s and cross-fold validation set: %d',
              file,
              dataset,
              cross_fold_idx
            )
            parsed_models[(dataset, cross_fold_idx)] = list(
              parse_models(
                open(os.path.join(file))
              )
            )
        except:
            print sys.exc_info()
            logging.warning('Could not parse: %s' % file)
    return parsed_models
コード例 #2
0
ファイル: score_pssms.py プロジェクト: pombredanne/biopsy
    def __init__(self, pssm_file, log):
        self.pssm_file = pssm_file
        self.png_file = pssm_file.replace('.pssm', '.png')
        self.eps_file = pssm_file.replace('.pssm', '.eps')

        re_match = Pssm.pssm_file_name_re.match(os.path.basename(pssm_file))
        self.tag = re_match.group(1)
        self.pssm_idx = int(re_match.group(2))
        self.num_sites, self.num_seqs_with_site, self.num_seqs = log.site_numbers[self.pssm_idx]
        # logging.info('%s: %s %d %d', self.pssm_file, self.fragment, self.cross_fold, self.pssm_idx)

        self.model = parse_models(open(self.pssm_file)).next()
        self.emissions = calculate_emissions(self.model)
        self.gap_probs = calculate_gap_probs(self.model)
        self.first_order_entropy_score = calculate_first_order_entropy_score(self.emissions)
        self.information_content_score = calculate_information_content_score(self.emissions)
        self.num_seqs_with_site_score = float(self.num_seqs_with_site) / float(self.num_seqs)
        self.overall_score = weighted_geometric_mean(
          (self.first_order_entropy_score, self.information_content_score, self.num_seqs_with_site_score),
          [1.5                           , 1.                            , 1.]
        )
        logging.info(
          '%s; %8g; %8g; %8g; %8g',
          self.pssm_file,
          self.first_order_entropy_score,
          self.information_content_score,
          self.num_seqs_with_site_score,
          self.overall_score
        )
コード例 #3
0
ファイル: test_harness.py プロジェクト: pombredanne/biopsy
def look_for_pssms_in_dir(directory):
    """Find the models that are in the specified directory (named <dataset>-<cross fold index>.pssm)."""
    # for each file called *.pssm in the model directory
    logging.info('Looking for PSSMs in: %s' % directory)
    parsed_models = dict()
    for file in glob.glob(os.path.join(directory, '*-*.pssm')):
        try:
            #logging.info('Found models in %s' % file)
            base, ext = os.path.splitext(os.path.basename(file))
            dataset, cross_fold_idx = base.split('-')
            cross_fold_idx = int(cross_fold_idx) - 1
            logging.info(
                'Read models from: %s for dataset: %s and cross-fold validation set: %d',
                file, dataset, cross_fold_idx)
            parsed_models[(dataset, cross_fold_idx)] = list(
                parse_models(open(os.path.join(file))))
        except:
            print sys.exc_info()
            logging.warning('Could not parse: %s' % file)
    return parsed_models
コード例 #4
0
                         help="File in which the gapped PSSMs are stored.")
option_parser.add_option(
    "-l",
    "--logo-files-basename",
    dest="logo_files_basename",
    help="basename of files to write logos to. Extension will be -0.png")
option_parser.add_option("-t",
                         "--image-type",
                         dest="image_type",
                         default='png',
                         help="type of images to write")
options, args = option_parser.parse_args()
for option in option_parser.option_list:
    if option.dest:
        logging.info('%s: %s (%s)', option.dest,
                     str(getattr(options, option.dest)), option.help)

# Load PSSMs
logging.info('Loading PSSMs: %s', options.models_file)
pssms = list(parse_models(open(options.models_file)))

for i, p in enumerate(pssms):
    filename = '%s-%d.%s' % (options.logo_files_basename, i,
                             options.image_type)
    logging.info('Creating image for PSSM: %s', filename)

    emissions, gap_probs = emissions_and_gaps_from_semi_parsed(p)
    logo_image = logo.pssm_as_image(emissions, transparencies=gap_probs)

    logo_image.save(filename)
コード例 #5
0
#model_dir = '/home/reid/Analysis/GappedPssms/apr-2009/single-gap'
#sequence_dir = '/home/reid/Data/GappedPssms/apr-2009/'

fisher_p_values = list()
for fragment, pssm in pssms():
    sequence_file = os.path.join(sequence_dir,
                                 sequence_filename_fmt % fragment)
    model_file = os.path.join(model_dir, '%s-%s.pssm' % (fragment, pssm))

    logging.info('Loading sequences: %s', sequence_file)
    sequences = list(sequences_from_fasta(sequence_file))
    numpy_seqs = map(seq_to_numpy, sequences)
    logging.info('Loaded %d sequences', len(sequences))

    logging.info('Parsing PSSMs: %s', model_file)
    pssms = list(parse_models(open(model_file)))

    logging.info('Building models')
    models = [
        build_hmm_from_semi_parsed(parsed, p_binding_site=p_binding_site)
        for parsed in pssms
    ]

    def nucleotide_dist():
        return numpy.zeros(4) + .25

    base_dists = DictOf(nucleotide_dist)

    min_site_length = 20
    logging.info('Analysing sequences')
    for hmm, traits in models:
コード例 #6
0
  help="basename of files to write logos to. Extension will be -0.png"
)
option_parser.add_option(
  "-t",
  "--image-type",
  dest="image_type",
  default='png',
  help="type of images to write"
)
options, args = option_parser.parse_args()
for option in option_parser.option_list:
    if option.dest:
        logging.info('%s: %s (%s)', option.dest, str(getattr(options, option.dest)), option.help)


# Load PSSMs
logging.info('Loading PSSMs: %s', options.models_file)
pssms = list(parse_models(open(options.models_file)))

for i, p in enumerate(pssms):
    filename = '%s-%d.%s' % (options.logo_files_basename, i, options.image_type)
    logging.info('Creating image for PSSM: %s', filename)

    emissions, gap_probs = emissions_and_gaps_from_semi_parsed(p)
    logo_image = logo.pssm_as_image(
      emissions,
      transparencies=gap_probs
    )

    logo_image.save(filename)
コード例 #7
0
#sequence_dir = '/home/reid/Data/GappedPssms/apr-2009/'


fisher_p_values = list()
for fragment, pssm in pssms():
    sequence_file = os.path.join(sequence_dir, sequence_filename_fmt % fragment)
    model_file = os.path.join(model_dir, '%s-%s.pssm' % (fragment, pssm))

    logging.info('Loading sequences: %s', sequence_file)
    sequences = list(sequences_from_fasta(sequence_file))
    numpy_seqs = map(seq_to_numpy, sequences)
    logging.info('Loaded %d sequences', len(sequences))


    logging.info('Parsing PSSMs: %s', model_file)
    pssms = list(parse_models(open(model_file)))


    logging.info('Building models')
    models = [
      build_hmm_from_semi_parsed(parsed, p_binding_site=p_binding_site)
      for parsed in pssms
    ]

    def nucleotide_dist():
        return numpy.zeros(4) + .25
    base_dists = DictOf(nucleotide_dist)

    min_site_length = 20
    logging.info('Analysing sequences')
    for hmm, traits in models:
コード例 #8
0
ファイル: test_harness.py プロジェクト: JohnReid/HMM
logging.info('Looking for PSSMs in: %s' % options.model_dir)
pssms_for_method = dict() # a dict of parsed models indexed by dataset/cross-validation set index tuple
for file in os.listdir(options.model_dir):
    base, ext = os.path.splitext(file)
    if '.pssm' == ext:
        dataset, cross_fold_idx = base.split('-')
        cross_fold_idx = int(cross_fold_idx)
        logging.info(
          'Reading models from: %s for dataset: %s and cross-fold validation set: %d',
          file,
          dataset,
          cross_fold_idx
        )
        pssms_for_method[(dataset, cross_fold_idx)] = list(
          parse_models(
            open(os.path.join(options.model_dir, file))
          )
        )






#
# Generate a ROC point for each p_binding_site for these PSSMs
#
#p_binding_sites = [.1]
num_roc_points = int(options.num_points)
min_p_binding_site = float(options.max_p_binding_site)
max_p_binding_site = float(options.min_p_binding_site)