Ejemplo n.º 1
0
def parse_hmmer_output(filename):
    logger.debug('parsing from file: ' + filename)
    result = result_from_file(filename)
    if result == None:
      return 'Unknown', 'Unknown', None
    pdbid, chainid = result.pdbid, result.chain
    # logger.debug('result: ' + str(result))
    rawscore = float(result.full_seq_score.score)
    return  pdbid, chainid, rawscore
Ejemplo n.º 2
0
def parse_smurf_output(filename):
    logger.debug('parsing from file: ' + filename)
    with open(filename, 'r') as f:
        pdb_str = f.readline()[1:7]
        # logger.debug('read: ' + pdb_str)
        if len(pdb_str) < 6:
          return None, None, None
        pdbid, chainid = pdb_str.split('_')
        read_value = f.readline()
        if read_value[0:8] == 'Sequence':
          return pdbid, chainid, None
        rawscore = float(read_value[11:])
        # logger.debug('read: ' + str(rawscore))
    return pdbid, chainid, rawscore
# determine the amount of logging info to output
if parsed_args.verbose:
    from logging import DEBUG
    from gargamel.logger import console_handler
    console_handler.setLevel(DEBUG)

# configuration summary
config = {'target_level' : target_level,
          'target_sunid' : target_sunid,
          'output_dir' : output_dir,
          #'logging_level' : level,
          'aligner' : aligner,
          'representative_field' : representative_field}

logger.debug('Program configuration: ' + str(config))

# create a dictionary from PDB IDs to residue sequences
logger.debug('Building a mapping from PDBID to residue sequence...')
sequences = sequences_from_file(FASTA_FILENAME)

# get the set of non-redundant PDB chains from the NRPDB file, and use only
# these chains for training
logger.debug('Getting non-redundant set of PDB chains...')
nrpdbs = nrpdbs_from_file(NRPDB_FILENAME, representative_field)
logger.debug('There are ' + str(len(nrpdbs)) + ' non-redundant chains.')

# then, filter the query_pdbids and the trained_pdbids to be only reps
# then subtract out the trained_pdbids

# get all the PDB IDs on which to test (don't test already trained superfamily)
representative_field = parsed_args.repfield

# determine the amount of logging info to output
if parsed_args.verbose:
    from logging import DEBUG
    from gargamel.logger import console_handler
    console_handler.setLevel(DEBUG)

# configuration summary
config = {'target_level' : target_level,
          'target_sunid' : target_sunid,
          'output_dir' : output_dir,
          #'logging_level' : level,
          'aligner' : aligner,
          'representative_field' : representative_field}
logger.debug('Program configuration: ' + str(config))

# create a dictionary from PDB IDs to residue sequences
logger.debug('Building a mapping from PDBID to residue sequence...')
sequences = sequences_from_file(FASTA_FILENAME)


# get the set of non-redundant PDB chains from the NRPDB file

logger.debug('Getting non-redundant set of PDB chains...')
nrpdbs = nrpdbs_from_file(NRPDB_FILENAME, representative_field)

# logger.debug('nrpdbs: ' + str(nrpdbs))

# get all the records from the SCOP classification file
# this now has (pdbid,chain) tuples
output_filename = 'negative_controls'

# determine the amount of logging info to output
if parsed_args.verbose:
    from logging import DEBUG
    from gargamel.logger import console_handler
    console_handler.setLevel(DEBUG)

# configuration summary
config = {'target_level' : target_level,
          'target_sunid' : target_sunid,
          'output_dir' : output_dir,
          #'logging_level' : level,
          'representative_field' : representative_field}

logger.debug('Program configuration: ' + str(config))


# get the set of non-redundant PDB chains from the NRPDB file, and use only
# these chains for training
logger.debug('Getting non-redundant set of PDB chains...')
nrpdbs = nrpdbs_from_file(NRPDB_FILENAME, representative_field)
logger.debug('There are ' + str(len(nrpdbs)) + ' non-redundant chains.')

# then, filter the query_pdbids and the trained_pdbids to be only reps
# then subtract out the trained_pdbids

# get all the PDB IDs on which to test (don't test already trained superfamily)
logger.debug('Getting records to test from SCOP Classification file...')
original_query_pdbids = all_pdbids_from_file(SCOP_CLASSIFICATION_FILE)
query_pdbids = set([nrpdbs[x] for x in original_query_pdbids if x in nrpdbs])
Ejemplo n.º 6
0
output_dir = parsed_args.outputdir.rstrip('/')  # remove trailing slash
query_file = parsed_args.query_file


# determine the amount of logging info to output
if parsed_args.verbose:
    from logging import DEBUG
    from gargamel.logger import console_handler
    console_handler.setLevel(DEBUG)

# configuration summary
config = {'output_dir' : output_dir,
          #'logging_level' : level,
          'aligner' : aligner,
          'query_file': query_file}
logger.debug('Program configuration: ' + str(config))

# end the program if the output dir doesn't exist
logger.debug('Checking whether output directory exists...')
if not os.path.isdir(output_dir):
    logger.critical('Output directory ' + output_dir + ' does not yet exist')
    logger.critical('Please run generate-matt-alignments.py and ' + \
                     'generate-hmm.py first')
    sys.exit(2)



        
# TODO get base fasta filename, put this in a loop over all queries        

base_fasta_filename = os.path.basename(query_file)
pdb_dir = parsed_args.pdbdir.rstrip('/')  # remove trailing slash
representative_field = parsed_args.repfield

# determine the amount of logging info to output
if parsed_args.verbose:
    from logging import DEBUG
    from gargamel.logger import console_handler
    console_handler.setLevel(DEBUG)

# summary of the program configuration
config = {'output_dir' : output_dir,
          'pdb_dir' : pdb_dir,
          'representative_field' : representative_field,
          'target_level' : target_level,
          'target_sunid' : target_sunid}
logger.debug('Program configuration: ' + str(config))

# get the set of non-redundant PDB chains from the NRPDB file, and use only
# these chains for training

nrpdbs = nrpdbs_from_file(NRPDB_FILENAME, representative_field)

# get all the records from the SCOP classification file
logger.debug('Getting PDB IDs from SCOP Classification file...')
all_pdbs = all_pdbids_from_file_in(SCOP_CLASSIFICATION_FILE, target_level,
                                     target_sunid)
logger.debug('all_pdbs: ' + str(all_pdbs))
if len(all_pdbs) == 0:
    logger.critical('Nothing in all_pdbs for target level: ' + str(target_level) + ' and target_sunid: ' + str(target_sunid))
    sys.exit(1)
pdb_dir = parsed_args.pdbdir.rstrip('/')  # remove trailing slash
representative_field = parsed_args.repfield

# determine the amount of logging info to output
if parsed_args.verbose:
    from logging import DEBUG
    from gargamel.logger import console_handler
    console_handler.setLevel(DEBUG)

# summary of the program configuration
config = {'output_dir' : output_dir,
          'pdb_dir' : pdb_dir,
          'representative_field' : representative_field,
          'target_level' : target_level,
          'target_sunid' : target_sunid}
logger.debug('Program configuration: ' + str(config))

# get the set of non-redundant PDB chains from the NRPDB file, and use only
# these chains for training

nrpdbs = nrpdbs_from_file(NRPDB_FILENAME, representative_field)

# get all the records from the SCOP classification file
logger.debug('Getting PDB IDs from SCOP Classification file...')
hierarchy = hierarchy_sets_from_file(SCOP_CLASSIFICATION_FILE, target_level,
                                     target_sunid)
logger.debug('hierarchy: ' + str(hierarchy))
if len(hierarchy) == 0:
    logger.critical('Nothing in hierarchy for target level: ' + str(target_level) + ' and target_sunid: ' + str(target_sunid))
    sys.exit(1)
output_dir = parsed_args.outputdir.rstrip('/')  # remove trailing slash
aligner = parsed_args.aligner
smurf_lite_threshold = parsed_args.smurf_lite_threshold
simev_frequency = parsed_args.simev_frequency
simev_count = parsed_args.simev_count
simev_threshold = parsed_args.simev_threshold

# determine the amount of logging info to output
if parsed_args.verbose:
    from logging import DEBUG
    from gargamel.logger import console_handler
    console_handler.setLevel(DEBUG)

# summary of the program configuration
config = {'output_dir' : output_dir, 'aligner' : aligner}
logger.debug('Program configuration: ' + str(config))

# end the program if the output dir doesn't exist
logger.debug('Checking whether output directory exists...')
if not os.path.isdir(output_dir):
    logger.critical('Output directory ' + output_dir + ' does not yet exist')
    logger.critical('Please run generate-matt-alignments.py first')
    sys.exit(2)


# determine which executable and multiple alignment file to use for the
# hmmbuild step, and determine the name of the HMM file
if aligner == SMURF_LITE:
    executable = SMURF_LITE_HMMBUILD_EXECUTABLE
    preparse_executable = SMURF_LITE_PREPARSE_EXECUTABLE
    hmm_filename = os.path.basename(output_dir) + '_smurf-lite.hmm+'
Ejemplo n.º 10
0
output_dir = parsed_args.outputdir.rstrip('/')  # remove trailing slash
aligner = parsed_args.aligner
smurf_lite_threshold = parsed_args.smurf_lite_threshold
simev_frequency = parsed_args.simev_frequency
simev_count = parsed_args.simev_count
simev_threshold = parsed_args.simev_threshold

# determine the amount of logging info to output
if parsed_args.verbose:
    from logging import DEBUG
    from gargamel.logger import console_handler
    console_handler.setLevel(DEBUG)

# summary of the program configuration
config = {'output_dir' : output_dir, 'aligner' : aligner}
logger.debug('Program configuration: ' + str(config))

# end the program if the output dir doesn't exist
logger.debug('Checking whether output directory exists...')
if not os.path.isdir(output_dir):
    logger.critical('Output directory ' + output_dir + ' does not yet exist')
    logger.critical('Please run generate-matt-alignments.py first')
    sys.exit(2)

logger.debug('Determining which hierarchy levels were left out during '
             'training...')
logger.debug('  output_dir contains: ' + str(os.listdir(output_dir)))
sunids = filter(lambda x: os.path.isdir(os.path.join(output_dir, x)),
                os.listdir(output_dir))
logger.debug('  sunids: ' + str(sunids))
representative_field = parsed_args.repfield

output_filename = 'positive_controls'

# determine the amount of logging info to output
if parsed_args.verbose:
    from logging import DEBUG
    from gargamel.logger import console_handler
    console_handler.setLevel(DEBUG)

# configuration summary
config = {'target_level' : target_level,
          'target_sunid' : target_sunid,
          'output_dir' : output_dir,
          'representative_field' : representative_field}
logger.debug('Program configuration: ' + str(config))



# get the set of non-redundant PDB chains from the NRPDB file

logger.debug('Getting non-redundant set of PDB chains...')
nrpdbs = nrpdbs_from_file(NRPDB_FILENAME, representative_field)

# logger.debug('nrpdbs: ' + str(nrpdbs))

# get all the records from the SCOP classification file
# this now has (pdbid,chain) tuples
logger.debug('Getting records to test from SCOP Classification file...')
hierarchy = hierarchy_sets_from_file(SCOP_CLASSIFICATION_FILE, target_level,
                                     target_sunid)
Ejemplo n.º 12
0
argparser = AlignmentArgumentParser(PROGRAM_DESCRIPTION)
parsed_args = argparser.parse_args()

# the directory containing the results from the smurf/hmmer alignment tests
output_dir = parsed_args.outputdir
aligner = parsed_args.aligner
# logger.debug('Aligner: ' + aligner)
# determine the amount of logging info to output
if parsed_args.verbose:
    from logging import DEBUG
    from gargamel.logger import console_handler
    console_handler.setLevel(DEBUG)

# a summary of the runtime configuration of this program
config = {'output_dir' : output_dir}
logger.debug('Program configuration: ' + str(config))

# check if the output dir exists
if not os.path.isdir(output_dir):
    logger.critical('Directory ' + output_dir + ' does not exist.')
    sys.exit(STATUS_NO_DIR)

# find all subdirectories of the output directory
logger.debug('Determining which family directories exist...')
logger.debug('  output_dir contains: ' + str(os.listdir(output_dir)))
subdirectories = filter(lambda x: os.path.isdir(os.path.join(output_dir, x)),
                        os.listdir(output_dir))
logger.debug('  subdirectories: ' + str(subdirectories))
# get all subdirectories which are only digits
# TODO this is not the best way to do this
families = filter(lambda x: all(filter(lambda y: y.isdigit(), x)),