def write_config(self): """Write the configuration file with all the user specified variables. """ if file_funcs.file_exists(self.config_file, self.mainlog): self.mainlog.warning('Config file ' + self.config_file + ' already exists. It will be overwritten!') handle = open(self.config_file, 'w') # We write this in a format that is similar to that used by structure gen_time = str(datetime.datetime.now()) handle.write('// Generated by \'setup\' at ' + gen_time + '\n') handle.write('#define BASE ' + self.base + '\n') handle.write('#define TARGET_SPECIES ' + self.target_species + '\n') handle.write('#define EVAL_THRESHOLD ' + self.eval_thresh + '\n') handle.write('\n// Program paths\n') handle.write('#define BASH ' + self.bash_path + '\n') handle.write('#define GZIP ' + self.gzip_path + '\n') handle.write('#define SUM ' + self.sum_path + '\n') handle.write('#define TBLASTX ' + self.tblastx_path + '\n') handle.write('#define PASTA ' + self.pasta_path + '\n') handle.write('#define HYPHY ' + self.hyphy_path + '\n') handle.write('#define CLUSTALO ' + self.clustalo_path + '\n') handle.write('#define FASTTREE ' + self.fasttree_path + '\n') handle.flush() handle.close() self.mainlog.info('Wrote configuration into ' + self.config_file) return
def write_config(self): """Write the configuration file with all the user specified variables. """ if file_funcs.file_exists(self.config_file, self.mainlog): self.mainlog.warning( 'Config file ' + self.config_file + ' already exists. It will be overwritten!') handle = open(self.config_file, 'w') # We write this in a format that is similar to that used by structure gen_time = str(datetime.datetime.now()) handle.write('// Generated by \'setup\' at ' + gen_time + '\n') handle.write('#define BASE ' + self.base + '\n') handle.write('#define TARGET_SPECIES ' + self.target_species + '\n') handle.write('#define EVAL_THRESHOLD ' + self.eval_thresh + '\n') handle.write('\n// Program paths\n') handle.write('#define BASH ' + self.bash_path + '\n') handle.write('#define GZIP ' + self.gzip_path + '\n') handle.write('#define SUM ' + self.sum_path + '\n') handle.write('#define TBLASTX ' + self.tblastx_path + '\n') handle.write('#define PASTA ' + self.pasta_path + '\n') handle.write('#define HYPHY ' + self.hyphy_path + '\n') handle.flush() handle.close() self.mainlog.info('Wrote configuration into ' + self.config_file) return
def download_files(self): """Iterate through the list of URLs and download the appropriate files. Computes the CRC sum of existing files and compares them to the remote checksum to decide whether or not to to download.""" # For each URL we have: for u, c in zip(self.urls, self.cksums): target_dir = self.make_species_dir(u) # cd into it os.chdir(target_dir) # What is the local file name? lname = file_funcs.local_name(u) # If it exists, we check if the checksums are the same if file_funcs.file_exists(lname, self.mainlog): local_cksum = file_funcs.calculate_crc32(lname, self.mainlog) crc32_same = file_funcs.checksum_is_same( local_cksum, c, self.mainlog) if crc32_same: self.mainlog.info( lname + ' already exists and is current, skipping.') continue else: self.mainlog.info(lname + ' exists, but is out of date. Updating.') same = False while not same: self.get_file(u) new_local_cksum = file_funcs.calculate_crc32( lname, self.mainlog) same = file_funcs.checksum_is_same( new_local_cksum, c, self.mainlog) # And save a record for those that need to be converted self.to_convert.append( os.path.join(self.base, target_dir, lname)) # If the file doesn't exist, then it's the same # as if the checksum were different else: self.mainlog.info(lname + ' does not exist. Downloading.') same = False while not same: self.get_file(u) new_local_cksum = file_funcs.calculate_crc32( lname, self.mainlog) same = file_funcs.checksum_is_same(new_local_cksum, c, self.mainlog) self.to_convert.append( os.path.join(self.base, target_dir, lname)) self.mainlog.info('Done downloading CDS files from Ensembl.') # We are done with the FTP connection, log out self.session.quit() return
def parse_subs(f, log): """Parse the input substitutions file. Returns a list of integers.""" # Does the file exist? if not file_funcs.file_exists(f, log): log.error('File ' + f + ' does not exist.') return False else: # Begin parsing it subs_data = [] with open(f, 'r') as subfile: for index, line in enumerate(subfile): tmp = line.strip().split('\t') # Check the fields. The first one should be numeric try: pos = int(tmp[0]) except ValueError: log.error( 'Line ' + str(index + 1) + ' of input file ' + f + ': First field is not an integer.') exit(1) # If we can sucessfully cast it to integer, then we continue # If there is only one item in the list, then the SNP ID # is abset. We drop in the empty string if len(tmp) == 1: snpid = '' log.warning( 'Variant on line ' + str(index + 1) + ' of input file ' + f + ' does not have an ID. ' + 'Using the empty string (\'\') as an ID.') else: snpid = tmp[1] # Return these as a tuple subs_data.append(pos) log.info( 'Input file ' + f + ' contains ' + str(index+1) + ' positions to predict.') # Sort the substitutions subs_data.sort() return subs_data
def valid_tree(f, log): """Check that the phylogenetic tree is valid. This only checks the tree structure and doesn't check any of the branch lengths or names.""" if not file_funcs.file_exists(f, log): log.error('File ' + f + ' does not exist') return False else: # Phylo.read() raises a NewickError when the tree is not valid try: p = Phylo.read(f, 'newick') except NewickError: log.error( 'Input file ' + \ f + \ ' is not a valid Newick tree file!') return False return True
def valid_fasta(f, log): """Check if the FASTA supplied is valid.""" # Does the file exist? if not file_funcs.file_exists(f, log): log.error('File ' + f + ' does not exist.') return False else: # Start checking it try: s = SeqIO.read(f, 'fasta') except ValueError: log.error( 'Input file ' + \ f + \ ' has more than one record. '+ \ 'This script only accepts single-record FASTA files.') return False return True
def valid_msa(f, log): """Check if the MSA is a valid sequence alignment or not. All sequences should be the same length, and should be in FASTA format.""" if not file_funcs.file_exists(f, log): log.error('File ' + f + ' does not exist.') return False else: # AlignIO.read() raises a ValueError if the alignment is not in the # right format, or if not all the sequences are the same length try: a = AlignIO.read(f, 'fasta') except ValueError: log.error( 'Input file ' + \ f + \ ' is not a valid FASTA alignment!' + \ ' Check the length of each sequence.') return False return True
def validate_args(args, log): """A function that validates the arguments. For arguments that are filenames, it checks that they are readable. For directories, it checks that they are read/write. For usernames, it checks that they are valid email addresses. Prompts for username/password if they are not supplied on the command line. Validate input files for prediction.""" # Check the base argument. If it starts with something other than a / # then it is a relative path, and we should fix it if 'base' not in args: args['base'] = '.' if not args['base'].startswith('/'): # Add the cwd onto it, since the script fails otherwise args['base'] = os.path.join(os.getcwd(), args['base']) # Then check the action # If we are fetching, we have to check the username and base # argparse should have checked for missing arguments by now # If the arguments do not check out, return a message if args['action'] == 'setup': if args['list_species']: return (False, 'The list of allowable species names is \n' + '\n'.join(SPECIES_LIST)) if args['target'] not in SPECIES_LIST: return (False, ('The species name you provided is not in the list of ' 'allowable species.')) # Check the filename for the config file. It can be a relative path # or start with a tilde. if not args['config'].startswith('/'): args['config'] = os.path.join(os.getcwd(), args['config']) elif args['config'].startswith('~'): # os.path.expanduser() will transform ~/... into /home/user/... args['config'] = os.path.expanduser(args['config']) if not check_args.valid_dir(os.path.dirname(args['config'])): return ( False, 'You cannot create a configuration file in that directory.') if not check_args.valid_dir(args['base']): return ( False, 'Base directory is not readable/writable, or does not exist.') elif args['action'] == 'fetch': # If config is suppled: if args['config']: if not file_funcs.file_exists(args['config'], log): return (False, 'The specified configuration file does not exist!') # If username is supplied: if args['user']: # Check if it's valid if not check_args.valid_email(args['user']): return (False, 'Username is not a valid e-mail address.') # Username not supplied, and we need to access JGI elif not args['convert_only']: args['user'] = input('Username for JGI Genomes Portal: ') # Else, we only want to convert else: pass # Same with password if args['password']: pass elif not args['convert_only']: args['password'] = getpass.getpass( 'Password for JGI Genomes Portal: ') else: pass if not check_args.valid_dir(args['base']): return ( False, 'Base directory is not readable/writable, or does not exist.') else: pass # Check the arguments passed to align elif args['action'] == 'align': # If config is suppled: if args['config']: if not file_funcs.file_exists(args['config'], log): return (False, 'The specified configuration file does not exist!') if not check_args.valid_dir(args['output']): return ( False, 'Output directory is not readable/writable, or does not exist.' ) # Check arguments to predict elif args['action'] == 'predict': # If config is suppled: if args['config']: if not file_funcs.file_exists(args['config'], log): return (False, 'The specified configuration file does not exist!') if not check_args.valid_dir(args['output']): return ( False, 'Output directory is not readable/writable, or does not exist.' ) if not parse_input.valid_tree(args['tree'], log): return (False, 'The input Newick tree is not valid.') if not parse_input.valid_msa(args['alignment'], log): return (False, 'The input MSA file provided is not valid.') if not parse_input.parse_subs(args['substitutions'], log): return (False, 'The input substitutions file provided is not valid.') return (args, None)
def fetch_cds(self): """Iterates through the urls and md5s instance attributes and downloads the appropriate files. Checks the local MD5 against the remote MD5 and downloads the remote file if they differ. Appends the filenames of each updated file to the `to_convert' attribute.""" self.mainlog.debug('Downloading files from ' + str(len(self.urls)) + ' species') for u, m in zip(self.urls, self.md5s): # Get a local name of the CDS lname = file_funcs.local_name(u) target_dir = self.make_species_dir(u) os.chdir(target_dir) # check to see if the file already exists if file_funcs.file_exists(lname, self.mainlog): # Get the md5 lmd5 = file_funcs.calculate_md5(lname, self.mainlog) # Compare the MD5s md5s_same = file_funcs.checksum_is_same(lmd5, m, self.mainlog) # If they are the same, skip it, and move on if md5s_same: self.mainlog.info(lname + ' is current. Skipping.') continue else: self.mainlog.info(lname + ' is out of date. Downloading.') # Try to download it until the MD5s check out same = False while not same: self.download_file(u) new_lmd5 = file_funcs.calculate_md5( lname, self.mainlog) same = file_funcs.checksum_is_same( new_lmd5, m, self.mainlog) # Tack it onto the list of files to convert self.to_convert.append( os.path.join( self.base, target_dir, lname) ) else: self.mainlog.info(lname + ' does not yet exist. Downloading.') # And the same procedure as if the file were updated same = False while not same: self.download_file(u) new_lmd5 = file_funcs.calculate_md5( lname, self.mainlog) same = file_funcs.checksum_is_same( new_lmd5, m, self.mainlog) self.to_convert.append( os.path.join( self.base, target_dir, lname)) self.mainlog.info('Done downloading CDS files from Phytozome.') return