Beispiel #1
0
 def write_config(self):
     """Write the configuration file with all the user specified variables.
     """
     if file_funcs.file_exists(self.config_file, self.mainlog):
         self.mainlog.warning('Config file ' + self.config_file +
                              ' already exists. It will be overwritten!')
     handle = open(self.config_file, 'w')
     #   We write this in a format that is similar to that used by structure
     gen_time = str(datetime.datetime.now())
     handle.write('// Generated by \'setup\' at ' + gen_time + '\n')
     handle.write('#define BASE ' + self.base + '\n')
     handle.write('#define TARGET_SPECIES ' + self.target_species + '\n')
     handle.write('#define EVAL_THRESHOLD ' + self.eval_thresh + '\n')
     handle.write('\n// Program paths\n')
     handle.write('#define BASH ' + self.bash_path + '\n')
     handle.write('#define GZIP ' + self.gzip_path + '\n')
     handle.write('#define SUM ' + self.sum_path + '\n')
     handle.write('#define TBLASTX ' + self.tblastx_path + '\n')
     handle.write('#define PASTA ' + self.pasta_path + '\n')
     handle.write('#define HYPHY ' + self.hyphy_path + '\n')
     handle.write('#define CLUSTALO ' + self.clustalo_path + '\n')
     handle.write('#define FASTTREE ' + self.fasttree_path + '\n')
     handle.flush()
     handle.close()
     self.mainlog.info('Wrote configuration into ' + self.config_file)
     return
Beispiel #2
0
 def write_config(self):
     """Write the configuration file with all the user specified variables.
     """
     if file_funcs.file_exists(self.config_file, self.mainlog):
         self.mainlog.warning(
             'Config file ' +
             self.config_file +
             ' already exists. It will be overwritten!')
     handle = open(self.config_file, 'w')
     #   We write this in a format that is similar to that used by structure
     gen_time = str(datetime.datetime.now())
     handle.write('// Generated by \'setup\' at ' + gen_time + '\n')
     handle.write('#define BASE ' + self.base + '\n')
     handle.write('#define TARGET_SPECIES ' + self.target_species + '\n')
     handle.write('#define EVAL_THRESHOLD ' + self.eval_thresh + '\n')
     handle.write('\n// Program paths\n')
     handle.write('#define BASH ' + self.bash_path + '\n')
     handle.write('#define GZIP ' + self.gzip_path + '\n')
     handle.write('#define SUM ' + self.sum_path + '\n')
     handle.write('#define TBLASTX ' + self.tblastx_path + '\n')
     handle.write('#define PASTA ' + self.pasta_path + '\n')
     handle.write('#define HYPHY ' + self.hyphy_path + '\n')
     handle.flush()
     handle.close()
     self.mainlog.info('Wrote configuration into ' + self.config_file)
     return
Beispiel #3
0
 def download_files(self):
     """Iterate through the list of URLs and download the appropriate
     files. Computes the CRC sum of existing files and compares them to
     the remote checksum to decide whether or not to to download."""
     #   For each URL we have:
     for u, c in zip(self.urls, self.cksums):
         target_dir = self.make_species_dir(u)
         #   cd into it
         os.chdir(target_dir)
         #   What is the local file name?
         lname = file_funcs.local_name(u)
         #   If it exists, we check if the checksums are the same
         if file_funcs.file_exists(lname, self.mainlog):
             local_cksum = file_funcs.calculate_crc32(lname, self.mainlog)
             crc32_same = file_funcs.checksum_is_same(
                 local_cksum, c, self.mainlog)
             if crc32_same:
                 self.mainlog.info(
                     lname + ' already exists and is current, skipping.')
                 continue
             else:
                 self.mainlog.info(lname +
                                   ' exists, but is out of date. Updating.')
                 same = False
                 while not same:
                     self.get_file(u)
                     new_local_cksum = file_funcs.calculate_crc32(
                         lname, self.mainlog)
                     same = file_funcs.checksum_is_same(
                         new_local_cksum, c, self.mainlog)
                 #   And save a record for those that need to be converted
                 self.to_convert.append(
                     os.path.join(self.base, target_dir, lname))
         #   If the file doesn't exist, then it's the same
         #   as if the checksum were different
         else:
             self.mainlog.info(lname + ' does not exist. Downloading.')
             same = False
             while not same:
                 self.get_file(u)
                 new_local_cksum = file_funcs.calculate_crc32(
                     lname, self.mainlog)
                 same = file_funcs.checksum_is_same(new_local_cksum, c,
                                                    self.mainlog)
             self.to_convert.append(
                 os.path.join(self.base, target_dir, lname))
     self.mainlog.info('Done downloading CDS files from Ensembl.')
     #   We are done with the FTP connection, log out
     self.session.quit()
     return
Beispiel #4
0
def parse_subs(f, log):
    """Parse the input substitutions file. Returns a list of integers."""
    #   Does the file exist?
    if not file_funcs.file_exists(f, log):
        log.error('File ' + f + ' does not exist.')
        return False
    else:
        #   Begin parsing it
        subs_data = []
        with open(f, 'r') as subfile:
            for index, line in enumerate(subfile):
                tmp = line.strip().split('\t')
                #   Check the fields. The first one should be numeric
                try:
                    pos = int(tmp[0])
                except ValueError:
                    log.error(
                        'Line ' +
                        str(index + 1) +
                        ' of input file ' + f
                        + ': First field is not an integer.')
                    exit(1)
                #   If we can sucessfully cast it to integer, then we continue
                #   If there is only one item in the list, then the SNP ID
                #   is abset. We drop in the empty string
                if len(tmp) == 1:
                    snpid = ''
                    log.warning(
                        'Variant on line ' +
                        str(index + 1) +
                        ' of input file ' +
                        f +
                        ' does not have an ID. ' +
                        'Using the empty string (\'\') as an ID.')
                else:
                    snpid = tmp[1]
                #   Return these as a tuple
                subs_data.append(pos)
    log.info(
        'Input file ' +
        f +
        ' contains ' +
        str(index+1) +
        ' positions to predict.')
    # Sort the substitutions
    subs_data.sort()
    return subs_data
Beispiel #5
0
def valid_tree(f, log):
    """Check that the phylogenetic tree is valid. This only checks the tree
    structure and doesn't check any of the branch lengths or names."""
    if not file_funcs.file_exists(f, log):
        log.error('File ' + f + ' does not exist')
        return False
    else:
        #   Phylo.read() raises a NewickError when the tree is not valid
        try:
            p = Phylo.read(f, 'newick')
        except NewickError:
            log.error(
                'Input file ' + \
                f + \
                ' is not a valid Newick tree file!')
            return False
        return True
Beispiel #6
0
def valid_fasta(f, log):
    """Check if the FASTA supplied is valid."""
    #   Does the file exist?
    if not file_funcs.file_exists(f, log):
        log.error('File ' + f + ' does not exist.')
        return False
    else:
        #   Start checking it
        try:
            s = SeqIO.read(f, 'fasta')
        except ValueError:
            log.error(
                'Input file ' + \
                f + \
                ' has more than one record. '+ \
                'This script only accepts single-record FASTA files.')
            return False
        return True
Beispiel #7
0
def valid_msa(f, log):
    """Check if the MSA is a valid sequence alignment or not. All sequences
    should be the same length, and should be in FASTA format."""
    if not file_funcs.file_exists(f, log):
        log.error('File ' + f + ' does not exist.')
        return False
    else:
        #   AlignIO.read() raises a ValueError if the alignment is not in the
        #   right format, or if not all the sequences are the same length
        try:
            a = AlignIO.read(f, 'fasta')
        except ValueError:
            log.error(
                'Input file ' + \
                f + \
                ' is not a valid FASTA alignment!' + \
                ' Check the length of each sequence.')
            return False
        return True
Beispiel #8
0
def validate_args(args, log):
    """A function that validates the arguments. For arguments that are
    filenames, it checks that they are readable. For directories, it checks
    that they are read/write. For usernames, it checks that they are valid
    email addresses. Prompts for username/password if they are not supplied
    on the command line. Validate input files for prediction."""
    #   Check the base argument. If it starts with something other than a /
    #   then it is a relative path, and we should fix it
    if 'base' not in args:
        args['base'] = '.'
    if not args['base'].startswith('/'):
        #   Add the cwd onto it, since the script fails otherwise
        args['base'] = os.path.join(os.getcwd(), args['base'])
    #   Then check the action
    #   If we are fetching, we have to check the username and base
    #   argparse should have checked for missing arguments by now
    #   If the arguments do not check out, return a message
    if args['action'] == 'setup':
        if args['list_species']:
            return (False, 'The list of allowable species names is \n' +
                    '\n'.join(SPECIES_LIST))
        if args['target'] not in SPECIES_LIST:
            return (False,
                    ('The species name you provided is not in the list of '
                     'allowable species.'))
        #   Check the filename for the config file. It can be a relative path
        #   or start with a tilde.
        if not args['config'].startswith('/'):
            args['config'] = os.path.join(os.getcwd(), args['config'])
        elif args['config'].startswith('~'):
            #   os.path.expanduser() will transform ~/... into /home/user/...
            args['config'] = os.path.expanduser(args['config'])
        if not check_args.valid_dir(os.path.dirname(args['config'])):
            return (
                False,
                'You cannot create a configuration file in that directory.')
        if not check_args.valid_dir(args['base']):
            return (
                False,
                'Base directory is not readable/writable, or does not exist.')
    elif args['action'] == 'fetch':
        #   If config is suppled:
        if args['config']:
            if not file_funcs.file_exists(args['config'], log):
                return (False,
                        'The specified configuration file does not exist!')
        #   If username is supplied:
        if args['user']:
            #   Check if it's valid
            if not check_args.valid_email(args['user']):
                return (False, 'Username is not a valid e-mail address.')
        #   Username not supplied, and we need to access JGI
        elif not args['convert_only']:
            args['user'] = input('Username for JGI Genomes Portal: ')
        #   Else, we only want to convert
        else:
            pass
        #   Same with password
        if args['password']:
            pass
        elif not args['convert_only']:
            args['password'] = getpass.getpass(
                'Password for JGI Genomes Portal: ')
        else:
            pass
        if not check_args.valid_dir(args['base']):
            return (
                False,
                'Base directory is not readable/writable, or does not exist.')
        else:
            pass
    #   Check the arguments passed to align
    elif args['action'] == 'align':
        #   If config is suppled:
        if args['config']:
            if not file_funcs.file_exists(args['config'], log):
                return (False,
                        'The specified configuration file does not exist!')
        if not check_args.valid_dir(args['output']):
            return (
                False,
                'Output directory is not readable/writable, or does not exist.'
            )
    #   Check arguments to predict
    elif args['action'] == 'predict':
        #   If config is suppled:
        if args['config']:
            if not file_funcs.file_exists(args['config'], log):
                return (False,
                        'The specified configuration file does not exist!')
        if not check_args.valid_dir(args['output']):
            return (
                False,
                'Output directory is not readable/writable, or does not exist.'
            )
        if not parse_input.valid_tree(args['tree'], log):
            return (False, 'The input Newick tree is not valid.')
        if not parse_input.valid_msa(args['alignment'], log):
            return (False, 'The input MSA file provided is not valid.')
        if not parse_input.parse_subs(args['substitutions'], log):
            return (False,
                    'The input substitutions file provided is not valid.')
    return (args, None)
Beispiel #9
0
 def fetch_cds(self):
     """Iterates through the urls and md5s instance attributes and
        downloads the appropriate files. Checks the local MD5 against the
        remote MD5 and downloads the remote file if they differ. Appends
        the filenames of each updated file to the `to_convert' attribute."""
     self.mainlog.debug('Downloading files from ' +
                        str(len(self.urls)) +
                        ' species')
     for u, m in zip(self.urls, self.md5s):
         #   Get a local name of the CDS
         lname = file_funcs.local_name(u)
         target_dir = self.make_species_dir(u)
         os.chdir(target_dir)
         #   check to see if the file already exists
         if file_funcs.file_exists(lname, self.mainlog):
             #   Get the md5
             lmd5 = file_funcs.calculate_md5(lname, self.mainlog)
             #   Compare the MD5s
             md5s_same = file_funcs.checksum_is_same(lmd5, m, self.mainlog)
             #   If they are the same, skip it, and move on
             if md5s_same:
                 self.mainlog.info(lname + ' is current. Skipping.')
                 continue
             else:
                 self.mainlog.info(lname + ' is out of date. Downloading.')
                 #   Try to download it until the MD5s check out
                 same = False
                 while not same:
                     self.download_file(u)
                     new_lmd5 = file_funcs.calculate_md5(
                         lname,
                         self.mainlog)
                     same = file_funcs.checksum_is_same(
                         new_lmd5,
                         m,
                         self.mainlog)
                 #   Tack it onto the list of files to convert
                 self.to_convert.append(
                     os.path.join(
                         self.base,
                         target_dir,
                         lname)
                     )
         else:
             self.mainlog.info(lname + ' does not yet exist. Downloading.')
             #   And the same procedure as if the file were updated
             same = False
             while not same:
                 self.download_file(u)
                 new_lmd5 = file_funcs.calculate_md5(
                     lname,
                     self.mainlog)
                 same = file_funcs.checksum_is_same(
                     new_lmd5,
                     m,
                     self.mainlog)
             self.to_convert.append(
                 os.path.join(
                     self.base,
                     target_dir,
                     lname))
     self.mainlog.info('Done downloading CDS files from Phytozome.')
     return