def download_file(self, url): """Fetches a remote file. Uses the cookies file from cURL to authenticate.""" self.mainlog.debug('Fetching ' + url) # Build the full URL to fetch full_url = self.DL_BASE + url # And build the command to download it cmd = [ 'curl', full_url, '-b', self.cookie.name, '-o', file_funcs.local_name(url) ] # Then download it p = subprocess.Popen( cmd, shell=False, stdout=subprocess.PIPE, stderr=subprocess.PIPE ) out, err = p.communicate() self.mainlog.debug('Done fetching ' + url) return
def get_xml_urls(self): """Gets the URLs and th MD5s of the CDS files from the XML file from Phytozome. Stores these data in `urls' and `md5s' respectively.""" self.mainlog.debug('Fetching XML') # Create another temporary named file for the XML output xml_out = tempfile.NamedTemporaryFile( mode='w+t', prefix='BAD_Mutations_JGI_XML_', suffix='.xml', delete=False) self.mainlog.debug('XML will be stored in ' + xml_out.name) # Use cURL to download the XML, passing the cookies we generated # earlier to authenticate. cmd = [ 'curl', self.XML_URL, '-b', self.cookie.name, '-o', xml_out.name ] # Execute the command p = subprocess.Popen( cmd, shell=False, stdout=subprocess.PIPE, stderr=subprocess.PIPE) out, err = p.communicate() self.mainlog.debug('cURL stdout: ' + out.decode('utf-8')) self.mainlog.debug('cURL stderr: ' + err.decode('utf-8')) # Then, read the XML back from the file xml = xml_out.read() # This suffix is what we want the filenames ending with # this can change, depending on the target of the LRT suffix = '.cds.fa.gz' # Use HTTP GET to fetch the XML from Phytozome's server # This is also a response obkect self.mainlog.debug('The XML I got was \n\n' + xml) # Create an element tree out of it, so we can easily step # through the data xml_tree = ElementTree.fromstring(xml) # Step through it and extract all CDS URLs for elem in xml_tree.findall('.//file'): # if the URL ends in a certain suffix, then save it if elem.attrib.get('url').endswith(suffix): url = elem.attrib.get('url') md5 = elem.attrib.get('md5') # Check to see that the file is in the list of # species to download local_filename = file_funcs.local_name(url) species_name = file_funcs.species_name(local_filename) if species_name in self.TO_FETCH: self.urls.append(url) self.md5s.append(md5) self.mainlog.debug('Found ' + str(len(self.urls)) + ' files to fetch') return
def get_xml_urls(self): """Gets the URLs and th MD5s of the CDS files from the XML file from Phytozome. Stores these data in `urls' and `md5s' respectively.""" self.mainlog.debug('Fetching XML') # Create another temporary named file for the XML output xml_out = tempfile.NamedTemporaryFile( mode='w+t', prefix='BAD_Mutations_JGI_XML_', suffix='.xml', delete=False) # Use cURL to download the XML, passing the cookies we generated # earlier to authenticate. cmd = [ 'curl', self.XML_URL, '-b', self.cookie.name, '-o', xml_out.name ] # Execute the command p = subprocess.Popen( cmd, shell=False, stdout=subprocess.PIPE, stderr=subprocess.PIPE) out, err = p.communicate() # Then, read the XML back from the file xml = xml_out.read() # This suffix is what we want the filenames ending with # this can change, depending on the target of the LRT suffix = '.cds.fa.gz' # Use HTTP GET to fetch the XML from Phytozome's server # This is also a response obkect self.mainlog.debug('The XML I got was \n\n' + xml) # Create an element tree out of it, so we can easily step # through the data xml_tree = ElementTree.fromstring(xml) # Step through it and extract all CDS URLs for elem in xml_tree.findall('.//file'): # if the URL ends in a certain suffix, then save it if elem.attrib.get('url').endswith(suffix): url = elem.attrib.get('url') md5 = elem.attrib.get('md5') # Check to see that the file is in the list of # species to download local_filename = file_funcs.local_name(url) species_name = file_funcs.species_name(local_filename) if species_name in self.TO_FETCH: self.urls.append(url) self.md5s.append(md5) self.mainlog.debug('Found ' + str(len(self.urls)) + ' files to fetch') return
def download_files(self): """Iterate through the list of URLs and download the appropriate files. Computes the CRC sum of existing files and compares them to the remote checksum to decide whether or not to to download.""" # For each URL we have: for u, c in zip(self.urls, self.cksums): target_dir = self.make_species_dir(u) # cd into it os.chdir(target_dir) # What is the local file name? lname = file_funcs.local_name(u) # If it exists, we check if the checksums are the same if file_funcs.file_exists(lname, self.mainlog): local_cksum = file_funcs.calculate_crc32(lname, self.mainlog) crc32_same = file_funcs.checksum_is_same( local_cksum, c, self.mainlog) if crc32_same: self.mainlog.info( lname + ' already exists and is current, skipping.') continue else: self.mainlog.info(lname + ' exists, but is out of date. Updating.') same = False while not same: self.get_file(u) new_local_cksum = file_funcs.calculate_crc32( lname, self.mainlog) same = file_funcs.checksum_is_same( new_local_cksum, c, self.mainlog) # And save a record for those that need to be converted self.to_convert.append( os.path.join(self.base, target_dir, lname)) # If the file doesn't exist, then it's the same # as if the checksum were different else: self.mainlog.info(lname + ' does not exist. Downloading.') same = False while not same: self.get_file(u) new_local_cksum = file_funcs.calculate_crc32( lname, self.mainlog) same = file_funcs.checksum_is_same(new_local_cksum, c, self.mainlog) self.to_convert.append( os.path.join(self.base, target_dir, lname)) self.mainlog.info('Done downloading CDS files from Ensembl.') # We are done with the FTP connection, log out self.session.quit() return
def get_file(self, fname): """Download the file specified by `fname'""" handle = open(file_funcs.local_name(fname), 'wb') self.session.retrbinary('RETR ' + fname, handle.write) handle.close() return
def fetch_cds(self): """Iterates through the urls and md5s instance attributes and downloads the appropriate files. Checks the local MD5 against the remote MD5 and downloads the remote file if they differ. Appends the filenames of each updated file to the `to_convert' attribute.""" self.mainlog.debug('Downloading files from ' + str(len(self.urls)) + ' species') for u, m in zip(self.urls, self.md5s): # Get a local name of the CDS lname = file_funcs.local_name(u) target_dir = self.make_species_dir(u) os.chdir(target_dir) # check to see if the file already exists if file_funcs.file_exists(lname, self.mainlog): # Get the md5 lmd5 = file_funcs.calculate_md5(lname, self.mainlog) # Compare the MD5s md5s_same = file_funcs.checksum_is_same(lmd5, m, self.mainlog) # If they are the same, skip it, and move on if md5s_same: self.mainlog.info(lname + ' is current. Skipping.') continue else: self.mainlog.info(lname + ' is out of date. Downloading.') # Try to download it until the MD5s check out same = False while not same: self.download_file(u) new_lmd5 = file_funcs.calculate_md5( lname, self.mainlog) same = file_funcs.checksum_is_same( new_lmd5, m, self.mainlog) # Tack it onto the list of files to convert self.to_convert.append( os.path.join( self.base, target_dir, lname) ) else: self.mainlog.info(lname + ' does not yet exist. Downloading.') # And the same procedure as if the file were updated same = False while not same: self.download_file(u) new_lmd5 = file_funcs.calculate_md5( lname, self.mainlog) same = file_funcs.checksum_is_same( new_lmd5, m, self.mainlog) self.to_convert.append( os.path.join( self.base, target_dir, lname)) self.mainlog.info('Done downloading CDS files from Phytozome.') return