def _get_metadata_and_fulltex_dir(self): # Prints stuff print >> sys.stdout, "\nRetrieving journal items directories." # Create progrss bar p_bar = progress_bar(len(self.files_list)) # Print stuff sys.stdout.write(p_bar.next()) sys.stdout.flush() for name in self.files_list: dataset_link = join(self.path_unpacked, name.split('.')[0], 'dataset.xml') try: dataset_xml = parse(dataset_link) except Exception, err: register_exception(alert_admin=True, prefix="Elsevier error reading dataset.xml file.") self.logger.error("Error reading dataset.xml file: %s" % (dataset_link,)) print >> sys.stdout, "\nError reading dataset.xml file: %s" % (dataset_link,) continue # created = get_value_in_tag(dataset_xml.getElementsByTagName('dataset-unique-ids')[0], 'timestamp') journal_items = dataset_xml.getElementsByTagName('journal-item') self.logger.info("Getting metadata and fulltex directories for %i journal items." % (len(journal_items),)) for journal_item in journal_items: xml_pathname = join(self.path_unpacked, name.split('.')[0], xml_to_text(journal_item.getElementsByTagName('ml')[0].getElementsByTagName('pathname')[0])) pdf_pathname = join(self.path_unpacked, name.split('.')[0], xml_to_text(journal_item.getElementsByTagName('web-pdf')[0].getElementsByTagName('pathname')[0])) self.found_articles.append(dict(xml=xml_pathname, pdf=pdf_pathname)) self.logger.info("Got metadata and fulltex directories of %i journals." % (len(self.found_articles),)) # Print stuff sys.stdout.write(p_bar.next()) sys.stdout.flush()
def _download_file_listing(self): if self.files_list: # Prints stuff print >> sys.stdout, "\nDownloading %i \".ready\" files." % (len(self.files_list)) # Create progrss bar p_bar = progress_bar(len(self.files_list)) # Print stuff sys.stdout.write(p_bar.next()) sys.stdout.flush() for filename in self.files_list: self.logger.info("Downloading: %s" % (filename,)) pkg_path = join(CFG_READY_PACKAGES, filename) self.path_r_pkg.append(pkg_path) try: ready_file = open(pkg_path, 'wb') self.ftp.retrbinary('RETR %s' % (filename,), ready_file.write) ready_file.close() except: self.logger.error("Error downloading file: %s" % (filename,)) print >> sys.stdout, "\nError downloading %s file!" % (filename,) print >> sys.stdout, sys.exc_info() # Print stuff sys.stdout.write(p_bar.next()) sys.stdout.flush() return self.path_r_pkg else: print >> sys.stdout, "No new packages to download." self.logger.info("No new packages to download.") raise NoNewFiles
def _download_tars(self, check_integrity=True): if check_integrity: check_pkgs_integrity(self.retrieved_packages, self.logger, self.ftp) print >> sys.stdout, "\nDownloading %i tar packages." \ % (len(self.retrieved_packages)) # Create progrss bar p_bar = progress_bar(len(self.files_list)) # Print stuff sys.stdout.write(p_bar.next()) sys.stdout.flush() for filename in self.retrieved_packages.iterkeys(): self.logger.info("Downloading tar package: %s" % (filename,)) unpack_path = join(CFG_TAR_FILES, filename) self.retrieved_packages_unpacked.append(unpack_path) try: tar_file = open(unpack_path, 'wb') self.ftp.retrbinary('RETR %s' % filename, tar_file.write) tar_file.close() except: register_exception(alert_admin=True, prefix="Elsevier package download faild.") self.logger.error("Error downloading tar file: %s" % (filename,)) print >> sys.stdout, "\nError downloading %s file!" % (filename,) print >> sys.stdout, sys.exc_info() # Print stuff sys.stdout.write(p_bar.next()) sys.stdout.flush() return self.retrieved_packages_unpacked
def _get_packages(self): # Prints stuff print >> sys.stdout, "\nRetrieving packages names." # Create progrss bar p_bar = progress_bar(len(self.files_list)) # Print stuff sys.stdout.write(p_bar.next()) sys.stdout.flush() for pack in self.path_r_pkg: self.logger.info("Retrieved package name: %s" % (pack,)) pack_xml = parse(pack) package_file = pack_xml.getElementsByTagName('dataset-package-file') for pf in package_file: filename = pf.getElementsByTagName('filename')[0] md5_val = pf.getElementsByTagName('md5')[0] self.retrieved_packages[xml_to_text(filename)] = xml_to_text(md5_val) # Print stuff sys.stdout.write(p_bar.next()) sys.stdout.flush() return self.retrieved_packages
def _download_tars(self, check_integrity=True): self.retrieved_packages_unpacked = [] # Prints stuff if self.files_list: if check_integrity: check_pkgs_integrity(self.files_list, self.logger, self.ftp) print >> sys.stdout, "\nDownloading %i tar packages." \ % (len(self.files_list)) # Create progrss bar p_bar = progress_bar(len(self.files_list)) # Print stuff sys.stdout.write(p_bar.next()) sys.stdout.flush() for filename in self.files_list: self.logger.info("Downloading tar package: %s" % (filename,)) unpack_path = join(CFG_TAR_FILES, filename) self.retrieved_packages_unpacked.append(unpack_path) try: tar_file = open(unpack_path, 'wb') self.ftp.retrbinary('RETR %s' % filename, tar_file.write) tar_file.close() except: self.logger.error("Error downloading tar file: %s" % (filename,)) print >> sys.stdout, "\nError downloading %s file!" % (filename,) print >> sys.stdout, sys.exc_info() # Print stuff sys.stdout.write(p_bar.next()) sys.stdout.flush() return self.retrieved_packages_unpacked else: print >> sys.stdout, "No new packages to download." self.logger.info("No new packages to download.") raise NoNewFiles