Example #1
0
  def test_parallel_local_download(self):
    locald = LocalDownload(self.examples)
    (file_list, dir_list) = locald.list()
    locald.match([r'^test'], file_list, dir_list)
    list1 = [locald.files_to_download[0]]
    list2 = locald.files_to_download[1:]
    locald.close()

    locald1 = LocalDownload(self.examples)
    locald1.files_to_download = list1
    locald2 = LocalDownload(self.examples)
    locald2.files_to_download = list2
    t1 = DownloadThread(locald1, self.utils.data_dir)
    t2 = DownloadThread(locald2, self.utils.data_dir)
    t1.start()
    t2.start()
    t1.join()
    t2.join()
    self.assertTrue(len(t1.downloader.files_to_download) == 1)
    self.assertTrue(os.path.exists(self.utils.data_dir + '/' +list1[0]['name']))
    self.assertTrue(len(t2.downloader.files_to_download) == 2)
    self.assertTrue(os.path.exists(self.utils.data_dir + '/' +list2[0]['name']))
    self.assertTrue(os.path.exists(self.utils.data_dir + '/' +list2[1]['name']))
Example #2
0
    def test_parallel_local_download(self):
        locald = LocalDownload(self.examples)
        (file_list, dir_list) = locald.list()
        locald.match([r'^test'], file_list, dir_list)
        list1 = [locald.files_to_download[0]]
        list2 = locald.files_to_download[1:]
        locald.close()

        locald1 = LocalDownload(self.examples)
        locald1.files_to_download = list1
        locald2 = LocalDownload(self.examples)
        locald2.files_to_download = list2
        t1 = DownloadThread(locald1, self.utils.data_dir)
        t2 = DownloadThread(locald2, self.utils.data_dir)
        t1.start()
        t2.start()
        t1.join()
        t2.join()
        self.assertTrue(len(t1.downloader.files_to_download) == 1)
        self.assertTrue(
            os.path.exists(self.utils.data_dir + '/' + list1[0]['name']))
        self.assertTrue(len(t2.downloader.files_to_download) == 2)
        self.assertTrue(
            os.path.exists(self.utils.data_dir + '/' + list2[0]['name']))
        self.assertTrue(
            os.path.exists(self.utils.data_dir + '/' + list2[1]['name']))
Example #3
0
    def wf_download(self):
        """
        Download remote files or use an available local copy from last production directory if possible.
        """
        logging.info("Workflow:wf_download")
        flow = self.get_flow(Workflow.FLOW_DOWNLOAD)
        downloader = None
        cf = self.session.config
        self.session.previous_release = self.session.get("previous_release")

        if cf.get("protocol") == "multi":
            """
            Search for:
            protocol = multi
            remote.file.0.protocol = directftp
            remote.file.0.server = ftp.ncbi.org
            remote.file.0.path = /musmusculus/chr1/chr1.fa

            => http://ftp2.fr.debian.org/debian/README.html?key1=value&key2=value2
            remote.file.1.protocol = directhttp
            remote.file.1.server = ftp2.fr.debian.org
            remote.file.1.path = debian/README.html
            remote.file.1.method =  GET
            remote.file.1.params.keys = key1,key2
            remote.file.1.params.key1 = value1
            remote.file.1.params.key2 = value2

            => http://ftp2.fr.debian.org/debian/README.html
                #POST PARAMS:
                  key1=value
                  key2=value2
            remote.file.1.protocol = directhttp
            remote.file.1.server = ftp2.fr.debian.org
            remote.file.1.path = debian/README.html
            remote.file.1.method =  POST
            remote.file.1.params.keys = key1,key2
            remote.file.1.params.key1 = value1
            remote.file.1.params.key2 = value2

            ......
            """
            downloader = MultiDownload()
            downloaders = []
            # Creates multiple downloaders
            i = 0
            rfile = cf.get("remote.file." + str(i) + ".path")
            while rfile is not None:
                if cf.get("remote.file." + str(i) + ".protocol") is not None:
                    protocol = cf.get("remote.file." + str(i) + ".protocol")
                else:
                    protocol = cf.get("protocol")
                if cf.get("remote.file." + str(i) + ".server") is not None:
                    server = cf.get("remote.file." + str(i) + ".server")
                else:
                    server = cf.get("server")
                subdownloader = self.get_handler(protocol, server, "", [cf.get("remote.file." + str(i) + ".path")])
                if cf.get("remote.file." + str(i) + ".credentials") is not None:
                    credentials = cf.get("remote.file." + str(i) + ".credentials")
                else:
                    credentials = cf.get("server.credentials")
                if credentials is not None:
                    subdownloader.set_credentials(credentials)
                if protocol == "directhttp":
                    subdownloader.method = cf.get("remote.file." + str(i) + ".method")
                    if subdownloader.method is None:
                        subdownloader.method = "GET"
                    if cf.get("remote.file." + str(i) + ".name"):
                        subdownloader.save_as = cf.get("remote.file." + str(i) + ".name")
                    else:
                        subdownloader.save_as = cf.get("remote.file." + str(i) + ".path")
                    if cf.get("remote.file." + str(i) + ".method"):
                        subdownloader.method = cf.get("remote.file." + str(i) + ".method").strip().upper()
                    subdownloader.params = {}
                    keys = cf.get("remote.file." + str(i) + ".params.keys")
                    if keys is not None:
                        keys = keys.split(",")
                        for key in keys:
                            param = cf.get("remote.file." + str(i) + ".params." + key.strip())
                            subdownloader.param[key.strip()] = param.strip()
                downloaders.append(subdownloader)
                i += 1
                rfile = cf.get("remote.file." + str(i) + ".path")
            downloader.add_downloaders(downloaders)

        else:
            """
            Simple case, one downloader with regexp
            """
            protocol = cf.get("protocol")
            if protocol == "directhttp" or protocol == "directftp":
                downloader = self.get_handler(cf.get("protocol"), cf.get("server"), "/", [cf.get("remote.dir")[:-1]])
                downloader.method = cf.get("url.method")
                if downloader.method is None:
                    downloader.method = "GET"
                downloader.save_as = cf.get("target.name")
                keys = cf.get("url.params")
                if keys is not None:
                    keys = keys.split(",")
                    for key in keys:
                        param = cf.get(key.strip() + ".value")
                        downloader.param[key.strip()] = param.strip()
            else:
                downloader = self.get_handler(cf.get("protocol"), cf.get("server"), cf.get("remote.dir"))

        if downloader is None:
            logging.error("Protocol " + cf.get("protocol") + " not supported")
            return False

        (file_list, dir_list) = downloader.list()

        downloader.match(cf.get("remote.files").split(), file_list, dir_list)
        for f in downloader.files_to_download:
            f["save_as"] = f["name"]
            for p in cf.get("remote.files").split():
                res = re.match("/" + p, f["name"])
                if res is not None and res.groups() is not None and len(res.groups()) >= 1:
                    f["save_as"] = "/".join(res.groups())
                    break

        self.session.set("download_files", downloader.files_to_download)
        if self.session.get("release") is None:
            # Not defined, or could not get it ealier
            # Set release to most recent file to download
            release_dict = Utils.get_more_recent_file(downloader.files_to_download)
            if release_dict is None:
                today = datetime.datetime.now()
                release_dict = {"year": today.year, "month": today.month, "day": today.day}

            release = str(release_dict["year"]) + "-" + str(release_dict["month"]) + "-" + str(release_dict["day"])
            self.session.set("release", release)
            self.session.set("remoterelease", release)
            # We restart from scratch, check if directory with this release already exists
            if self.options.get_option(Options.FROMSCRATCH):
                index = 0
                # Release directory exits, set index to 1
                if os.path.exists(self.session.get_full_release_directory()):
                    index = 1
                for x in range(1, 100):
                    if os.path.exists(self.session.get_full_release_directory() + "__" + str(x)):
                        index = x + 1

                # while os.path.exists(self.session.get_full_release_directory()+'__'+str(index)):
                #  index += 1
                # If we found a directory for this release:   XX or XX__Y
                if index > 0:
                    self.session.set("release", release + "__" + str(index))
                    release = release + "__" + str(index)
            logging.info("Workflow:wf_download:release:remoterelease:" + self.session.get("remoterelease"))
            logging.info("Workflow:wf_download:release:release:" + release)
            MongoConnector.banks.update({"name": self.bank.name}, {"$set": {"status.release.progress": str(release)}})
            self.download_go_ahead = False
            if self.options.get_option(Options.FROM_TASK) == "download":
                # We want to download again in same release, that's fine, we do not care it is the same release
                self.download_go_ahead = True
            if not self.download_go_ahead and self.session.previous_release == self.session.get("remoterelease"):
                logging.info("Workflow:wf_release:same_as_previous_session")
                return self.no_need_to_update()

        self.banks = MongoConnector.banks
        self.bank.bank = self.banks.find_one({"name": self.name})

        nb_prod_dir = len(self.bank.bank["production"])
        offline_dir = self.session.get_offline_directory()

        copied_files = []

        # Check if already in offlinedir
        keep_files = []
        if os.path.exists(offline_dir):
            for file_to_download in downloader.files_to_download:
                # If file is in offline dir and has same date and size, do not download again
                if os.path.exists(offline_dir + "/" + file_to_download["name"]):
                    try:
                        file_stat = os.stat(offline_dir + "/" + file_to_download["name"])
                        f_stat = datetime.datetime.fromtimestamp(
                            os.path.getmtime(offline_dir + "/" + file_to_download["name"])
                        )
                        year = str(f_stat.year)
                        month = str(f_stat.month)
                        day = str(f_stat.day)
                        if (
                            str(file_stat.st_size) != str(file_to_download["size"])
                            or str(year) != str(file_to_download["year"])
                            or str(month) != str(file_to_download["month"])
                            or str(day) != str(file_to_download["day"])
                        ):
                            logging.debug("Workflow:wf_download:different_from_offline:" + file_to_download["name"])
                            keep_files.append(file_to_download)
                        else:
                            logging.debug("Workflow:wf_download:offline:" + file_to_download["name"])
                    except Exception as e:
                        # Could not get stats on file
                        os.remove(offline_dir + "/" + file_to_download["name"])
                        keep_files.append(file_to_download)
                else:
                    keep_files.append(file_to_download)
            downloader.files_to_download = keep_files

        self.download_go_ahead = False
        if self.options.get_option(Options.FROM_TASK) == "download":
            # We want to download again in same release, that's fine, we do not care it is the same release
            self.download_go_ahead = True

        if not self.options.get_option(Options.FROMSCRATCH) and not self.download_go_ahead and nb_prod_dir > 0:
            # for prod in self.bank.bank['production']:
            #  if self.session.get('release') == prod['release']:
            #    logging.info('Workflow:wf_release:same_as_previous_production_dir')
            #    return self.no_need_to_update()

            # Get last production
            last_production = self.bank.bank["production"][nb_prod_dir - 1]
            # Get session corresponding to production directory
            last_production_session = self.banks.find_one(
                {"name": self.name, "sessions.id": last_production["session"]}, {"sessions.$": 1}
            )
            last_production_dir = os.path.join(
                last_production["data_dir"], cf.get("dir.version"), last_production["release"]
            )
            # Checks if some files can be copied instead of downloaded
            downloader.download_or_copy(last_production_session["sessions"][0]["files"], last_production_dir)
            if len(downloader.files_to_download) == 0:
                return self.no_need_to_update()

            # release_dir = os.path.join(self.session.config.get('data.dir'),
            #              self.session.config.get('dir.version'),
            #              self.session.get_release_directory())
            logging.debug("Workflow:wf_download:Copy files from " + last_production_dir)
            copied_files = downloader.files_to_copy
            Utils.copy_files(downloader.files_to_copy, offline_dir)

        downloader.close()

        DownloadThread.NB_THREAD = int(self.session.config.get("files.num.threads"))

        if cf.get("protocol") == "multi":
            thlist = DownloadThread.get_threads_multi(downloader.downloaders, offline_dir)
        else:
            thlist = DownloadThread.get_threads(downloader, offline_dir)

        running_th = []
        for th in thlist:
            running_th.append(th)
            th.start()

        while len(running_th) > 0:
            try:
                # Join all threads using a timeout so it doesn't block
                # Filter out threads which have been joined or are None
                running_th = [t.join(1000) for t in running_th if t is not None and t.isAlive()]
                logging.debug("Workflow:wf_download:Download:Threads:" + str(running_th))
            except KeyboardInterrupt:
                logging.warn("Ctrl-c received! Sending kill to threads...")
                logging.warn("Running tasks will continue and process will stop.")
                for t in running_th:
                    t.downloader.kill_received = True
        logging.info("Workflow:wf_download:Download:Threads:Over")
        # for th in thlist:
        #  th.join()
        is_error = False
        for th in thlist:
            if th.error:
                is_error = True
                downloader.error = True
                break
        self.downloaded_files = downloader.files_to_download + copied_files
        # self.downloaded_files = downloader.download(offline_dir) + copied_files

        # downloader.close()

        if downloader.error:
            logging.error("An error occured during download")
            return False

        return True