Exemple #1
0
 def test_copy(self):
   from_dir = os.path.dirname(os.path.realpath(__file__))
   local_file = 'biomaj_tests.py'
   files_to_copy = [ {'root': from_dir, 'name': local_file}]
   to_dir = self.utils.data_dir
   Utils.copy_files(files_to_copy, to_dir)
   self.assertTrue(os.path.exists(to_dir+'/biomaj_tests.py'))
Exemple #2
0
 def test_copy(self):
     from_dir = os.path.dirname(os.path.realpath(__file__))
     local_file = "biomaj_tests.py"
     files_to_copy = [{"root": from_dir, "name": local_file}]
     to_dir = self.utils.data_dir
     Utils.copy_files(files_to_copy, to_dir)
     self.assertTrue(os.path.exists(to_dir + "/biomaj_tests.py"))
Exemple #3
0
    def test_uncompress(self):
        from_file = {"root": os.path.dirname(os.path.realpath(__file__)), "name": "bank/test.fasta.gz"}

        to_dir = self.utils.data_dir
        Utils.copy_files([from_file], to_dir)
        Utils.uncompress(os.path.join(to_dir, from_file["name"]))
        self.assertTrue(os.path.exists(to_dir + "/bank/test.fasta"))
Exemple #4
0
 def test_copy(self):
     from_dir = os.path.dirname(os.path.realpath(__file__))
     local_file = 'biomaj_tests.py'
     files_to_copy = [{'root': from_dir, 'name': local_file}]
     to_dir = self.utils.data_dir
     Utils.copy_files(files_to_copy, to_dir)
     self.assertTrue(os.path.exists(to_dir + '/biomaj_tests.py'))
Exemple #5
0
  def test_uncompress(self):
    from_file = { 'root': os.path.dirname(os.path.realpath(__file__)),
                  'name': 'bank/test.fasta.gz'
                  }

    to_dir = self.utils.data_dir
    Utils.copy_files([from_file], to_dir)
    Utils.uncompress(os.path.join(to_dir, from_file['name']))
    self.assertTrue(os.path.exists(to_dir+'/bank/test.fasta'))
Exemple #6
0
    def test_uncompress(self):
        from_file = {
            'root': os.path.dirname(os.path.realpath(__file__)),
            'name': 'bank/test.fasta.gz'
        }

        to_dir = self.utils.data_dir
        Utils.copy_files([from_file], to_dir)
        Utils.uncompress(os.path.join(to_dir, from_file['name']))
        self.assertTrue(os.path.exists(to_dir + '/bank/test.fasta'))
Exemple #7
0
 def wf_uncompress(self):
     '''
     Uncompress files if archives and no.extract = false
     '''
     logging.info('Workflow:wf_uncompress')
     no_extract = self.session.config.get('no.extract')
     if no_extract is None or no_extract == 'false':
         for file in self.downloaded_files:
             if 'save_as' not in file:
                 file['save_as'] = file['name']
             Utils.uncompress(self.session.get_offline_directory() + '/' + file['save_as'])
     return True
Exemple #8
0
    def download(self, local_dir):
        '''
        Copy local files to local_dir

        :param local_dir: Directory where files should be copied
        :type local_dir: str
        :return: list of downloaded files
        '''
        logging.debug('Local:Download')
        Utils.copy_files(self.files_to_download, local_dir, lock=self.mkdir_lock)

        return self.files_to_download
Exemple #9
0
    def download(self, local_dir):
        '''
        Copy local files to local_dir

        :param local_dir: Directory where files should be copied
        :type local_dir: str
        :return: list of downloaded files
        '''
        logging.debug('Local:Download')
        Utils.copy_files(self.files_to_download,
                         local_dir,
                         lock=self.mkdir_lock)

        return self.files_to_download
Exemple #10
0
 def test_get_more_recent_file(self):
     files = [{
         'name': '/test1',
         'year': '2013',
         'month': '11',
         'day': '10',
         'size': 10
     }, {
         'name': '/test2',
         'year': '2013',
         'month': '11',
         'day': '12',
         'size': 10
     }, {
         'name': '/test/test1',
         'year': '1988',
         'month': '11',
         'day': '10',
         'size': 10
     }, {
         'name': '/test/test11',
         'year': '2013',
         'month': '9',
         'day': '23',
         'size': 10
     }]
     release = Utils.get_more_recent_file(files)
     self.assertTrue(release['year'] == '2013')
     self.assertTrue(release['month'] == '11')
     self.assertTrue(release['day'] == '12')
Exemple #11
0
 def test_get_more_recent_file(self):
     files = [
         {"name": "/test1", "year": "2013", "month": "11", "day": "10", "size": 10},
         {"name": "/test2", "year": "2013", "month": "11", "day": "12", "size": 10},
         {"name": "/test/test1", "year": "1988", "month": "11", "day": "10", "size": 10},
         {"name": "/test/test11", "year": "2013", "month": "9", "day": "23", "size": 10},
     ]
     release = Utils.get_more_recent_file(files)
     self.assertTrue(release["year"] == "2013")
     self.assertTrue(release["month"] == "11")
     self.assertTrue(release["day"] == "12")
Exemple #12
0
 def test_get_more_recent_file(self):
   files = [
         {'name':'/test1', 'year': '2013', 'month': '11', 'day': '10', 'size': 10},
         {'name':'/test2', 'year': '2013', 'month': '11', 'day': '12', 'size': 10},
         {'name':'/test/test1', 'year': '1988', 'month': '11', 'day': '10', 'size': 10},
         {'name':'/test/test11', 'year': '2013', 'month': '9', 'day': '23', 'size': 10}
         ]
   release = Utils.get_more_recent_file(files)
   self.assertTrue(release['year']=='2013')
   self.assertTrue(release['month']=='11')
   self.assertTrue(release['day']=='12')
Exemple #13
0
 def wf_stats(self):
     '''
     Get some stats from current release data dir
     '''
     logging.info('Workflow:wf_stats')
     do_stats = self.bank.config.get('data.stats')
     if do_stats is None or do_stats == '0':
         self.session.set('fullsize', 0)
         return True
     prod_dir = self.session.get_full_release_directory()
     dir_size = Utils.get_folder_size(prod_dir)
     self.session.set('fullsize', dir_size)
     return True
Exemple #14
0
 def wf_stats(self):
     """
     Get some stats from current release data dir
     """
     logging.info("Workflow:wf_stats")
     do_stats = self.bank.config.get("data.stats")
     if do_stats is None or do_stats == "0":
         self.session.set("fullsize", 0)
         return True
     prod_dir = self.session.get_full_release_directory()
     dir_size = Utils.get_folder_size(prod_dir)
     self.session.set("fullsize", dir_size)
     return True
Exemple #15
0
    def wf_copy(self):
        '''
        Copy files from offline directory to release directory
        '''
        logging.info('Workflow:wf_copy')
        from_dir = os.path.join(self.session.config.get('data.dir'),
                      self.session.config.get('offline.dir.name'))
        regexp = self.session.config.get('local.files').split()
        to_dir = os.path.join(self.session.config.get('data.dir'),
                      self.session.config.get('dir.version'),
                      self.session.get_release_directory(), 'flat')

        local_files = Utils.copy_files_with_regexp(from_dir,to_dir,regexp, True)
        self.session._session['files'] = local_files
        if len(self.session._session['files']) == 0:
            logging.error('Workflow:wf_copy:No file match in offline dir')
            return False
        return True
Exemple #16
0
def main():

    parser = argparse.ArgumentParser()

    parser.add_argument('-s',
                        '--scan',
                        dest="directory",
                        help="Directory to scan")
    parser.add_argument('--type', dest="ftype", help="Files type")
    parser.add_argument(
        '--tags',
        dest="tags",
        action="append",
        default=[],
        help="tags, format key:value, can be repeated multiple times")

    args = parser.parse_args()

    if not os.path.exists(args.directory):
        sys.exit(1)

    res = {}
    for (path, dirs, files) in os.walk(args.directory):
        for file in files:
            filename = os.path.join(path, file)
            (file_format, mime) = Utils.detect_format(filename)
            if file_format is not None:
                file_format = file_format.replace('application/', '')
            filename = filename.replace(args.directory + '/', '')
            if file_format is not None:
                if file_format not in res:
                    res[file_format] = [filename]
                else:
                    res[file_format].append(filename)

    f_type = ''
    if args.ftype:
        f_type = args.ftype
    tags = ''
    if args.tags:
        tags = ','.join(args.tags)
    for fformat in res.keys():
        print '##BIOMAJ#' + fformat + '#' + f_type + '#' + tags + '#' + ','.join(
            res[fformat])
Exemple #17
0
    def wf_copy(self):
        """
        Copy files from offline directory to release directory
        """
        logging.info("Workflow:wf_copy")
        from_dir = os.path.join(self.session.config.get("data.dir"), self.session.config.get("offline.dir.name"))
        regexp = self.session.config.get("local.files").split()
        to_dir = os.path.join(
            self.session.config.get("data.dir"),
            self.session.config.get("dir.version"),
            self.session.get_release_directory(),
            "flat",
        )

        local_files = Utils.copy_files_with_regexp(from_dir, to_dir, regexp, True)
        self.session._session["files"] = local_files
        if len(self.session._session["files"]) == 0:
            logging.error("Workflow:wf_copy:No file match in offline dir")
            return False
        return True
Exemple #18
0
def main():

    parser = argparse.ArgumentParser()

    parser.add_argument("-s", "--scan", dest="directory", help="Directory to scan")
    parser.add_argument("--type", dest="ftype", help="Files type")
    parser.add_argument(
        "--tags",
        dest="tags",
        action="append",
        default=[],
        help="tags, format key:value, can be repeated multiple times",
    )

    args = parser.parse_args()

    if not os.path.exists(args.directory):
        sys.exit(1)

    res = {}
    for (path, dirs, files) in os.walk(args.directory):
        for file in files:
            filename = os.path.join(path, file)
            (file_format, mime) = Utils.detect_format(filename)
            if file_format is not None:
                file_format = file_format.replace("application/", "")
            filename = filename.replace(args.directory + "/", "")
            if file_format is not None:
                if file_format not in res:
                    res[file_format] = [filename]
                else:
                    res[file_format].append(filename)

    f_type = ""
    if args.ftype:
        f_type = args.ftype
    tags = ""
    if args.tags:
        tags = ",".join(args.tags)
    for fformat in res.keys():
        print "##BIOMAJ#" + fformat + "#" + f_type + "#" + tags + "#" + ",".join(res[fformat])
Exemple #19
0
 def wf_uncompress(self):
     """
     Uncompress files if archives and no.extract = false
     """
     logging.info("Workflow:wf_uncompress")
     no_extract = self.session.config.get("no.extract")
     if no_extract is None or no_extract == "false":
         for file in self.downloaded_files:
             if "save_as" not in file:
                 file["save_as"] = file["name"]
             nb_try = 1
             not_ok = True
             while nb_try < 3 and not_ok:
                 status = Utils.uncompress(self.session.get_offline_directory() + "/" + file["save_as"])
                 if status:
                     not_ok = False
                 else:
                     logging.warn("Workflow:wf_uncompress:Failure:" + file["name"] + ":" + str(nb_try))
                     nb_try += 1
             if not_ok:
                 logging.error("Workflow:wf_uncompress:Failure:" + file["name"])
                 return False
     return True
Exemple #20
0
 def test_mimes(self):
   fasta_file = os.path.join(os.path.dirname(os.path.realpath(__file__)),'bank/test2.fasta')
   (mime, encoding) = Utils.detect_format(fasta_file)
   self.assertTrue('application/fasta' == mime)
Exemple #21
0
 def test_mimes(self):
     fasta_file = os.path.join(os.path.dirname(os.path.realpath(__file__)),
                               'bank/test2.fasta')
     (mime, encoding) = Utils.detect_format(fasta_file)
     self.assertTrue('application/fasta' == mime)
Exemple #22
0
    def list(self, directory=''):
        '''
        List FTP directory

        :return: tuple of file and dirs in current directory with details
        '''
        logging.debug('Download:List:' + self.url + self.rootdir + directory)
        #self.crl.setopt(pycurl.URL, self.url+self.rootdir+directory)
        try:
            self.crl.setopt(pycurl.URL, self.url + self.rootdir + directory)
        except Exception as a:
            self.crl.setopt(pycurl.URL,
                            (self.url + self.rootdir + directory).encode(
                                'ascii', 'ignore'))

        if self.proxy is not None:
            self.crl.setopt(pycurl.PROXY, self.proxy)
            if self.proxy_auth is not None:
                self.crl.setopt(pycurl.PROXYUSERPWD, self.proxy_auth)

        if self.credentials is not None:
            self.crl.setopt(pycurl.USERPWD, self.credentials)
        output = BytesIO()
        # lets assign this buffer to pycurl object
        self.crl.setopt(pycurl.WRITEFUNCTION, output.write)
        self.crl.setopt(pycurl.HEADERFUNCTION, self.header_function)

        self.crl.setopt(pycurl.CONNECTTIMEOUT, 300)
        # Download should not take more than 5minutes
        self.crl.setopt(pycurl.TIMEOUT, self.timeout)
        self.crl.setopt(pycurl.NOSIGNAL, 1)
        try:
            self.crl.perform()
        except Exception as e:
            logging.error('Could not get errcode:' + str(e))

        # Figure out what encoding was sent with the response, if any.
        # Check against lowercased header name.
        encoding = None
        if 'content-type' in self.headers:
            content_type = self.headers['content-type'].lower()
            match = re.search('charset=(\S+)', content_type)
            if match:
                encoding = match.group(1)
        if encoding is None:
            # Default encoding for HTML is iso-8859-1.
            # Other content types may have different default encoding,
            # or in case of binary data, may have no encoding at all.
            encoding = 'iso-8859-1'

        # lets get the output in a string
        result = output.getvalue().decode(encoding)

        # FTP LIST output is separated by \r\n
        # lets split the output in lines
        #lines = result.split(r'[\r\n]+')
        lines = re.split(r'[\n\r]+', result)
        # lets walk through each line
        rfiles = []
        rdirs = []

        for line in lines:
            rfile = {}
            # lets print each part separately
            parts = line.split()
            # the individual fields in this list of parts
            if not parts: continue
            rfile['permissions'] = parts[0]
            rfile['group'] = parts[2]
            rfile['user'] = parts[3]
            rfile['size'] = parts[4]
            rfile['month'] = Utils.month_to_num(parts[5])
            rfile['day'] = parts[6]
            rfile['hash'] = hashlib.md5(line.encode('utf-8')).hexdigest()
            try:
                rfile['year'] = int(parts[7])
            except Exception as e:
                # specific ftp case issues at getting date info
                curdate = datetime.now()
                rfile['year'] = curdate.year
                # Year not precised, month feater than current means previous year
                if rfile['month'] > curdate.month:
                    rfile['year'] = curdate.year - 1
                # Same month but later day => previous year
                if rfile['month'] == curdate.month and int(
                        rfile['day']) > curdate.day:
                    rfile['year'] = curdate.year - 1
            rfile['name'] = parts[8]
            if len(parts) >= 10 and parts[9] == '->':
                # Symlink, add to files AND dirs as we don't know the type of the link
                rdirs.append(rfile)

            is_dir = False
            if re.match('^d', rfile['permissions']):
                is_dir = True

            if not is_dir:
                rfiles.append(rfile)
            else:
                rdirs.append(rfile)
        return (rfiles, rdirs)
Exemple #23
0
    def list(self, directory=''):
        '''
        Try to get file headers to get last_modification and size
        '''
        for file in self.files_to_download:
            self.crl.setopt(pycurl.HEADER, True)
            if self.credentials is not None:
                self.crl.setopt(pycurl.USERPWD, self.credentials)

            if self.proxy is not None:
                self.crl.setopt(pycurl.PROXY, self.proxy)
                if self.proxy_auth is not None:
                    curl.setopt(pycurl.PROXYUSERPWD, self.proxy_auth)

            self.crl.setopt(pycurl.NOBODY, True)
            try:
                self.crl.setopt(pycurl.URL, self.url+self.rootdir+file['name'])
            except Exception as a:
                self.crl.setopt(pycurl.URL, (self.url+self.rootdir+file['name']).encode('ascii','ignore'))
            #self.crl.setopt(pycurl.URL, self.url+self.rootdir+file['name'])
            output = BytesIO()
            # lets assign this buffer to pycurl object
            self.crl.setopt(pycurl.WRITEFUNCTION, output.write)
            self.crl.setopt(pycurl.HEADERFUNCTION, self.header_function)
            self.crl.perform()

            # Figure out what encoding was sent with the response, if any.
            # Check against lowercased header name.
            encoding = None
            if 'content-type' in self.headers:
                content_type = self.headers['content-type'].lower()
                match = re.search('charset=(\S+)', content_type)
                if match:
                    encoding = match.group(1)
            if encoding is None:
                # Default encoding for HTML is iso-8859-1.
                # Other content types may have different default encoding,
                # or in case of binary data, may have no encoding at all.
                encoding = 'iso-8859-1'

            # lets get the output in a string
            result = output.getvalue().decode(encoding)

            lines = re.split(r'[\n\r]+', result)
            for line in lines:
                parts = line.split(':')
                if parts[0].strip() == 'Content-Length':
                    file['size'] = parts[1].strip()
                if parts[0].strip() == 'Last-Modified':
                    # Sun, 06 Nov 1994
                    res = re.match('(\w+),\s+(\d+)\s+(\w+)\s+(\d+)', parts[1].strip())
                    if res:
                        file['day'] = res.group(2)
                        file['month'] = Utils.month_to_num(res.group(3))
                        file['year'] = res.group(4)
                        continue
                    #Sunday, 06-Nov-94
                    res = re.match('(\w+),\s+(\d+)-(\w+)-(\d+)', parts[1].strip())
                    if res:
                        file['day'] = res.group(2)
                        file['month'] = Utils.month_to_num(res.group(3))
                        file['year'] = str(2000 + int(res.group(4)))
                        continue
                    #Sun Nov  6 08:49:37 1994
                    res = re.match('(\w+)\s+(\w+)\s+(\d+)\s+\d{2}:\d{2}:\d{2}\s+(\d+)', parts[1].strip())
                    if res:
                        file['day'] = res.group(3)
                        file['month'] = Utils.month_to_num(res.group(2))
                        file['year'] = res.group(4)
                        continue
        return (self.files_to_download,[])
Exemple #24
0
 def test_copy_with_regexp(self):
   from_dir = os.path.dirname(os.path.realpath(__file__))
   to_dir = self.utils.data_dir
   Utils.copy_files_with_regexp(from_dir, to_dir, ['.*\.py'])
   self.assertTrue(os.path.exists(to_dir+'/biomaj_tests.py'))
Exemple #25
0
 def test_copy_with_regexp(self):
     from_dir = os.path.dirname(os.path.realpath(__file__))
     to_dir = self.utils.data_dir
     Utils.copy_files_with_regexp(from_dir, to_dir, ['.*\.py'])
     self.assertTrue(os.path.exists(to_dir + '/biomaj_tests.py'))
Exemple #26
0
    def list(self, directory=''):
        '''
        List FTP directory

        :return: tuple of file and dirs in current directory with details
        '''
        logging.debug('Download:List:'+self.url+self.rootdir+directory)
        #self.crl.setopt(pycurl.URL, self.url+self.rootdir+directory)
        try:
            self.crl.setopt(pycurl.URL, self.url+self.rootdir+directory)
        except Exception as a:
            self.crl.setopt(pycurl.URL, (self.url+self.rootdir+directory).encode('ascii','ignore'))

        if self.proxy is not None:
            self.crl.setopt(pycurl.PROXY, self.proxy)
            if self.proxy_auth is not None:
                curl.setopt(pycurl.PROXYUSERPWD, self.proxy_auth)

        if self.credentials is not None:
            self.crl.setopt(pycurl.USERPWD, self.credentials)

        output = BytesIO()
        # lets assign this buffer to pycurl object
        self.crl.setopt(pycurl.WRITEFUNCTION, output.write)
        self.crl.setopt(pycurl.HEADERFUNCTION, self.header_function)
        self.crl.perform()
        # Figure out what encoding was sent with the response, if any.
        # Check against lowercased header name.
        encoding = None
        if 'content-type' in self.headers:
            content_type = self.headers['content-type'].lower()
            match = re.search('charset=(\S+)', content_type)
            if match:
                encoding = match.group(1)
        if encoding is None:
            # Default encoding for HTML is iso-8859-1.
            # Other content types may have different default encoding,
            # or in case of binary data, may have no encoding at all.
            encoding = 'iso-8859-1'

        # lets get the output in a string
        result = output.getvalue().decode(encoding)
        '''
        'http.parse.dir.line': r'<a[\s]+href="([\S]+)/".*alt="\[DIR\]">.*([\d]{2}-[\w\d]{2,5}-[\d]{4}\s[\d]{2}:[\d]{2})',
        'http.parse.file.line': r'<a[\s]+href="([\S]+)".*([\d]{2}-[\w\d]{2,5}-[\d]{4}\s[\d]{2}:[\d]{2})[\s]+([\d\.]+[MKG]{0,1})',
        'http.group.dir.name': 1,
        'http.group.dir.date': 2,
        'http.group.file.name': 1,
        'http.group.file.date': 2,
        'http.group.file.size': 3,
        '''

        rfiles = []
        rdirs = []

        dirs = re.findall(self.config.get('http.parse.dir.line'), result)
        if dirs is not None and len(dirs)>0:
            for dir in dirs:
                rfile = {}
                rfile['permissions'] = ''
                rfile['group'] = ''
                rfile['user'] = ''
                rfile['size'] = '0'
                date = dir[int(self.config.get('http.group.dir.date'))-1]
                dirdate = date.split()
                parts = dirdate[0].split('-')
                #19-Jul-2014 13:02
                rfile['month'] = Utils.month_to_num(parts[1])
                rfile['day'] = parts[0]
                rfile['year'] = parts[2]
                rfile['name'] = dir[int(self.config.get('http.group.dir.name'))-1]
                rdirs.append(rfile)

        files = re.findall(self.config.get('http.parse.file.line'), result)
        if files is not None and len(files)>0:
            for file in files:
                rfile = {}
                rfile['permissions'] = ''
                rfile['group'] = ''
                rfile['user'] = ''
                rfile['size'] = file[int(self.config.get('http.group.file.size'))-1]
                date = file[int(self.config.get('http.group.file.date'))-1]
                dirdate = date.split()
                parts = dirdate[0].split('-')
                #19-Jul-2014 13:02
                rfile['month'] = Utils.month_to_num(parts[1])
                rfile['day'] = parts[0]
                rfile['year'] = parts[2]
                rfile['name'] = file[int(self.config.get('http.group.file.name'))-1]
                rfiles.append(rfile)

        return (rfiles, rdirs)
Exemple #27
0
    def list(self, directory=''):
        '''
        List FTP directory

        :return: tuple of file and dirs in current directory with details
        '''
        logging.debug('Download:List:'+self.url+self.rootdir+directory)
        #self.crl.setopt(pycurl.URL, self.url+self.rootdir+directory)
        try:
            self.crl.setopt(pycurl.URL, self.url+self.rootdir+directory)
        except Exception as a:
            self.crl.setopt(pycurl.URL, (self.url+self.rootdir+directory).encode('ascii', 'ignore'))

        if self.proxy is not None:
            self.crl.setopt(pycurl.PROXY, self.proxy)
            if self.proxy_auth is not None:
                self.crl.setopt(pycurl.PROXYUSERPWD, self.proxy_auth)

        if self.credentials is not None:
            self.crl.setopt(pycurl.USERPWD, self.credentials)
        output = BytesIO()
        # lets assign this buffer to pycurl object
        self.crl.setopt(pycurl.WRITEFUNCTION, output.write)
        self.crl.setopt(pycurl.HEADERFUNCTION, self.header_function)
        self.crl.perform()
        # Figure out what encoding was sent with the response, if any.
        # Check against lowercased header name.
        encoding = None
        if 'content-type' in self.headers:
            content_type = self.headers['content-type'].lower()
            match = re.search('charset=(\S+)', content_type)
            if match:
                encoding = match.group(1)
        if encoding is None:
            # Default encoding for HTML is iso-8859-1.
            # Other content types may have different default encoding,
            # or in case of binary data, may have no encoding at all.
            encoding = 'iso-8859-1'

        # lets get the output in a string
        result = output.getvalue().decode(encoding)

        # FTP LIST output is separated by \r\n
        # lets split the output in lines
        #lines = result.split(r'[\r\n]+')
        lines = re.split(r'[\n\r]+', result)
        # lets walk through each line
        rfiles = []
        rdirs = []

        for line in lines:
            rfile = {}
            # lets print each part separately
            parts = line.split()
            # the individual fields in this list of parts
            if not parts: continue
            rfile['permissions'] = parts[0]
            rfile['group'] = parts[2]
            rfile['user'] = parts[3]
            rfile['size'] = parts[4]
            rfile['month'] = Utils.month_to_num(parts[5])
            rfile['day'] = parts[6]
            try:
                rfile['year'] = int(parts[7])
            except Exception as e:
                # specific ftp case issues at getting date info
                curdate = datetime.now()
                rfile['year'] = curdate.year
                # Year not precised, month feater than current means previous year
                if rfile['month'] > curdate.month:
                    rfile['year'] = curdate.year - 1
                # Same month but later day => previous year
                if rfile['month'] == curdate.month and int(rfile['day']) > curdate.day:
                    rfile['year'] = curdate.year - 1
            rfile['name'] = parts[8]
            if len(parts) >= 10 and parts[9] == '->':
                # Symlink, add to files AND dirs as we don't know the type of the link
                rdirs.append(rfile)

            is_dir = False
            if re.match('^d', rfile['permissions']):
                is_dir = True

            if not is_dir:
                rfiles.append(rfile)
            else:
                rdirs.append(rfile)
        return (rfiles, rdirs)
Exemple #28
0
    def wf_download(self):
        """
        Download remote files or use an available local copy from last production directory if possible.
        """
        logging.info("Workflow:wf_download")
        flow = self.get_flow(Workflow.FLOW_DOWNLOAD)
        downloader = None
        cf = self.session.config
        self.session.previous_release = self.session.get("previous_release")

        if cf.get("protocol") == "multi":
            """
            Search for:
            protocol = multi
            remote.file.0.protocol = directftp
            remote.file.0.server = ftp.ncbi.org
            remote.file.0.path = /musmusculus/chr1/chr1.fa

            => http://ftp2.fr.debian.org/debian/README.html?key1=value&key2=value2
            remote.file.1.protocol = directhttp
            remote.file.1.server = ftp2.fr.debian.org
            remote.file.1.path = debian/README.html
            remote.file.1.method =  GET
            remote.file.1.params.keys = key1,key2
            remote.file.1.params.key1 = value1
            remote.file.1.params.key2 = value2

            => http://ftp2.fr.debian.org/debian/README.html
                #POST PARAMS:
                  key1=value
                  key2=value2
            remote.file.1.protocol = directhttp
            remote.file.1.server = ftp2.fr.debian.org
            remote.file.1.path = debian/README.html
            remote.file.1.method =  POST
            remote.file.1.params.keys = key1,key2
            remote.file.1.params.key1 = value1
            remote.file.1.params.key2 = value2

            ......
            """
            downloader = MultiDownload()
            downloaders = []
            # Creates multiple downloaders
            i = 0
            rfile = cf.get("remote.file." + str(i) + ".path")
            while rfile is not None:
                if cf.get("remote.file." + str(i) + ".protocol") is not None:
                    protocol = cf.get("remote.file." + str(i) + ".protocol")
                else:
                    protocol = cf.get("protocol")
                if cf.get("remote.file." + str(i) + ".server") is not None:
                    server = cf.get("remote.file." + str(i) + ".server")
                else:
                    server = cf.get("server")
                subdownloader = self.get_handler(protocol, server, "", [cf.get("remote.file." + str(i) + ".path")])
                if cf.get("remote.file." + str(i) + ".credentials") is not None:
                    credentials = cf.get("remote.file." + str(i) + ".credentials")
                else:
                    credentials = cf.get("server.credentials")
                if credentials is not None:
                    subdownloader.set_credentials(credentials)
                if protocol == "directhttp":
                    subdownloader.method = cf.get("remote.file." + str(i) + ".method")
                    if subdownloader.method is None:
                        subdownloader.method = "GET"
                    if cf.get("remote.file." + str(i) + ".name"):
                        subdownloader.save_as = cf.get("remote.file." + str(i) + ".name")
                    else:
                        subdownloader.save_as = cf.get("remote.file." + str(i) + ".path")
                    if cf.get("remote.file." + str(i) + ".method"):
                        subdownloader.method = cf.get("remote.file." + str(i) + ".method").strip().upper()
                    subdownloader.params = {}
                    keys = cf.get("remote.file." + str(i) + ".params.keys")
                    if keys is not None:
                        keys = keys.split(",")
                        for key in keys:
                            param = cf.get("remote.file." + str(i) + ".params." + key.strip())
                            subdownloader.param[key.strip()] = param.strip()
                downloaders.append(subdownloader)
                i += 1
                rfile = cf.get("remote.file." + str(i) + ".path")
            downloader.add_downloaders(downloaders)

        else:
            """
            Simple case, one downloader with regexp
            """
            protocol = cf.get("protocol")
            if protocol == "directhttp" or protocol == "directftp":
                downloader = self.get_handler(cf.get("protocol"), cf.get("server"), "/", [cf.get("remote.dir")[:-1]])
                downloader.method = cf.get("url.method")
                if downloader.method is None:
                    downloader.method = "GET"
                downloader.save_as = cf.get("target.name")
                keys = cf.get("url.params")
                if keys is not None:
                    keys = keys.split(",")
                    for key in keys:
                        param = cf.get(key.strip() + ".value")
                        downloader.param[key.strip()] = param.strip()
            else:
                downloader = self.get_handler(cf.get("protocol"), cf.get("server"), cf.get("remote.dir"))

        if downloader is None:
            logging.error("Protocol " + cf.get("protocol") + " not supported")
            return False

        (file_list, dir_list) = downloader.list()

        downloader.match(cf.get("remote.files").split(), file_list, dir_list)
        for f in downloader.files_to_download:
            f["save_as"] = f["name"]
            for p in cf.get("remote.files").split():
                res = re.match("/" + p, f["name"])
                if res is not None and res.groups() is not None and len(res.groups()) >= 1:
                    f["save_as"] = "/".join(res.groups())
                    break

        self.session.set("download_files", downloader.files_to_download)
        if self.session.get("release") is None:
            # Not defined, or could not get it ealier
            # Set release to most recent file to download
            release_dict = Utils.get_more_recent_file(downloader.files_to_download)
            if release_dict is None:
                today = datetime.datetime.now()
                release_dict = {"year": today.year, "month": today.month, "day": today.day}

            release = str(release_dict["year"]) + "-" + str(release_dict["month"]) + "-" + str(release_dict["day"])
            self.session.set("release", release)
            self.session.set("remoterelease", release)
            # We restart from scratch, check if directory with this release already exists
            if self.options.get_option(Options.FROMSCRATCH):
                index = 0
                # Release directory exits, set index to 1
                if os.path.exists(self.session.get_full_release_directory()):
                    index = 1
                for x in range(1, 100):
                    if os.path.exists(self.session.get_full_release_directory() + "__" + str(x)):
                        index = x + 1

                # while os.path.exists(self.session.get_full_release_directory()+'__'+str(index)):
                #  index += 1
                # If we found a directory for this release:   XX or XX__Y
                if index > 0:
                    self.session.set("release", release + "__" + str(index))
                    release = release + "__" + str(index)
            logging.info("Workflow:wf_download:release:remoterelease:" + self.session.get("remoterelease"))
            logging.info("Workflow:wf_download:release:release:" + release)
            MongoConnector.banks.update({"name": self.bank.name}, {"$set": {"status.release.progress": str(release)}})
            self.download_go_ahead = False
            if self.options.get_option(Options.FROM_TASK) == "download":
                # We want to download again in same release, that's fine, we do not care it is the same release
                self.download_go_ahead = True
            if not self.download_go_ahead and self.session.previous_release == self.session.get("remoterelease"):
                logging.info("Workflow:wf_release:same_as_previous_session")
                return self.no_need_to_update()

        self.banks = MongoConnector.banks
        self.bank.bank = self.banks.find_one({"name": self.name})

        nb_prod_dir = len(self.bank.bank["production"])
        offline_dir = self.session.get_offline_directory()

        copied_files = []

        # Check if already in offlinedir
        keep_files = []
        if os.path.exists(offline_dir):
            for file_to_download in downloader.files_to_download:
                # If file is in offline dir and has same date and size, do not download again
                if os.path.exists(offline_dir + "/" + file_to_download["name"]):
                    try:
                        file_stat = os.stat(offline_dir + "/" + file_to_download["name"])
                        f_stat = datetime.datetime.fromtimestamp(
                            os.path.getmtime(offline_dir + "/" + file_to_download["name"])
                        )
                        year = str(f_stat.year)
                        month = str(f_stat.month)
                        day = str(f_stat.day)
                        if (
                            str(file_stat.st_size) != str(file_to_download["size"])
                            or str(year) != str(file_to_download["year"])
                            or str(month) != str(file_to_download["month"])
                            or str(day) != str(file_to_download["day"])
                        ):
                            logging.debug("Workflow:wf_download:different_from_offline:" + file_to_download["name"])
                            keep_files.append(file_to_download)
                        else:
                            logging.debug("Workflow:wf_download:offline:" + file_to_download["name"])
                    except Exception as e:
                        # Could not get stats on file
                        os.remove(offline_dir + "/" + file_to_download["name"])
                        keep_files.append(file_to_download)
                else:
                    keep_files.append(file_to_download)
            downloader.files_to_download = keep_files

        self.download_go_ahead = False
        if self.options.get_option(Options.FROM_TASK) == "download":
            # We want to download again in same release, that's fine, we do not care it is the same release
            self.download_go_ahead = True

        if not self.options.get_option(Options.FROMSCRATCH) and not self.download_go_ahead and nb_prod_dir > 0:
            # for prod in self.bank.bank['production']:
            #  if self.session.get('release') == prod['release']:
            #    logging.info('Workflow:wf_release:same_as_previous_production_dir')
            #    return self.no_need_to_update()

            # Get last production
            last_production = self.bank.bank["production"][nb_prod_dir - 1]
            # Get session corresponding to production directory
            last_production_session = self.banks.find_one(
                {"name": self.name, "sessions.id": last_production["session"]}, {"sessions.$": 1}
            )
            last_production_dir = os.path.join(
                last_production["data_dir"], cf.get("dir.version"), last_production["release"]
            )
            # Checks if some files can be copied instead of downloaded
            downloader.download_or_copy(last_production_session["sessions"][0]["files"], last_production_dir)
            if len(downloader.files_to_download) == 0:
                return self.no_need_to_update()

            # release_dir = os.path.join(self.session.config.get('data.dir'),
            #              self.session.config.get('dir.version'),
            #              self.session.get_release_directory())
            logging.debug("Workflow:wf_download:Copy files from " + last_production_dir)
            copied_files = downloader.files_to_copy
            Utils.copy_files(downloader.files_to_copy, offline_dir)

        downloader.close()

        DownloadThread.NB_THREAD = int(self.session.config.get("files.num.threads"))

        if cf.get("protocol") == "multi":
            thlist = DownloadThread.get_threads_multi(downloader.downloaders, offline_dir)
        else:
            thlist = DownloadThread.get_threads(downloader, offline_dir)

        running_th = []
        for th in thlist:
            running_th.append(th)
            th.start()

        while len(running_th) > 0:
            try:
                # Join all threads using a timeout so it doesn't block
                # Filter out threads which have been joined or are None
                running_th = [t.join(1000) for t in running_th if t is not None and t.isAlive()]
                logging.debug("Workflow:wf_download:Download:Threads:" + str(running_th))
            except KeyboardInterrupt:
                logging.warn("Ctrl-c received! Sending kill to threads...")
                logging.warn("Running tasks will continue and process will stop.")
                for t in running_th:
                    t.downloader.kill_received = True
        logging.info("Workflow:wf_download:Download:Threads:Over")
        # for th in thlist:
        #  th.join()
        is_error = False
        for th in thlist:
            if th.error:
                is_error = True
                downloader.error = True
                break
        self.downloaded_files = downloader.files_to_download + copied_files
        # self.downloaded_files = downloader.download(offline_dir) + copied_files

        # downloader.close()

        if downloader.error:
            logging.error("An error occured during download")
            return False

        return True
Exemple #29
0
    def list(self, directory=''):
        '''
        List FTP directory

        :return: tuple of file and dirs in current directory with details
        '''
        logging.debug('Download:List:' + self.url + self.rootdir + directory)
        #self.crl.setopt(pycurl.URL, self.url+self.rootdir+directory)
        try:
            self.crl.setopt(pycurl.URL, self.url + self.rootdir + directory)
        except Exception as a:
            self.crl.setopt(pycurl.URL,
                            (self.url + self.rootdir + directory).encode(
                                'ascii', 'ignore'))

        if self.proxy is not None:
            self.crl.setopt(pycurl.PROXY, self.proxy)
            if self.proxy_auth is not None:
                self.crl.setopt(pycurl.PROXYUSERPWD, self.proxy_auth)

        if self.credentials is not None:
            self.crl.setopt(pycurl.USERPWD, self.credentials)

        output = BytesIO()
        # lets assign this buffer to pycurl object
        self.crl.setopt(pycurl.WRITEFUNCTION, output.write)
        self.crl.setopt(pycurl.HEADERFUNCTION, self.header_function)
        self.crl.perform()
        # Figure out what encoding was sent with the response, if any.
        # Check against lowercased header name.
        encoding = None
        if 'content-type' in self.headers:
            content_type = self.headers['content-type'].lower()
            match = re.search('charset=(\S+)', content_type)
            if match:
                encoding = match.group(1)
        if encoding is None:
            # Default encoding for HTML is iso-8859-1.
            # Other content types may have different default encoding,
            # or in case of binary data, may have no encoding at all.
            encoding = 'iso-8859-1'

        # lets get the output in a string
        result = output.getvalue().decode(encoding)
        '''
        'http.parse.dir.line': r'<a[\s]+href="([\S]+)/".*alt="\[DIR\]">.*([\d]{2}-[\w\d]{2,5}-[\d]{4}\s[\d]{2}:[\d]{2})',
        'http.parse.file.line': r'<a[\s]+href="([\S]+)".*([\d]{2}-[\w\d]{2,5}-[\d]{4}\s[\d]{2}:[\d]{2})[\s]+([\d\.]+[MKG]{0,1})',
        'http.group.dir.name': 1,
        'http.group.dir.date': 2,
        'http.group.file.name': 1,
        'http.group.file.date': 2,
        'http.group.file.size': 3,
        '''

        rfiles = []
        rdirs = []

        dirs = re.findall(self.config.get('http.parse.dir.line'), result)
        if dirs is not None and len(dirs) > 0:
            for founddir in dirs:
                rfile = {}
                rfile['permissions'] = ''
                rfile['group'] = ''
                rfile['user'] = ''
                rfile['size'] = '0'
                date = founddir[int(self.config.get('http.group.dir.date')) -
                                1]
                dirdate = date.split()
                parts = dirdate[0].split('-')
                #19-Jul-2014 13:02
                rfile['month'] = Utils.month_to_num(parts[1])
                rfile['day'] = parts[0]
                rfile['year'] = parts[2]
                rfile['name'] = founddir[
                    int(self.config.get('http.group.dir.name')) - 1]
                rdirs.append(rfile)

        files = re.findall(self.config.get('http.parse.file.line'), result)
        if files is not None and len(files) > 0:
            for foundfile in files:
                rfile = {}
                rfile['permissions'] = ''
                rfile['group'] = ''
                rfile['user'] = ''
                rfile['size'] = foundfile[
                    int(self.config.get('http.group.file.size')) - 1]
                date = foundfile[int(self.config.get('http.group.file.date')) -
                                 1]
                if self.config.get('http.parse.file.date.format'):
                    date_object = datetime.datetime.strptime(
                        date,
                        self.config.get('http.parse.file.date.format').replace(
                            '%%', '%'))
                    rfile['month'] = date_object.month
                    rfile['day'] = date_object.day
                    rfile['year'] = date_object.year
                else:
                    dirdate = date.split()
                    parts = dirdate[0].split('-')
                    #19-Jul-2014 13:02
                    rfile['month'] = Utils.month_to_num(parts[1])
                    rfile['day'] = parts[0]
                    rfile['year'] = parts[2]
                rfile['name'] = foundfile[
                    int(self.config.get('http.group.file.name')) - 1]
                filehash = (rfile['name'] + str(date) +
                            str(rfile['size'])).encode('utf-8')
                rfile['hash'] = hashlib.md5(filehash).hexdigest()
                rfiles.append(rfile)
                print("###OSALLOU " + str(rfile))

        return (rfiles, rdirs)
Exemple #30
0
    def list(self, directory=''):
        '''
        Try to get file headers to get last_modification and size
        '''
        for file in self.files_to_download:
            self.crl.setopt(pycurl.HEADER, True)
            if self.credentials is not None:
                self.crl.setopt(pycurl.USERPWD, self.credentials)

            if self.proxy is not None:
                self.crl.setopt(pycurl.PROXY, self.proxy)
                if self.proxy_auth is not None:
                    self.crl.setopt(pycurl.PROXYUSERPWD, self.proxy_auth)

            self.crl.setopt(pycurl.NOBODY, True)
            try:
                self.crl.setopt(pycurl.URL, self.url+self.rootdir+file['name'])
            except Exception as a:
                self.crl.setopt(pycurl.URL, (self.url+self.rootdir+file['name']).encode('ascii', 'ignore'))
            #self.crl.setopt(pycurl.URL, self.url+self.rootdir+file['name'])
            output = BytesIO()
            # lets assign this buffer to pycurl object
            self.crl.setopt(pycurl.WRITEFUNCTION, output.write)
            self.crl.setopt(pycurl.HEADERFUNCTION, self.header_function)
            self.crl.perform()

            # Figure out what encoding was sent with the response, if any.
            # Check against lowercased header name.
            encoding = None
            if 'content-type' in self.headers:
                content_type = self.headers['content-type'].lower()
                match = re.search('charset=(\S+)', content_type)
                if match:
                    encoding = match.group(1)
            if encoding is None:
                # Default encoding for HTML is iso-8859-1.
                # Other content types may have different default encoding,
                # or in case of binary data, may have no encoding at all.
                encoding = 'iso-8859-1'

            # lets get the output in a string
            result = output.getvalue().decode(encoding)

            lines = re.split(r'[\n\r]+', result)
            for line in lines:
                parts = line.split(':')
                if parts[0].strip() == 'Content-Length':
                    file['size'] = parts[1].strip()
                if parts[0].strip() == 'Last-Modified':
                    # Sun, 06 Nov 1994
                    res = re.match('(\w+),\s+(\d+)\s+(\w+)\s+(\d+)', parts[1].strip())
                    if res:
                        file['hash'] = hashlib.md5(str(res.group(0)).encode('utf-8')).hexdigest()
                        file['day'] = res.group(2)
                        file['month'] = Utils.month_to_num(res.group(3))
                        file['year'] = res.group(4)
                        continue
                    #Sunday, 06-Nov-94
                    res = re.match('(\w+),\s+(\d+)-(\w+)-(\d+)', parts[1].strip())
                    if res:
                        file['hash'] = hashlib.md5(str(res.group(0)).encode('utf-8')).hexdigest()
                        file['day'] = res.group(2)
                        file['month'] = Utils.month_to_num(res.group(3))
                        file['year'] = str(2000 + int(res.group(4)))
                        continue
                    #Sun Nov  6 08:49:37 1994
                    res = re.match('(\w+)\s+(\w+)\s+(\d+)\s+\d{2}:\d{2}:\d{2}\s+(\d+)', parts[1].strip())
                    if res:
                        file['hash'] = hashlib.md5(str(res.group(0)).encode('utf-8')).hexdigest()
                        file['day'] = res.group(3)
                        file['month'] = Utils.month_to_num(res.group(2))
                        file['year'] = res.group(4)
                        continue
        return (self.files_to_download, [])