コード例 #1
0
    def list(self, directory=''):
        '''
        List FTP directory

        :return: tuple of file and dirs in current directory with details
        '''
        logging.debug('Download:List:' + self.url + self.rootdir + directory)
        #self.crl.setopt(pycurl.URL, self.url+self.rootdir+directory)
        try:
            self.crl.setopt(pycurl.URL, self.url + self.rootdir + directory)
        except Exception as a:
            self.crl.setopt(pycurl.URL,
                            (self.url + self.rootdir + directory).encode(
                                'ascii', 'ignore'))

        if self.proxy is not None:
            self.crl.setopt(pycurl.PROXY, self.proxy)
            if self.proxy_auth is not None:
                self.crl.setopt(pycurl.PROXYUSERPWD, self.proxy_auth)

        if self.credentials is not None:
            self.crl.setopt(pycurl.USERPWD, self.credentials)

        output = BytesIO()
        # lets assign this buffer to pycurl object
        self.crl.setopt(pycurl.WRITEFUNCTION, output.write)
        self.crl.setopt(pycurl.HEADERFUNCTION, self.header_function)
        self.crl.perform()
        # Figure out what encoding was sent with the response, if any.
        # Check against lowercased header name.
        encoding = None
        if 'content-type' in self.headers:
            content_type = self.headers['content-type'].lower()
            match = re.search('charset=(\S+)', content_type)
            if match:
                encoding = match.group(1)
        if encoding is None:
            # Default encoding for HTML is iso-8859-1.
            # Other content types may have different default encoding,
            # or in case of binary data, may have no encoding at all.
            encoding = 'iso-8859-1'

        # lets get the output in a string
        result = output.getvalue().decode(encoding)
        '''
        'http.parse.dir.line': r'<a[\s]+href="([\S]+)/".*alt="\[DIR\]">.*([\d]{2}-[\w\d]{2,5}-[\d]{4}\s[\d]{2}:[\d]{2})',
        'http.parse.file.line': r'<a[\s]+href="([\S]+)".*([\d]{2}-[\w\d]{2,5}-[\d]{4}\s[\d]{2}:[\d]{2})[\s]+([\d\.]+[MKG]{0,1})',
        'http.group.dir.name': 1,
        'http.group.dir.date': 2,
        'http.group.file.name': 1,
        'http.group.file.date': 2,
        'http.group.file.size': 3,
        '''

        rfiles = []
        rdirs = []

        dirs = re.findall(self.config.get('http.parse.dir.line'), result)
        if dirs is not None and len(dirs) > 0:
            for founddir in dirs:
                rfile = {}
                rfile['permissions'] = ''
                rfile['group'] = ''
                rfile['user'] = ''
                rfile['size'] = '0'
                date = founddir[int(self.config.get('http.group.dir.date')) -
                                1]
                dirdate = date.split()
                parts = dirdate[0].split('-')
                #19-Jul-2014 13:02
                rfile['month'] = Utils.month_to_num(parts[1])
                rfile['day'] = parts[0]
                rfile['year'] = parts[2]
                rfile['name'] = founddir[
                    int(self.config.get('http.group.dir.name')) - 1]
                rdirs.append(rfile)

        files = re.findall(self.config.get('http.parse.file.line'), result)
        if files is not None and len(files) > 0:
            for foundfile in files:
                rfile = {}
                rfile['permissions'] = ''
                rfile['group'] = ''
                rfile['user'] = ''
                rfile['size'] = foundfile[
                    int(self.config.get('http.group.file.size')) - 1]
                date = foundfile[int(self.config.get('http.group.file.date')) -
                                 1]
                if self.config.get('http.parse.file.date.format'):
                    date_object = datetime.datetime.strptime(
                        date,
                        self.config.get('http.parse.file.date.format').replace(
                            '%%', '%'))
                    rfile['month'] = date_object.month
                    rfile['day'] = date_object.day
                    rfile['year'] = date_object.year
                else:
                    dirdate = date.split()
                    parts = dirdate[0].split('-')
                    #19-Jul-2014 13:02
                    rfile['month'] = Utils.month_to_num(parts[1])
                    rfile['day'] = parts[0]
                    rfile['year'] = parts[2]
                rfile['name'] = foundfile[
                    int(self.config.get('http.group.file.name')) - 1]
                filehash = (rfile['name'] + str(date) +
                            str(rfile['size'])).encode('utf-8')
                rfile['hash'] = hashlib.md5(filehash).hexdigest()
                rfiles.append(rfile)
                print("###OSALLOU " + str(rfile))

        return (rfiles, rdirs)
コード例 #2
0
ファイル: ftp.py プロジェクト: lecorguille/biomaj
    def list(self, directory=''):
        '''
        List FTP directory

        :return: tuple of file and dirs in current directory with details
        '''
        logging.debug('Download:List:' + self.url + self.rootdir + directory)
        #self.crl.setopt(pycurl.URL, self.url+self.rootdir+directory)
        try:
            self.crl.setopt(pycurl.URL, self.url + self.rootdir + directory)
        except Exception as a:
            self.crl.setopt(pycurl.URL,
                            (self.url + self.rootdir + directory).encode(
                                'ascii', 'ignore'))

        if self.proxy is not None:
            self.crl.setopt(pycurl.PROXY, self.proxy)
            if self.proxy_auth is not None:
                self.crl.setopt(pycurl.PROXYUSERPWD, self.proxy_auth)

        if self.credentials is not None:
            self.crl.setopt(pycurl.USERPWD, self.credentials)
        output = BytesIO()
        # lets assign this buffer to pycurl object
        self.crl.setopt(pycurl.WRITEFUNCTION, output.write)
        self.crl.setopt(pycurl.HEADERFUNCTION, self.header_function)

        self.crl.setopt(pycurl.CONNECTTIMEOUT, 300)
        # Download should not take more than 5minutes
        self.crl.setopt(pycurl.TIMEOUT, self.timeout)
        self.crl.setopt(pycurl.NOSIGNAL, 1)
        try:
            self.crl.perform()
        except Exception as e:
            logging.error('Could not get errcode:' + str(e))

        # Figure out what encoding was sent with the response, if any.
        # Check against lowercased header name.
        encoding = None
        if 'content-type' in self.headers:
            content_type = self.headers['content-type'].lower()
            match = re.search('charset=(\S+)', content_type)
            if match:
                encoding = match.group(1)
        if encoding is None:
            # Default encoding for HTML is iso-8859-1.
            # Other content types may have different default encoding,
            # or in case of binary data, may have no encoding at all.
            encoding = 'iso-8859-1'

        # lets get the output in a string
        result = output.getvalue().decode(encoding)

        # FTP LIST output is separated by \r\n
        # lets split the output in lines
        #lines = result.split(r'[\r\n]+')
        lines = re.split(r'[\n\r]+', result)
        # lets walk through each line
        rfiles = []
        rdirs = []

        for line in lines:
            rfile = {}
            # lets print each part separately
            parts = line.split()
            # the individual fields in this list of parts
            if not parts: continue
            rfile['permissions'] = parts[0]
            rfile['group'] = parts[2]
            rfile['user'] = parts[3]
            rfile['size'] = parts[4]
            rfile['month'] = Utils.month_to_num(parts[5])
            rfile['day'] = parts[6]
            rfile['hash'] = hashlib.md5(line.encode('utf-8')).hexdigest()
            try:
                rfile['year'] = int(parts[7])
            except Exception as e:
                # specific ftp case issues at getting date info
                curdate = datetime.now()
                rfile['year'] = curdate.year
                # Year not precised, month feater than current means previous year
                if rfile['month'] > curdate.month:
                    rfile['year'] = curdate.year - 1
                # Same month but later day => previous year
                if rfile['month'] == curdate.month and int(
                        rfile['day']) > curdate.day:
                    rfile['year'] = curdate.year - 1
            rfile['name'] = parts[8]
            if len(parts) >= 10 and parts[9] == '->':
                # Symlink, add to files AND dirs as we don't know the type of the link
                rdirs.append(rfile)

            is_dir = False
            if re.match('^d', rfile['permissions']):
                is_dir = True

            if not is_dir:
                rfiles.append(rfile)
            else:
                rdirs.append(rfile)
        return (rfiles, rdirs)
コード例 #3
0
ファイル: http.py プロジェクト: markiskander/biomaj
    def list(self, directory=''):
        '''
        List FTP directory

        :return: tuple of file and dirs in current directory with details
        '''
        logging.debug('Download:List:'+self.url+self.rootdir+directory)
        #self.crl.setopt(pycurl.URL, self.url+self.rootdir+directory)
        try:
            self.crl.setopt(pycurl.URL, self.url+self.rootdir+directory)
        except Exception as a:
            self.crl.setopt(pycurl.URL, (self.url+self.rootdir+directory).encode('ascii','ignore'))

        if self.proxy is not None:
            self.crl.setopt(pycurl.PROXY, self.proxy)
            if self.proxy_auth is not None:
                curl.setopt(pycurl.PROXYUSERPWD, self.proxy_auth)

        if self.credentials is not None:
            self.crl.setopt(pycurl.USERPWD, self.credentials)

        output = BytesIO()
        # lets assign this buffer to pycurl object
        self.crl.setopt(pycurl.WRITEFUNCTION, output.write)
        self.crl.setopt(pycurl.HEADERFUNCTION, self.header_function)
        self.crl.perform()
        # Figure out what encoding was sent with the response, if any.
        # Check against lowercased header name.
        encoding = None
        if 'content-type' in self.headers:
            content_type = self.headers['content-type'].lower()
            match = re.search('charset=(\S+)', content_type)
            if match:
                encoding = match.group(1)
        if encoding is None:
            # Default encoding for HTML is iso-8859-1.
            # Other content types may have different default encoding,
            # or in case of binary data, may have no encoding at all.
            encoding = 'iso-8859-1'

        # lets get the output in a string
        result = output.getvalue().decode(encoding)
        '''
        'http.parse.dir.line': r'<a[\s]+href="([\S]+)/".*alt="\[DIR\]">.*([\d]{2}-[\w\d]{2,5}-[\d]{4}\s[\d]{2}:[\d]{2})',
        'http.parse.file.line': r'<a[\s]+href="([\S]+)".*([\d]{2}-[\w\d]{2,5}-[\d]{4}\s[\d]{2}:[\d]{2})[\s]+([\d\.]+[MKG]{0,1})',
        'http.group.dir.name': 1,
        'http.group.dir.date': 2,
        'http.group.file.name': 1,
        'http.group.file.date': 2,
        'http.group.file.size': 3,
        '''

        rfiles = []
        rdirs = []

        dirs = re.findall(self.config.get('http.parse.dir.line'), result)
        if dirs is not None and len(dirs)>0:
            for dir in dirs:
                rfile = {}
                rfile['permissions'] = ''
                rfile['group'] = ''
                rfile['user'] = ''
                rfile['size'] = '0'
                date = dir[int(self.config.get('http.group.dir.date'))-1]
                dirdate = date.split()
                parts = dirdate[0].split('-')
                #19-Jul-2014 13:02
                rfile['month'] = Utils.month_to_num(parts[1])
                rfile['day'] = parts[0]
                rfile['year'] = parts[2]
                rfile['name'] = dir[int(self.config.get('http.group.dir.name'))-1]
                rdirs.append(rfile)

        files = re.findall(self.config.get('http.parse.file.line'), result)
        if files is not None and len(files)>0:
            for file in files:
                rfile = {}
                rfile['permissions'] = ''
                rfile['group'] = ''
                rfile['user'] = ''
                rfile['size'] = file[int(self.config.get('http.group.file.size'))-1]
                date = file[int(self.config.get('http.group.file.date'))-1]
                dirdate = date.split()
                parts = dirdate[0].split('-')
                #19-Jul-2014 13:02
                rfile['month'] = Utils.month_to_num(parts[1])
                rfile['day'] = parts[0]
                rfile['year'] = parts[2]
                rfile['name'] = file[int(self.config.get('http.group.file.name'))-1]
                rfiles.append(rfile)

        return (rfiles, rdirs)
コード例 #4
0
ファイル: ftp.py プロジェクト: pfem-bioinfo/biomaj
    def list(self, directory=''):
        '''
        List FTP directory

        :return: tuple of file and dirs in current directory with details
        '''
        logging.debug('Download:List:'+self.url+self.rootdir+directory)
        #self.crl.setopt(pycurl.URL, self.url+self.rootdir+directory)
        try:
            self.crl.setopt(pycurl.URL, self.url+self.rootdir+directory)
        except Exception as a:
            self.crl.setopt(pycurl.URL, (self.url+self.rootdir+directory).encode('ascii', 'ignore'))

        if self.proxy is not None:
            self.crl.setopt(pycurl.PROXY, self.proxy)
            if self.proxy_auth is not None:
                self.crl.setopt(pycurl.PROXYUSERPWD, self.proxy_auth)

        if self.credentials is not None:
            self.crl.setopt(pycurl.USERPWD, self.credentials)
        output = BytesIO()
        # lets assign this buffer to pycurl object
        self.crl.setopt(pycurl.WRITEFUNCTION, output.write)
        self.crl.setopt(pycurl.HEADERFUNCTION, self.header_function)
        self.crl.perform()
        # Figure out what encoding was sent with the response, if any.
        # Check against lowercased header name.
        encoding = None
        if 'content-type' in self.headers:
            content_type = self.headers['content-type'].lower()
            match = re.search('charset=(\S+)', content_type)
            if match:
                encoding = match.group(1)
        if encoding is None:
            # Default encoding for HTML is iso-8859-1.
            # Other content types may have different default encoding,
            # or in case of binary data, may have no encoding at all.
            encoding = 'iso-8859-1'

        # lets get the output in a string
        result = output.getvalue().decode(encoding)

        # FTP LIST output is separated by \r\n
        # lets split the output in lines
        #lines = result.split(r'[\r\n]+')
        lines = re.split(r'[\n\r]+', result)
        # lets walk through each line
        rfiles = []
        rdirs = []

        for line in lines:
            rfile = {}
            # lets print each part separately
            parts = line.split()
            # the individual fields in this list of parts
            if not parts: continue
            rfile['permissions'] = parts[0]
            rfile['group'] = parts[2]
            rfile['user'] = parts[3]
            rfile['size'] = parts[4]
            rfile['month'] = Utils.month_to_num(parts[5])
            rfile['day'] = parts[6]
            try:
                rfile['year'] = int(parts[7])
            except Exception as e:
                # specific ftp case issues at getting date info
                curdate = datetime.now()
                rfile['year'] = curdate.year
                # Year not precised, month feater than current means previous year
                if rfile['month'] > curdate.month:
                    rfile['year'] = curdate.year - 1
                # Same month but later day => previous year
                if rfile['month'] == curdate.month and int(rfile['day']) > curdate.day:
                    rfile['year'] = curdate.year - 1
            rfile['name'] = parts[8]
            if len(parts) >= 10 and parts[9] == '->':
                # Symlink, add to files AND dirs as we don't know the type of the link
                rdirs.append(rfile)

            is_dir = False
            if re.match('^d', rfile['permissions']):
                is_dir = True

            if not is_dir:
                rfiles.append(rfile)
            else:
                rdirs.append(rfile)
        return (rfiles, rdirs)
コード例 #5
0
ファイル: direct.py プロジェクト: markiskander/biomaj
    def list(self, directory=''):
        '''
        Try to get file headers to get last_modification and size
        '''
        for file in self.files_to_download:
            self.crl.setopt(pycurl.HEADER, True)
            if self.credentials is not None:
                self.crl.setopt(pycurl.USERPWD, self.credentials)

            if self.proxy is not None:
                self.crl.setopt(pycurl.PROXY, self.proxy)
                if self.proxy_auth is not None:
                    curl.setopt(pycurl.PROXYUSERPWD, self.proxy_auth)

            self.crl.setopt(pycurl.NOBODY, True)
            try:
                self.crl.setopt(pycurl.URL, self.url+self.rootdir+file['name'])
            except Exception as a:
                self.crl.setopt(pycurl.URL, (self.url+self.rootdir+file['name']).encode('ascii','ignore'))
            #self.crl.setopt(pycurl.URL, self.url+self.rootdir+file['name'])
            output = BytesIO()
            # lets assign this buffer to pycurl object
            self.crl.setopt(pycurl.WRITEFUNCTION, output.write)
            self.crl.setopt(pycurl.HEADERFUNCTION, self.header_function)
            self.crl.perform()

            # Figure out what encoding was sent with the response, if any.
            # Check against lowercased header name.
            encoding = None
            if 'content-type' in self.headers:
                content_type = self.headers['content-type'].lower()
                match = re.search('charset=(\S+)', content_type)
                if match:
                    encoding = match.group(1)
            if encoding is None:
                # Default encoding for HTML is iso-8859-1.
                # Other content types may have different default encoding,
                # or in case of binary data, may have no encoding at all.
                encoding = 'iso-8859-1'

            # lets get the output in a string
            result = output.getvalue().decode(encoding)

            lines = re.split(r'[\n\r]+', result)
            for line in lines:
                parts = line.split(':')
                if parts[0].strip() == 'Content-Length':
                    file['size'] = parts[1].strip()
                if parts[0].strip() == 'Last-Modified':
                    # Sun, 06 Nov 1994
                    res = re.match('(\w+),\s+(\d+)\s+(\w+)\s+(\d+)', parts[1].strip())
                    if res:
                        file['day'] = res.group(2)
                        file['month'] = Utils.month_to_num(res.group(3))
                        file['year'] = res.group(4)
                        continue
                    #Sunday, 06-Nov-94
                    res = re.match('(\w+),\s+(\d+)-(\w+)-(\d+)', parts[1].strip())
                    if res:
                        file['day'] = res.group(2)
                        file['month'] = Utils.month_to_num(res.group(3))
                        file['year'] = str(2000 + int(res.group(4)))
                        continue
                    #Sun Nov  6 08:49:37 1994
                    res = re.match('(\w+)\s+(\w+)\s+(\d+)\s+\d{2}:\d{2}:\d{2}\s+(\d+)', parts[1].strip())
                    if res:
                        file['day'] = res.group(3)
                        file['month'] = Utils.month_to_num(res.group(2))
                        file['year'] = res.group(4)
                        continue
        return (self.files_to_download,[])
コード例 #6
0
    def list(self, directory=''):
        '''
        Try to get file headers to get last_modification and size
        '''
        for file in self.files_to_download:
            self.crl.setopt(pycurl.HEADER, True)
            if self.credentials is not None:
                self.crl.setopt(pycurl.USERPWD, self.credentials)

            if self.proxy is not None:
                self.crl.setopt(pycurl.PROXY, self.proxy)
                if self.proxy_auth is not None:
                    self.crl.setopt(pycurl.PROXYUSERPWD, self.proxy_auth)

            self.crl.setopt(pycurl.NOBODY, True)
            try:
                self.crl.setopt(pycurl.URL, self.url+self.rootdir+file['name'])
            except Exception as a:
                self.crl.setopt(pycurl.URL, (self.url+self.rootdir+file['name']).encode('ascii', 'ignore'))
            #self.crl.setopt(pycurl.URL, self.url+self.rootdir+file['name'])
            output = BytesIO()
            # lets assign this buffer to pycurl object
            self.crl.setopt(pycurl.WRITEFUNCTION, output.write)
            self.crl.setopt(pycurl.HEADERFUNCTION, self.header_function)
            self.crl.perform()

            # Figure out what encoding was sent with the response, if any.
            # Check against lowercased header name.
            encoding = None
            if 'content-type' in self.headers:
                content_type = self.headers['content-type'].lower()
                match = re.search('charset=(\S+)', content_type)
                if match:
                    encoding = match.group(1)
            if encoding is None:
                # Default encoding for HTML is iso-8859-1.
                # Other content types may have different default encoding,
                # or in case of binary data, may have no encoding at all.
                encoding = 'iso-8859-1'

            # lets get the output in a string
            result = output.getvalue().decode(encoding)

            lines = re.split(r'[\n\r]+', result)
            for line in lines:
                parts = line.split(':')
                if parts[0].strip() == 'Content-Length':
                    file['size'] = parts[1].strip()
                if parts[0].strip() == 'Last-Modified':
                    # Sun, 06 Nov 1994
                    res = re.match('(\w+),\s+(\d+)\s+(\w+)\s+(\d+)', parts[1].strip())
                    if res:
                        file['hash'] = hashlib.md5(str(res.group(0)).encode('utf-8')).hexdigest()
                        file['day'] = res.group(2)
                        file['month'] = Utils.month_to_num(res.group(3))
                        file['year'] = res.group(4)
                        continue
                    #Sunday, 06-Nov-94
                    res = re.match('(\w+),\s+(\d+)-(\w+)-(\d+)', parts[1].strip())
                    if res:
                        file['hash'] = hashlib.md5(str(res.group(0)).encode('utf-8')).hexdigest()
                        file['day'] = res.group(2)
                        file['month'] = Utils.month_to_num(res.group(3))
                        file['year'] = str(2000 + int(res.group(4)))
                        continue
                    #Sun Nov  6 08:49:37 1994
                    res = re.match('(\w+)\s+(\w+)\s+(\d+)\s+\d{2}:\d{2}:\d{2}\s+(\d+)', parts[1].strip())
                    if res:
                        file['hash'] = hashlib.md5(str(res.group(0)).encode('utf-8')).hexdigest()
                        file['day'] = res.group(3)
                        file['month'] = Utils.month_to_num(res.group(2))
                        file['year'] = res.group(4)
                        continue
        return (self.files_to_download, [])