Esempio n. 1
0
    def __init__(self, db):
        self.db = db
        self.findex = Findex(db)

        self.env = {}
        self.files = None
Esempio n. 2
0
class Searcher():
    def __init__(self, cfg, db):
        self.cfg = cfg
        self.db = db
        self.findex = Findex(self.db)

    def _key_check(self, keyword):
        if isinstance(keyword, dict):
            if not 'key' in keyword or not keyword['key']:
                raise SearchException('Search query must contain 4 characters or more')

            keyword = keyword['key'][0]

        block = ['-', ',', '+', '_', '%']
        for b in block:
            keyword = keyword.replace(b, ' ')

        if len(keyword) < 4:
            raise SearchException('Search query must contain 4 characters or more')

        return keyword

    def search(self, vars):
        val = self._key_check(vars)
        val = val.lower()

        filtered = False
        start_dbtime = datetime.now()

        # to-do: move this to API (or make api.py use this class)
        q = self.db.query(Files)

        # if this is later set with Files.<column_name>, it will be sorted on this.
        sort = ''

        sdata = {
            'protocols': [],
            'hosts': [],
            'exts': [],
            'cats': [],
            'fsize': 0
        }

        if 'protocols' in vars:
            protocols = [z.lower() for z in vars['protocols']]
            plookup = {'ftp': 0, 'http': 1, 'smb': 2}

            protocols_ids = []
            if isinstance(protocols, list):
                protocols = [z.lower() for z in protocols]

                for p in protocols:
                    if p in plookup and not plookup[p] in protocols_ids:
                        protocols_ids.append(plookup[p])

            if protocols_ids:
                sdata['protocols'] = protocols_ids

        else:
            sdata['protocols'] = [0, 1, 2]

        if 'hosts' in vars:
            dhosts = vars['hosts']

            if isinstance(dhosts, list):
                if not dhosts[0] == '*':
                    host_ids = []
                    for host in dhosts:
                        host_results = self.db.query(Hosts).filter(Hosts.address==host).filter(Hosts.protocol.in_(sdata['protocols'])).all()

                        for host_result in host_results:
                            host_ids.append(host_result.id)

                    if host_ids:
                        sdata['hosts'] = host_ids
                    else:
                        raise SearchException('Could not find any host entries for specified host(s)')

        if sdata['hosts']:
            q = q.filter(Files.host_id.in_(sdata['hosts']))
            filtered = True

        if 'cats' in vars:
            clookup = {
                'unknown': 0,
                'documents': 1,
                'movies': 2,
                'music': 3,
                'pictures': 4
            }

            dformats = []
            for cat in [z.lower() for z in vars['cats']]:
                if cat in clookup:
                    dformats.append(clookup[cat])
                else:
                    dformats.append(int(cat))

            if isinstance(dformats, list):
                q = q.filter(Files.file_format.in_(dformats))

                for dformat in dformats:
                    sdata['cats'].append(dformat)
        else:
            sdata['cats'] = [0, 1, 2, 3, 4]

        for i in [0, 1, 2, 3, 4]:
            if not i in sdata['cats']:
                filtered = True

        if 'exts' in vars:
            exts = vars['exts']

            if isinstance(exts, list):
                exts = [z.replace('.', '') for z in exts if z]

                q = q.filter(Files.file_ext.in_(exts))
                filtered = True

                for ext in exts:
                    sdata['exts'].append(ext)
        elif '.' in val:
            spl = val.split('.', 1)
            ext = spl[1].replace(',', '').strip()
            q = q.filter(Files.file_ext == ext)
            sdata['exts'].append(ext)

            val = self._key_check(spl[0])
            filtered = True

        if 'size' in vars:
            fsize = vars['size']

            if isinstance(fsize, list):
                fsize = int(fsize[0])

                sizes = {
                    0: '*',
                    1: (0, 8388600),
                    2: (8388600, 134220000),
                    3: (134220000, 536870912),
                    4: (536870912, 2147483648),
                    5: (2147483648, 8589934592),
                    6: (8589934592)
                }

                if fsize == 0:
                    pass
                elif 1 <= fsize <= 5:
                    q = q.filter(Files.file_size > sizes[fsize][0], Files.file_size < sizes[fsize][1])
                    filtered = True
                elif fsize == 6:
                    q = q.filter(Files.file_size > sizes[fsize])
                    filtered = True

                sdata['fsize'] = fsize
                sort = 'file_size'

        if 'path' in vars:
            path = vars['path']

            if isinstance(path, list):
                path = path[0]

                if len(path) > 3:
                    path = quote_plus(path)
                    q = q.filter(Files.file_path.like(path+'%'))
                    filtered = True

        if 'host' in vars:
            host = vars['host']

            if isinstance(host, list):
                host = int(host[0])
                q = q.filter(Files.host_id == host)
                filtered = True

        q = q.filter(Files.searchable.like('%'+val+'%')).limit(600)

        results = {}
        results['data'] = q.all()
        results['load_dbtime'] = (datetime.now() - start_dbtime).total_seconds()

        if sort:
            results['data'] = sorted(results['data'], key=lambda k: k.file_size, reverse=True)

        # to-do: dont do this here
        for r in results['data']:
            host = self.findex.get_host_objects(r.host_id)
            setattr(r, 'host', host)

        results['data'] = self.findex.set_humanize(results['data'])
        results['data'] = self.findex.set_icons(results['data'])
        sdata['filtered'] = filtered

        return {'sdata': sdata, 'results': results, 'key': jinja2.escape(vars['key'][0])}
Esempio n. 3
0
class Browser:
    def __init__(self, db):
        self.db = db
        self.findex = Findex(db)

        self.env = {}
        self.files = None

    def parse_incoming_path(self, path):
        self.env["isdir"] = path.endswith("/")

        spl = path.split("/")
        self.env["host"] = spl[0]
        self.env["file_path"] = "/" + "/".join(spl[1:-1])

        if not self.env["isdir"]:
            self.env["file_name"] = path.split("/")[-1]
        if self.env["file_path"] != "/":
            self.env["file_path"] += "/"

        self.env["file_path_quoted"] = quote_plus(self.env["file_path"])

    def fetch_files(self):
        host = self.db.query(Hosts).filter_by(address=self.env["host"]).first()

        if not host:
            raise BrowseException("No host found")

        self.env["host_id"] = host.id
        files = self.findex.get_files_objects(host_id=host.id, file_path=self.env["file_path_quoted"])

        if not files:
            raise BrowseException("No files found")

        self.files = files

    def prepare_files(self, sort=None):
        # sort files
        self.files = sorted(self.files, key=lambda k: k.file_name)  # alphabetically
        self.files = sorted(self.files, key=lambda k: k.file_isdir, reverse=True)  # folders always on top

        if not self.env["file_path"] == "/":  # add CDUP dirs
            x = Files(
                file_name="..",
                file_path="../",
                file_ext="",
                file_format=-1,
                file_isdir=True,
                file_modified=datetime.now(),
                file_perm=None,
                searchable=None,
                file_size=0,
                host=self.env["host"],
            )
            setattr(x, "file_name_human", "..")
            self.files.insert(0, x)

        self.files = self.findex.set_icons(self.files)

    def sort(self):
        # calculate total folder file size (non-recursive)
        total_size = 0
        for f in self.files:
            total_size += f.file_size

    def generate_action_fetches(self):
        url = "ftp://%s" % self.env["host"]

        if self.env["file_path"] == "/":
            path = ""
        elif self.env["file_path"].startswith("/") and url.endswith("/"):
            path = self.env["file_path"][1:]

        wget_extras = ""
        lftp_extras = ""

        # if self.source.crawl_authtype:
        #     wget_extras = 'user=%s password=%s ' % (self.source.crawl_username, self.source.crawl_password)
        #
        #     if self.source.crawl_authtype == 'HTTP_DIGEST':
        #         lftp_extras = Debug('Authentication using DIGEST is not supported by lftp')
        #     else:
        #         lftp_extras = '-u %s,%s ' % (self.source.crawl_username, self.source.crawl_password)

        wget = "wget %s-r -nH --no-parent '%s'" % (wget_extras, url + self.env["file_path"])

        # if not isinstance(lftp_extras, Debug):
        lftp = "lftp %s-e 'mirror' '%s'" % (lftp_extras, url + self.env["file_path"])
        # else:
        #    lftp = lftp_extras.message

        return dict(wget=wget, lftp=lftp)

    def breadcrumbs(self):
        data = [self.env["host"]]
        data += [z for z in self.env["file_path"].split("/")[1:] if z]

        return data

    def output_json(self):
        data = []

        for source_file in self.files:
            if source_file.filename_human == "..":
                continue

            data.append(
                "[%s] %s%s%s"
                % (
                    "D" if source_file.is_directory else "F",
                    self.folder.source.crawl_url,
                    source_file.filepath_human,
                    source_file.filename_human,
                )
            )

        return "\n".join(data)
Esempio n. 4
0
 def __init__(self, cfg, db):
     self.cfg = cfg
     self.db = db
     self.findex = Findex(db=self.db)
Esempio n. 5
0
 def __init__(self, cfg, db):
     self.cfg = cfg
     self.db = db
     self.findex = Findex(self.db)
Esempio n. 6
0
class Browse():
    def __init__(self, cfg, db):
        self.cfg = cfg
        self.db = db
        self.findex = Findex(db=self.db)

    @data_strap
    def hosts(self, env):
        data = {}
        data['hosts'] = self.db.query(Hosts).all()

        return jinja2_template('main/browse_hosts', env=env, data=data)

    @data_strap
    def browse(self, path, env):
        env['load_dbtime'] = 0

        browser = Browser(db=self.db)
        try:
            browser.parse_incoming_path(path)

            start_dbtime = datetime.now()
            browser.fetch_files()
            env['load_dbtime'] = (datetime.now() -
                                  start_dbtime).total_seconds()

            browser.prepare_files()

            data = {
                'files': browser.files,
                'breadcrumbs': browser.breadcrumbs(),
                'action_fetches': browser.generate_action_fetches(),
                'env': browser.env
            }

            return jinja2_template('main/browse_dir', env=env, data=data)
        except Exception as ex:
            return jinja2_template('main/error',
                                   env=env,
                                   data={'error': 'no files were found'})

        return ''

    @data_strap
    def goto(self, path, env):
        try:
            uid = int(path)

            f = self.findex.get_files_objects(id=uid)

            if not f:
                raise Exception()

            f = f[0]

            h = self.db.query(Hosts).filter_by(id=f.host_id).first()

            if f and h:
                data = {'file': f, 'host': h}
            else:
                raise Exception()

            return jinja2_template('main/browse_goto', env=env, data=data)
        except Exception as ex:
            return 'error :( we could always stay here'
Esempio n. 7
0
class Browse():
    def __init__(self, cfg, db):
        self.cfg = cfg
        self.db = db
        self.findex = Findex(db=self.db)

    @data_strap
    def hosts(self, env):
        data = {}
        data['hosts'] = self.db.query(Hosts).all()

        return jinja2_template('main/browse_hosts', env=env, data=data)

    @data_strap
    def browse(self, path, env):
        env['load_dbtime'] = 0

        browser = Browser(db=self.db)
        try:
            browser.parse_incoming_path(path)

            start_dbtime = datetime.now()
            browser.fetch_files()
            env['load_dbtime'] = (datetime.now() - start_dbtime).total_seconds()

            browser.prepare_files()

            data = {
                'files': browser.files,
                'breadcrumbs': browser.breadcrumbs(),
                'action_fetches': browser.generate_action_fetches(),
                'env': browser.env
            }

            return jinja2_template('main/browse_dir', env=env, data=data)
        except Exception as ex:
            return jinja2_template('main/error', env=env, data={
                'error': 'no files were found'
            })

        return ''

    @data_strap
    def goto(self, path, env):
        try:
            uid = int(path)

            f = self.findex.get_files_objects(id=uid)

            if not f:
                raise Exception()

            f = f[0]

            h = self.db.query(Hosts).filter_by(
                id=f.host_id
            ).first()

            if f and h:
                data = {
                    'file': f,
                    'host': h
                }
            else:
                raise Exception()

            return jinja2_template('main/browse_goto', env=env, data=data)
        except Exception as ex:
            return 'error :( we could always stay here'
Esempio n. 8
0
    def __init__(self, db):
        self.db = db
        self.findex = Findex(db)

        self.env = {}
        self.files = None
Esempio n. 9
0
class Browser():
    def __init__(self, db):
        self.db = db
        self.findex = Findex(db)

        self.env = {}
        self.files = None

    def parse_incoming_path(self, path):
        self.env['isdir'] = path.endswith('/')

        spl = path.split('/')
        self.env['host'] = spl[0]
        self.env['file_path'] = '/' + '/'.join(spl[1:-1])

        if not self.env['isdir']: self.env['file_name'] = path.split('/')[-1]
        if self.env['file_path'] != '/': self.env['file_path'] += '/'

        self.env['file_path_quoted'] = quote_plus(self.env['file_path'])

    def fetch_files(self):
        host = self.db.query(Hosts).filter_by(
            address=self.env['host']
        ).first()

        if not host:
            raise BrowseException('No host found')

        self.env['host_id'] = host.id
        files = self.findex.get_files_objects(host_id=host.id, file_path=self.env['file_path_quoted'])

        if not files:
            raise BrowseException('No files found')

        self.files = files

    def prepare_files(self, sort=None):
        # sort files
        self.files = sorted(self.files, key=lambda k: k.file_name)  # alphabetically
        self.files = sorted(self.files, key=lambda k: k.file_isdir, reverse=True)  # folders always on top

        if not self.env['file_path'] == '/':  # add CDUP dirs
            x = Files(
                file_name='..', file_path='../', file_ext='', file_format=-1,
                file_isdir=True, file_modified=datetime.now(), file_perm=None, searchable=None,
                file_size=0, host=self.env['host']
            )
            setattr(x, 'file_name_human', '..')
            self.files.insert(0, x)

        self.files = self.findex.set_icons(self.files)

    def sort(self):
        # calculate total folder file size (non-recursive)
        total_size = 0
        for f in self.files:
            total_size += f.file_size

    def generate_action_fetches(self):
        url = 'ftp://%s' % self.env['host']

        if self.env['file_path'] == '/':
            path = ''
        elif self.env['file_path'].startswith('/') and url.endswith('/'):
            path = self.env['file_path'][1:]

        wget_extras = ''
        lftp_extras = ''

        # if self.source.crawl_authtype:
        #     wget_extras = 'user=%s password=%s ' % (self.source.crawl_username, self.source.crawl_password)
        #
        #     if self.source.crawl_authtype == 'HTTP_DIGEST':
        #         lftp_extras = Debug('Authentication using DIGEST is not supported by lftp')
        #     else:
        #         lftp_extras = '-u %s,%s ' % (self.source.crawl_username, self.source.crawl_password)

        wget = 'wget %s-r -nH --no-parent \'%s\'' % (wget_extras, url + self.env['file_path'])

        #if not isinstance(lftp_extras, Debug):
        lftp = 'lftp %s-e \'mirror\' \'%s\'' % (lftp_extras, url + self.env['file_path'])
        #else:
        #    lftp = lftp_extras.message

        return dict(wget=wget, lftp=lftp)

    def breadcrumbs(self):
        data = [self.env['host']]
        data += [z for z in self.env['file_path'].split('/')[1:] if z]

        return data

    def output_json(self):
        data = []

        for source_file in self.files:
            if source_file.filename_human == '..':
                continue

            data.append('[%s] %s%s%s' % (
                'D' if source_file.is_directory else 'F',
                self.folder.source.crawl_url,
                source_file.filepath_human,
                source_file.filename_human))

        return '\n'.join(data)
Esempio n. 10
0
class Searcher():
    def __init__(self, cfg, db):
        self.cfg = cfg
        self.db = db
        self.findex = Findex(self.db)

    def _key_check(self, keyword):
        if isinstance(keyword, dict):
            if not 'key' in keyword or not keyword['key']:
                raise SearchException(
                    'Search query must contain 4 characters or more')

            keyword = keyword['key'][0]

        block = ['-', ',', '+', '_', '%']
        for b in block:
            keyword = keyword.replace(b, ' ')

        if len(keyword) < 4:
            raise SearchException(
                'Search query must contain 4 characters or more')

        return keyword

    def search(self, vars):
        val = self._key_check(vars)
        val = val.lower()

        filtered = False
        start_dbtime = datetime.now()

        # to-do: move this to API (or make api.py use this class)
        q = self.db.query(Files)

        # if this is later set with Files.<column_name>, it will be sorted on this.
        sort = ''

        sdata = {
            'protocols': [],
            'hosts': [],
            'exts': [],
            'cats': [],
            'fsize': 0
        }

        if 'protocols' in vars:
            protocols = [z.lower() for z in vars['protocols']]
            plookup = {'ftp': 0, 'http': 1, 'smb': 2}

            protocols_ids = []
            if isinstance(protocols, list):
                protocols = [z.lower() for z in protocols]

                for p in protocols:
                    if p in plookup and not plookup[p] in protocols_ids:
                        protocols_ids.append(plookup[p])

            if protocols_ids:
                sdata['protocols'] = protocols_ids

        else:
            sdata['protocols'] = [0, 1, 2]

        if 'hosts' in vars:
            dhosts = vars['hosts']

            if isinstance(dhosts, list):
                if not dhosts[0] == '*':
                    host_ids = []
                    for host in dhosts:
                        host_results = self.db.query(Hosts).filter(
                            Hosts.address == host).filter(
                                Hosts.protocol.in_(sdata['protocols'])).all()

                        for host_result in host_results:
                            host_ids.append(host_result.id)

                    if host_ids:
                        sdata['hosts'] = host_ids
                    else:
                        raise SearchException(
                            'Could not find any host entries for specified host(s)'
                        )

        if sdata['hosts']:
            q = q.filter(Files.host_id.in_(sdata['hosts']))
            filtered = True

        if 'cats' in vars:
            clookup = {
                'unknown': 0,
                'documents': 1,
                'movies': 2,
                'music': 3,
                'pictures': 4
            }

            dformats = []
            for cat in [z.lower() for z in vars['cats']]:
                if cat in clookup:
                    dformats.append(clookup[cat])
                else:
                    dformats.append(int(cat))

            if isinstance(dformats, list):
                q = q.filter(Files.file_format.in_(dformats))

                for dformat in dformats:
                    sdata['cats'].append(dformat)
        else:
            sdata['cats'] = [0, 1, 2, 3, 4]

        for i in [0, 1, 2, 3, 4]:
            if not i in sdata['cats']:
                filtered = True

        if 'exts' in vars:
            exts = vars['exts']

            if isinstance(exts, list):
                exts = [z.replace('.', '') for z in exts if z]

                q = q.filter(Files.file_ext.in_(exts))
                filtered = True

                for ext in exts:
                    sdata['exts'].append(ext)
        elif '.' in val:
            spl = val.split('.', 1)
            ext = spl[1].replace(',', '').strip()
            q = q.filter(Files.file_ext == ext)
            sdata['exts'].append(ext)

            val = self._key_check(spl[0])
            filtered = True

        if 'size' in vars:
            fsize = vars['size']

            if isinstance(fsize, list):
                fsize = int(fsize[0])

                sizes = {
                    0: '*',
                    1: (0, 8388600),
                    2: (8388600, 134220000),
                    3: (134220000, 536870912),
                    4: (536870912, 2147483648),
                    5: (2147483648, 8589934592),
                    6: (8589934592)
                }

                if fsize == 0:
                    pass
                elif 1 <= fsize <= 5:
                    q = q.filter(Files.file_size > sizes[fsize][0],
                                 Files.file_size < sizes[fsize][1])
                    filtered = True
                elif fsize == 6:
                    q = q.filter(Files.file_size > sizes[fsize])
                    filtered = True

                sdata['fsize'] = fsize
                sort = 'file_size'

        if 'path' in vars:
            path = vars['path']

            if isinstance(path, list):
                path = path[0]

                if len(path) > 3:
                    path = quote_plus(path)
                    q = q.filter(Files.file_path.like(path + '%'))
                    filtered = True

        if 'host' in vars:
            host = vars['host']

            if isinstance(host, list):
                host = int(host[0])
                q = q.filter(Files.host_id == host)
                filtered = True

        q = q.filter(Files.searchable.like('%' + val + '%')).limit(600)

        results = {}
        results['data'] = q.all()
        results['load_dbtime'] = (datetime.now() -
                                  start_dbtime).total_seconds()

        if sort:
            results['data'] = sorted(results['data'],
                                     key=lambda k: k.file_size,
                                     reverse=True)

        # to-do: dont do this here
        for r in results['data']:
            host = self.findex.get_host_objects(r.host_id)
            setattr(r, 'host', host)

        results['data'] = self.findex.set_humanize(results['data'])
        results['data'] = self.findex.set_icons(results['data'])
        sdata['filtered'] = filtered

        return {
            'sdata': sdata,
            'results': results,
            'key': jinja2.escape(vars['key'][0])
        }