def tag_copier(path, cliargs):
    """This is the tag copier worker function.
    It gets a path from the Queue and searches index for the
    same path and copies any existing tags (from index2)
    Updates index's doc's tag and tag_custom fields.
    """

    doclist = []

    # doc search (matching path) in index for existing tags from index2
    # filename
    f = os.path.basename(path[0])
    # parent path
    p = os.path.abspath(os.path.join(path[0], os.pardir))

    data = {
        "size": 1,
        "_source": ['tag', 'tag_custom'],
        "query": {
            "query_string": {
                "query":
                "filename: \"" + f + "\" AND path_parent: \"" + p + "\""
            }
        }
    }

    # check if file or directory
    if path[3] == 'directory':
        # search ES
        res = es.search(index=cliargs['index'],
                        doc_type='directory',
                        body=data,
                        request_timeout=config['es_timeout'])
    else:
        res = es.search(index=cliargs['index'],
                        doc_type='file',
                        body=data,
                        request_timeout=config['es_timeout'])

    # mark task done if no matching path in index and continue
    if len(res['hits']['hits']) == 0:
        return True

    # existing tag in index2
    docid = res['hits']['hits'][0]['_id']

    # update tag and tag_custom fields in index
    d = {
        '_op_type': 'update',
        '_index': cliargs['index'],
        '_type': path[3],
        '_id': docid,
        'doc': {
            'tag': path[1],
            'tag_custom': path[2]
        }
    }
    doclist.append(d)

    index_bulk_add(es, doclist, config, cliargs)
def get_metadata(path, cliargs):
    dir_source = ""
    filename = escape_chars(os.path.basename(path))
    parent_dir = escape_chars(os.path.abspath(os.path.join(path, os.pardir)))
    fullpath = escape_chars(os.path.abspath(path))

    data = {
        "size": 1,
        "query": {
            "query_string": {
                "query":
                "filename: " + filename + " AND path_parent: " + parent_dir
            }
        }
    }
    res = es.search(index=cliargs['index2'],
                    doc_type='directory',
                    body=data,
                    request_timeout=config['es_timeout'])
    try:
        dir_source = res['hits']['hits'][0]['_source']
    except IndexError:
        pass

    data = {"query": {"query_string": {"query": "path_parent: " + fullpath}}}
    files_source = []
    res = es.search(index=cliargs['index2'],
                    doc_type='file',
                    scroll='1m',
                    size=config['es_scrollsize'],
                    body=data,
                    request_timeout=config['es_timeout'])

    while res['hits']['hits'] and len(res['hits']['hits']) > 0:
        for hit in res['hits']['hits']:
            files_source.append(hit['_source'])
        # get es scroll id
        scroll_id = res['_scroll_id']
        # use es scroll api
        res = es.scroll(scroll_id=scroll_id,
                        scroll='1m',
                        request_timeout=config['es_timeout'])

    return dir_source, files_source
def calc_dir_size(dirlist, cliargs):
    """This is the calculate directory size worker function.
    It gets a directory list from the Queue search ES for all 
    files in each directory (recursive) and sums their filesizes 
    to create a total filesize and item count for each dir, 
    then pdates dir doc's filesize and items fields.
    """
    doclist = []

    for path in dirlist:
        totalsize = 0
        totalitems = 1  # 1 for itself
        totalitems_files = 0
        totalitems_subdirs = 0
        # file doc search with aggregate for sum filesizes
        # escape special characters
        newpath = escape_chars(path[1])
        # create wildcard string and check for / (root) path
        if newpath == '\/':
            newpathwildcard = '\/*'
        else:
            newpathwildcard = newpath + '\/*'

        # check if / (root) path
        if newpath == '\/':
            data = {
                "size": 0,
                "query": {
                    "query_string": {
                        "query": "path_parent: " + newpath + "*",
                        "analyze_wildcard": "true"
                    }
                },
                "aggs": {
                    "total_size": {
                        "sum": {
                            "field": "filesize"
                        }
                    }
                }
            }
        else:
            data = {
                "size": 0,
                "query": {
                    "query_string": {
                        'query': 'path_parent: ' + newpath + ' OR path_parent: ' + newpathwildcard,
                        'analyze_wildcard': 'true'
                    }
                },
                "aggs": {
                    "total_size": {
                        "sum": {
                            "field": "filesize"
                        }
                    }
                }
            }

        # search ES and start scroll
        res = es.search(index=cliargs['index'], doc_type='file', body=data,
                        request_timeout=config['es_timeout'])

        # total items sum
        totalitems_files += res['hits']['total']

        # total file size sum
        totalsize += res['aggregations']['total_size']['value']

        # directory doc search (subdirs)

        # check if / (root) path
        if newpath == '\/':
            data = {
                "size": 0,
                "query": {
                    "query_string": {
                        "query": "path_parent: " + newpath + "*",
                        "analyze_wildcard": "true"
                    }
                }
            }
        else:
            data = {
                "size": 0,
                "query": {
                    "query_string": {
                        'query': 'path_parent: ' + newpath + ' OR path_parent: ' + newpathwildcard,
                        'analyze_wildcard': 'true'
                    }
                }
            }

        # search ES and start scroll
        res = es.search(index=cliargs['index'], doc_type='directory', body=data,
                        request_timeout=config['es_timeout'])

        # total items sum
        totalitems_subdirs += res['hits']['total']

        # total items
        totalitems += totalitems_files + totalitems_subdirs

        # update filesize and items fields for directory (path) doc
        d = {
            '_op_type': 'update',
            '_index': cliargs['index'],
            '_type': 'directory',
            '_id': path[0],
            'doc': {'filesize': totalsize, 'items': totalitems,
                    'items_files': totalitems_files,
                    'items_subdirs': totalitems_subdirs}
        }
        # add total cost per gb to doc
        if cliargs['costpergb']:
            d = cost_per_gb(d, path[1], path[2], path[3], path[4], 'directory')
        doclist.append(d)

    index_bulk_add(es, doclist, config, cliargs)
def calc_hot_dirs(dirlist, cliargs):
    """This is the calculate hotdirs worker function.
    It gets a directory list from the Queue, iterates over the path list
    and searches index2 for the same path and calculates change percent
    between the two. If path not in index2, change percent is 100%.
    Updates index's directory doc's change_percent fields.
    """
    doclist = []

    for path in dirlist:
        # doc search (matching path) in index2
        # filename
        f = os.path.basename(path[1])
        # parent path
        p = os.path.abspath(os.path.join(path[1], os.pardir))

        data = {
            "size": 1,
            "_source": ['filesize', 'items', 'items_files', 'items_subdirs'],
            "query": {
                "query_string": {
                    "query": "filename: \"" + f + "\" AND path_parent: \"" + p + "\""
                }
            }
        }

        # search ES
        res = es.search(index=cliargs['hotdirs'], doc_type='directory', body=data,
                        request_timeout=config['es_timeout'])

        # calculate change percent

        # set change percent to 100% if no matching path in index2
        if len(res['hits']['hits']) == 0:
            changepercent_filesize = 100.0
            changepercent_items = 100.0
            changepercent_items_files = 100.0
            changepercent_items_subdirs = 100.0
        else:
            source = res['hits']['hits'][0]['_source']
            # ((new - old) / old) * 100
            try:
                # check if path size in index2 was 0 bytes and set change percent to 100%
                if path[2] > 0 and source['filesize'] == 0:
                    changepercent_filesize = 100.0
                else:
                    changepercent_filesize = round(((path[2] - source['filesize'])
                                                    / source['filesize']) * 100.0, 2)
            except ZeroDivisionError:
                changepercent_filesize = 0.0
            try:
                # check if path items in index2 was 0 and set change percent to 100%
                if path[3] > 0 and source['items'] == 0:
                    changepercent_items = 100.0
                else:
                    changepercent_items = round(((path[3] - source['items'])
                                                 / source['items']) * 100.0, 2)
            except ZeroDivisionError:
                changepercent_items = 0.0
            try:
                # check if path file items in index2 was 0 and set change percent to 100%
                if path[4] > 0 and source['items_files'] == 0:
                    changepercent_items_files = 100.0
                else:
                    changepercent_items_files = round(((path[4] - source['items_files'])
                                                       / source['items_files']) * 100.0, 2)
            except ZeroDivisionError:
                changepercent_items_files = 0.0
            try:
                # check if path subdir items in index2 was 0 and set change percent to 100%
                if path[5] > 0 and source['items_subdirs'] == 0:
                    changepercent_items_subdirs = 100.0
                else:
                    changepercent_items_subdirs = round(((path[5] - source['items_subdirs'])
                                                         / source['items_subdirs']) * 100.0, 2)
            except ZeroDivisionError:
                changepercent_items_subdirs = 0.0

        # update fields in index
        d = {
            '_op_type': 'update',
            '_index': cliargs['index'],
            '_type': 'directory',
            '_id': path[0],
            'doc': {'change_percent_filesize': changepercent_filesize,
                    'change_percent_items': changepercent_items,
                    'change_percent_items_files': changepercent_items_files,
                    'change_percent_items_subdirs': changepercent_items_subdirs}
        }
        doclist.append(d)

    index_bulk_add(es, doclist, config, cliargs)
def es_bulk_add(worker_name, dirlist, filelist, cliargs, totalcrawltime=None):
    if cliargs['chunkfiles']:
        updated_dirlist = []
        # check for existing directory docs in index and update crawl time only (dirchunk)
        for d in dirlist:
            try:
                path = d[
                    'chunkpath']  # this key determins if its part of a chunked dir
                crawltime = d['crawl_time']
                f = os.path.basename(path)
                # parent path
                p = os.path.abspath(os.path.join(path, os.pardir))

                data = {
                    "size": 1,
                    "_source": ['crawl_time'],
                    "query": {
                        "query_string": {
                            "query":
                            "filename: \"" + f + "\" AND path_parent: \"" + p +
                            "\""
                        }
                    }
                }

                es.indices.refresh(index=cliargs['index'])
                res = es.search(index=cliargs['index'],
                                doc_type='directory',
                                body=data,
                                request_timeout=config['es_timeout'])

                if len(res['hits']['hits']) == 0:
                    continue

                docid = res['hits']['hits'][0]['_id']
                current_crawltime = res['hits']['hits'][0]['_source'][
                    'crawl_time']
                udpated_crawltime = current_crawltime + crawltime

                # update crawltime in index
                d = {
                    '_op_type': 'update',
                    '_index': cliargs['index'],
                    '_type': 'directory',
                    '_id': docid,
                    'doc': {
                        'crawl_time': udpated_crawltime
                    }
                }
            except KeyError:
                pass  # not part of a chunked dir

            updated_dirlist.append(d)

        dirlist = updated_dirlist

    starttime = time.time()

    docs = dirlist + filelist
    index_bulk_add(es, docs, config, cliargs)

    if not cliargs['noworkerdocs']:
        data = {
            "worker_name": worker_name,
            "dir_count": len(dirlist),
            "file_count": len(filelist),
            "bulk_time": round(time.time() - starttime, 6),
            "crawl_time": round(totalcrawltime, 6),
            "indexing_date": datetime.utcnow().isoformat()
        }
        es.index(index=cliargs['index'], doc_type='worker', body=data)
Exemple #6
0
def calc_dir_size(dirlist, cliargs):
    """This is the calculate directory size worker function.
    It gets a directory list from the Queue and searches ES for all
    subdirs in each directory (recursive) and sums their filesize and
    items fields to create a total filesize and item count for each directory doc.
    Updates directory doc's filesize and items fields.
    """

    # check if other bots are idle and throw them some jobs (dir paths)
    if len(dirlist) >= cliargs['batchsize']:
        workers_idle = 0
        workers = Worker.all(connection=redis_conn)
        num_workers = len(workers)
        for w in workers:
            if w._state == "idle":
                workers_idle += 1
            if workers_idle > num_workers // 2:
                workers_idle = True
                break
        q_len = len(q_calc)
        if q_len == 0 and workers_idle == True:
            # take half the paths randomly
            shuffle(dirlist)
            n = len(dirlist) // 2
            tossdirs = dirlist[:n]
            dirlist = dirlist[n:]
            q_crawl.enqueue(calc_dir_size, args=(
                tossdirs,
                cliargs,
            ))

    doclist = []
    for path in dirlist:
        totalitems = 1  # 1 for itself
        # file doc search with aggregate for sum filesizes
        # escape special characters
        newpath = escape_chars(path[1])
        parentpath = escape_chars(
            os.path.abspath(os.path.join(path[1], os.pardir)))
        pathbasename = escape_chars(os.path.basename(path[1]))

        # create wildcard string and check for / (root) path
        if newpath == '\/':
            newpathwildcard = '\/*'
        else:
            newpathwildcard = newpath + '\/*'

        # check if / (root) path
        if newpath == '\/':
            data = {
                "size": 0,
                "query": {
                    "query_string": {
                        "query": "path_parent: " + newpath + "*",
                        "analyze_wildcard": "true"
                    }
                },
                "aggs": {
                    "total_size": {
                        "sum": {
                            "field": "filesize"
                        }
                    }
                }
            }
        else:
            data = {
                "size": 0,
                "query": {
                    "query_string": {
                        'query':
                        '(path_parent: ' + parentpath + ' AND filename: ' +
                        pathbasename + ') OR path_parent: ' + newpath +
                        ' OR path_parent: ' + newpathwildcard,
                        'analyze_wildcard':
                        'true'
                    }
                },
                "aggs": {
                    "total_size": {
                        "sum": {
                            "field": "filesize"
                        }
                    },
                    "total_files": {
                        "sum": {
                            "field": "items_files"
                        }
                    },
                    "total_subdirs": {
                        "sum": {
                            "field": "items_subdirs"
                        }
                    }
                }
            }

        # search ES and start scroll for all directory doc search (subdirs)
        res = es.search(index=cliargs['index'],
                        doc_type='directory',
                        body=data,
                        request_timeout=config['es_timeout'])

        # total file size sum
        totalsize = res['aggregations']['total_size']['value']

        # total items sum for all subdirs count
        totalitems_subdirs = res['aggregations']['total_subdirs']['value']

        # total items sum for all files count
        totalitems_files = res['aggregations']['total_files']['value']

        totalitems += totalitems_subdirs + totalitems_files

        # update filesize and items fields for directory (path) doc
        d = {
            '_op_type': 'update',
            '_index': cliargs['index'],
            '_type': 'directory',
            '_id': path[0],
            'doc': {
                'filesize': totalsize,
                'items': totalitems,
                'items_files': totalitems_files,
                'items_subdirs': totalitems_subdirs
            }
        }
        doclist.append(d)

    index_bulk_add(es, doclist, config, cliargs)
Exemple #7
0
def calc_dir_size(dirlist, cliargs):
    """This is the calculate directory size worker function.
    It gets a directory list from the Queue and searches ES for all
    subdirs in each directory (recursive) and sums their filesize and
    items fields to create a total filesize and item count for each directory doc.
    Updates directory doc's filesize and items fields.
    """

    doclist = []
    for path in dirlist:
        totalitems = 1  # 1 for itself
        # file doc search with aggregate for sum filesizes
        # escape special characters
        newpath = escape_chars(path[1])
        parentpath = escape_chars(
            os.path.abspath(os.path.join(path[1], os.pardir)))
        pathbasename = escape_chars(os.path.basename(path[1]))

        # create wildcard string and check for / (root) path
        if newpath == '\/':
            newpathwildcard = '\/*'
        else:
            newpathwildcard = newpath + '\/*'

        # check if / (root) path
        if newpath == '\/':
            data = {
                "size": 0,
                "query": {
                    "query_string": {
                        "query": "path_parent: " + newpath + "*",
                        "analyze_wildcard": "true"
                    }
                },
                "aggs": {
                    "total_size": {
                        "sum": {
                            "field": "filesize"
                        }
                    }
                }
            }
        else:
            data = {
                "size": 0,
                "query": {
                    "query_string": {
                        'query':
                        '(path_parent: ' + parentpath + ' AND filename: ' +
                        pathbasename + ') OR path_parent: ' + newpath +
                        ' OR path_parent: ' + newpathwildcard,
                        'analyze_wildcard':
                        'true'
                    }
                },
                "aggs": {
                    "total_size": {
                        "sum": {
                            "field": "filesize"
                        }
                    },
                    "total_files": {
                        "sum": {
                            "field": "items_files"
                        }
                    },
                    "total_subdirs": {
                        "sum": {
                            "field": "items_subdirs"
                        }
                    }
                }
            }

        # search ES and start scroll for all directory doc search (subdirs)
        res = es.search(index=cliargs['index'],
                        doc_type='directory',
                        body=data,
                        request_timeout=config['es_timeout'])

        # total file size sum
        totalsize = res['aggregations']['total_size']['value']

        # total items sum for all subdirs count
        totalitems_subdirs = res['aggregations']['total_subdirs']['value']

        # total items sum for all files count
        totalitems_files = res['aggregations']['total_files']['value']

        totalitems += totalitems_subdirs + totalitems_files

        # update filesize and items fields for directory (path) doc
        d = {
            '_op_type': 'update',
            '_index': cliargs['index'],
            '_type': 'directory',
            '_id': path[0],
            'doc': {
                'filesize': totalsize,
                'items': totalitems,
                'items_files': totalitems_files,
                'items_subdirs': totalitems_subdirs
            }
        }
        doclist.append(d)

    index_bulk_add(es, doclist, config, cliargs)