Exemple #1
0
    def load(self, job_path, deployment_type, listener_port, elb_name_suffix, cf_params):
        env = cf_params['env']
        region = cf_params['region']
        cluster = cf_params['cluster'] if 'cluster' in cf_params else 'lambda'
        has_ecs_service = False if deployment_type != 'ecs_service' else True

        filenames = []
        for subdir, dirs, files in os.walk(job_path):
            for fn in files:
                filenames.append(os.path.join(subdir, fn))

        pool = ThreadPool(32)
        pool.imap(self.process_cf_file, ((cf_params, cluster, elb_name_suffix, env, fn, has_ecs_service, listener_port, region)
                            for idx, fn in enumerate(filenames)), chunksize=1)
        pool.close()
        pool.join()
        logging.info("Completed update of %s" % job_path)

        contains_failure = False
        while not self.q.empty():
            job_result = self.q.get()
            logging.info(job_result)
            if "Failed" in job_result:
                contains_failure = True
        if contains_failure:
            logging.error("One or more CF stacks failed!")
            sys.exit(1)
        else:
            logging.info("All CF stacks deployed successfully!")
Exemple #2
0
 def test_synchronize(self):
     demo = LockDemo()
     pool = ThreadPool(2)
     pool.imap(demo.bar, range(2))
     sleep(0.04)
     assert_that(demo.call_count, equal_to(1))
     sleep(0.05)
     assert_that(demo.call_count, equal_to(2))
def find_suppliers_with_details(client, framework_slug):
    pool = ThreadPool(30)

    records = find_suppliers(client, framework_slug)
    records = pool.imap(partial(add_supplier_info, client), records)
    records = pool.imap(partial(add_framework_info, client, framework_slug), records)
    records = pool.imap(partial(add_submitted_draft_counts, client, framework_slug), records)

    return get_csv_rows(records, framework_slug)
Exemple #4
0
 def test_synchronize_with_same_param(self):
     demo = LockDemo()
     pool = ThreadPool(3)
     pool.imap(demo.foo2, (1, 1))
     pool.apply_async(demo.foo1)
     sleep(0.04)
     assert_that(demo.call_count, equal_to(1))
     sleep(0.05)
     assert_that(demo.call_count, equal_to(2))
     sleep(0.05)
     assert_that(demo.call_count, equal_to(3))
def find_all_labs(client):
    pool = ThreadPool(20)
    records = find_suppliers(client, FRAMEWORK_SLUG)
    records = pool.imap(add_framework_info(client, FRAMEWORK_SLUG), records)
    records = filter(lambda record: record['onFramework'], records)
    records = pool.imap(add_draft_services(client, FRAMEWORK_SLUG), records)
    services = itertools.chain.from_iterable(record['services'] for record in records)
    services = filter(
        lambda record: record['lot'] == 'user-research-studios' and record['status'] == 'submitted',
        services)

    return services
def find_suppliers_with_details(client, content_loader, framework_slug, supplier_ids=None):
    pool = ThreadPool(30)

    content_loader.load_manifest(framework_slug, 'declaration', 'declaration')
    declaration_content = content_loader.get_manifest(framework_slug, 'declaration')

    records = find_suppliers(client, framework_slug, supplier_ids)
    records = pool.imap(add_supplier_info(client), records)
    records = pool.imap(add_framework_info(client, framework_slug), records)
    records = pool.imap(add_draft_counts(client, framework_slug), records)
    records = map(add_failed_questions(declaration_content), records)

    return records
def find_services_by_lot(client, framework_slug, lot_slug):
    pool = ThreadPool(30)
    service_adder = add_draft_services(client, framework_slug,
                                       lot=lot_slug,
                                       status="submitted")

    records = find_suppliers(client, framework_slug)
    records = pool.imap(add_supplier_info(client), records)
    records = pool.imap(add_framework_info(client, framework_slug), records)
    records = pool.imap(service_adder, records)
    records = filter(lambda record: len(record["services"]) > 0, records)

    return records
Exemple #8
0
    def get_used_properties(self, set_ids=None, article_ids=None, **filters):
        """
        Returns a sequency of property names in use in the specified set(s) (or setids)
        """
        if set_ids is not None:
            filters["sets"] = set_ids

        if article_ids is not None:
            filters["ids"] = article_ids

        all_properties = self.get_properties()
        flexible_properties = set(all_properties) - set(ALL_FIELDS)

        body = {"query": {"bool": {"must": [
            build_filter(**filters),
            {"exists": {"field": "fakeprop"}}
        ]}}}

        bodies = (copy.deepcopy(body) for _ in range(len(flexible_properties)))
        pool = ThreadPool()
        results = pool.imap(self._get_used_properties, zip(bodies, flexible_properties))

        try:
            for found, prop in zip(results, flexible_properties):
                if found:
                    yield prop
        finally:
            pool.close()
Exemple #9
0
def run_tidy(sha="HEAD", is_rev_range=False):
    diff_cmdline = ["git", "diff" if is_rev_range else "show", sha]

    # Figure out which paths changed in the given diff.
    changed_paths = subprocess.check_output(diff_cmdline + ["--name-only", "--pretty=format:"]).splitlines()
    changed_paths = [p for p in changed_paths if p]

    # Produce a separate diff for each file and run clang-tidy-diff on it
    # in parallel.
    def tidy_on_path(path):
        patch_file = tempfile.NamedTemporaryFile()
        cmd = diff_cmdline + [
            "--src-prefix=%s/" % ROOT,
            "--dst-prefix=%s/" % ROOT,
            "--",
            path]
        subprocess.check_call(cmd, stdout=patch_file, cwd=ROOT)
        cmdline = [CLANG_TIDY_DIFF,
                   "-clang-tidy-binary", CLANG_TIDY,
                   "-p0",
                   "--",
                   "-DCLANG_TIDY"] + compile_flags.get_flags()
        return subprocess.check_output(
            cmdline,
            stdin=file(patch_file.name),
            cwd=ROOT)
    pool = ThreadPool(multiprocessing.cpu_count())
    try:
        return "".join(pool.imap(tidy_on_path, changed_paths))
    except KeyboardInterrupt as ki:
        sys.exit(1)
    finally:
        pool.terminate()
        pool.join()
Exemple #10
0
 def make_requests(self, type, args, plugins=None):
     pool = ThreadPool(6)
     if not plugins:
         plugins = [p['name'] for p in self.plugins]
     reqs = [(type, p, arg) for p in plugins for arg in args]
     #not sure why returning this doesn't work
     for x in pool.imap(self._run, reqs):
         yield x
def get_reviews_from_imdb(movie_start_id, movie_end_id):
    """
    save movies reviews in storage.

    Args:
        movie_start_id: the start of the range of the movies.
        movie_end_id:   the end of the range of the movies.
    """

    thread_pool = ThreadPool()
    block_size = round((movie_end_id - movie_start_id) / MAX_THREADS)
    for results in thread_pool.imap(get_reviews_from_single_movie, xrange(movie_start_id, movie_end_id), block_size):
        if results is not None:
            storage.add_reviews(filter(lambda result: result[0] != "", results))
def run_attack(attack, pipe):
    """
    Given an attack function to run, and a Connection object through which to
    communicate, receive a network and set of fractions to remove, and simulate
    attacks on that network for each fraction of nodes. Puts S1/N back through
    the pipe for each fraction.
    """
    network = pipe.recv()
    fractions = pipe.recv()

    N = len(network)
    nodes_to_remove = [int(round(f * N)) for f in fractions]

    thread_pool = ThreadPool(5)
    results = thread_pool.imap(lambda x: attack(network, x), nodes_to_remove)

    for res in results:
        pipe.send(gc_size(res, N))
    pipe.close()
def evaluate_retrieval(model, entity_model, eval_items):
    pool = ThreadPool()

    def entity_rank((entity, word_idxs, doc_len)):
        if word_idxs:
            entity_id = entity_model.entities[entity]
            scores = entity_model.score(model,
                                        entity_model.vectors,
                                        word_idxs)
            rank = np.sum(scores >= scores[entity_id])
        else:
            rank = entity_vecs.shape[0]

        return int(np.log2(doc_len)), np.log2(rank)

    ranks = defaultdict(list)
    for size, rank in pool.imap(entity_rank, eval_items):
        ranks[size].append(rank)

    sorted_ranks = sorted(ranks.iteritems())
    logging.info('%s overall score: %.3f by size: %s',
                 type(entity_model).__name__,
                 np.mean(np.hstack(ranks.values())),
                 ' '.join('%d: %.3f' % (k, np.mean(v)) for k, v in sorted_ranks))
Exemple #14
0
 def get_schools_private(self, iterable):
     pool = ThreadPool(64)
     list(pool.imap(self.handler_schools_private, iterable))
Exemple #15
0
 def test_synchronize_with_different_param(self):
     demo = LockDemo()
     pool = ThreadPool(2)
     pool.imap(demo.foo2, range(2))
     sleep(0.02)
     assert_that(demo.call_count, equal_to(2))
Exemple #16
0
 def get_data_public(self):
     pool = ThreadPool(64)
     list(pool.imap(self.handler_data_public, self.__schools_links_public))
Exemple #17
0
def check_proxy(proxies):
    """Return validation array for a list of proxies."""
    pool = ThreadPool(processes=512)
    proxy_source = itertools.product(proxies, PROXY_MAP)
    return [p for p in pool.imap(_is_valid_proxy, proxy_source) if p]
Exemple #18
0
def main():

    args = parse_args()

    socket.setdefaulttimeout(args.timeout)

    couch_server = mkcouch(args.couch)

    sessions_db_name = args.sessions_db_name
    try:
        sessions_db = couch_server.create(sessions_db_name)
    except couchdb.PreconditionFailed:
        sessions_db = couch_server[sessions_db_name]

    if args.resume or args.resume is None:
        session_id = args.resume
        if session_id is None:
            current_doc = sessions_db['$current']
            session_id = current_doc['session_id']
        print('Resuming session %s' % session_id)
        session_doc = sessions_db[session_id]
        site_host = session_doc['site']
        scheme, host = scheme_and_host(site_host)
        db_name = session_doc['db_name']
        session_doc['resumed_at'] = datetime.utcnow().isoformat()
        if args.start:
            start_page_name = args.start
        else:
            start_page_name = session_doc.get('last_page_name', args.start)
        if args.desc:
            descending = True
        else:
            descending = session_doc.get('descending', False)
        sessions_db[session_id] = session_doc
    else:
        site_host = args.site
        db_name = args.db
        start_page_name = args.start
        descending = args.desc
        if not site_host:
            print('Site to scrape is not specified')
            raise SystemExit(1)
        scheme, host = scheme_and_host(site_host)
        if not db_name:
            db_name = host.replace('.', '-')
        session_id = '-'.join((db_name,
                               str(int(time.time())),
                               str(int(1000*random.random()))))
        print('Starting session %s' % session_id)
        sessions_db[session_id] = {
            'created_at': datetime.utcnow().isoformat(),
            'site': site_host,
            'db_name': db_name,
            'descending': descending
        }
        current_doc = sessions_db.get('$current', {})
        current_doc['session_id'] = session_id
        sessions_db['$current'] = current_doc


    site = mwclient.Site((scheme, host), path=args.site_path, ext=args.site_ext)

    update_siteinfo(site, couch_server, db_name)

    if args.siteinfo_only:
        return

    try:
        db = couch_server.create(db_name)
    except couchdb.PreconditionFailed:
        db = couch_server[db_name]

    set_show_func(db)

    def titles_from_args(titles):
        for title in titles:
            if title.startswith('@'):
                with open(os.path.expanduser(title[1:])) as f:
                    for line in f:
                        yield line.strip()
            else:
                yield title

    def titles_from_recent_changes(timestamp):
        changes = site.recentchanges(start=timestamp,
                                     namespace=0,
                                     show='!minor|!redirect|!anon')
        for change in changes:
            title = change.get('title')
            if title:
                doc = db.get(title)
                doc_revid = doc.get('parse', {}).get('revid') if doc else None
                revid = change.get('revid')
                if doc_revid == revid:
                    continue
                yield title

    if args.titles:
        pages = (site.Pages[title.decode('utf8')]
                 for title in titles_from_args(args.titles))
    elif args.changes_since or args.recent:
        if args.recent:
            recent_days = args.recent_days
            changes_since = datetime.strftime(
                datetime.utcnow() + timedelta(days=-recent_days),
                '%Y%m%d%H%M%S')
        else:
            changes_since = args.changes_since.ljust(14, '0')
        print('Getting recent changes (since %s)' % changes_since)
        pages = (site.Pages[title]
                 for title in titles_from_recent_changes(changes_since))
    else:
        print('Starting at %s' % start_page_name)
        pages = site.allpages(start=start_page_name,
                              dir='descending' if descending else 'ascending')

    #threads are updating the same session document,
    #we don't want to have conflicts
    lock = RLock()

    def inc_count(count_name):
        with lock:
            session_doc = sessions_db[session_id]
            count = session_doc.get(count_name, 0)
            session_doc[count_name] = count + 1
            sessions_db[session_id] = session_doc

    def update_session(title):
        with lock:
            session_doc = sessions_db[session_id]
            session_doc['last_page_name'] = title
            session_doc['updated_at'] = datetime.utcnow().isoformat()
            sessions_db[session_id] = session_doc

    def process(page):
        title = page.name
        if not page.exists:
            print('Not found: %s' % title)
            inc_count('not_found')
            if args.delete_not_found:
                try:
                    del db[title]
                except couchdb.ResourceNotFound:
                    print('%s was not in the database' % title)
                except couchdb.ResourceConflict:
                    print('Conflict while deleting %s' % title)
                else:
                    print('%s removed from the database' % title)
            return
        try:
            aliases = set()
            redirect_count = 0
            while page.redirect:
                redirect_count += 1
                redirect_target = redirects_to(site, page.name)
                frag = redirect_target.fragment
                if frag:
                    alias = (title, frag)
                else:
                    alias = title
                aliases.add(alias)

                page = redirect_target.page
                print('%s ==> %s' % (
                    title,
                    page.name + (('#'+frag) if frag else '')))

                if redirect_count >= 10:
                    print('Too many redirect levels: %r' % aliases)
                    break

                title = page.name

            if page.redirect:
                print('Failed to resolve redirect %s', title)
                inc_count('failed_redirect')
                return

            doc = db.get(title)
            if doc:
                current_aliases = set()
                for alias in doc.get('aliases', ()):
                    if isinstance(alias, list):
                        alias = tuple(alias)
                    current_aliases.add(alias)
                if not aliases.issubset(current_aliases):
                    merged_aliases = aliases|current_aliases
                    #remove aliases without fragment if one with fragment is present
                    #this is mostly to cleanup aliases in old scrapes
                    to_remove = set()
                    for alias in merged_aliases:
                        if isinstance(alias, tuple):
                            to_remove.add(alias[0])
                    merged_aliases = merged_aliases - to_remove
                    doc['aliases'] = list(merged_aliases)
                    db[title] = doc
                revid = doc.get('parse', {}).get('revid')
                if page.revision == revid:
                    print('%s is up to date (rev. %s), skipping' %
                          (title, revid))
                    inc_count('up_to_date')
                    return
                else:
                    inc_count('updated')
                    print('New rev. %s is available for %s (have rev. %s)' %
                          (page.revision, title, revid))

            parse = site.api('parse', page=title)
        except KeyboardInterrupt as ki:
            print ('Caught KeyboardInterrupt', ki)
            thread.interrupt_main()
        except couchdb.ResourceConflict:
            print('Update conflict, skipping: %s' % title)
            return
        except Exception:
            print('Failed to process %s:' % title)
            traceback.print_exc()
            inc_count('error')
            return
        if doc:
            doc.update(parse)
        else:
            inc_count('new')
            doc = parse
            if aliases:
                doc['aliases'] = list(aliases)
        try:
            db[title] = doc
        except couchdb.ResourceConflict:
            print('Update conflict, skipping: %s' % title)
            return

    import pylru
    seen = pylru.lrucache(10000)

    def ipages(pages):
        for index, page in enumerate(pages):
            title = page.name
            print('%7s %s' % (index, title))
            if title in seen:
                print('Already saw %s, skipping' % (title,))
                continue
            seen[title] = True
            update_session(title)
            yield page


    with flock(os.path.join(tempfile.gettempdir(),
                            hashlib.sha1(host).hexdigest())):
        if args.speed:
            pool = ThreadPool(processes=args.speed*2)
            for _result in pool.imap(process, ipages(pages)):
                pass

        else:
            for page in ipages(pages):
                process(page)
    def untile_image(self, output_destination):
        """
        Downloads image tiles and joins them.
        These processes are done in parallel.
        """
        self.num_tiles = self.x_tiles * self.y_tiles
        self.num_downloaded = 0
        self.num_joined = 0

        # Progressbars for downloading and joining.
        download_progressbar = None
        joining_progressbar = None
        if progressbar:
            download_progressbar = progressbar.ProgressBar(
                widgets=['Loading tiles: ',
                         progressbar.Counter(), '/', str(self.num_tiles), ' ',
                         progressbar.Bar('>', left='[', right=']'), ' ',
                         progressbar.ETA()],
                maxval=self.num_tiles
            )
            joining_progressbar = progressbar.ProgressBar(
                widgets=['Joining tiles: ',
                         progressbar.Counter(), '/', str(self.num_tiles), ' ',
                         progressbar.Bar('>', left='[', right=']'), ' ',
                         progressbar.ETA()],
                maxval=self.num_tiles
            )
            download_progressbar.start()
            if self.no_download:
                download_progressbar.finish()
                joining_progressbar.start()

        def update_progressbars():
            # Update UI info
            if progressbar:
                if self.num_downloaded < self.num_tiles:
                    download_progressbar.update(self.num_downloaded)
                elif not download_progressbar.finished:
                    download_progressbar.finish()
                    joining_progressbar.start()
                    joining_progressbar.update(self.num_joined)  # There are already images  joined!
                else:
                    joining_progressbar.update(self.num_joined)

        def local_tile_path(col, row):
            return os.path.join(self.tile_dir, "{}_{}.{}".format(col, row, self.ext))

        def download(tile_position):
            col, row = tile_position
            url = self.get_tile_url(col, row)
            destination = local_tile_path(col, row)
            if not progressbar:
                self.log.debug("Loading tile (row {:3}, col {:3})".format(row, col))
            try:
                download_url(url, destination)
            except urllib.error.HTTPError as e:
                self.num_downloaded += 1
                self.log.warning(
                    "{}. Tile {} (row {}, col {}) does not exist on the server."
                    .format(e, url, row, col)
                )
                return (None, None)
            self.num_downloaded += 1
            return tile_position

        # Download tiles in self.nthreads parallel threads.
        tile_positions = itertools.product(range(self.x_tiles), range(self.y_tiles))
        if not self.no_download:
            pool = ThreadPool(processes=self.nthreads)
            self.downloaded_iterator = pool.imap(download, tile_positions)
        else:
            self.downloaded_iterator = tile_positions
            self.num_downloaded = self.num_tiles

        def jplarge(self, joining_progressbar):
            """
            Faster untilig algorithm, assembling columns separately,
            then assembling those into final image. Cuts down on the cost
            of constantly opening two huge final images.
            """
            # Do tile joining in parallel with the downloading.
            # Use 4 temporary files for the joining process.
            tmpimgs = []
            finalimage = []
            tempinfo = {'tmp_': tmpimgs, 'final_': finalimage}
            for i in range(2):
                for f in iter(tempinfo):
                    fhandle = tempfile.NamedTemporaryFile(suffix='.jpg', prefix=f, dir=self.tile_dir, delete=False)
                    tempinfo[f].append(fhandle.name)
                    fhandle.close()
                    self.log.debug("Created temporary image file: " + tempinfo[f][i])

            # The index of current_col temp image to be used for input, toggles between 0 and 1.
            active_tmp = 0
            active_final = 0

            # Join tiles into a single image in parallel to them being downloaded.
            try:
                subproc = None # Popen class of the most recently called subprocess.
                current_col = 0
                tile_in_column = 0
                for i, (col, row) in enumerate(self.downloaded_iterator):
                    if col is None:
                        self.log.debug("Missing col tile!")
                        continue # Tile failed to download.

                    if col == current_col:
                        if not progressbar:
                            self.log.debug("Adding tile (row {:3}, col {:3}) to the image".format(row, col))

                        # As the very first step create an (almost) empty temp column image,
                        # with the target column dimensions.
                        # Don't reuse old tempfile without overwriting it first -
                        # if the file is broken, we want an empty space instead of an image from previous iteration.
                        if tile_in_column == 0 and not current_col == self.x_tiles - 1:
                            subproc = subprocess.Popen([self.jpegtran,
                                '-copy', 'all',
                                '-crop', '{:d}x{:d}+0+0'.format(self.tile_size, self.height),
                                '-outfile', tmpimgs[active_tmp],
                                local_tile_path(col, row)
                            ])
                            subproc.wait()
                        # Last column may have different width - create tempfile with correct dimensions
                        elif tile_in_column == 0 and current_col == self.x_tiles - 1:
                            subproc = subprocess.Popen([self.jpegtran,
                                '-copy', 'all',
                                '-crop', '{:d}x{:d}+0+0'.format(self.width - ((self.x_tiles - 1) * self.tile_size), self.height),
                                '-outfile', tmpimgs[active_tmp],
                                local_tile_path(col, row)
                            ])
                            subproc.wait()
                        # Not working on a complete column - just keep adding images.
                        else:
                            subproc = subprocess.Popen([self.jpegtran,
                                '-perfect',
                                '-copy', 'all',
                                '-drop', '+{:d}+{:d}'.format(0, row * self.tile_size), local_tile_path(col, row),
                                '-outfile', tmpimgs[active_tmp],
                                tmpimgs[(active_tmp + 1) % 2]
                            ])
                            subproc.wait()

                        self.num_joined += 1
                        update_progressbars()

                        # After untiling of a first column,
                        # create a full sized temp image with the just untiled column
                        if tile_in_column == self.y_tiles - 1 and current_col == 0:
                            subproc = subprocess.Popen([self.jpegtran,
                                '-perfect',
                                '-copy', 'all',
                                '-crop', '{:d}x{:d}+0+0'.format(self.width, self.height),
                                '-outfile', finalimage[active_final],
                                tmpimgs[active_tmp]
                            ])
                            subproc.wait()
                            current_col += 1
                            tile_in_column = 0
                            active_final = (active_final + 1) % 2
                            active_tmp = (active_tmp + 1) % 2
                        # Drop just untiled column (other then first) into the full sized temp image.
                        elif tile_in_column == self.y_tiles - 1 and not current_col == 0:
                            subproc = subprocess.Popen([self.jpegtran,
                                '-perfect',
                                '-copy', 'all',
                                '-drop', '+{:d}+{:d}'.format(current_col * self.tile_size, 0), tmpimgs[active_tmp],
                                '-outfile', finalimage[active_final],
                                finalimage[(active_final + 1) % 2]
                            ])
                            subproc.wait()
                            current_col += 1
                            tile_in_column = 0
                            active_final = (active_final + 1) % 2
                            active_tmp = (active_tmp + 1) % 2
                        # No column completely untiled, keep working
                        else:
                            tile_in_column += 1
                            active_tmp = (active_tmp + 1) % 2  # toggle between the two temp images

                # Optimize the final  image and write it to destination
                subproc = subprocess.Popen([self.jpegtran,
                    '-copy', 'all',
                    '-optimize',
                    '-outfile', output_destination,
                    finalimage[(active_final + 1) % 2]
                ])
                subproc.wait()

                num_missing = self.num_tiles - self.num_joined
                if num_missing > 0:
                    self.log.warning(
                        "Image '{3}' is missing {0} tile{1}. "
                        "You might want to download the image at a different zoom level "
                        "(currently {2}) to get the missing part{1}."
                        .format(num_missing, '' if num_missing == 1 else 's', self.zoom_level,
                                output_destination)
                    )
                if progressbar and joining_progressbar.start_time is not None:
                    joining_progressbar.finish()

            except KeyboardInterrupt:
                # Kill the jpegtran subprocess.
                if subproc and subproc.poll() is None:
                    subproc.kill()
                raise
            finally:
                #Delete the temporary images.
                for i in range(2):
                    os.unlink(tmpimgs[i])
                    os.unlink(finalimage[i])

        # Select untiling algorithm
        # if self.algorithm == 'jt_xl':
            # jplarge(self, joining_progressbar)
        # elif self.algorithm == 'jt_std':
            # jpstandard(self, joining_progressbar)

        jplarge(self, joining_progressbar)
def process_batch_results(options):

    ppresults = PostProcessingResults()

    ##%% Expand some options for convenience

    output_dir = options.output_dir

    ##%% Prepare output dir

    os.makedirs(output_dir, exist_ok=True)

    ##%% Load ground truth if available

    ground_truth_indexed_db = None

    if options.ground_truth_json_file and len(
            options.ground_truth_json_file) > 0:

        ground_truth_indexed_db = IndexedJsonDb(
            options.ground_truth_json_file,
            b_normalize_paths=True,
            filename_replacements=options.ground_truth_filename_replacements)

        # Mark images in the ground truth as positive or negative
        n_negative, n_positive, n_unknown, n_ambiguous = mark_detection_status(
            ground_truth_indexed_db,
            negative_classes=options.negative_classes,
            unknown_classes=options.unlabeled_classes)
        print(
            'Finished loading and indexing ground truth: {} negative, {} positive, {} unknown, {} ambiguous'
            .format(n_negative, n_positive, n_unknown, n_ambiguous))

    ##%% Load detection results

    if options.api_detection_results is None:
        detection_results, other_fields = load_api_results(
            options.api_output_file,
            normalize_paths=True,
            filename_replacements=options.api_output_filename_replacements)
        ppresults.api_detection_results = detection_results
        ppresults.api_other_fields = other_fields

    else:
        print('Bypassing detection results loading...')
        assert options.api_other_fields is not None
        detection_results = options.api_detection_results
        other_fields = options.api_other_fields

    detection_categories_map = other_fields['detection_categories']
    if 'classification_categories' in other_fields:
        classification_categories_map = other_fields[
            'classification_categories']
    else:
        classification_categories_map = {}

    # Add a column (pred_detection_label) to indicate predicted detection status, not separating out the classes
    if options.include_almost_detections:
        detection_results['pred_detection_label'] = DetectionStatus.DS_ALMOST
        confidences = detection_results['max_detection_conf']
        detection_results.loc[
            confidences >= options.confidence_threshold,
            'pred_detection_label'] = DetectionStatus.DS_POSITIVE
        detection_results.loc[
            confidences < options.almost_detection_confidence_threshold,
            'pred_detection_label'] = DetectionStatus.DS_NEGATIVE
    else:
        detection_results['pred_detection_label'] = \
        np.where(detection_results['max_detection_conf'] >= options.confidence_threshold,
                 DetectionStatus.DS_POSITIVE, DetectionStatus.DS_NEGATIVE)

    n_positives = sum(detection_results['pred_detection_label'] ==
                      DetectionStatus.DS_POSITIVE)
    print(
        'Finished loading and preprocessing {} rows from detector output, predicted {} positives'
        .format(len(detection_results), n_positives))

    if options.include_almost_detections:
        n_almosts = sum(detection_results['pred_detection_label'] ==
                        DetectionStatus.DS_ALMOST)
        print('...and {} almost-positives'.format(n_almosts))

    ##%% If we have ground truth, remove images we can't match to ground truth

    if ground_truth_indexed_db is not None:

        b_match = [False] * len(detection_results)

        detector_files = detection_results['file'].tolist()

        # fn = detector_files[0]; print(fn)
        for i_fn, fn in enumerate(detector_files):

            # assert fn in ground_truth_indexed_db.filename_to_id, 'Could not find ground truth for row {} ({})'.format(i_fn,fn)
            if fn in ground_truth_indexed_db.filename_to_id:
                b_match[i_fn] = True

        print('Confirmed filename matches to ground truth for {} of {} files'.
              format(sum(b_match), len(detector_files)))

        detection_results = detection_results[b_match]
        detector_files = detection_results['file'].tolist()

        assert len(
            detector_files
        ) > 0, 'No detection files available, possible ground truth path issue?'

        print('Trimmed detection results to {} files'.format(
            len(detector_files)))

    ##%% Sample images for visualization

    images_to_visualize = detection_results

    if options.num_images_to_sample > 0 and options.num_images_to_sample <= len(
            detection_results):

        images_to_visualize = images_to_visualize.sample(
            options.num_images_to_sample, random_state=options.sample_seed)

    output_html_file = ''

    style_header = """<head>
        <style type="text/css">
        <!--
        a { text-decoration:none; }
        body { font-family:segoe ui, calibri, "trebuchet ms", verdana, arial, sans-serif; }
        div.contentdiv { margin-left:20px; }
        -->
        </style>
        </head>"""

    ##%% Fork here depending on whether or not ground truth is available

    # If we have ground truth, we'll compute precision/recall and sample tp/fp/tn/fn.
    #
    # Otherwise we'll just visualize detections/non-detections.

    if ground_truth_indexed_db is not None:

        ##%% Detection evaluation: compute precision/recall

        # numpy array of detection probabilities
        p_detection = detection_results['max_detection_conf'].values
        n_detections = len(p_detection)

        # numpy array of bools (0.0/1.0), and -1 as null value
        gt_detections = np.zeros(n_detections, dtype=float)

        for i_detection, fn in enumerate(detector_files):
            image_id = ground_truth_indexed_db.filename_to_id[fn]
            image = ground_truth_indexed_db.image_id_to_image[image_id]
            detection_status = image['_detection_status']

            if detection_status == DetectionStatus.DS_NEGATIVE:
                gt_detections[i_detection] = 0.0
            elif detection_status == DetectionStatus.DS_POSITIVE:
                gt_detections[i_detection] = 1.0
            else:
                gt_detections[i_detection] = -1.0

        # Don't include ambiguous/unknown ground truth in precision/recall analysis
        b_valid_ground_truth = gt_detections >= 0.0

        p_detection_pr = p_detection[b_valid_ground_truth]
        gt_detections_pr = gt_detections[b_valid_ground_truth]

        print('Including {} of {} values in p/r analysis'.format(
            np.sum(b_valid_ground_truth), len(b_valid_ground_truth)))

        precisions, recalls, thresholds = precision_recall_curve(
            gt_detections_pr, p_detection_pr)

        # For completeness, include the result at a confidence threshold of 1.0
        thresholds = np.append(thresholds, [1.0])

        precisions_recalls = pd.DataFrame(
            data={
                'confidence_threshold': thresholds,
                'precision': precisions,
                'recall': recalls
            })

        # Compute and print summary statistics
        average_precision = average_precision_score(gt_detections_pr,
                                                    p_detection_pr)
        print('Average precision: {:.1%}'.format(average_precision))

        # Thresholds go up throughout precisions/recalls/thresholds; find the last
        # value where recall is at or above target.  That's our precision @ target recall.
        target_recall = 0.9
        b_above_target_recall = np.where(recalls >= target_recall)
        if not np.any(b_above_target_recall):
            precision_at_target_recall = 0.0
        else:
            i_target_recall = np.argmax(b_above_target_recall)
            precision_at_target_recall = precisions[i_target_recall]
        print('Precision at {:.1%} recall: {:.1%}'.format(
            target_recall, precision_at_target_recall))

        cm = confusion_matrix(
            gt_detections_pr,
            np.array(p_detection_pr) > options.confidence_threshold)

        # Flatten the confusion matrix
        tn, fp, fn, tp = cm.ravel()

        precision_at_confidence_threshold = tp / (tp + fp)
        recall_at_confidence_threshold = tp / (tp + fn)
        f1 = 2.0 * (precision_at_confidence_threshold * recall_at_confidence_threshold) / \
            (precision_at_confidence_threshold + recall_at_confidence_threshold)

        print(
            'At a confidence threshold of {:.1%}, precision={:.1%}, recall={:.1%}, f1={:.1%}'
            .format(options.confidence_threshold,
                    precision_at_confidence_threshold,
                    recall_at_confidence_threshold, f1))

        ##%% Collect classification results, if they exist

        classifier_accuracies = []

        # Mapping of classnames to idx for the confusion matrix.
        #
        # The lambda is actually kind of a hack, because we use assume that
        # the following code does not reassign classname_to_idx
        classname_to_idx = collections.defaultdict(
            lambda: len(classname_to_idx))

        # Confusion matrix as defaultdict of defaultdict
        #
        # Rows / first index is ground truth, columns / second index is predicted category
        classifier_cm = collections.defaultdict(
            lambda: collections.defaultdict(lambda: 0))

        # iDetection = 0; fn = detector_files[iDetection]; print(fn)
        assert len(detector_files) == len(detection_results)
        for iDetection, fn in enumerate(detector_files):

            image_id = ground_truth_indexed_db.filename_to_id[fn]
            image = ground_truth_indexed_db.image_id_to_image[image_id]
            detections = detection_results['detections'].iloc[iDetection]
            pred_class_ids = [det['classifications'][0][0] \
                for det in detections if 'classifications' in det.keys()]
            pred_classnames = [
                classification_categories_map[pd] for pd in pred_class_ids
            ]

            # If this image has classification predictions, and an unambiguous class
            # annotated, and is a positive image...
            if len(pred_classnames) > 0 \
                    and '_unambiguous_category' in image.keys() \
                    and image['_detection_status'] == DetectionStatus.DS_POSITIVE:

                # The unambiguous category, we make this a set for easier handling afterward
                gt_categories = set([image['_unambiguous_category']])
                pred_categories = set(pred_classnames)

                # Compute the accuracy as intersection of union,
                # i.e. (# of categories in both prediciton and GT)
                #      divided by (# of categories in either prediction or GT
                #
                # In case of only one GT category, the result will be 1.0, if
                # prediction is one category and this category matches GT
                #
                # It is 1.0/(# of predicted top-1 categories), if the GT is
                # one of the predicted top-1 categories.
                #
                # It is 0.0, if none of the predicted categories is correct

                classifier_accuracies.append(
                    len(gt_categories & pred_categories) /
                    len(gt_categories | pred_categories))
                image['_classification_accuracy'] = classifier_accuracies[-1]

                # Distribute this accuracy across all predicted categories in the
                # confusion matrix
                assert len(gt_categories) == 1
                gt_class_idx = classname_to_idx[list(gt_categories)[0]]
                for pred_category in pred_categories:
                    pred_class_idx = classname_to_idx[pred_category]
                    classifier_cm[gt_class_idx][pred_class_idx] += 1

        # ...for each file in the detection results

        # If we have classification results
        if len(classifier_accuracies) > 0:

            # Build confusion matrix as array from classifier_cm
            all_class_ids = sorted(classname_to_idx.values())
            classifier_cm_array = np.array(
                [[classifier_cm[r_idx][c_idx] for c_idx in all_class_ids]
                 for r_idx in all_class_ids],
                dtype=float)
            classifier_cm_array /= (
                classifier_cm_array.sum(axis=1, keepdims=True) + 1e-7)

            # Print some statistics
            print("Finished computation of {} classification results".format(
                len(classifier_accuracies)))
            print("Mean accuracy: {}".format(np.mean(classifier_accuracies)))

            # Prepare confusion matrix output

            # Get confusion matrix as string
            sio = io.StringIO()
            np.savetxt(sio, classifier_cm_array * 100, fmt='%5.1f')
            cm_str = sio.getvalue()
            # Get fixed-size classname for each idx
            idx_to_classname = {v: k for k, v in classname_to_idx.items()}
            classname_list = [
                idx_to_classname[idx]
                for idx in sorted(classname_to_idx.values())
            ]
            classname_headers = [
                '{:<5}'.format(cname[:5]) for cname in classname_list
            ]

            # Prepend class name on each line and add to the top
            cm_str_lines = [' ' * 16 + ' '.join(classname_headers)]
            cm_str_lines += [
                '{:>15}'.format(cn[:15]) + ' ' + cm_line
                for cn, cm_line in zip(classname_list, cm_str.splitlines())
            ]

            # Print formatted confusion matrix
            print("Confusion matrix: ")
            print(*cm_str_lines, sep='\n')

            # Plot confusion matrix

            # To manually add more space at bottom: plt.rcParams['figure.subplot.bottom'] = 0.1
            #
            # Add 0.5 to figsize for every class. For two classes, this will result in
            # fig = plt.figure(figsize=[4,4])
            fig = vis_utils.plot_confusion_matrix(classifier_cm_array,
                                                  classname_list,
                                                  normalize=False,
                                                  title='Confusion matrix',
                                                  cmap=plt.cm.Blues,
                                                  vmax=1.0,
                                                  use_colorbar=True,
                                                  y_label=True)
            cm_figure_relative_filename = 'confusion_matrix.png'
            cm_figure_filename = os.path.join(output_dir,
                                              cm_figure_relative_filename)
            plt.savefig(cm_figure_filename)
            plt.close(fig)

        # ...if we have classification results

        ##%% Render output

        # Write p/r table to .csv file in output directory
        pr_table_filename = os.path.join(output_dir, 'prec_recall.csv')
        precisions_recalls.to_csv(pr_table_filename, index=False)

        # Write precision/recall plot to .png file in output directory
        t = 'Precision-Recall curve: AP={:0.1%}, P@{:0.1%}={:0.1%}'.format(
            average_precision, target_recall, precision_at_target_recall)
        fig = vis_utils.plot_precision_recall_curve(precisions, recalls, t)
        pr_figure_relative_filename = 'prec_recall.png'
        pr_figure_filename = os.path.join(output_dir,
                                          pr_figure_relative_filename)
        plt.savefig(pr_figure_filename)
        # plt.show(block=False)
        plt.close(fig)

        ##%% Sampling

        # Sample true/false positives/negatives with correct/incorrect top-1
        # classification and render to html

        # Accumulate html image structs (in the format expected by write_html_image_lists)
        # for each category, e.g. 'tp', 'fp', ..., 'class_bird', ...
        images_html = collections.defaultdict(lambda: [])
        # Add default entries by accessing them for the first time
        [images_html[res] for res in ['tp', 'tpc', 'tpi', 'fp', 'tn', 'fn']]
        for res in images_html.keys():
            os.makedirs(os.path.join(output_dir, res), exist_ok=True)

        image_count = len(images_to_visualize)

        # Each element will be a list of 2-tuples, with elements [collection name,html info struct]
        rendering_results = []

        # Each element will be a three-tuple with elements file,max_conf,detections
        files_to_render = []

        # Assemble the information we need for rendering, so we can parallelize without
        # dealing with Pandas
        # i_row = 0; row = images_to_visualize.iloc[0]
        for _, row in images_to_visualize.iterrows():

            # Filenames should already have been normalized to either '/' or '\'
            files_to_render.append(
                [row['file'], row['max_detection_conf'], row['detections']])

        def render_image_with_gt(file_info):

            image_relative_path = file_info[0]
            max_conf = file_info[1]
            detections = file_info[2]

            # This should already have been normalized to either '/' or '\'

            image_id = ground_truth_indexed_db.filename_to_id.get(
                image_relative_path, None)
            if image_id is None:
                print('Warning: couldn'
                      't find ground truth for image {}'.format(
                          image_relative_path))
                return None

            image = ground_truth_indexed_db.image_id_to_image[image_id]
            annotations = ground_truth_indexed_db.image_id_to_annotations[
                image_id]

            gt_status = image['_detection_status']

            gt_presence = bool(gt_status)

            gt_classes = CameraTrapJsonUtils.annotations_to_classnames(
                annotations, ground_truth_indexed_db.cat_id_to_name)
            gt_class_summary = ','.join(gt_classes)

            if gt_status > DetectionStatus.DS_MAX_DEFINITIVE_VALUE:
                print(
                    'Skipping image {}, does not have a definitive ground truth status (status: {}, classes: {})'
                    .format(image_id, gt_status, gt_class_summary))
                return None

            detected = max_conf > options.confidence_threshold

            if gt_presence and detected:
                if '_classification_accuracy' not in image.keys():
                    res = 'tp'
                elif np.isclose(1, image['_classification_accuracy']):
                    res = 'tpc'
                else:
                    res = 'tpi'
            elif not gt_presence and detected:
                res = 'fp'
            elif gt_presence and not detected:
                res = 'fn'
            else:
                res = 'tn'

            display_name = '<b>Result type</b>: {}, <b>Presence</b>: {}, <b>Class</b>: {}, <b>Max conf</b>: {:0.2f}%, <b>Image</b>: {}'.format(
                res.upper(), str(gt_presence), gt_class_summary,
                max_conf * 100, image_relative_path)

            rendered_image_html_info = render_bounding_boxes(
                options.image_base_dir, image_relative_path, display_name,
                detections, res, detection_categories_map,
                classification_categories_map, options)

            image_result = None
            if len(rendered_image_html_info) > 0:
                image_result = [[res, rendered_image_html_info]]
                for gt_class in gt_classes:
                    image_result.append([
                        'class_{}'.format(gt_class), rendered_image_html_info
                    ])

            return image_result

        # ...def render_image_with_gt(file_info)

        start_time = time.time()
        if options.parallelize_rendering:
            if options.parallelize_rendering_n_cores is None:
                pool = ThreadPool()
            else:
                print('Rendering images with {} workers'.format(
                    options.parallelize_rendering_n_cores))
                pool = ThreadPool(options.parallelize_rendering_n_cores)
            rendering_results = list(
                tqdm(pool.imap(render_image_with_gt, files_to_render),
                     total=len(files_to_render)))
        else:
            # file_info = files_to_render[0]
            for file_info in tqdm(files_to_render):
                rendering_results.append(render_image_with_gt(file_info))
        elapsed = time.time() - start_time

        # Map all the rendering results in the list rendering_results into the
        # dictionary images_html
        image_rendered_count = 0
        for rendering_result in rendering_results:
            if rendering_result is None:
                continue
            image_rendered_count += 1
            for assignment in rendering_result:
                images_html[assignment[0]].append(assignment[1])

        # Prepare the individual html image files
        image_counts = prepare_html_subpages(images_html, output_dir)

        print('{} images rendered (of {})'.format(image_rendered_count,
                                                  image_count))

        # Write index.html
        all_tp_count = image_counts['tp'] + image_counts['tpc'] + image_counts[
            'tpi']
        total_count = all_tp_count + image_counts['tn'] + image_counts[
            'fp'] + image_counts['fn']

        classification_detection_results = """&nbsp;&nbsp;&nbsp;&nbsp;<a href="tpc.html">with all correct top-1 predictions (TPC)</a> ({})<br/>
           &nbsp;&nbsp;&nbsp;&nbsp;<a href="tpi.html">with one or more incorrect top-1 prediction (TPI)</a> ({})<br/>
           &nbsp;&nbsp;&nbsp;&nbsp;<a href="tp.html">without classification evaluation</a><sup>*</sup> ({})<br/>""".format(
            image_counts['tpc'], image_counts['tpi'], image_counts['tp'])

        index_page = """<html>
        {}
        <body>
        <h2>Evaluation</h2>

        <h3>Sample images</h3>
        <div style="margin-left:20px;">
        <p>A sample of {} images, annotated with detections above {:.1%} confidence.</p>
        <a href="tp.html">True positives (TP)</a> ({}) ({:0.1%})<br/>
        CLASSIFICATION_PLACEHOLDER_1
        <a href="tn.html">True negatives (TN)</a> ({}) ({:0.1%})<br/>
        <a href="fp.html">False positives (FP)</a> ({}) ({:0.1%})<br/>
        <a href="fn.html">False negatives (FN)</a> ({}) ({:0.1%})<br/>
        CLASSIFICATION_PLACEHOLDER_2
        </div>        
        """.format(style_header, image_count, options.confidence_threshold,
                   all_tp_count, all_tp_count / total_count,
                   image_counts['tn'], image_counts['tn'] / total_count,
                   image_counts['fp'], image_counts['fp'] / total_count,
                   image_counts['fn'], image_counts['fn'] / total_count)

        index_page += """
            <h3>Detection results</h3>
            <div class="contentdiv">
            <p>At a confidence threshold of {:0.1%}, precision={:0.1%}, recall={:0.1%}</p>
            <p><strong>Precision/recall summary for all {} images</strong></p><img src="{}"><br/>
            </div>
            """.format(options.confidence_threshold,
                       precision_at_confidence_threshold,
                       recall_at_confidence_threshold, len(detection_results),
                       pr_figure_relative_filename)

        if len(classifier_accuracies) > 0:
            index_page = index_page.replace('CLASSIFICATION_PLACEHOLDER_1',
                                            classification_detection_results)
            index_page = index_page.replace(
                'CLASSIFICATION_PLACEHOLDER_2',
                """<p><sup>*</sup>We do not evaluate the classification result of images 
                if the classification information is missing, if the image contains
                categories like &lsquo;empty&rsquo; or &lsquo;human&rsquo;, or if the image has multiple 
                classification labels.</p>""")
        else:
            index_page = index_page.replace('CLASSIFICATION_PLACEHOLDER_1', '')
            index_page = index_page.replace('CLASSIFICATION_PLACEHOLDER_2', '')

        if len(classifier_accuracies) > 0:
            index_page += """
                <h3>Classification results</h3>
                <div class="contentdiv">
                <p>Classification accuracy: {:.2%}<br>
                The accuracy is computed only for images with exactly one classification label.
                The accuracy of an image is computed as 1/(number of unique detected top-1 classes),
                i.e. if the model detects multiple boxes with different top-1 classes, then the accuracy
                decreases and the image is put into 'TPI'.</p>
                <p>Confusion matrix:</p>
                <p><img src="{}"></p>
                <div style='font-family:monospace;display:block;'>{}</div>
                </div>
                """.format(np.mean(classifier_accuracies),
                           cm_figure_relative_filename,
                           "<br>".join(cm_str_lines).replace(' ', '&nbsp;'))

        # Show links to each GT class
        #
        # We could do this without classification results; currently we don't.
        if len(classname_to_idx) > 0:

            index_page += '<h3>Images of specific classes</h3><br/><div class="contentdiv">'
            # Add links to all available classes
            for cname in sorted(classname_to_idx.keys()):
                index_page += "<a href='class_{0}.html'>{0}</a> ({1})<br>".format(
                    cname, len(images_html['class_{}'.format(cname)]))
            index_page += "</div>"

        # Close body and html tags
        index_page += "</body></html>"
        output_html_file = os.path.join(output_dir, 'index.html')
        with open(output_html_file, 'w') as f:
            f.write(index_page)

        print('Finished writing html to {}'.format(output_html_file))

    # ...for each image

    ##%% Otherwise, if we don't have ground truth...

    else:

        ##%% Sample detections/non-detections

        # Accumulate html image structs (in the format expected by write_html_image_lists)
        # for each category
        images_html = collections.defaultdict(lambda: [])

        # Add default entries by accessing them for the first time
        [images_html[res] for res in ['detections', 'non_detections']]
        if options.include_almost_detections:
            images_html['almost_detections']

        # Create output directories
        for res in images_html.keys():
            os.makedirs(os.path.join(output_dir, res), exist_ok=True)

        image_count = len(images_to_visualize)
        has_classification_info = False

        # Each element will be a list of 2-tuples, with elements [collection name,html info struct]
        rendering_results = []

        # Each element will be a three-tuple with elements file,max_conf,detections
        files_to_render = []

        # Assemble the information we need for rendering, so we can parallelize without
        # dealing with Pandas
        # i_row = 0; row = images_to_visualize.iloc[0]
        for _, row in images_to_visualize.iterrows():

            # Filenames should already have been normalized to either '/' or '\'
            files_to_render.append(
                [row['file'], row['max_detection_conf'], row['detections']])

        # Local function for parallelization
        def render_image_no_gt(file_info):

            image_relative_path = file_info[0]
            max_conf = file_info[1]
            detections = file_info[2]

            detection_status = DetectionStatus.DS_UNASSIGNED
            if max_conf >= options.confidence_threshold:
                detection_status = DetectionStatus.DS_POSITIVE
            else:
                if options.include_almost_detections:
                    if max_conf >= options.almost_detection_confidence_threshold:
                        detection_status = DetectionStatus.DS_ALMOST
                    else:
                        detection_status = DetectionStatus.DS_NEGATIVE
                else:
                    detection_status = DetectionStatus.DS_NEGATIVE

            if detection_status == DetectionStatus.DS_POSITIVE:
                res = 'detections'
            elif detection_status == DetectionStatus.DS_NEGATIVE:
                res = 'non_detections'
            else:
                assert detection_status == DetectionStatus.DS_ALMOST
                res = 'almost_detections'

            display_name = '<b>Result type</b>: {}, <b>Image</b>: {}, <b>Max conf</b>: {}'.format(
                res, image_relative_path, max_conf)

            rendering_options = copy.copy(options)
            if detection_status == DetectionStatus.DS_ALMOST:
                rendering_options.confidence_threshold = rendering_options.almost_detection_confidence_threshold
            rendered_image_html_info = render_bounding_boxes(
                options.image_base_dir, image_relative_path, display_name,
                detections, res, detection_categories_map,
                classification_categories_map, rendering_options)

            image_result = None
            if len(rendered_image_html_info) > 0:
                image_result = [[res, rendered_image_html_info]]
                for det in detections:
                    if 'classifications' in det:
                        top1_class = classification_categories_map[
                            det['classifications'][0][0]]
                        image_result.append([
                            'class_{}'.format(top1_class),
                            rendered_image_html_info
                        ])

            return image_result

        # ...def render_image_no_gt(file_info):

        start_time = time.time()
        if options.parallelize_rendering:
            if options.parallelize_rendering_n_cores is None:
                pool = ThreadPool()
            else:
                print('Rendering images with {} workers'.format(
                    options.parallelize_rendering_n_cores))
                pool = ThreadPool(options.parallelize_rendering_n_cores)
            rendering_results = list(
                tqdm(pool.imap(render_image_no_gt, files_to_render),
                     total=len(files_to_render)))
        else:
            for file_info in tqdm(files_to_render):
                rendering_results.append(render_image_no_gt(file_info))
        elapsed = time.time() - start_time

        # Map all the rendering results in the list rendering_results into the
        # dictionary images_html
        image_rendered_count = 0
        for rendering_result in rendering_results:
            if rendering_result is None:
                continue
            image_rendered_count += 1
            for assignment in rendering_result:
                if 'class' in assignment[0]:
                    has_classification_info = True
                images_html[assignment[0]].append(assignment[1])

        # Prepare the individual html image files
        image_counts = prepare_html_subpages(images_html, output_dir)

        print('Rendered {} images (of {}) in {} ({} per image)'.format(
            image_rendered_count, image_count,
            humanfriendly.format_timespan(elapsed),
            humanfriendly.format_timespan(elapsed / image_rendered_count)))

        # Write index.HTML
        total_images = image_counts['detections'] + image_counts[
            'non_detections']
        if options.include_almost_detections:
            total_images += image_counts['almost_detections']
        assert (total_images == image_count)

        almost_detection_string = ''
        if options.include_almost_detections:
            almost_detection_string = ' (&ldquo;almost detection&rdquo; threshold at {:.1%})'.format(
                options.almost_detection_confidence_threshold)

        index_page = """<html>{}<body>
        <h2>Visualization of results</h2>
        <p>A sample of {} images, annotated with detections above {:.1%} confidence{}.</p>
        <h3>Sample images</h3>
        <div class="contentdiv">
        <a href="detections.html">detections</a> ({}, {:.1%})<br/>
        <a href="non_detections.html">non-detections</a> ({}, {:.1%})<br/>""".format(
            style_header, image_count, options.confidence_threshold,
            almost_detection_string, image_counts['detections'],
            image_counts['detections'] / total_images,
            image_counts['non_detections'],
            image_counts['non_detections'] / total_images)

        if options.include_almost_detections:
            index_page += """<a href="almost_detections.html">almost-detections</a> ({}, {:.1%})<br/>""".format(
                image_counts['almost_detections'],
                image_counts['almost_detections'] / total_images)

        index_page += '</div>\n'

        if has_classification_info:
            index_page += "<h3>Images of detected classes</h3>"
            index_page += "<p>The same image might appear under multiple classes if multiple species were detected.</p>\n<div class='contentdiv'>\n"

            # Add links to all available classes
            for cname in sorted(classification_categories_map.values()):
                ccount = len(images_html['class_{}'.format(cname)])
                if ccount > 0:
                    index_page += "<a href='class_{}.html'>{}</a> ({})<br/>\n".format(
                        cname, cname.lower(), ccount)
            index_page += "</div>\n"

        index_page += "</body></html>"
        output_html_file = os.path.join(output_dir, 'index.html')
        with open(output_html_file, 'w') as f:
            f.write(index_page)

        print('Finished writing html to {}'.format(output_html_file))

        # os.startfile(output_html_file)

    # ...if we do/don't have ground truth

    ppresults.output_html_file = output_html_file
    return ppresults
Exemple #21
0
 def get_cities(self):
     pool = ThreadPool(16)
     list(pool.imap(self.handler_cities, self.__get_states()))
Exemple #22
0
 def get_school_category(self):
     pool = ThreadPool(64)
     list(pool.imap(self.handler_categories, self.__cities))
Exemple #23
0
class StockExchange():
    def __init__(self, max_worker=5):
        self._status = 'close'
        self._expire_at = 0
        self._thread_pools = Pool(max_worker)

    @property
    def market_status(self):
        now = datetime.now()
        if self._expire_at < now.timestamp():
            self._update_market_status(now)
        return self._status

    @property
    def market_am_open(self):
        now = datetime.now()
        if self._expire_at < now.timestamp():
            self._update_market_status(now)
        return self._market_am_open

    @property
    def market_am_close(self):
        now = datetime.now()
        if self._expire_at < now.timestamp():
            self._update_market_status(now)
        return self._market_am_close

    @property
    def market_fm_open(self):
        now = datetime.now()
        if self._expire_at < now.timestamp():
            self._update_market_status(now)
        return self._market_fm_open

    @property
    def market_fm_close(self):
        now = datetime.now()
        if self._expire_at < now.timestamp():
            self._update_market_status(now)
        return self._market_fm_close

    def _update_market_status(self, now):
        hq = self.hq('sh000001')
        hq_date = hq.loc['sh000001', 'date']
        hq_time = hq.loc['sh000001', 'time']
        hq_datetime = datetime.strptime(hq_date + ' ' + hq_time,
                                        '%Y-%m-%d %H:%M:%S')

        self._market_am_open = hq_datetime.replace(hour=9,
                                                   minute=25,
                                                   second=0,
                                                   microsecond=0)
        self._market_am_close = hq_datetime.replace(hour=11,
                                                    minute=30,
                                                    second=0,
                                                    microsecond=0)
        self._market_fm_open = hq_datetime.replace(hour=13,
                                                   minute=0,
                                                   second=0,
                                                   microsecond=0)
        self._market_fm_close = hq_datetime.replace(hour=15,
                                                    minute=0,
                                                    second=0,
                                                    microsecond=0)

        if hq_datetime.date() < now.date():
            self._status = 'close'
            self._expire_at = (self._market_am_open +
                               timedelta(days=1)).timestamp()
        else:
            if hq_datetime < self._market_am_close:
                self._status = 'trading'
                self._expire_at = self._market_am_close.timestamp()
            elif hq_datetime < self._market_fm_open:
                self._status = 'break'
                self._expire_at = self._market_fm_open.timestamp()
            elif hq_datetime < self._market_fm_close:
                self._status = 'trading'
                self._expire_at = self._market_fm_close.timestamp()
            else:
                self._status = 'close'
                self._expire_at = (self._market_am_open +
                                   timedelta(days=1)).timestamp()

        return

    def hq(self, *symbols):
        '''行情接口——默认使用新浪的行情接口
           :param symbols: [ 'sz150023','sz150022','sz159915']
           :return: 行情数据
        '''
        _symbols = []
        for s in symbols:
            if isinstance(s, (list, set, tuple)):
                _symbols.extend(s)
            else:
                _symbols.append(s)
        symbols = _symbols

        urls = []
        for i in range(0, len(symbols), _MAX_SINA_HQ_LIST):
            url = 'http://hq.sinajs.cn/?rn=%d&list=' % int(time.time())
            urls.append(url + ','.join(symbols[i:i + _MAX_SINA_HQ_LIST]))

        respones = self._thread_pools.imap(requests.get, urls)
        data = list()
        for r in respones:
            lines = r.text.splitlines()
            for line in lines:
                d = line.split('"')[1].split(',')
                # 如果格式不正确,则返回nan
                if len(d) != len(_SINA_STOCK_KEYS):
                    d = [np.nan] * len(_SINA_STOCK_KEYS)
                data.append(d)
        df = pd.DataFrame(data,
                          index=symbols,
                          columns=_SINA_STOCK_KEYS,
                          dtype='float')
        df.index.name = 'symbol'
        df['volume'] = df['volume'] // 100
        if 'volume' in _SINA_STOCK_KEYS and 'lasttrade' in _SINA_STOCK_KEYS and 'yclose' in _SINA_STOCK_KEYS:
            df.loc[df.volume == 0, 'lasttrade'] = df['yclose']
        return df

    def bar(self, symbol, start='', end='', ktype='D', adjtype='forward'):
        '''
        获取k线的函数
        :param symbol: 证券代码,如: sz150023,sz000001,sh000001
        :param start: 起始日期,datetime or '2016-01-01'
        :param end: 终止日期, datetime or '2016-03-31'
        :param ktype: k线频率, D=日k线 W=周 M=月 5=5分钟 15=15分钟 30=30分钟 60=60分钟,默认为D
        :param adjtype: 复权调整类型, forward-前复权 afterward-后复权 None-不复权,默认为:forward
        :return: 返回DataFrame
                 index : date
                 columns : [symbol, open, high, low, close, volume]
        '''

        # 将start,end转换成datetime格式
        if isinstance(start, str):
            if start == '':
                start = datetime.now().replace(month=1,
                                               day=1,
                                               hour=0,
                                               minute=0)
            else:
                start = datetime.strptime(start, '%Y-%m-%d')
        if isinstance(end, str):
            if end == '':
                end = datetime.now().replace(hour=0, minute=0, microsecond=0)
            else:
                end = datetime.strptime(end, '%Y-%m-%d')

        # 判断是否需要使用最新的行情数据
        if (end.date() >= datetime.today().date()) and (self.market_status !=
                                                        'close'):
            hq = self._thread_pools.apply_async(self.hq, args=([symbol]))
            need_update_hq = True
        else:
            need_update_hq = False

        ktype = _KTYPE[ktype.upper()]

        if adjtype:
            adjtype = _ADJTYPE[adjtype.lower()]
        else:
            adjtype = ''

        data = []
        results = []

        for year in range(start.year - 1, end.year + 1):
            kwargs = {
                'year': year,
                'symbol': symbol,
                'ktype': ktype,
                'adjtype': adjtype
            }
            results.append(
                self._thread_pools.apply_async(self._parser_bar, kwds=kwargs))

        for result in results:
            result = result.get()
            data.extend(result)

        data = self._thread_pools.map(lambda x: x[:6], data)

        df = pd.DataFrame(
            data,
            columns=['date', 'open', 'close', 'high', 'low', 'volume'],
            dtype='float')
        df = df.set_index('date')
        df = df.sort_index()
        if need_update_hq:
            hq = hq.get()
            date = hq.loc[symbol, 'date']
            bar_last_close = df.ix[-1, 'close']
            hq_yclose = hq.loc[symbol, 'yclose']
            if bar_last_close != hq_yclose:
                adj = hq_yclose / bar_last_close
                df[['open', 'high', 'low',
                    'close']] = df[['open', 'high', 'low', 'close']] / adj

            df.loc[date, 'open'] = hq.loc[symbol, 'open']
            df.loc[date, 'close'] = hq.loc[symbol, 'lasttrade']
            df.loc[date, 'high'] = hq.loc[symbol, 'high']
            df.loc[date, 'low'] = hq.loc[symbol, 'low']
            df.loc[date, 'volume'] = hq.loc[symbol, 'volume']
        else:
            df = df.loc[df.index <= end.strftime('%Y-%m-%d')]

        df['symbol'] = symbol
        df['yclose'] = df['close'].shift(1)
        df['chg'] = df['close'].pct_change(1) * 100
        df['chg'] = df['chg'].round(2)
        df = df[[
            'symbol', 'open', 'high', 'low', 'close', 'yclose', 'chg', 'volume'
        ]]

        df = df.loc[df.index >= start.strftime('%Y-%m-%d')]
        return df

    @cache(TTLTimer(hours=9))
    @retry(3)
    def _parser_bar(self, year, symbol, ktype, adjtype):
        url = _BAR_URL_TEMPLATE % (adjtype, year, symbol, ktype, year, year,
                                   adjtype, random())
        r = requests.get(url)
        r.raise_for_status()
        d = r.text
        d = d.split('=')[1]
        d = json.loads(d)['data']

        if '%s%s' % (adjtype, ktype) in d[symbol].keys():
            d = d[symbol]['%s%s' % (adjtype, ktype)]
        else:
            d = d[symbol][ktype]

        return d

    def mbar(self, symbol, ktype='1', adjtype='forward'):
        pass

    def tick(self, symbol, date=None):

        params = {'symbol': symbol, 'date': date}
        r = requests.get(url='http://market.finance.sina.com.cn/downxls.php',
                         params=params)

        tick_xls = BytesIO(r.content)
        tick_val = tick_xls.getvalue()
        if tick_val.find(b'alert') != -1 or len(tick_val) < 20:
            df = pd.DataFrame([],
                              columns=[
                                  'date', 'symbol', 'type', 'price', 'change',
                                  'amount'
                              ])
            df = df.set_index('date')
            return df
        else:
            df = pd.read_table(tick_xls,
                               names=_TICK_COLUMNS,
                               skiprows=[0],
                               encoding='GBK')

        df['date'] = df['time'].apply(lambda x: '%s %s' % (date, x))
        df = df.set_index('date')
        d = {'买盘': 'B', '卖盘': 'S', '中性盘': 'M'}
        df['type'] = df['type'].apply(lambda x: d[x])
        df['symbol'] = symbol
        df = df.sort_index()

        return df[['symbol', 'type', 'price', 'change', 'amount']]
Exemple #24
0
def process_images(db_path, output_dir, image_base_dir, options=None):
    """
    Writes images and html to output_dir to visualize the annotations in the json file
    db_path.
    
    db_path can also be a previously-loaded database.
    
    Returns the html filename and the database:
        
    return htmlOutputFile,image_db
    """

    if options is None:
        options = DbVizOptions()

    print(options.__dict__)

    os.makedirs(os.path.join(output_dir, 'rendered_images'), exist_ok=True)
    assert (os.path.isdir(image_base_dir))

    if isinstance(db_path, str):
        assert (os.path.isfile(db_path))
        print('Loading database from {}...'.format(db_path))
        image_db = json.load(open(db_path))
        print('...done')
    elif isinstance(db_path, dict):
        print('Using previously-loaded DB')
        image_db = db_path
    else:
        raise ValueError('Illegal dictionary or filename')

    annotations = image_db['annotations']
    images = image_db['images']
    categories = image_db['categories']

    # Optionally remove all images without bounding boxes, *before* sampling
    if options.trim_to_images_with_bboxes:

        bHasBbox = [False] * len(annotations)
        for iAnn, ann in enumerate(annotations):
            if 'bbox' in ann:
                assert isinstance(ann['bbox'], list)
                bHasBbox[iAnn] = True
        annotationsWithBboxes = list(compress(annotations, bHasBbox))

        imageIDsWithBboxes = [x['image_id'] for x in annotationsWithBboxes]
        imageIDsWithBboxes = set(imageIDsWithBboxes)

        bImageHasBbox = [False] * len(images)
        for iImage, image in enumerate(images):
            imageID = image['id']
            if imageID in imageIDsWithBboxes:
                bImageHasBbox[iImage] = True
        imagesWithBboxes = list(compress(images, bImageHasBbox))
        images = imagesWithBboxes

    # Optionally remove images with specific labels, *before* sampling
    if options.classes_to_exclude is not None:

        print('Indexing database')
        indexed_db = IndexedJsonDb(image_db)
        bValidClass = [True] * len(images)
        for iImage, image in enumerate(images):
            classes = indexed_db.get_classes_for_image(image)
            for excludedClass in options.classes_to_exclude:
                if excludedClass in classes:
                    bValidClass[iImage] = False
                    break

        imagesWithValidClasses = list(compress(images, bValidClass))
        images = imagesWithValidClasses

    # Put the annotations in a dataframe so we can select all annotations for a given image
    print('Creating data frames')
    df_anno = pd.DataFrame(annotations)
    df_img = pd.DataFrame(images)

    # Construct label map
    label_map = {}
    for cat in categories:
        label_map[int(cat['id'])] = cat['name']

    # Take a sample of images
    if options.num_to_visualize is not None:
        df_img = df_img.sample(n=options.num_to_visualize,
                               random_state=options.random_seed)

    images_html = []

    # Set of dicts representing inputs to render_db_bounding_boxes:
    #
    # bboxes, boxClasses, image_path
    rendering_info = []

    print('Preparing rendering list')
    # iImage = 0
    for iImage in tqdm(range(len(df_img))):

        img_id = df_img.iloc[iImage]['id']
        img_relative_path = df_img.iloc[iImage]['file_name']
        img_path = os.path.join(
            image_base_dir,
            image_filename_to_path(img_relative_path, image_base_dir))

        annos_i = df_anno.loc[df_anno['image_id'] ==
                              img_id, :]  # all annotations on this image

        bboxes = []
        boxClasses = []

        # All the class labels we've seen for this image (with out without bboxes)
        imageCategories = set()

        annotationLevelForImage = ''

        # Iterate over annotations for this image
        # iAnn = 0; anno = annos_i.iloc[iAnn]
        for iAnn, anno in annos_i.iterrows():

            if 'sequence_level_annotation' in anno:
                bSequenceLevelAnnotation = anno['sequence_level_annotation']
                if bSequenceLevelAnnotation:
                    annLevel = 'sequence'
                else:
                    annLevel = 'image'
                if annotationLevelForImage == '':
                    annotationLevelForImage = annLevel
                elif annotationLevelForImage != annLevel:
                    annotationLevelForImage = 'mixed'

            categoryID = anno['category_id']
            categoryName = label_map[categoryID]
            if options.add_search_links:
                categoryName = categoryName.replace('"', '')
                categoryName = '<a href="https://www.bing.com/images/search?q={}">{}</a>'.format(
                    categoryName, categoryName)
            imageCategories.add(categoryName)

            if 'bbox' in anno:
                bbox = anno['bbox']
                if isinstance(bbox, float):
                    assert math.isnan(
                        bbox
                    ), "I shouldn't see a bbox that's neither a box nor NaN"
                    continue
                bboxes.append(bbox)
                boxClasses.append(anno['category_id'])

        imageClasses = ', '.join(imageCategories)

        file_name = '{}_gtbbox.jpg'.format(img_id.lower().split('.jpg')[0])
        file_name = file_name.replace('/', '~')

        rendering_info.append({
            'bboxes': bboxes,
            'boxClasses': boxClasses,
            'img_path': img_path,
            'output_file_name': file_name
        })

        labelLevelString = ''
        if len(annotationLevelForImage) > 0:
            labelLevelString = ' (annotation level: {})'.format(
                annotationLevelForImage)

        # We're adding html for an image before we render it, so it's possible this image will
        # fail to render.  For applications where this script is being used to debua a database
        # (the common case?), this is useful behavior, for other applications, this is annoying.
        #
        # TODO: optionally write html only for images where rendering succeeded
        images_html.append({
            'filename':
            '{}/{}'.format('rendered_images', file_name),
            'title':
            '{}<br/>{}, number of boxes: {}, class labels: {}{}'.format(
                img_relative_path, img_id, len(bboxes), imageClasses,
                labelLevelString),
            'textStyle':
            'font-family:verdana,arial,calibri;font-size:80%;text-align:left;margin-top:20;margin-bottom:5'
        })

    # ...for each image

    def render_image_info(rendering_info):

        img_path = rendering_info['img_path']
        bboxes = rendering_info['bboxes']
        bboxClasses = rendering_info['boxClasses']
        output_file_name = rendering_info['output_file_name']

        if not os.path.exists(img_path):
            print('Image {} cannot be found'.format(img_path))
            return

        try:
            original_image = vis_utils.open_image(img_path)
            original_size = original_image.size
            image = vis_utils.resize_image(original_image, options.viz_size[0],
                                           options.viz_size[1])
        except Exception as e:
            print('Image {} failed to open. Error: {}'.format(img_path, e))
            return

        vis_utils.render_db_bounding_boxes(boxes=bboxes,
                                           classes=bboxClasses,
                                           image=image,
                                           original_size=original_size,
                                           label_map=label_map)
        image.save(
            os.path.join(output_dir, 'rendered_images', output_file_name))

    # ...def render_image_info

    print('Rendering images')
    start_time = time.time()
    if options.parallelize_rendering:
        if options.parallelize_rendering_n_cores is None:
            pool = ThreadPool()
        else:
            print('Rendering images with {} workers'.format(
                options.parallelize_rendering_n_cores))
            pool = ThreadPool(options.parallelize_rendering_n_cores)
            tqdm(pool.imap(render_image_info, rendering_info),
                 total=len(rendering_info))
    else:
        for file_info in tqdm(rendering_info):
            render_image_info(file_info)
    elapsed = time.time() - start_time

    print('Rendered {} images in {}'.format(
        len(rendering_info), humanfriendly.format_timespan(elapsed)))

    if options.sort_by_filename:
        images_html = sorted(images_html, key=lambda x: x['filename'])

    htmlOutputFile = os.path.join(output_dir, 'index.html')

    htmlOptions = options.htmlOptions
    if isinstance(db_path, str):
        htmlOptions[
            'headerHtml'] = '<h1>Sample annotations from {}</h1>'.format(
                db_path)
    else:
        htmlOptions['headerHtml'] = '<h1>Sample annotations</h1>'
    write_html_image_list(filename=htmlOutputFile,
                          images=images_html,
                          options=htmlOptions)

    print('Visualized {} images, wrote results to {}'.format(
        len(images_html), htmlOutputFile))

    return htmlOutputFile, image_db
Exemple #25
0
class ThreadPoolStrategy(ConcurrentStrategy, _PoolRunnableStrategy,
                         _Resultable):

    _Thread_Pool: ThreadPool = None
    _Thread_List: List[Union[ApplyResult, AsyncResult]] = None

    def __init__(self, pool_size: int):
        super().__init__(pool_size=pool_size)

    def initialization(self,
                       queue_tasks: Optional[Union[_BaseQueueTask,
                                                   _BaseList]] = None,
                       features: Optional[Union[_BaseFeatureAdapterFactory,
                                                _BaseList]] = None,
                       *args,
                       **kwargs) -> None:
        super(ThreadPoolStrategy, self).initialization(queue_tasks=queue_tasks,
                                                       features=features,
                                                       *args,
                                                       **kwargs)

        # Initialize and build the Processes Pool.
        __pool_initializer: Callable = kwargs.get("pool_initializer", None)
        __pool_initargs: IterableType = kwargs.get("pool_initargs", None)
        self._Thread_Pool = ThreadPool(processes=self.pool_size,
                                       initializer=__pool_initializer,
                                       initargs=__pool_initargs)

    def apply(self,
              tasks_size: int,
              function: Callable,
              args: Tuple = (),
              kwargs: Dict = {}) -> None:
        self.reset_result()
        __process_running_result = None

        try:
            __process_running_result = [
                self._Thread_Pool.apply(func=function, args=args, kwds=kwargs)
                for _ in range(tasks_size)
            ]
            __exception = None
            __process_run_successful = True
        except Exception as e:
            __exception = e
            __process_run_successful = False

        # Save Running result state and Running result value as dict
        self._result_saving(successful=__process_run_successful,
                            result=__process_running_result,
                            exception=None)

    def async_apply(self,
                    tasks_size: int,
                    function: Callable,
                    args: Tuple = (),
                    kwargs: Dict = {},
                    callback: Callable = None,
                    error_callback: Callable = None) -> None:
        self.reset_result()
        self._Thread_List = [
            self._Thread_Pool.apply_async(func=function,
                                          args=args,
                                          kwds=kwargs,
                                          callback=callback,
                                          error_callback=error_callback)
            for _ in range(tasks_size)
        ]

        for process in self._Thread_List:
            _process_running_result = None
            _process_run_successful = None
            _exception = None

            try:
                _process_running_result = process.get()
                _process_run_successful = process.successful()
            except Exception as e:
                _exception = e
                _process_run_successful = False

            # Save Running result state and Running result value as dict
            self._result_saving(successful=_process_run_successful,
                                result=_process_running_result,
                                exception=_exception)

    def apply_with_iter(self,
                        functions_iter: List[Callable],
                        args_iter: List[Tuple] = None,
                        kwargs_iter: List[Dict] = None) -> None:
        self.reset_result()
        __process_running_result = None

        if args_iter is None:
            args_iter = [() for _ in functions_iter]

        if kwargs_iter is None:
            kwargs_iter = [{} for _ in functions_iter]

        try:
            __process_running_result = [
                self._Thread_Pool.apply(func=_func, args=_args, kwds=_kwargs)
                for _func, _args, _kwargs in zip(functions_iter, args_iter,
                                                 kwargs_iter)
            ]
            __exception = None
            __process_run_successful = True
        except Exception as e:
            __exception = e
            __process_run_successful = False

        # Save Running result state and Running result value as dict
        self._result_saving(successful=__process_run_successful,
                            result=__process_running_result,
                            exception=__exception)

    def async_apply_with_iter(
            self,
            functions_iter: List[Callable],
            args_iter: List[Tuple] = None,
            kwargs_iter: List[Dict] = None,
            callback_iter: List[Callable] = None,
            error_callback_iter: List[Callable] = None) -> None:
        self.reset_result()

        if args_iter is None:
            args_iter = [() for _ in functions_iter]

        if kwargs_iter is None:
            kwargs_iter = [{} for _ in functions_iter]

        if callback_iter is None:
            callback_iter = [None for _ in functions_iter]

        if error_callback_iter is None:
            error_callback_iter = [None for _ in functions_iter]

        self._Thread_List = [
            self._Thread_Pool.apply_async(func=_func,
                                          args=_args,
                                          kwds=_kwargs,
                                          callback=_callback)
            for _func, _args, _kwargs, _callback in zip(
                functions_iter, args_iter, kwargs_iter, callback_iter)
        ]

        for process in self._Thread_List:
            _process_running_result = None
            _process_run_successful = None
            _exception = None

            try:
                _process_running_result = process.get()
                _process_run_successful = process.successful()
            except Exception as e:
                _exception = e
                _process_run_successful = False

            # Save Running result state and Running result value as dict
            self._result_saving(successful=_process_run_successful,
                                result=_process_running_result,
                                exception=_exception)

    def map(self,
            function: Callable,
            args_iter: IterableType = (),
            chunksize: int = None) -> None:
        self.reset_result()
        __process_running_result = None

        try:
            __process_running_result = self._Thread_Pool.map(
                func=function, iterable=args_iter, chunksize=chunksize)
            __exception = None
            __process_run_successful = True
        except Exception as e:
            __exception = e
            __process_run_successful = False

        # Save Running result state and Running result value as dict
        for __result in (__process_running_result or []):
            self._result_saving(successful=__process_run_successful,
                                result=__result,
                                exception=None)

    def async_map(self,
                  function: Callable,
                  args_iter: IterableType = (),
                  chunksize: int = None,
                  callback: Callable = None,
                  error_callback: Callable = None) -> None:
        self.reset_result()
        __map_result = self._Thread_Pool.map_async(
            func=function,
            iterable=args_iter,
            chunksize=chunksize,
            callback=callback,
            error_callback=error_callback)
        __process_running_result = __map_result.get()
        __process_run_successful = __map_result.successful()

        # Save Running result state and Running result value as dict
        for __result in (__process_running_result or []):
            self._result_saving(successful=__process_run_successful,
                                result=__result,
                                exception=None)

    def map_by_args(self,
                    function: Callable,
                    args_iter: IterableType[IterableType] = (),
                    chunksize: int = None) -> None:
        self.reset_result()
        __process_running_result = None

        try:
            __process_running_result = self._Thread_Pool.starmap(
                func=function, iterable=args_iter, chunksize=chunksize)
            __exception = None
            __process_run_successful = True
        except Exception as e:
            __exception = e
            __process_run_successful = False

        # Save Running result state and Running result value as dict
        for __result in (__process_running_result or []):
            self._result_saving(successful=__process_run_successful,
                                result=__result,
                                exception=None)

    def async_map_by_args(self,
                          function: Callable,
                          args_iter: IterableType[IterableType] = (),
                          chunksize: int = None,
                          callback: Callable = None,
                          error_callback: Callable = None) -> None:
        self.reset_result()
        __map_result = self._Thread_Pool.starmap_async(
            func=function,
            iterable=args_iter,
            chunksize=chunksize,
            callback=callback,
            error_callback=error_callback)
        __process_running_result = __map_result.get()
        __process_run_successful = __map_result.successful()

        # Save Running result state and Running result value as dict
        for __result in (__process_running_result or []):
            self._result_saving(successful=__process_run_successful,
                                result=__result,
                                exception=None)

    def imap(self,
             function: Callable,
             args_iter: IterableType = (),
             chunksize: int = 1) -> None:
        self.reset_result()
        __process_running_result = None

        try:
            imap_running_result = self._Thread_Pool.imap(func=function,
                                                         iterable=args_iter,
                                                         chunksize=chunksize)
            __process_running_result = [
                result for result in imap_running_result
            ]
            __exception = None
            __process_run_successful = True
        except Exception as e:
            __exception = e
            __process_run_successful = False

        # Save Running result state and Running result value as dict
        for __result in (__process_running_result or []):
            self._result_saving(successful=__process_run_successful,
                                result=__result,
                                exception=None)

    def imap_unordered(self,
                       function: Callable,
                       args_iter: IterableType = (),
                       chunksize: int = 1) -> None:
        self.reset_result()
        __process_running_result = None

        try:
            imap_running_result = self._Thread_Pool.imap_unordered(
                func=function, iterable=args_iter, chunksize=chunksize)
            __process_running_result = [
                result for result in imap_running_result
            ]
            __exception = None
            __process_run_successful = True
        except Exception as e:
            __exception = e
            __process_run_successful = False

        # Save Running result state and Running result value as dict
        for __result in (__process_running_result or []):
            self._result_saving(successful=__process_run_successful,
                                result=__result,
                                exception=None)

    def _result_saving(self, successful: bool, result: List,
                       exception: Exception) -> None:
        _thread_result = {
            "successful": successful,
            "result": result,
            "exception": exception
        }
        # Saving value into list
        self._Thread_Running_Result.append(_thread_result)

    def close(self) -> None:
        self._Thread_Pool.close()
        self._Thread_Pool.join()

    def terminal(self) -> None:
        self._Thread_Pool.terminate()

    def get_result(self) -> List[_ConcurrentResult]:
        return self.result()

    def _saving_process(self) -> List[_ThreadPoolResult]:
        _pool_results = []
        for __result in self._Thread_Running_Result:
            _pool_result = _ThreadPoolResult()
            _pool_result.is_successful = __result["successful"]
            _pool_result.data = __result["result"]
            _pool_results.append(_pool_result)
        return _pool_results
Exemple #26
0
 def get_schools_public(self, iterable):
     pool = ThreadPool(64)
     list(pool.imap(self.handler_schools_public, iterable))
 def test_iter(self):
     yielder = Yielder()
     pool = ThreadPool(processes=10)
     pool.imap(func=print_sample, iterable=iter(yielder.sample_gen, None), chunksize=1)
     pool.join()
def fast_accuracy(vocab, syn0, questions_file, restrict=100000, logger=logging):
    from multiprocessing.pool import ThreadPool
    pool = ThreadPool()

    ok_vocab = nlargest(restrict, vocab.iteritems(),
                        key=lambda (_, item): item.count)
    ok_vocab.sort(key=lambda (_, item): item.index)

    ok_proj_vocab = dict((word, proj_idx)
                         for proj_idx, (word, _) in enumerate(ok_vocab))
    ok_syn0 = syn0[[item.index for _, item in ok_vocab]]

    # normalize
    for i in xrange(ok_syn0.shape[0]):
        ok_syn0[i] /= np.sqrt(np.sum(ok_syn0[i]**2))

    questions = []

    with open(questions_file) as fin:
        cur_section = None
        for line_no, line in enumerate(fin):
            if line.startswith(': '):
                cur_section = line.lstrip(': ').strip()
            else:
                if cur_section is None:
                    raise ValueError('Missing section header')

                try:
                    # TODO assumes vocabulary preprocessing uses lowercase, too...
                    wa, wb, wc, wexpected = [word.lower() for word in line.split()]
                except:
                    logger.info("skipping invalid line #%i in %s" % (line_no, questions))
                    continue

                try:
                    a = ok_proj_vocab[wa]
                    b = ok_proj_vocab[wb]
                    c = ok_proj_vocab[wc]
                    expected = ok_proj_vocab[wexpected]
                except KeyError:
                    logger.debug("skipping line #%i with OOV words: %s" % (line_no, line))
                    continue

                questions.append((cur_section, a, b, c, expected))

    def check(question):
        section, a, b, c, expected = question
        ignore = set([a, b, c])

        mean = np.zeros_like(syn0[0])
        for weight, idx in [(-1, a), (1, b), (1, c)]:
            mean += weight * ok_syn0[idx]
        mean /= np.sqrt(np.sum(mean**2))

        dists = np.dot(ok_syn0, mean)
        correct = False
        for proj_idx in np.argsort(dists)[::-1]:
            if proj_idx not in ignore:
                if proj_idx == expected:
                    correct = True
                break

        return section, correct

    def log_section((section, correct, all_qs)):
        logger.info("%s: %.1f%% (%i/%i)",
                    section, 100. * correct / all_qs,
                    correct, all_qs)

    summary = []
    for section, answers in groupby(pool.imap(check, questions),
                                    key=itemgetter(0)):
        answers = list(answers)
        correct = sum(answer for _, answer in answers)
        all_qs = len(answers)
        summary.append((section, correct, all_qs))
        log_section(summary[-1])

    total_correct = sum(t[1] for t in summary)
    total_all_qs = sum(t[2] for t in summary)
    summary.append(('total', total_correct, total_all_qs))
    log_section(summary[-1])

    return summary
Exemple #29
0
 def get_all_links_private(self):
     pool = ThreadPool(64)
     list(pool.imap(self.handler_all_links_public, self.__private))
Exemple #30
0
 def get_data_private(self):
     pool = ThreadPool(64)
     list(pool.imap(self.handler_data_private,
                    self.__schools_links_private))
Exemple #31
0
class Pipeline(object):
    """
    A pipeline of multiple processors to process S3 objects.
    """

    def __init__(self, access_key=None, secret_key=None, dry_run=False, threads=None):
        self._pipeline = []
        self.access_key = access_key
        self.secret_key = secret_key
        self.dry_run = dry_run
        self.threads = threads
        self.pool = ThreadPool(threads)
        self.thread_local_buckets = threading.local()

    def append(self, analyser, pattern, ignore_case=True):
        if ignore_case:
            pattern = re.compile(pattern, flags=re.IGNORECASE)
        else:
            pattern = re.compile(pattern)
        self._pipeline.append((pattern, analyser))

    def analyse(self, pattern, ignore_case=True):
        def decorator(func):
            self.append(DecoratorAnalyser(func.__name__, func), pattern, ignore_case)
            return func

        return decorator

    def connect_s3(self):
        if self.access_key is not None and self.secret_key is not None:
            return boto.connect_s3(aws_access_key_id=self.access_key,
                                   aws_secret_access_key=self.secret_key)
        else:
            return boto.connect_s3()

    def get_bucket(self, name):
        if getattr(self.thread_local_buckets, name, None) is None:
            logging.debug('Create new connection to S3 from thread %s', threading.currentThread())
            conn = self.connect_s3()
            bucket = conn.get_bucket(name)
            setattr(self.thread_local_buckets, name, bucket)
        return getattr(self.thread_local_buckets, name)

    def run(self, bucket, prefix='', show_progress=True):
        self.pre_run()

        bucket = self.get_bucket(bucket)
        keys = bucket.list(prefix)
        chunk_size = self.threads if self.threads is not None else cpu_count()
        it = self.pool.imap(self.analyse_key, keys, chunksize=chunk_size)
        if show_progress:
            list(progress.dots(it, label='Analysing bucket "%s"' % bucket.name))
        else:
            list(it)

        self.post_run()

    def pre_run(self):
        for _, analyser in self._pipeline:
            analyser.start()

    def post_run(self):
        for _, analyser in self._pipeline:
            analyser.finish()

    def analyse_key(self, key):
        bucket = self.get_bucket(key.bucket.name)
        for pattern, analyser in self._pipeline:
            if pattern.match(key.key):
                # update key metadata since last analyser might already modified it
                key = bucket.get_key(key.key)
                analyser.analyse(key, dry_run=self.dry_run)
Exemple #32
0
class Mininote(object):
    """Provides access to the Evernote 'database'."""

    def __init__(self, token, notebook_guid=None):
        """
        :param str token: The Evernote auth token
        :param str notebook_guid: The Evernote notebook GUID or None if not known
        """
        client = EvernoteClient(token=token,
                                consumer_key=EVERNOTE_CONSUMER_KEY,
                                consumer_secret=EVERNOTE_CONSUMER_SECRET,
                                sandbox=DEVELOPMENT_MODE)
        self._token = token
        self._note_store_uri = client.get_user_store().getNoteStoreUrl()
        self._thread_pool = ThreadPool(processes=EVERNOTE_FETCH_THREADS)

        self.notebook_guid = notebook_guid or self._get_create_notebook()

    def _note_store(self):
        """
        :returns: new NoteStore instance
        """
        return Store(self._token, NoteStore.Client, self._note_store_uri)

    def add_note(self, note):
        """
        :param Note note: The mininote Note instance
        """
        logger.debug('add note: {}'.format(note.text))
        self._note_store().createNote(convert_to_enote(note, self.notebook_guid))

    def search(self, string):
        """
        :param str string: The Evernote search query string
        :returns: An iterator to retrieve notes
        """
        def get_page(start):
            result_spec = NotesMetadataResultSpec(includeTitle=True,
                                                  includeCreated=True,
                                                  includeUpdated=True,
                                                  includeContentLength=True)
            return self._note_store().findNotesMetadata(note_filter,
                                                        start,
                                                        EVERNOTE_MAX_PAGE_SIZE,
                                                        result_spec)

        def iter_note_metadata(note_filter):
            i = 0
            while True:
                time0 = time.time()
                page = get_page(i)
                logger.debug('Page fetch time: {}'.format(time.time() - time0))
                for note_metadata in page.notes:
                    yield note_metadata
                i += len(page.notes)
                if i >= page.totalNotes:
                    break

        def fetch_note(note_metadata):
            if note_metadata.contentLength > CONTENT_FETCH_THRESHOLD:
                note = self._note_store().getNote(note_metadata.guid, True, False, False, False)
            else:
                note = None
            return convert_to_mininote(note_metadata, note)

        note_filter = NoteFilter(words=string,
                                 ascending=True,
                                 order=NoteSortOrder.UPDATED,
                                 notebookGuid=self.notebook_guid)
        return self._thread_pool.imap(fetch_note, iter_note_metadata(note_filter))

    def update_note(self, note):
        """
        :param Note note: The mininote Note instance
        """
        logger.debug('update_note: {}'.format(note))
        self._note_store().updateNote(convert_to_enote(note, self.notebook_guid))

    def delete_note(self, note):
        """
        :param Note note: The mininote Note instance
        """
        logger.debug('delete note: {}'.format(note))
        self._note_store().deleteNote(note.guid)

    def _get_create_notebook(self):
        """
        Get or create the Evernote notebook.

        :returns: Notebook guid
        """
        for notebook in self._note_store().listNotebooks():
            if notebook.name == EVERNOTE_NOTEBOOK:
                return notebook.guid
        return self._note_store() \
                   .createNotebook(Notebook(name=EVERNOTE_NOTEBOOK)) \
                   .guid
Exemple #33
0
def parallel(func, source, chunksize=0, numcpus=multiprocessing.cpu_count()):
    if chunksize:
        source = chunk(source, chunksize)
    p = ThreadPool(numcpus)
    for i in p.imap(func, source):
        yield i
class ParallelFileWriter(object):
    def __init__(self, fileobj, compresslevel=9, n_threads=1):
        self.fileobj = fileobj
        self.compresslevel = compresslevel
        self.n_threads = n_threads

        # Initialize file state
        self.size = 0
        self._init_state()
        self._write_header()

        # Parallel initialization
        self.buffers = []
        self.buffer_length = 0

        self.pool = ThreadPool(n_threads)
        self.compress_queue = Queue(maxsize=n_threads)

        self._consumer_thread = threading.Thread(target=self._consumer)
        self._consumer_thread.daemon = True
        self._consumer_thread.start()

    def tell(self):
        return self.size

    def write(self, data):
        if not isinstance(data, bytes):
            data = memoryview(data)
        n = len(data)
        if n > 0:
            self._per_buffer_op(data)
            self.size += n
            self.buffer_length += n
            self.buffers.append(data)
            if self.buffer_length > self._block_size:
                self.compress_queue.put(self.buffers)
                self.buffers = []
                self.buffer_length = 0
        return n

    def _consumer(self):
        with closing(self.pool):
            for buffers in self.pool.imap(self._compress,
                                          iter(self.compress_queue.get, None)):
                for buf in buffers:
                    if len(buf):
                        self.fileobj.write(buf)

    def _compress(self, in_bufs):
        out_bufs = []
        compressor = self._new_compressor()
        for data in in_bufs:
            out_bufs.append(compressor.compress(data))
        out_bufs.append(self._flush_compressor(compressor))
        return out_bufs

    def close(self):
        if self.fileobj is None:
            return

        # Flush any waiting buffers
        if self.buffers:
            self.compress_queue.put(self.buffers)

        # Wait for all work to finish
        self.compress_queue.put(None)
        self._consumer_thread.join()

        # Write the closing bytes
        self._write_footer()

        # Flush fileobj
        self.fileobj.flush()

        # Cache shutdown state
        self.compress_queue = None
        self.pool = None
        self.fileobj = None
class SiteCheckProcessManager(Thread, SiteCheckerController):
    MEM_MINIMUM_REQ = 100

    def __init__(self, job_name: str="", input_Q:multiprocessing.Queue=None, max_procss=4, concurrent_page=1,
                 page_max_level=10, max_page_per_site=1000, output_delegate=None,
                 memory_limit_per_process=100, **kwargs):
        """

        :param job_name:
        :param input_Q:
        :param max_procss:
        :param concurrent_page:
        :param page_max_level:
        :param max_page_per_site:
        :param output_delegate:
        :param memory_limit_per_process: if value is less than 100, throw ValueException
        :param kwargs:
        :return:
        """
        Thread.__init__(self)
        #FeedbackInterface.__init__(**kwargs)
        #super(SiteCheckProcessManager, self).__init__(**kwargs)
        #self.process_queue = multiprocessing.Queue()
        self.name = job_name
        if max_procss <= 0:
            max_procss = 1
        self.max_prcess = max_procss
        if input_Q is None:
            self.inputQueue = multiprocessing.Queue()
        else:
            self.inputQueue = input_Q
        self.outputQueue = multiprocessing.Queue()
        self._whoisQueue = multiprocessing.Queue()
        #self.output_lock = threading.RLock()
        #self.tempList = site_list # if there is a need to add new sites during scripting, add to this list
        self.processPrfix = "Process-"
        self.threadPrfix = "Thread-"
        self.page_max_level = page_max_level
        self.max_page_per_site = max_page_per_site

        if output_delegate is None:
            self.output_delegate = self.default_delegate
        else:
            self.output_delegate = output_delegate # delegate of type f(x:OnSiteLink)
        self.stop_event = multiprocessing.Event()
        self.finished = False
        self.pool = ThreadPool(processes=self.max_prcess)
        #self.pool = multiprocessing.Pool(processes=self.max_prcess)
        self.output_thread = None
        self.job_all = 0
        self.job_done = 0
        self.job_waiting = 0
        self.total_page_done = 0
        self.page_per_sec = 0  # need to do this
        self.average_page_per_site = 0
        self.patch_limit = self.max_prcess
        self.temp_results = []
        self.site_info = []  # collect site info after the job done
        self.db_trash_list = []
        self.concurrent_page = concurrent_page
        self.continue_lock = threading.RLock()
        self.db_trash_lock = threading.RLock()
        self.state_lock = threading.RLock()
        self.temp_result_lock = threading.RLock()
        self.site_info_lock = threading.RLock()
        if memory_limit_per_process < SiteCheckProcessManager.MEM_MINIMUM_REQ:
            ex = ValueError("minimum memory requirement to run the crawler is 100 MB, otherwise too many memory control looping.")
            msg = "error in SiteCheckProcessManager.__init__(), with database: " + job_name
            ErrorLogger.log_error("SiteCheckProcessManager", ex, msg)
        self.memory_limit_per_process = memory_limit_per_process
        self.whois_process = None
        self.whois_queue_process = Process(target=run_queue_server)
        #self.input_iter = SiteInputIter(self.inputQueue, self, self.concurrent_page, self.page_max_level,
        #                                self.max_page_per_site, self.outputQueue, self.process_site_info)
        self.input_iter = SiteInputIter(self.inputQueue, func=site_check_process, external_stop=self.stop_event)

    def _create_all_file_dirs(self):
        try:
            FileHandler.create_file_if_not_exist(get_log_dir())
            FileHandler.create_file_if_not_exist(get_recovery_dir_path())
            FileHandler.create_file_if_not_exist(get_temp_db_dir())
            FileHandler.create_file_if_not_exist(get_task_backup_dir())
            FileHandler.create_file_if_not_exist(get_db_buffer_default_dir())
        except Exception as ex:
            ErrorLogger.log_error("SiteCheckProcessManager", ex, "_create_all_file_dirs()")

    def clear_cache(self):
        try:
            FileHandler.clear_dir(get_log_dir())
            FileHandler.clear_dir(get_recovery_dir_path())
            FileHandler.clear_dir(get_temp_db_dir())
            FileHandler.clear_dir(get_task_backup_dir())
            FileHandler.clear_dir(get_db_buffer_default_dir())
        except Exception as ex:
            ErrorLogger.log_error("SiteCheckProcessManager", ex, "clear_cache()")

    def set_system_limit(self):
        try:
            os.system('sudo -s')
            os.system('ulimit -n 204800')
            # os.system('ulimit -s 1024')
        except Exception as ex:
            print(ex)

    def get_temp_result_count(self):
        #with self.temp_result_lock:
        return len(self.temp_results)

    def get_temp_result_and_clear(self) -> []:
        with self.temp_result_lock:
            copied = self.temp_results.copy()
            self.temp_results.clear()
        return copied

    def default_delegate(self, result):
        with self.temp_result_lock:
            if isinstance(result, OnSiteLink):
                self.temp_results.append(result)  # make no difference
                #CsvLogger.log_to_file("ExternalSiteTemp", [(result.link, result.response_code), ])
            elif isinstance(result, str):
                self.temp_results.append(result)
            elif isinstance(result, tuple) and len(result) == 2:
                temp = OnSiteLink(result[0], result[1])
                print("new domain:", temp)
                self.temp_results.append(temp)
            else:
                pass

    def get_state(self) -> SiteCheckProcessState:
        print("get state from slave crawler")
        with self.state_lock:
            state = SiteCheckProcessState(self.job_all, self.job_done, self.job_waiting, self.total_page_done,
                                          self.average_page_per_site, self.get_temp_result_count())
        print("get state from slave crawler finished")
        return state

    def get_filter_progress(self):
        if isinstance(self.whois_process, MemoryControlPs):
            state = self.whois_process.get_last_state()
            if isinstance(state, WhoisCheckerState):
                return state.progress_count, state.data_total
            else:
                return 0, 0
        else:
            return 0, 0

    def clear_trash(self):  # run with a thread
        while not self.stop_event.is_set():
            with self.db_trash_lock:
                removed_list = []
                trash_len = len(self.db_trash_list)
                if trash_len > 0:
                    for item in self.db_trash_list:
                        if TempDBInterface.force_clear(item):
                            #print("removed trash:", item)
                            removed_list.append(item)
                    for removed_item in removed_list:
                        self.db_trash_list.remove(removed_item)
                    CsvLogger.log_to_file("job_finished", [(x, str(datetime.datetime.now())) for x in removed_list], get_task_backup_dir())
                    removed_list.clear()
            time.sleep(2)

    def put_to_input_queue(self, data: []):
        if data is not None:
            for item in data:
                self.inputQueue.put(item)
                self.job_all += 1

    def get_site_info_list_and_clear(self):
        with self.site_info_lock:
            copied = self.site_info.copy()
            self.site_info.clear()
        return copied

    def get_site_info_list_count(self):
        return len(self.site_info)

    def process_site_info(self, site_info):
        if site_info is not None:
            with self.site_info_lock:
                PrintLogger.print("finished site info: " + str(site_info.__dict__))
                self.site_info.append(site_info)

    def process_feedback(self, feedback: SiteFeedback):
        self.add_page_done(feedback.page_done)
        if feedback.finished:
            # print("should process feedback!")
            self.site_finished()
            self.process_site_info(feedback.seed_feedback)
            with self.db_trash_lock:
                self.db_trash_list.append(feedback.datasource_ref)
                self.db_trash_list.append(feedback.datasource_ref+".ext.db")

    def add_page_done(self, number_page_done: int):  # make sure it is thread safe
        with self.state_lock:
            self.total_page_done += number_page_done
        time.sleep(0.001)

    def site_finished(self):
        # print("one more site done")
        with self.state_lock:
            self.job_done += 1
            self.average_page_per_site = self.total_page_done/self.job_done
        time.sleep(0.001)

    def set_stop(self):
        self.stop_event.set()

    def can_continue(self):
        return not self.stop_event.is_set()

    def checking_whois(self):
        optinmal = self.max_prcess * self.concurrent_page/5
        if optinmal < 10:
            worker_number = 10
        else:
            worker_number = int(optinmal)
        mem_limit = self.memory_limit_per_process/2
        if mem_limit < 200:
            mem_limit = 200
        self.whois_process = MemoryControlPs(whois_process,
                                      func_kwargs=WhoisChecker.get_input_parameters(self._whoisQueue, self.outputQueue,
                                                                                    self.stop_event, worker_number),
                                      mem_limit=mem_limit, external_stop_event=self.stop_event)
        self.whois_process.start()

    def queue_failure_reset(self):
        manager, self.outputQueue = get_queue_client(QueueManager.MachineSettingCrawler, QueueManager.Method_Whois_Output)
        return self.outputQueue

    def run(self):
        # self.set_system_limit()
        self._create_all_file_dirs()
        self.whois_queue_process.start()
        whois_thread = Thread(target=self.checking_whois)
        trash_clean_thread = Thread(target=self.clear_trash)
        manager, self.outputQueue = get_queue_client(QueueManager.MachineSettingCrawler, QueueManager.Method_Whois_Output)
        # self.output_thread = outputThread(0, self.threadPrfix+"Output", self.stop_event, self.outputQueue,
        #                           delegate=self.output_delegate, failsure_reset_queue=self.queue_failure_reset)
        self.output_thread = outputThread(threadID=0, name=self.threadPrfix+"Output", stop_event=self.stop_event,
                                          inputQ=self.outputQueue, delegate=self.output_delegate,
                                          failsure_reset_queue=self.queue_failure_reset)
        self.output_thread.start()
        trash_clean_thread.start()
        whois_thread.start()
        # self.whois_queue_process.start()
        self.input_iter.func_kwarg = SiteThreadChecker.get_input_parameter(full_link="", # this parameter will be updated in self.input_iter
                                                                           max_page=self.max_page_per_site,
                                                                           max_level=self.page_max_level,
                                                                           output_queue=self._whoisQueue,
                                                                           pool_size=self.concurrent_page)
        self.input_iter.callback = self.process_feedback
        self.input_iter.Memlimit = self.memory_limit_per_process
        try:
            #print("monitor process started: pid: ", os.getpid())
            self.pool.imap(site_check_process_iter, self.input_iter, 1)
            #self.pool.imap_unordered(site_check_process_iter, self.input_iter)
            while self.can_continue():
                time.sleep(0.5)
        except Exception as ex:
            msg = "run(), with database: " + self.name
            ErrorLogger.log_error("SiteCheckProcessManager", ex, msg)
        finally:
            print("terminate miner!")
            self.pool.terminate()
            whois_thread.join()
            self.whois_queue_process.terminate()
            self.temp_results.clear()
            self.site_info.clear()
            self.finished = True
Exemple #36
0
def download_sequence(output_folder, mpy_token, sequence, username, c, nb_sequences):
    global _DOWNLOAD_SEQUENCE_SIZE
    global _DOWNLOAD_TOTAL_SIZE

    subfolder_enabled = SUBFOLDER
    
    if DEBUG >= 3:
        print(" Prepare sequence download")
       
    if DEBUG >= 4:
        pprint(sequence)
 
    sequence_name = (
        sequence["properties"]["captured_at"]
        + "_"
        + sequence["properties"]["created_at"]
    )
    if os.name == "nt":
        sequence_name = sequence_name.replace(":", "_")
    sequence_day = sequence_name.split("T")[0]
    sorted_folder = output_folder + "/" + sequence_day

    if subfolder_enabled == 1:
        subfolder = sequence["properties"]["captured_at"]
        if os.name == "nt":
            subfolder = subfolder.replace(":", "_")
        sorted_folder = sorted_folder + "/" + subfolder

    download_list = []
    os.makedirs(sorted_folder, exist_ok=True)

    # First pass on image_keys : sorts which one needs downloading
    image_keys = sequence["properties"]["coordinateProperties"]["image_keys"]
    for image_index, image_key in enumerate(image_keys, 1):
        sorted_path = (
            sorted_folder + "/" + sequence_name + "_" + "%04d" % image_index + ".jpg"
        )
        if not os.path.exists(sorted_path):
            download_list.append(image_key)
        elif os.stat(sorted_path).st_size == 0:
            download_list.append(image_key)
    if not download_list:
        if DEBUG >= 2:
            print(" Sequence %r already fully downloaded" % sequence_name)
        return 0, 0

    already_downloaded = len(image_keys) - len(download_list)
    if already_downloaded:
        if DEBUG >= 1:
            print(" Already downloaded: %d/%d" % (already_downloaded, len(image_keys)))

    if DRY_RUN:
        return 1, len(download_list)

    # Third pass, download if entry is found in dict
    sequence_dl_retries = 0
    update_urls = True
    while download_list and not sequence_dl_retries >= SEQUENCE_DL_MAX_RETRIES:
        if update_urls:
            source_urls = get_source_urls(download_list, mpy_token, username)
            update_urls = False

        sequence_dl_retries += 1

        # show only on a retry
        if sequence_dl_retries > 1 and DEBUG >= 1:
            print("sequence download retries: %s/%s" % (sequence_dl_retries, SEQUENCE_DL_MAX_RETRIES))

        if len(download_list) > len(source_urls):
            print(
                " Missing %d/%d images, will refresh and retry later"
                % (len(download_list) - len(source_urls), len(download_list))
            )
            
            # if we get nothing wait a little bit
            if len(download_list) - len(source_urls) == len(download_list):
                if DEBUG >= 1:
                    print(" Wait a second due long missing source list")
                time.sleep(2)
                
            sequence_dl_retries -= 1
            # refresh list after this pass
            update_urls = True

        pool = ThreadPool(NUM_THREADS)
        pool_args = []
        for image_index, image_key in enumerate(image_keys, 1):
            if image_key in download_list:
                sorted_path = (
                    sorted_folder
                    + "/"
                    + sequence_name
                    + "_"
                    + "%04d" % image_index
                    + ".jpg"
                )
                if image_key in source_urls:
                    source_url = source_urls[image_key]
                    pool_args.append((image_key, sorted_path, source_url))

        if DEBUG >= 3:
            print(" Filling download pool done")
        
        try:
            for i, image_key in enumerate(pool.imap(download_file, pool_args), 1):
                if image_key:
                    download_list.remove(image_key)
                print(
                    "  Downloading images #%03d out of %03d round: %d" % (i, len(pool_args), sequence_dl_retries),
                    end="\r",
                    flush=True,
                )
        except SSLException as e:
            print(e)
        except DownloadException as e:
            print(e)
        except URLExpireException as e:
            print(e)
            sequence_dl_retries -= 1
            # refresh urls
            update_urls = True
        finally:
            pool.terminate()
            pool.join()
    print(" Done sequence %r (%d/%d) %3.1f MB, camera: %s" % (sequence_name, c, nb_sequences, _DOWNLOAD_SEQUENCE_SIZE/1024/1024, sequence["properties"]["camera_make"]), flush=True)
    _DOWNLOAD_TOTAL_SIZE += _DOWNLOAD_SEQUENCE_SIZE
    _DOWNLOAD_SEQUENCE_SIZE = 0
                    
    return 1, len(source_urls)