Ejemplo n.º 1
0
def fetch_data_package(url, dir_name):
    rq = requests.get(url)
    if (rq.status_code != 200):
        log.warn("Not authorized %d at %s" % (rq.status_code, url))
        return False
    spec = rq.json()
    # check for update
    dp_filename = os.path.join(dir_name, 'datapackage.json')
    if os.path.isfile(dp_filename):
        with open(dp_filename) as f:
            cached = json.load(f)
            if cached == spec:
                log.debug("No updates")
                return False
    # create a data folder
    data_folder = os.path.join(dir_name, 'data')
    if not os.path.isdir(dir_name):
        os.makedirs(data_folder)
    # download a copy of the datapackage
    download_file(dir_name, url, 'datapackage.json')
    for res in spec['resources']:
        if 'path' in res:
            # paths override urls, for local mirrors
            basepath = "/".join(url.split('/')[:-1]) + '/'
            fn = download_file(data_folder, basepath + res['path'])
        elif 'url' in res:
            # download resource from url
            fn = download_file(data_folder, res['url'])
        else:
            # skip this resource
            log.debug("Skipping: %s" % res)
            continue
        if 'title' in res:
            log.debug('Downloaded: %s - %s' % (res['title'], fn))
            return True
Ejemplo n.º 2
0
def process_datapackage(pkg_name):
    '''Reads a data package and returns a dict with its metadata. The
    items in the dict are:
        - name
        - title
        - license
        - description
        - sources
        - readme: in HTML, processed with python-markdown from README.md,
          empty if README.md does not exist)
        - datafiles: a dict that contains the contents of the "resources"
          attribute. Each resource also contains the "basename" property,
          which is the resource base filename (without preceding
          directory)
    '''
    pkg_dir = os.path.join(repo_dir, pkg_name)
    pkg_info = {}
    metadata = json.loads(open(os.path.join(pkg_dir, "datapackage.json")).read())

    # get main attributes
    pkg_info['name'] = pkg_name
    pkg_info['original_name'] = metadata['name']
    pkg_info['title'] = metadata['title']
    pkg_info['license'] = metadata.get('license')
    pkg_info['description'] = metadata['description']
    pkg_info['sources'] = metadata.get('sources')
    # process README
    readme = ""
    readme_path = os.path.join(pkg_dir, "README.md")
    pkg_info['readme_path'] = readme_path
    if not os.path.exists(readme_path):
        log.warn("No README.md file found in the data package.")
    else:
        contents = codecs.open(readme_path, 'r', 'utf-8').read()
        try:
            readme = markdown.markdown(contents, output_format="html5", encoding="UTF-8")
        except UnicodeDecodeError:
            log.critical("README.md has invalid encoding, maybe the datapackage is not UTF-8?")
            raise
    pkg_info['readme'] = readme
    # process resource/datafiles list
    for r in metadata['resources']:
        r['basename'] = os.path.basename(r['path'])
        if r.get('name'):
            title = os.path.basename(r['name'])
        else:
            # no resource name, use capitalised filename
            title = os.path.basename(r['path']).split('.')[0]
            title = title[:1].upper() + title[1:]
        r['title'] = title

    pkg_info['datafiles'] = metadata['resources']

    return pkg_info
Ejemplo n.º 3
0
def test_output():
    # All of these just need to output without errors.
    from zenlog import log
    log.debug("A quirky message only developers care about")
    log.info("Curious users might want to know this")
    log.warn("Something is wrong and any user should be informed")
    log.warning("Something is wrong and any user should be informed")
    log.error("Serious stuff, this is red for a reason")
    log.critical("OH NO everything is on fire")
    log.c("OH NO everything is on fire")
    log.crit("OH NO everything is on fire")
Ejemplo n.º 4
0
def test_output():
    # All of these just need to output without errors.
    from zenlog import log
    log.debug("A quirky message only developers care about")
    log.info("Curious users might want to know this")
    log.warn("Something is wrong and any user should be informed")
    log.warning("Something is wrong and any user should be informed")
    log.error("Serious stuff, this is red for a reason")
    log.critical("OH NO everything is on fire")
    log.c("OH NO everything is on fire")
    log.crit("OH NO everything is on fire")
Ejemplo n.º 5
0
def get_media_from_tag(tag: Tag) -> Media:
    """Create a Media object from one or more <a> tags."""
    media_tags: List[Tag] = tag.select('a')
    images = []
    videos = []
    for media_tag in media_tags:
        url: str = media_tag.get('href')
        if re.search(r'(?:png|jpe?g|gif|resizedimage)$', url):
            images.append(url)
        elif re.search(r'(?:youtube|youtu\.be|ajax%2Fmodvideo)', url):
            videos.append(url)
        else:
            log.warn(f'Unknown media type for url {url}')
    return Media(images, videos)
Ejemplo n.º 6
0
    def generate_session_pages(self):
        self.date_data = get_date_dataset()
        self.date_data.reverse()
        if self.fast_run:
            COUNTER = 0
        for leg, sess, num, d, dpub, page_start, page_end in self.date_data:
            dateobj = dateparser.parse(d)
            session = get_session_from_legsessnum(leg, sess, num)
            if not session:
                log.warn(
                    "File for %s-%s-%s is missing from the transcripts dataset!"
                    % (leg, sess, num))
                continue
            target_dir = "%s%d/%02d/%02d" % (self.sessions_path, dateobj.year,
                                             dateobj.month, dateobj.day)
            filename = "%s/index.html" % target_dir
            info = get_session_info(leg, sess, num)
            create_dir(os.path.join(self.output_dir, target_dir))

            if type(session) in (str, unicode):
                # sessão em texto simples
                context = {
                    'session_date': dateobj,
                    'year_number': dateobj.year,
                    'leg': leg,
                    'sess': sess,
                    'num': num,
                    'text': session,
                    'monthnames': MESES,
                    'pdf_url': 'xpto',
                    'page_name': 'sessoes',
                }
                if info:
                    context['session_info'] = info
                self.render_template_into_file('session_plaintext.html',
                                               filename, context)

            elif type(session) in (dict, OrderedDict):
                # usar entradas do .json como contexto
                session['session_date'] = dateparser.parse(
                    session['session_date'])
                session['monthnames'] = MESES
                session['page_name'] = 'sessoes'
                self.render_template_into_file('session.html', filename,
                                               session)
            if self.fast_run:
                COUNTER += 1
                if COUNTER > self.fast_run_count:
                    break
Ejemplo n.º 7
0
    def _update_all(self):
        default_args = self.get_default_args()
        for manga in self.db.get_all():  # type Manga
            self.log() and log.info('Update %s', manga.url)
            _args = default_args.copy()
            data = json.loads(manga.data)
            data_args = data.get('args', {})
            del data_args['rewrite_exists_archives']
            del data_args['user_agent']
            del data_args['url']

            if not fs.is_dir(
                    fs.path_join(data_args['destination'], data_args['name'])):
                self.log() and log.warn('Destination not exists. Skip')
                continue

            _args.update({  # re-init args
                'url': manga.url,
                **data_args,
            })
            provider = self._get_provider(_args)
            if provider:
                provider = provider()  # type: Provider
                provider.before_provider(_args)
                provider.http.cookies = data.get('cookies')
                provider.http.ua = data.get('browser')
                provider.run(_args)
                provider.after_provider()
                provider.update_db()
                self.global_info.add_info(info)
def update_dict_recursively(base_dict, update_dict, copy=True):
    if 0 == len(base_dict):
        return deepcopy(update_dict) if copy else update_dict
    if copy:
        base_dict = deepcopy(base_dict)
    try:
        for k, v in update_dict.items():
            if isinstance(v, Mapping):
                base_dict[k] = update_dict_recursively(base_dict.get(k, {}),
                                                       v,
                                                       copy=False)
            else:
                base_dict[k] = update_dict[k]
    # AttributeError: 'list' object has no attribute 'items' (occurs if update dict is empty (list))
    except AttributeError as ex:
        log.warn(f'Empty dict update added to {base_dict}')
        # raise ex
    return base_dict
def grow_cut(image: np.ndarray, seeds: np.ndarray, max_iter: int = 200, window_size: int = 3
             ) -> Tuple[np.ndarray, np.ndarray, int]:

    if 'uint16' != str(image.dtype):
        raise ValueError(f'"seed_sized_input_image" should be of dtype uint16, but is {image.dtype}')

    cell_changes = np.zeros_like(seeds, dtype=np.uint16)
    image_min_value = np.amin(image)
    image_max_value = np.amax(image)

    if 'int8' != str(seeds.dtype):
        if seeds.dtype in (np.bool, np.uint8):
            seeds = np.copy(seeds)  # Copies seed data
            seeds = seeds.view(np.int8)
            # Ensure that FG object is not on the edge of the image
            bg_label = -1
            seeds[:, (0, -1)] = seeds[(0, -1), :] = bg_label
            print('[SimpleGrowCut] added background labels at image edges')
        else:
            seeds = seeds.astype(np.int8)  # Copies seed data

    strength_map = np.empty_like(seeds, dtype=np.float64)
    strength_map[:] = (0 != seeds).view(np.int8)

    num_computed_iterations, segmentation_volume, *_ = growcut_cython(  # TODO return strength map as well?
        image=image,  # uint16
        labels_arr=seeds,  # int8
        strengths_arr=strength_map,  # float64
        cell_changes=cell_changes,  # uint16  # ref to same data as label_changes_per_cell
        max_iter=np.int32(max_iter),  # int32
        window_size=np.int32(window_size),  # int32
        image_max_distance_value=np.uint16(image_max_value - image_min_value),  # uint16
    )

    if num_computed_iterations == max_iter:
        neutral_label = 0
        log.warn('GrowCut used the given maximum of {} iterations. {} undefined labels left.'.format(
            num_computed_iterations, 'No' if np.any(segmentation_volume == neutral_label)
            else np.count_nonzero(0 == segmentation_volume)
        ))
    elif num_computed_iterations == 0:
        log.warn('GrowCut stopped after zero iterations')

    return segmentation_volume, seeds, num_computed_iterations
Ejemplo n.º 10
0
    def verify(self):
        """
        Creates a new model for the user and authenticates it with the
        challenge response method.

        Raises:
            IndexError: If an invalid message is encountered

        """

        # Connect to the socket
        self._connect()

        message = {
            "NewModel": {
                "email": self.email,
                "password": self.password,
                "model_name": self.model_name,
            }
        }

        self._send_message(message)

        while True:
            # Read some data
            data = self._read_message()
            log.debug(f"Received data={data}")

            try:
                variant, data = parse_message(data)

                if variant == "Challenge":
                    self.authenticate_challenge(data)
                elif variant == "AccessToken":
                    self.display_access(data)
                    self.save_access_tokens()
                    break
                else:
                    log.warn(
                        f"Encountered an unexpected message variant={variant}")

            except IndexError:
                log.error(f"Failed to parse a message from data={data}")
Ejemplo n.º 11
0
    def generate_session_pages(self):
        self.date_data = get_date_dataset()
        self.date_data.reverse()
        if self.fast_run:
            COUNTER = 0
        for leg, sess, num, d, dpub, page_start, page_end in self.date_data:
            dateobj = parse_iso_date(d)
            session = get_session_from_legsessnum(leg, sess, num)
            if not session:
                log.warn("File for %s-%s-%s is missing from the transcripts dataset!" % (leg, sess, num))
                continue
            target_dir = "%s%d/%02d/%02d" % (self.sessions_path, dateobj.year, dateobj.month, dateobj.day)
            filename = "%s/index.html" % target_dir
            info = get_session_info(leg, sess, num)
            create_dir(os.path.join(self.output_dir, target_dir))

            if type(session) in (str, unicode):
                # sessão em texto simples
                context = {'date': dateobj,
                           'year_number': dateobj.year,
                           'leg': leg,
                           'sess': sess,
                           'num': num,
                           'text': session,
                           'monthnames': MESES,
                           'pdf_url': 'xpto',
                           'page_name': 'sessoes',
                           }
                if info:
                    context['session_info'] = info
                self.render_template_into_file('session_plaintext.html', filename, context)

            elif type(session) in (dict, OrderedDict):
                # usar entradas do .json como contexto
                session['date'] = parse_iso_date(session['session_date'])
                session['monthnames'] = MESES
                session['page_name'] = 'sessoes'
                self.render_template_into_file('session.html', filename, session)
            if self.fast_run:
                COUNTER += 1
                if COUNTER > self.fast_run_count:
                    break
Ejemplo n.º 12
0
 def _run_normal(self, _args, urls):
     for url in urls:
         _args['url'] = url
         provider = self._get_provider(_args)  # type: Provider
         if provider:
             provider.before_provider(_args)
             provider.run(_args)
             provider.after_provider()
             provider.update_db()
             self.global_info.add_info(info)
         else:
             self.show_log() and log.warn('Provider not exists')
Ejemplo n.º 13
0
def download_file(output_dir, descriptor):
    descriptor_name = descriptor['name']
    output_path = output_dir / descriptor_name

    if output_path.is_file():
        if descriptor['hash'] == calculate_file_hash(output_path):
            log.info(f'proper file already downloaded ({descriptor_name})')
            return

        log.warn('file exists, but has invalid hash, re-downloading')
        os.remove(output_path)

    req = requests.get(descriptor['url'])
    if 200 != req.status_code:
        raise RuntimeError(
            f'could not download file ({descriptor_name}), try again')

    if descriptor['hash'] != calculate_buffer_hash(req.content):
        raise RuntimeError(
            f'downloaded file ({descriptor_name}) has invalid hash')

    with open(output_path, 'wb') as output_file:
        output_file.write(req.content)
Ejemplo n.º 14
0
def generate(offline=False,
             fetch_only=False,
             output_dir=OUTPUT_DIR,
             theme_dir=os.path.join(THEMES_DIR, 'centraldedados'),
             repo_dir=REPO_DIR,
             config_file=CONFIG_FILE):
    '''Main function that takes care of the whole process.'''
    global env, packages
    # Read the config file
    parser = SafeConfigParser()
    parser.read(config_file)
    # Load the theme and set up Jinja
    theme_name = parser.get('ui', 'theme')
    theme_dir = os.path.join(THEMES_DIR, theme_name)
    template_dir = os.path.join(theme_dir, "templates")
    env = jinja2.Environment(loader=jinja2.FileSystemLoader([template_dir]))

    # Set up the output directory
    if not os.path.exists(output_dir):
        os.mkdir(output_dir)
    # Set up the dir for storing repositories
    if not os.path.exists(repo_dir):
        log.debug("Directory %s doesn't exist, creating it." % repo_dir)
        os.mkdir(repo_dir)
    # Copy htaccess file
    shutil.copyfile(os.path.join(theme_dir, 'static/htaccess'),
                    os.path.join(output_dir, ".htaccess"))
    # Create static dirs
    # TODO: only update changed files -- right now we regenerate the whole static dir

    # Static CSS files
    css_dir = os.path.join(output_dir, "css")
    if os.path.exists(css_dir):
        shutil.rmtree(css_dir)
    shutil.copytree(os.path.join(theme_dir, "static/css"), css_dir)
    # Static JavaScript files
    js_dir = os.path.join(output_dir, "js")
    if os.path.exists(js_dir):
        shutil.rmtree(js_dir)
    shutil.copytree(os.path.join(theme_dir, "static/js"), js_dir)
    # Theme images
    img_dir = os.path.join(output_dir, "img")
    if os.path.exists(img_dir):
        shutil.rmtree(img_dir)
    shutil.copytree(os.path.join(theme_dir, "static/img"), img_dir)
    # Fonts
    fonts_dir = os.path.join(output_dir, "fonts")
    if os.path.exists(fonts_dir):
        shutil.rmtree(fonts_dir)
    shutil.copytree(os.path.join(theme_dir, "static/fonts"), fonts_dir)

    if not parser.items('repositories'):
        log.critical(
            'No repository data in settings.conf (does it even exist?). Cannot proceed :('
        )
        sys.exit()
    # go through each specified dataset
    for r in parser.items('repositories'):
        name, url = r
        dir_name = os.path.join(repo_dir, name)
        repo = None

        # do we have a local copy?
        if os.path.isdir(dir_name):
            if not os.path.isdir(os.path.join(dir_name, '.git')):
                if url.endswith(".json"):
                    log.info("%s: Data package, refreshing" % name)
                    updated = fetch_data_package(url, dir_name)
                else:
                    log.info('%s: Unsupported repo, skipping update' % name)
                    continue

            elif not offline:
                repo = git.Repo(dir_name)
                origin = repo.remotes.origin
                try:
                    origin.fetch()
                except AssertionError:
                    # usually this fails on the first run, try again
                    origin.fetch()
                except git.exc.GitCommandError:
                    log.critical(
                        "%s: Fetch error, this dataset will be left out." %
                        name)
                    continue
                # see if we have updates
                if not local_and_remote_are_at_same_commit(repo, origin):
                    log.debug(
                        "%s: Repo has new commits, updating local copy." %
                        name)
                    updated = True
                    # connection errors can also happen if fetch succeeds but pull fails
                    try:
                        result = origin.pull()[0]
                    except git.exc.GitCommandError:
                        log.critical(
                            "%s: Pull error, this dataset will be left out." %
                            name)
                        continue
                    if result.flags & result.ERROR:
                        log.error("%s: Pull error, but going ahead." % name)
                        updated = False
                else:
                    log.info("%s: No changes." % name)
                    updated = False
            else:
                log.debug("%s: Offline mode, using cached version." % name)
                # we set updated to True in order to re-generate everything
                # FIXME: See checksum of CSV files to make sure they're new before
                # marking updated as true
                updated = True
                repo = git.Repo(dir_name)
            if fetch_only:
                # if the --fetch-only flag was set, skip to the next dataset
                continue
        else:
            if offline:
                log.warn("%s: No local cache, skipping." % name)
                continue
            else:
                if url.endswith(".git"):
                    # Handle GIT Repository URL
                    log.info("%s: New repo, cloning." % name)
                    try:
                        repo = git.Repo.clone_from(url, dir_name)
                        # For faster checkouts, one file at a time:
                        #repo = git.Repo.clone_from(url, dir_name, n=True, depth=1)
                        #repo.git.checkout("HEAD", "datapackage.json")
                    except git.exc.GitCommandError as inst:
                        log.warn("%s: skipping %s" % (inst, name))
                        continue
                    updated = True

                elif url.endswith(".json"):
                    # Handle Data Package URL
                    log.info("%s: New data package, fetching." % name)
                    updated = fetch_data_package(url, dir_name)
                else:
                    log.warn("Unsupported repository: %s" % url)

        # get datapackage metadata
        try:
            pkg_info = process_datapackage(name, repo_dir, url)
        except ParseException as inst:
            log.warn("%s: skipping %s" % (inst, name))
            continue

        # set last updated time based on last commit, comes in Unix timestamp format so we convert
        import datetime
        if repo is not None:
            d = repo.head.commit.committed_date
        else:
            d = int(time.mktime(time.localtime()))
        last_updated = datetime.datetime.fromtimestamp(
            int(d)).strftime('%Y-%m-%d %H:%M:%S')
        pkg_info['last_updated'] = last_updated
        # add it to the packages list for index page generation after the loop ends
        packages.append(pkg_info)
        # re-generate the dataset HTML pages
        create_dataset_page(pkg_info, output_dir)
        # if repo was updated, copy over CSV/JSON/* and ZIP files to the download dir
        # (we always generate them if offline)
        if updated or offline:
            create_dataset_page(pkg_info, output_dir)
            datafiles = pkg_info['datafiles']
            zipf = zipfile.ZipFile(os.path.join(output_dir, name + '.zip'),
                                   'w')
            for d in datafiles:
                log.info("Copying %s" % d['path'])
                # copy file
                target = os.path.join(output_dir, os.path.basename(d['path']))
                shutil.copyfile(os.path.join(dir_name, d['path']), target)
                # generate JSON version of CSV
                # if target.endswith('.csv'):
                # csv2json(target, target.replace(".csv", ".json"))
                # make zip file
                zipf.write(os.path.join(dir_name, d['path']),
                           d['basename'],
                           compress_type=zipfile.ZIP_DEFLATED)
            if 'readme_path' in pkg_info:
                try:
                    zipf.write(pkg_info['readme_path'], 'README.md')
                except OSError:
                    pass
            zipf.close()

    # HTML index with the list of available packages
    create_index_page(packages, output_dir)
    # Static JSON API of the data packages
    create_api(packages, output_dir, repo_dir)
    # Static pages
    create_static_pages(output_dir)
    # Contact page
    create_contact_page(output_dir, parser.get('credentials', 'contact_email'))

    log.info("All static content is ready inside '%s'." % OUTPUT_DIR)
Ejemplo n.º 15
0
def process_datapackage(pkg_name, repo_dir, repo_url):
    '''Reads a data package and returns a dict with its metadata. The
    items in the dict are:
        - name
        - title
        - license
        - description
        - sources
        - readme: in HTML, processed with python-markdown from README.md,
          empty if README.md does not exist)
        - datafiles: a dict that contains the contents of the "resources"
          attribute. Each resource also contains the "basename" property,
          which is the resource base filename (without preceding
          directory)
    '''
    pkg_dir = os.path.join(repo_dir, pkg_name)
    pkg_info = {}
    try:
        metadata = json.loads(
            open(os.path.join(pkg_dir, "datapackage.json")).read())
    except IOError:
        raise ParseException("datapackage.json not found")

    # get main attributes
    pkg_info['name'] = pkg_name
    pkg_info['homepage'] = repo_url
    pkg_info['original_name'] = metadata['name']
    pkg_info['title'] = metadata['title']
    pkg_info['license'] = metadata.get('license')
    if pkg_info['license'] and 'title' in pkg_info['license']:
        pkg_info['license'] = pkg_info['license']['title']
    if not 'description' in metadata:
        pkg_info['description'] = ""
    else:
        pkg_info['description'] = metadata['description']
    pkg_info['sources'] = metadata.get('sources') or []
    # process README
    readme = ""
    readme_path = os.path.join(pkg_dir, "README.md")
    if not os.path.exists(readme_path):
        readme_path = os.path.join(pkg_dir, "README.markdown")
    if not os.path.exists(readme_path):
        if len(pkg_info['description']) > 140:
            readme = markdown.markdown(pkg_info['description'],
                                       output_format="html5",
                                       encoding="UTF-8")
            pkg_info['description'] = ""
        else:
            log.warn("No README.md or description found in the data package.")
    else:
        pkg_info['readme_path'] = readme_path
        contents = codecs.open(readme_path, 'r', 'utf-8').read()
        try:
            readme = markdown.markdown(contents,
                                       output_format="html5",
                                       encoding="UTF-8")
        except UnicodeDecodeError:
            raise ParseException(
                "README.md has invalid encoding, maybe the datapackage is not UTF-8?"
            )
    pkg_info['readme'] = readme
    # process resource/datafiles list
    for r in metadata['resources']:
        if not r.get('schema'):
            log.warn("Schema missing in resource, adding blank")
            r['schema'] = {'fields': []}
        if not r.get('path'):
            r['path'] = 'data/%s' % r['url'].split('/')[-1]
        r['basename'] = os.path.basename(r['path'])
        if not r.get('title'):
            if r.get('name'):
                title = os.path.basename(r['name'])
            else:
                # no resource name, use capitalised filename
                title = os.path.basename(r['path']).split('.')[0]
                title = title[:1].upper() + title[1:]
            r['title'] = title

    pkg_info['datafiles'] = metadata['resources']

    return pkg_info
def generate(offline, fetch_only):
    '''Main function that takes care of the whole process.'''
    # set up the output directory
    if not os.path.exists(output_dir):
        os.mkdir(output_dir)
    # set up the dir for storing repositories
    if not os.path.exists(repo_dir):
        log.info("Directory %s doesn't exist, creating it." % repo_dir)
        os.mkdir(repo_dir)
    # create dir for dataset pages
    if not os.path.exists(os.path.join(output_dir, datasets_dir)):
        os.mkdir(os.path.join(output_dir, datasets_dir))
    # create download dir for zip and csv/json/* dataset files
    if not os.path.exists(os.path.join(output_dir, files_dir)):
        os.mkdir(os.path.join(output_dir, files_dir))
    # create static dirs
    # TODO: only update changed files -- right now we regenerate the whole static dir
    css_dir = os.path.join(output_dir, "css")
    js_dir = os.path.join(output_dir, "js")
    img_dir = os.path.join(output_dir, "img")
    fonts_dir = os.path.join(output_dir, "fonts")
    if os.path.exists(css_dir):
        shutil.rmtree(css_dir)
    shutil.copytree("static/css", css_dir)
    if os.path.exists(js_dir):
        shutil.rmtree(js_dir)
    shutil.copytree("static/js", js_dir)
    if os.path.exists(img_dir):
        shutil.rmtree(img_dir)
    shutil.copytree("static/img", img_dir)
    if os.path.exists(fonts_dir):
        shutil.rmtree(fonts_dir)
    shutil.copytree("static/fonts", fonts_dir)

    # read the config file to get the datasets we want to publish
    parser = SafeConfigParser()
    parser.read(config_file)
    packages = []

    if not parser.items('repositories'):
        log.critical('No repository data in settings.conf (does it even exist?). Cannot proceed :(')
        sys.exit()
    # go through each specified dataset
    for r in parser.items('repositories'):
        name, url = r
        dir_name = os.path.join(repo_dir, name)

        # do we have a local copy?
        if os.path.isdir(dir_name):
            if not offline:
                log.info("Checking for changes in repo '%s'..." % name)
                repo = git.Repo(dir_name)
                origin = repo.remotes.origin
                try:
                    origin.fetch()
                except AssertionError:
                    # usually this fails on the first run, try again
                    origin.fetch()
                except git.exc.GitCommandError:
                    log.critical("Fetch error connecting to repository, this dataset will be ignored and not listed in the index!")
                    continue
                # connection errors can also happen if fetch succeeds but pull fails
                try:
                    result = origin.pull()[0]
                except git.exc.GitCommandError:
                    log.critical("Pull error connecting to repository, this dataset will be ignored and not listed in the index!")
                    continue
                # we get specific flags for the results Git gave us
                # and we set the "updated" var in order to signal whether to
                # copy over the new files to the download dir or not
                if result.flags & result.HEAD_UPTODATE:
                    log.info("No new changes in repo '%s'." % name)
                    updated = False
                elif result.flags & result.ERROR:
                    log.error("Error pulling from repo '%s'!" % name)
                    updated = False
                else:
                    # TODO: figure out other git-python flags and return more
                    # informative log output
                    log.info("Repo changed, updating. (returned flags: %d)" % result.flags)
                    updated = True
            else:
                log.info("Offline mode, using cached version of package %s..." % name)
                # we set updated to True in order to re-generate everything
                # FIXME: See checksum of CSV files to make sure they're new before
                # marking updated as true
                updated = True
                repo = git.Repo(dir_name)
            if fetch_only:
                # if the --fetch-only flag was set, skip to the next dataset
                continue
        else:
            if offline:
                log.warn("Package %s specified in settings but no local cache, skipping..." % name)
                continue
            else:
                log.info("We don't have repo '%s', cloning..." % name)
                repo = git.Repo.clone_from(url, dir_name)
                updated = True

        # get datapackage metadata
        pkg_info = process_datapackage(name)
        # set last updated time based on last commit, comes in Unix timestamp format so we convert
        import datetime
        d = repo.head.commit.committed_date
        last_updated = datetime.datetime.fromtimestamp(int("1284101485")).strftime('%Y-%m-%d %H:%M:%S')
        log.debug(last_updated)
        pkg_info['last_updated'] = last_updated
        # add it to the packages list for index page generation after the loop ends
        packages.append(pkg_info)
        # re-generate the dataset HTML pages
        create_dataset_page(pkg_info)
        # if repo was updated, copy over CSV/JSON/* and ZIP files to the download dir
        # (we always generate them if offline)
        if updated or offline:
            create_dataset_page(pkg_info)
            datafiles = pkg_info['datafiles']
            zipf = zipfile.ZipFile(os.path.join(output_dir, files_dir, name + '.zip'), 'w')
            for d in datafiles:
                # copy CSV file
                target = os.path.join(output_dir, files_dir, os.path.basename(d['path']))
                shutil.copyfile(os.path.join(dir_name, d['path']), target)
                # generate JSON version
                csv2json(target, target.replace(".csv", ".json"))
                # make zip file
                zipf.write(os.path.join(dir_name, d['path']), d['basename'], compress_type=zipfile.ZIP_DEFLATED)
            try:
                zipf.write(pkg_info['readme_path'], 'README.md')
            except OSError:
                pass
            zipf.close()

    # generate the HTML index with the list of available packages
    create_index_page(packages)
    # generate the static JSON API of the data packages
    create_api(packages)
Ejemplo n.º 17
0
 def stop(self):
     if self.is_playing:
         log.debug("Killing ffplay PID: {}".format(self.process.pid))
         os.kill(self.process.pid, SIGTERM)
     else:
         log.warn("Player: radio is not playing")
Ejemplo n.º 18
0
def generate(offline=False,
             fetch_only=False,
             output_dir=OUTPUT_DIR,
             theme_dir=os.path.join(THEMES_DIR, 'centraldedados'),
             repo_dir=REPO_DIR,
             config_file=CONFIG_FILE):
    '''Main function that takes care of the whole process.'''
    global env, packages
    # Read the config file
    parser = SafeConfigParser()
    parser.read(config_file)
    # Load the theme and set up Jinja
    theme_name = parser.get('ui', 'theme')
    theme_dir = os.path.join(THEMES_DIR, theme_name)
    template_dir = os.path.join(theme_dir, "templates")
    env = jinja2.Environment(loader=jinja2.FileSystemLoader([template_dir]))

    # Set up the output directory
    if not os.path.exists(output_dir):
        os.mkdir(output_dir)
    # Set up the dir for storing repositories
    if not os.path.exists(repo_dir):
        log.debug("Directory %s doesn't exist, creating it." % repo_dir)
        os.mkdir(repo_dir)
    # Copy htaccess file
    shutil.copyfile(os.path.join(theme_dir, 'static/htaccess'), os.path.join(output_dir, ".htaccess"))

    # Static CSS files
    css_dir = os.path.join(output_dir, "css")
    if os.path.exists(css_dir):
        shutil.rmtree(css_dir)
    shutil.copytree(os.path.join(theme_dir, "static/css"), css_dir)
    # Static JavaScript files
    js_dir = os.path.join(output_dir, "js")
    if os.path.exists(js_dir):
        shutil.rmtree(js_dir)
    shutil.copytree(os.path.join(theme_dir, "static/js"), js_dir)
    # Theme images
    img_dir = os.path.join(output_dir, "img")
    if os.path.exists(img_dir):
        shutil.rmtree(img_dir)
    shutil.copytree(os.path.join(theme_dir, "static/img"), img_dir)
    # Fonts
    fonts_dir = os.path.join(output_dir, "fonts")
    if os.path.exists(fonts_dir):
        shutil.rmtree(fonts_dir)
    shutil.copytree(os.path.join(theme_dir, "static/fonts"), fonts_dir)

    if not parser.items('repositories'):
        log.critical('No repository data in settings.conf (does it even exist?). Cannot proceed :(')
        sys.exit()
    # go through each specified dataset
    for r in parser.items('repositories'):
        name, url = r
        dir_name = os.path.join(repo_dir, name)
        repo = None

        # do we have a local copy?
        if os.path.isdir(dir_name):
            if not os.path.isdir(os.path.join(dir_name, '.git')):
                if url.endswith(".json"):
                    log.info("%s: Data package, refreshing" % name)
                    updated = fetch_data_package(url, dir_name)
                else:
                    log.info('%s: Unsupported repo, skipping update' % name)
                    continue

            elif not offline:
                repo = git.Repo(dir_name)
                origin = repo.remotes.origin
                try:
                    origin.fetch()
                except AssertionError:
                    # usually this fails on the first run, try again
                    origin.fetch()
                except git.exc.GitCommandError:
                    log.critical("%s: Fetch error, this dataset will be left out." % name)
                    continue
                # see if we have updates
                if not local_and_remote_are_at_same_commit(repo, origin):
                    log.debug("%s: Repo has new commits, updating local copy." % name)
                    updated = True
                    # connection errors can also happen if fetch succeeds but pull fails
                    try:
                        result = origin.pull()[0]
                    except git.exc.GitCommandError:
                        log.critical("%s: Pull error, this dataset will be left out." % name)
                        continue
                    if result.flags & result.ERROR:
                        log.error("%s: Pull error, but going ahead." % name)
                        updated = False
                else:
                    log.info("%s: No changes." % name)
                    updated = False
            else:
                log.debug("%s: Offline mode, using cached version." % name)
                # we set updated to True in order to re-generate everything
                updated = True
                repo = git.Repo(dir_name)
            if fetch_only:
                # if the --fetch-only flag was set, skip to the next dataset
                continue
        else:
            if offline:
                log.warn("%s: No local cache, skipping." % name)
                continue
            else:
                if url.endswith(".git"):
                    # Handle GIT Repository URL
                    log.info("%s: New repo, cloning." % name)
                    try:
                        repo = git.Repo.clone_from(url, dir_name)
                        # For faster checkouts, one file at a time:
                        # repo = git.Repo.clone_from(url, dir_name, n=True, depth=1)
                        # repo.git.checkout("HEAD", "datapackage.json")
                    except git.exc.GitCommandError as inst:
                        log.warn("%s: skipping %s" % (inst, name))
                        continue
                    updated = True

                elif url.endswith(".json"):
                    # Handle Data Package URL
                    log.info("%s: New data package, fetching." % name)
                    updated = fetch_data_package(url, dir_name)
                else:
                    log.warn("Unsupported repository: %s" % url)

        # get datapackage metadata
        try:
            pkg_info = process_datapackage(name, repo_dir, url)
        except ParseException as inst:
            log.warn("%s: skipping %s" % (inst, name))
            continue

        # set last updated time based on last commit, comes in Unix timestamp format so we convert
        import datetime
        if repo is not None:
            d = repo.head.commit.committed_date
        else:
            d = int(time.mktime(time.localtime()))
        last_updated = datetime.datetime.fromtimestamp(int(d)).strftime('%Y-%m-%d %H:%M:%S')
        pkg_info['last_updated'] = last_updated
        # add it to the packages list for index page generation after the loop ends
        packages.append(pkg_info)
        # re-generate the dataset HTML pages
        create_dataset_page(pkg_info, output_dir)
        # if repo was updated, copy over CSV/JSON/* and ZIP files to the download dir
        # (we always generate them if offline)
        if updated or offline:
            create_dataset_page(pkg_info, output_dir)
            datafiles = pkg_info['datafiles']
            zipf = zipfile.ZipFile(os.path.join(output_dir, name + '.zip'), 'w')
            for d in datafiles:
                log.info("Copying %s" % d['path'])
                # copy file
                target = os.path.join(output_dir, os.path.basename(d['path']))
                shutil.copyfile(os.path.join(dir_name, d['path']), target)
                # generate JSON version of CSV
                if target.endswith('.csv'):
                    csv2json(target, target.replace(".csv", ".json"))
                # make zip file
                zipf.write(os.path.join(dir_name, d['path']), d['basename'], compress_type=zipfile.ZIP_DEFLATED)
            if 'readme_path' in pkg_info:
                try:
                    zipf.write(pkg_info['readme_path'], 'README.md')
                except OSError:
                    pass
            zipf.close()

    # HTML index with the list of available packages
    create_index_page(packages, output_dir)
    # Static JSON API of the data packages
    create_api(packages, output_dir, repo_dir)
    # Static pages
    create_static_pages(output_dir)
    # Contact page
    create_contact_page(output_dir, parser.get('credentials', 'contact_email'))

    log.info("All static content is ready inside '%s'." % OUTPUT_DIR)
Ejemplo n.º 19
0
def process_datapackage(pkg_name, repo_dir, repo_url):
    '''Reads a data package and returns a dict with its metadata. The
    items in the dict are:
        - name
        - title
        - license
        - repository
        - version
        - description
        - sources
        - readme: in HTML, processed with python-markdown from README.md,
          empty if README.md does not exist)
        - datafiles: a dict that contains the contents of the "resources"
          attribute. Each resource also contains the "basename" property,
          which is the resource base filename (without preceding
          directory)
    '''
    pkg_dir = os.path.join(repo_dir, pkg_name)
    pkg_info = {}
    try:
        metadata = json.loads(open(os.path.join(pkg_dir, "datapackage.json")).read())
    except IOError:
        raise ParseException("datapackage.json not found")

    # get main attributes
    pkg_info['name'] = pkg_name
    pkg_info['homepage'] = repo_url
    pkg_info['original_name'] = metadata['name']
    pkg_info['title'] = metadata['title']
    pkg_info['license'] = metadata.get('license')
    pkg_info['version'] = metadata.get('version')
    pkg_info['repository'] = metadata.get('repository')
    pkg_info['homepage'] = metadata.get('homepage')
    if pkg_info['license'] and 'title' in pkg_info['license']:
        pkg_info['license'] = pkg_info['license']['title']
    if 'description' not in metadata:
        pkg_info['description'] = ""
    else:
        pkg_info['description'] = metadata['description']
    pkg_info['sources'] = metadata.get('sources') or []
    # process README
    readme = ""
    readme_path = os.path.join(pkg_dir, "README.md")
    if not os.path.exists(readme_path):
        readme_path = os.path.join(pkg_dir, "README.markdown")
    if not os.path.exists(readme_path):
        if len(pkg_info['description']) > 140:
            readme = markdown.markdown(pkg_info['description'], output_format="html5", encoding="UTF-8")
            pkg_info['description'] = ""
        else:
            log.warn("No README.md or description found in the data package.")
    else:
        pkg_info['readme_path'] = readme_path
        contents = codecs.open(readme_path, 'r', 'utf-8').read()
        try:
            readme = markdown.markdown(contents, output_format="html5", encoding="UTF-8")
        except UnicodeDecodeError:
            raise ParseException("README.md has invalid encoding, maybe the datapackage is not UTF-8?")
    pkg_info['readme'] = readme
    # process resource/datafiles list
    for r in metadata['resources']:
        if not r.get('schema'):
            log.warn("Schema missing in resource, adding blank")
            r['schema'] = {'fields': []}
        if not r.get('path'):
            r['path'] = 'data/%s' % r['url'].split('/')[-1]
        r['basename'] = os.path.basename(r['path'])
        if not r.get('title'):
            if r.get('name'):
                title = os.path.basename(r['name'])
            else:
                # no resource name, use capitalised filename
                title = os.path.basename(r['path']).split('.')[0]
                title = title[:1].upper() + title[1:]
            r['title'] = title

    pkg_info['datafiles'] = metadata['resources']

    return pkg_info
Ejemplo n.º 20
0
def seeds_to_metrics(seeds: Dict[str, Dict[str, Dict[str, list]]],
                     gt_data: Dict[str, tuple],
                     image_ids: list,
                     load_from: str = 'seeds_to_metrics.json'):
    # image_ids == [['feedback_interactive', 'UserName1', '2019-04-10T14:24:44', '2448185', 'data_set', 'DataID1'], ...]

    load_ = Path(__file__).with_name(load_from)
    if load_.exists():
        with load_.open(mode='r') as fp:
            log.info(f'Reading file {load_}')
            metrics_results = json.load(fp)
    else:
        metrics_results = {}
    for prot, v in seeds.items():
        if prot not in metrics_results:
            metrics_results[prot] = {}
        for usr, v_ in v.items():
            if usr not in metrics_results[prot]:
                metrics_results[prot][usr] = {}
            for expe, seeds_list in v_.items():
                if expe not in metrics_results[prot][usr]:
                    image_id = [
                        e[5] for e in image_ids
                        if e[0] == prot and e[1] == usr and e[2] == expe
                    ]
                    if len(image_id) is 0:
                        print(
                            f'[Warning] no ID found for {(prot, usr, expe)} in {image_ids}'
                        )
                        continue
                    image_id = image_id[0]
                    try:
                        img, gt = gt_data[image_id]
                    except KeyError:
                        print(
                            f'[Warning] data ID not found, skipping "{image_id}"'
                        )
                        continue
                    metrics_ = []

                    current_seeds = np.zeros_like(img, dtype=np.int8)
                    current_seeds[(0, -1), :] = -1
                    current_seeds[:, (0, -1)] = -1

                    assert 'uint' in str(
                        img.dtype), f'{image_id}, {img.dtype}, {img.shape}'
                    img = img.astype(np.uint16, copy=False)

                    met = Metrics()
                    met.set_multiple_inputs(
                        (gt, np.zeros_like(current_seeds, dtype=np.int8) - 1))
                    metrics_result = met.get_outcome()
                    metrics_.append(metrics_result)

                    for seeds_ in seeds_list:
                        if len(seeds_['bg']) > 0:
                            prev_w, prev_h = seeds_['bg'][0]
                            for i, (w_, h_) in enumerate(seeds_['bg']):
                                try:
                                    rr, cc = line(prev_h, prev_w, h_, w_)
                                    current_seeds[rr, cc] = -1
                                    prev_w, prev_h = w_, h_
                                except IndexError as ex:
                                    log.warn(ex)
                                    if len(seeds_['bg']) > (i + 1):
                                        prev_w, prev_h = seeds_['bg'][i + 1]
                        if len(seeds_['fg']) > 0:
                            prev_w, prev_h = seeds_['fg'][0]
                            for i, (w_, h_) in enumerate(seeds_['fg']):
                                try:
                                    rr, cc = line(prev_h, prev_w, h_, w_)
                                    current_seeds[rr, cc] = 1
                                    prev_w, prev_h = w_, h_
                                except IndexError as ex:
                                    log.warn(ex)
                                    if len(seeds_['bg']) > (i + 1):
                                        prev_w, prev_h = seeds_['bg'][i + 1]

                        segmentation_mask, *_ = grow_cut(img,
                                                         current_seeds,
                                                         max_iter=2**11,
                                                         window_size=3)
                        met = Metrics()
                        met.set_multiple_inputs((gt, segmentation_mask))
                        metrics_result = met.get_outcome()
                        metrics_.append(metrics_result)

                    metrics_results[prot][usr][expe] = {image_id: metrics_}
    return metrics_results
def parse_event_info(event, info):
    # event is a dict, info is a BeautifulSoup object
    if event['type'] == u"Baixa comissão para discussão":
        com_name = info.find('span', id=RE_COMNAME)
        if com_name:
            event['comission_name'] = com_name.text.strip()
    elif event['type'] == u"Publicação":
        url_nodes = info.findAll('a')
        urls = [{'url': node['href'], 'title': node.text.strip('[]')} for node in url_nodes]
        event['references'] = urls
    elif event['type'] in (u"Votação na generalidade", u"Votação Deliberação", u"Votação final global"):
        vote_info = info.find('span', id=RE_VOTEINFO)

        # funky parse loop for understanding how each party voted
        # I really have to refactor this, please pester me if you need it -- rlafuente
        results = {'for': [], 'against': [], 'abstain': []}
        current_vote = None
        for c in vote_info.contents:
            if type(c) == bs4.element.Tag:
                if c.name == "br":
                    continue
                elif c.name == "i":
                    results[current_vote].append(c.text)
                else:
                    log.error("Unrecognized vote tag: %s" % c)
            elif type(c) == bs4.element.NavigableString:
                c = c.strip()
                if c == ",":
                    continue
                if c.startswith(u'Contra:'):
                    current_vote = "against"
                    if not c == u'Contra:':
                        # cases with voters in one line (individual MPs)
                        # ex. "Abstenção: Isabel Oneto (PS)"
                        c = c.replace(u'Contra: ', '').split(', ')
                        for mp in c:
                            if mp:
                                results[current_vote].append(mp.strip(','))
                elif c.startswith(u"A Favor:"):
                    current_vote = "for"
                    if not c == u'A Favor:':
                        c = c.replace(u'A Favor: ', '').split(', ')
                        for mp in c:
                            if mp:
                                results[current_vote].append(mp.strip(','))
                elif c.startswith(u"Abstenção:"):
                    current_vote = "abstain"
                    if not c == u'Abstenção:':
                        c = c.replace(u'Abstenção: ', '').split(', ')
                        for mp in c:
                            if mp:
                                results[current_vote].append(mp.strip(','))
                else:
                    log.warn("Orphan vote string: %s -- saving as voter" % c)
                    c = c.split(', ')
                    for mp in c:
                        if mp:
                            results[current_vote].append(mp)

        event['vote_info'] = results
        pass
    else:
        if info.text.strip():
            event['raw_info'] = info.text.strip()
    return event
Ejemplo n.º 22
0
def predict_log_data(
        input_file_path='./data/cache/correlation_raw_data.json',
        out_dir: Union[Path, str] = './data/results',
        remove_questionnaire_features: bool = True,
        keep_sus_as_feature:
    bool = False,  # May be used for AttrakDiff-only prediction
        add_features_from_pca:
    Union[
        int,
        float] = 0.1,  # Add N (int or frac) additional PCA features to X_{train|test}
        use_feature_selection: bool = True,
        use_num_features:
    int = 20,  # Note: only utilized if 'use_feature_selection'
        relative_test_size: float = 0.1,
        random_state: int = 42):

    out_dir = Path(out_dir)
    out_dir.mkdir(parents=True, exist_ok=True)

    log.info(f'Load data from "./best_features.json"')
    with Path('best_features.json').open('r') as fp:
        # With 128 trees, best 1%
        feature_selection_columns = json.load(fp)
    feature_selection_columns = {
        k: [e.replace('\u03a3', u'Σ') for e in v]
        for k, v in feature_selection_columns.items()
    }

    if use_feature_selection:
        assert use_num_features <= len(feature_selection_columns['PQ'])

    y_labels = ('PQ', 'ATT', 'HQ-I', 'HQ-S', 'HQ', 'SUS')
    questionnaire_labels = [l.lower() for l in y_labels]

    if keep_sus_as_feature:
        questionnaire_labels = [
            l for l in questionnaire_labels if l.upper() != 'SUS'
        ]
        y_labels = [l for l in y_labels if l.upper() != 'SUS']

    attributes, data = get_data(input_file_path)

    pd.set_option('display.max_rows', 500)
    pd.set_option('display.max_columns', 500)
    pd.set_option('display.width', 1000)

    for y_label in y_labels:

        log.info(f'# {y_label.upper()}')

        y_attribute = [y_label.lower() == a.lower() for a in attributes]

        if remove_questionnaire_features:
            x_attributes = [(a.lower() not in questionnaire_labels)
                            for a in attributes]
        else:
            x_attributes = np.invert(y_attribute).tolist()

        X = np.array(data)[:, x_attributes]
        y = np.array(data)[:, y_attribute].ravel()

        if 0 < add_features_from_pca:
            if 1 > add_features_from_pca:
                n_components_pca = int(
                    np.ceil(add_features_from_pca * X.shape[1]))
            else:
                n_components_pca = add_features_from_pca

            if n_components_pca > min(*X.shape):
                log.warn(
                    f'Could not compute PCA with {n_components_pca} components, since input X {X.shape} has '
                    +
                    f'not enough data. Using {min(*X.shape)} components instead.'
                )
                n_components_pca = min(*X.shape)

            X, _ = impute_and_scale(X)
            pca = PCA(n_components=n_components_pca, svd_solver='full')
            pca.fit(X)
            X_pca = pca.transform(X)
            log.debug(f'X_pca has shape {X_pca.shape}')
            X = np.concatenate((X, X_pca), axis=1)

            pca_attributes = [f'PCA_VAL_{i}' for i in range(X_pca.shape[1])]
            attributes_including_pca = [*attributes, *pca_attributes]
            log.info(f'Additional PCA features: {X_pca.shape[1]}')
            x_attributes.extend([True] * X_pca.shape[1])
            del X_pca

        df = pd.DataFrame(
            data=X, columns=np.array(attributes_including_pca)[x_attributes])
        # log.debug(df.head())
        # log.debug(df.describe(include='all'))

        if use_feature_selection:
            # Check if all features (PCA) are present
            feature_selection_columns = {
                k: [v_ for v_ in v if v_ in attributes_including_pca]
                for k, v in feature_selection_columns.items()
            }

            feature_names = feature_selection_columns[
                y_label.upper()][:use_num_features]

            df = df.loc[:, feature_names]
            # log.info(df.head())
            # log.info(df.describe(include='all'))

            log.info(
                f'Features before selection: {np.count_nonzero(x_attributes)}')
            x_attributes = [
                (x_attr and (atrr in feature_names))
                for x_attr, atrr in zip(x_attributes, attributes_including_pca)
            ]
            log.info(
                f'Features after selection: {np.count_nonzero(x_attributes)}')
            log.info(
                str([
                    atrr for x_attr, atrr in zip(x_attributes,
                                                 attributes_including_pca)
                    if (x_attr and (atrr in feature_names))
                ]))
            X = df
            # X = X.as_matrix()

        X, scaler = impute_and_scale(X)

        log.debug(f'Number of train/test splits is {X.shape[0]}')
        kf = KFold(n_splits=X.shape[0],
                   shuffle=False,
                   random_state=random_state)

        log.info(f'Load data from "./best_parameters.json"')
        with Path('best_parameters.json').open(
                'r') as fp:  # With 128 trees, best 1%
            parameters = json.load(fp)[y_label]
        parameters.pop('random_state', '<dummy/>')

        if X.shape[0] < 10 and parameters['min_samples_leaf'] > 1:
            log.warn(
                f'GBRF parameter "min_samples_leaf" is originally set to {parameters["min_samples_leaf"]}, '
                +
                f'however only {X.shape[0]} samples are in the overall input data. '
                +
                'Therefore, "min_samples_leaf" are set to 1 during training.')
            parameters['min_samples_leaf'] = 1

        log.info(f'Best parameters: {parameters}')

        for iteration_num, (train_indices,
                            test_indices) in tqdm(enumerate(kf.split(X=X,
                                                                     y=y))):
            assert test_indices.size == 1
            X_train, X_test, y_train, y_test = X[train_indices], X[
                test_indices], y[train_indices], y[test_indices]

            # Speed improvement
            X_train = np.asfortranarray(X_train, dtype=np.float64)
            y_train = np.ascontiguousarray(y_train, dtype=np.float32)
            X_test = np.asfortranarray(X_test, dtype=np.float64)
            y_test = np.ascontiguousarray(y_test, dtype=np.float32)

            x_feature_labels = feature_names if use_feature_selection else \
                [a for a, l in zip(attributes_including_pca, x_attributes) if l]
            assert len(x_feature_labels) == X_train.shape[1]

            log.info(f'y size: {y.shape} -> y_train size: {y_train.shape}')
            log.info(
                f'y label: {", ".join([a for a, l in zip(attributes_including_pca, y_attribute) if l])}'
            )
            log.info(f'features: {len(x_feature_labels)}')

            # Set seed for reproducibility
            np.random.seed(random_state + iteration_num)
            parameters.update({'random_state': random_state + iteration_num})

            additional_hash = ''
            if use_feature_selection:
                additional_hash = f'-{hash_(feature_selection_columns)}'
            save_file = f'gbrf-label_{y_label.upper()}-PCA_{add_features_from_pca}-' + \
                        f'SUS_{keep_sus_as_feature}-FEATSEL_{use_feature_selection}-' + \
                        f'{test_indices[0]}-{iteration_num}-{hash_(parameters)}-{additional_hash}.pkl'
            save_file = Path(out_dir).joinpath(save_file)
            log.info(f'Current model\'s save file: "{save_file}"')

            if save_file.is_file():
                log.info('Load model from save file')
                try:
                    sf = joblib.load(filename=save_file)
                except AttributeError as ex:
                    log.error(
                        'You probably used another version of sklearn or Python to pickle this.'
                    )
                    raise ex
                model = sf['model']
            else:
                log.info('Fit new model')
                model = GradientBoostingRegressor(**parameters)
                model.fit(X_train, y_train)

                value = {
                    'model': model,
                    'parameters': parameters,
                    'X_train': X_train,
                    'y_train': y_train,
                    'X_test': X_test,
                    'y_test': y_test,
                    'X_feature_labels': x_feature_labels,
                    'y_label': y_label,
                    'additional_parameters': {
                        'relative_test_size': relative_test_size,
                        'remove_quest_features': remove_questionnaire_features,
                        'keep_sus_as_feature': keep_sus_as_feature,
                        'add_features_from_pca': add_features_from_pca,
                        'use_feature_selection': use_feature_selection,
                        'use_num_features': use_num_features,
                        'random_state': random_state + iteration_num,
                        'scaler': scaler,
                    }
                }
                joblib.dump(value=value,
                            filename=save_file,
                            compress=9,
                            protocol=pickle.HIGHEST_PROTOCOL)

            y_pred = model.predict(X_test)

            for score_func in (explained_variance_score, mean_absolute_error,
                               mean_squared_error, median_absolute_error,
                               r2_score):
                log.info(f'{score_func.__name__} {score_func(y_test, y_pred)}')