def fetch_data_package(url, dir_name): rq = requests.get(url) if (rq.status_code != 200): log.warn("Not authorized %d at %s" % (rq.status_code, url)) return False spec = rq.json() # check for update dp_filename = os.path.join(dir_name, 'datapackage.json') if os.path.isfile(dp_filename): with open(dp_filename) as f: cached = json.load(f) if cached == spec: log.debug("No updates") return False # create a data folder data_folder = os.path.join(dir_name, 'data') if not os.path.isdir(dir_name): os.makedirs(data_folder) # download a copy of the datapackage download_file(dir_name, url, 'datapackage.json') for res in spec['resources']: if 'path' in res: # paths override urls, for local mirrors basepath = "/".join(url.split('/')[:-1]) + '/' fn = download_file(data_folder, basepath + res['path']) elif 'url' in res: # download resource from url fn = download_file(data_folder, res['url']) else: # skip this resource log.debug("Skipping: %s" % res) continue if 'title' in res: log.debug('Downloaded: %s - %s' % (res['title'], fn)) return True
def process_datapackage(pkg_name): '''Reads a data package and returns a dict with its metadata. The items in the dict are: - name - title - license - description - sources - readme: in HTML, processed with python-markdown from README.md, empty if README.md does not exist) - datafiles: a dict that contains the contents of the "resources" attribute. Each resource also contains the "basename" property, which is the resource base filename (without preceding directory) ''' pkg_dir = os.path.join(repo_dir, pkg_name) pkg_info = {} metadata = json.loads(open(os.path.join(pkg_dir, "datapackage.json")).read()) # get main attributes pkg_info['name'] = pkg_name pkg_info['original_name'] = metadata['name'] pkg_info['title'] = metadata['title'] pkg_info['license'] = metadata.get('license') pkg_info['description'] = metadata['description'] pkg_info['sources'] = metadata.get('sources') # process README readme = "" readme_path = os.path.join(pkg_dir, "README.md") pkg_info['readme_path'] = readme_path if not os.path.exists(readme_path): log.warn("No README.md file found in the data package.") else: contents = codecs.open(readme_path, 'r', 'utf-8').read() try: readme = markdown.markdown(contents, output_format="html5", encoding="UTF-8") except UnicodeDecodeError: log.critical("README.md has invalid encoding, maybe the datapackage is not UTF-8?") raise pkg_info['readme'] = readme # process resource/datafiles list for r in metadata['resources']: r['basename'] = os.path.basename(r['path']) if r.get('name'): title = os.path.basename(r['name']) else: # no resource name, use capitalised filename title = os.path.basename(r['path']).split('.')[0] title = title[:1].upper() + title[1:] r['title'] = title pkg_info['datafiles'] = metadata['resources'] return pkg_info
def test_output(): # All of these just need to output without errors. from zenlog import log log.debug("A quirky message only developers care about") log.info("Curious users might want to know this") log.warn("Something is wrong and any user should be informed") log.warning("Something is wrong and any user should be informed") log.error("Serious stuff, this is red for a reason") log.critical("OH NO everything is on fire") log.c("OH NO everything is on fire") log.crit("OH NO everything is on fire")
def get_media_from_tag(tag: Tag) -> Media: """Create a Media object from one or more <a> tags.""" media_tags: List[Tag] = tag.select('a') images = [] videos = [] for media_tag in media_tags: url: str = media_tag.get('href') if re.search(r'(?:png|jpe?g|gif|resizedimage)$', url): images.append(url) elif re.search(r'(?:youtube|youtu\.be|ajax%2Fmodvideo)', url): videos.append(url) else: log.warn(f'Unknown media type for url {url}') return Media(images, videos)
def generate_session_pages(self): self.date_data = get_date_dataset() self.date_data.reverse() if self.fast_run: COUNTER = 0 for leg, sess, num, d, dpub, page_start, page_end in self.date_data: dateobj = dateparser.parse(d) session = get_session_from_legsessnum(leg, sess, num) if not session: log.warn( "File for %s-%s-%s is missing from the transcripts dataset!" % (leg, sess, num)) continue target_dir = "%s%d/%02d/%02d" % (self.sessions_path, dateobj.year, dateobj.month, dateobj.day) filename = "%s/index.html" % target_dir info = get_session_info(leg, sess, num) create_dir(os.path.join(self.output_dir, target_dir)) if type(session) in (str, unicode): # sessão em texto simples context = { 'session_date': dateobj, 'year_number': dateobj.year, 'leg': leg, 'sess': sess, 'num': num, 'text': session, 'monthnames': MESES, 'pdf_url': 'xpto', 'page_name': 'sessoes', } if info: context['session_info'] = info self.render_template_into_file('session_plaintext.html', filename, context) elif type(session) in (dict, OrderedDict): # usar entradas do .json como contexto session['session_date'] = dateparser.parse( session['session_date']) session['monthnames'] = MESES session['page_name'] = 'sessoes' self.render_template_into_file('session.html', filename, session) if self.fast_run: COUNTER += 1 if COUNTER > self.fast_run_count: break
def _update_all(self): default_args = self.get_default_args() for manga in self.db.get_all(): # type Manga self.log() and log.info('Update %s', manga.url) _args = default_args.copy() data = json.loads(manga.data) data_args = data.get('args', {}) del data_args['rewrite_exists_archives'] del data_args['user_agent'] del data_args['url'] if not fs.is_dir( fs.path_join(data_args['destination'], data_args['name'])): self.log() and log.warn('Destination not exists. Skip') continue _args.update({ # re-init args 'url': manga.url, **data_args, }) provider = self._get_provider(_args) if provider: provider = provider() # type: Provider provider.before_provider(_args) provider.http.cookies = data.get('cookies') provider.http.ua = data.get('browser') provider.run(_args) provider.after_provider() provider.update_db() self.global_info.add_info(info)
def update_dict_recursively(base_dict, update_dict, copy=True): if 0 == len(base_dict): return deepcopy(update_dict) if copy else update_dict if copy: base_dict = deepcopy(base_dict) try: for k, v in update_dict.items(): if isinstance(v, Mapping): base_dict[k] = update_dict_recursively(base_dict.get(k, {}), v, copy=False) else: base_dict[k] = update_dict[k] # AttributeError: 'list' object has no attribute 'items' (occurs if update dict is empty (list)) except AttributeError as ex: log.warn(f'Empty dict update added to {base_dict}') # raise ex return base_dict
def grow_cut(image: np.ndarray, seeds: np.ndarray, max_iter: int = 200, window_size: int = 3 ) -> Tuple[np.ndarray, np.ndarray, int]: if 'uint16' != str(image.dtype): raise ValueError(f'"seed_sized_input_image" should be of dtype uint16, but is {image.dtype}') cell_changes = np.zeros_like(seeds, dtype=np.uint16) image_min_value = np.amin(image) image_max_value = np.amax(image) if 'int8' != str(seeds.dtype): if seeds.dtype in (np.bool, np.uint8): seeds = np.copy(seeds) # Copies seed data seeds = seeds.view(np.int8) # Ensure that FG object is not on the edge of the image bg_label = -1 seeds[:, (0, -1)] = seeds[(0, -1), :] = bg_label print('[SimpleGrowCut] added background labels at image edges') else: seeds = seeds.astype(np.int8) # Copies seed data strength_map = np.empty_like(seeds, dtype=np.float64) strength_map[:] = (0 != seeds).view(np.int8) num_computed_iterations, segmentation_volume, *_ = growcut_cython( # TODO return strength map as well? image=image, # uint16 labels_arr=seeds, # int8 strengths_arr=strength_map, # float64 cell_changes=cell_changes, # uint16 # ref to same data as label_changes_per_cell max_iter=np.int32(max_iter), # int32 window_size=np.int32(window_size), # int32 image_max_distance_value=np.uint16(image_max_value - image_min_value), # uint16 ) if num_computed_iterations == max_iter: neutral_label = 0 log.warn('GrowCut used the given maximum of {} iterations. {} undefined labels left.'.format( num_computed_iterations, 'No' if np.any(segmentation_volume == neutral_label) else np.count_nonzero(0 == segmentation_volume) )) elif num_computed_iterations == 0: log.warn('GrowCut stopped after zero iterations') return segmentation_volume, seeds, num_computed_iterations
def verify(self): """ Creates a new model for the user and authenticates it with the challenge response method. Raises: IndexError: If an invalid message is encountered """ # Connect to the socket self._connect() message = { "NewModel": { "email": self.email, "password": self.password, "model_name": self.model_name, } } self._send_message(message) while True: # Read some data data = self._read_message() log.debug(f"Received data={data}") try: variant, data = parse_message(data) if variant == "Challenge": self.authenticate_challenge(data) elif variant == "AccessToken": self.display_access(data) self.save_access_tokens() break else: log.warn( f"Encountered an unexpected message variant={variant}") except IndexError: log.error(f"Failed to parse a message from data={data}")
def generate_session_pages(self): self.date_data = get_date_dataset() self.date_data.reverse() if self.fast_run: COUNTER = 0 for leg, sess, num, d, dpub, page_start, page_end in self.date_data: dateobj = parse_iso_date(d) session = get_session_from_legsessnum(leg, sess, num) if not session: log.warn("File for %s-%s-%s is missing from the transcripts dataset!" % (leg, sess, num)) continue target_dir = "%s%d/%02d/%02d" % (self.sessions_path, dateobj.year, dateobj.month, dateobj.day) filename = "%s/index.html" % target_dir info = get_session_info(leg, sess, num) create_dir(os.path.join(self.output_dir, target_dir)) if type(session) in (str, unicode): # sessão em texto simples context = {'date': dateobj, 'year_number': dateobj.year, 'leg': leg, 'sess': sess, 'num': num, 'text': session, 'monthnames': MESES, 'pdf_url': 'xpto', 'page_name': 'sessoes', } if info: context['session_info'] = info self.render_template_into_file('session_plaintext.html', filename, context) elif type(session) in (dict, OrderedDict): # usar entradas do .json como contexto session['date'] = parse_iso_date(session['session_date']) session['monthnames'] = MESES session['page_name'] = 'sessoes' self.render_template_into_file('session.html', filename, session) if self.fast_run: COUNTER += 1 if COUNTER > self.fast_run_count: break
def _run_normal(self, _args, urls): for url in urls: _args['url'] = url provider = self._get_provider(_args) # type: Provider if provider: provider.before_provider(_args) provider.run(_args) provider.after_provider() provider.update_db() self.global_info.add_info(info) else: self.show_log() and log.warn('Provider not exists')
def download_file(output_dir, descriptor): descriptor_name = descriptor['name'] output_path = output_dir / descriptor_name if output_path.is_file(): if descriptor['hash'] == calculate_file_hash(output_path): log.info(f'proper file already downloaded ({descriptor_name})') return log.warn('file exists, but has invalid hash, re-downloading') os.remove(output_path) req = requests.get(descriptor['url']) if 200 != req.status_code: raise RuntimeError( f'could not download file ({descriptor_name}), try again') if descriptor['hash'] != calculate_buffer_hash(req.content): raise RuntimeError( f'downloaded file ({descriptor_name}) has invalid hash') with open(output_path, 'wb') as output_file: output_file.write(req.content)
def generate(offline=False, fetch_only=False, output_dir=OUTPUT_DIR, theme_dir=os.path.join(THEMES_DIR, 'centraldedados'), repo_dir=REPO_DIR, config_file=CONFIG_FILE): '''Main function that takes care of the whole process.''' global env, packages # Read the config file parser = SafeConfigParser() parser.read(config_file) # Load the theme and set up Jinja theme_name = parser.get('ui', 'theme') theme_dir = os.path.join(THEMES_DIR, theme_name) template_dir = os.path.join(theme_dir, "templates") env = jinja2.Environment(loader=jinja2.FileSystemLoader([template_dir])) # Set up the output directory if not os.path.exists(output_dir): os.mkdir(output_dir) # Set up the dir for storing repositories if not os.path.exists(repo_dir): log.debug("Directory %s doesn't exist, creating it." % repo_dir) os.mkdir(repo_dir) # Copy htaccess file shutil.copyfile(os.path.join(theme_dir, 'static/htaccess'), os.path.join(output_dir, ".htaccess")) # Create static dirs # TODO: only update changed files -- right now we regenerate the whole static dir # Static CSS files css_dir = os.path.join(output_dir, "css") if os.path.exists(css_dir): shutil.rmtree(css_dir) shutil.copytree(os.path.join(theme_dir, "static/css"), css_dir) # Static JavaScript files js_dir = os.path.join(output_dir, "js") if os.path.exists(js_dir): shutil.rmtree(js_dir) shutil.copytree(os.path.join(theme_dir, "static/js"), js_dir) # Theme images img_dir = os.path.join(output_dir, "img") if os.path.exists(img_dir): shutil.rmtree(img_dir) shutil.copytree(os.path.join(theme_dir, "static/img"), img_dir) # Fonts fonts_dir = os.path.join(output_dir, "fonts") if os.path.exists(fonts_dir): shutil.rmtree(fonts_dir) shutil.copytree(os.path.join(theme_dir, "static/fonts"), fonts_dir) if not parser.items('repositories'): log.critical( 'No repository data in settings.conf (does it even exist?). Cannot proceed :(' ) sys.exit() # go through each specified dataset for r in parser.items('repositories'): name, url = r dir_name = os.path.join(repo_dir, name) repo = None # do we have a local copy? if os.path.isdir(dir_name): if not os.path.isdir(os.path.join(dir_name, '.git')): if url.endswith(".json"): log.info("%s: Data package, refreshing" % name) updated = fetch_data_package(url, dir_name) else: log.info('%s: Unsupported repo, skipping update' % name) continue elif not offline: repo = git.Repo(dir_name) origin = repo.remotes.origin try: origin.fetch() except AssertionError: # usually this fails on the first run, try again origin.fetch() except git.exc.GitCommandError: log.critical( "%s: Fetch error, this dataset will be left out." % name) continue # see if we have updates if not local_and_remote_are_at_same_commit(repo, origin): log.debug( "%s: Repo has new commits, updating local copy." % name) updated = True # connection errors can also happen if fetch succeeds but pull fails try: result = origin.pull()[0] except git.exc.GitCommandError: log.critical( "%s: Pull error, this dataset will be left out." % name) continue if result.flags & result.ERROR: log.error("%s: Pull error, but going ahead." % name) updated = False else: log.info("%s: No changes." % name) updated = False else: log.debug("%s: Offline mode, using cached version." % name) # we set updated to True in order to re-generate everything # FIXME: See checksum of CSV files to make sure they're new before # marking updated as true updated = True repo = git.Repo(dir_name) if fetch_only: # if the --fetch-only flag was set, skip to the next dataset continue else: if offline: log.warn("%s: No local cache, skipping." % name) continue else: if url.endswith(".git"): # Handle GIT Repository URL log.info("%s: New repo, cloning." % name) try: repo = git.Repo.clone_from(url, dir_name) # For faster checkouts, one file at a time: #repo = git.Repo.clone_from(url, dir_name, n=True, depth=1) #repo.git.checkout("HEAD", "datapackage.json") except git.exc.GitCommandError as inst: log.warn("%s: skipping %s" % (inst, name)) continue updated = True elif url.endswith(".json"): # Handle Data Package URL log.info("%s: New data package, fetching." % name) updated = fetch_data_package(url, dir_name) else: log.warn("Unsupported repository: %s" % url) # get datapackage metadata try: pkg_info = process_datapackage(name, repo_dir, url) except ParseException as inst: log.warn("%s: skipping %s" % (inst, name)) continue # set last updated time based on last commit, comes in Unix timestamp format so we convert import datetime if repo is not None: d = repo.head.commit.committed_date else: d = int(time.mktime(time.localtime())) last_updated = datetime.datetime.fromtimestamp( int(d)).strftime('%Y-%m-%d %H:%M:%S') pkg_info['last_updated'] = last_updated # add it to the packages list for index page generation after the loop ends packages.append(pkg_info) # re-generate the dataset HTML pages create_dataset_page(pkg_info, output_dir) # if repo was updated, copy over CSV/JSON/* and ZIP files to the download dir # (we always generate them if offline) if updated or offline: create_dataset_page(pkg_info, output_dir) datafiles = pkg_info['datafiles'] zipf = zipfile.ZipFile(os.path.join(output_dir, name + '.zip'), 'w') for d in datafiles: log.info("Copying %s" % d['path']) # copy file target = os.path.join(output_dir, os.path.basename(d['path'])) shutil.copyfile(os.path.join(dir_name, d['path']), target) # generate JSON version of CSV # if target.endswith('.csv'): # csv2json(target, target.replace(".csv", ".json")) # make zip file zipf.write(os.path.join(dir_name, d['path']), d['basename'], compress_type=zipfile.ZIP_DEFLATED) if 'readme_path' in pkg_info: try: zipf.write(pkg_info['readme_path'], 'README.md') except OSError: pass zipf.close() # HTML index with the list of available packages create_index_page(packages, output_dir) # Static JSON API of the data packages create_api(packages, output_dir, repo_dir) # Static pages create_static_pages(output_dir) # Contact page create_contact_page(output_dir, parser.get('credentials', 'contact_email')) log.info("All static content is ready inside '%s'." % OUTPUT_DIR)
def process_datapackage(pkg_name, repo_dir, repo_url): '''Reads a data package and returns a dict with its metadata. The items in the dict are: - name - title - license - description - sources - readme: in HTML, processed with python-markdown from README.md, empty if README.md does not exist) - datafiles: a dict that contains the contents of the "resources" attribute. Each resource also contains the "basename" property, which is the resource base filename (without preceding directory) ''' pkg_dir = os.path.join(repo_dir, pkg_name) pkg_info = {} try: metadata = json.loads( open(os.path.join(pkg_dir, "datapackage.json")).read()) except IOError: raise ParseException("datapackage.json not found") # get main attributes pkg_info['name'] = pkg_name pkg_info['homepage'] = repo_url pkg_info['original_name'] = metadata['name'] pkg_info['title'] = metadata['title'] pkg_info['license'] = metadata.get('license') if pkg_info['license'] and 'title' in pkg_info['license']: pkg_info['license'] = pkg_info['license']['title'] if not 'description' in metadata: pkg_info['description'] = "" else: pkg_info['description'] = metadata['description'] pkg_info['sources'] = metadata.get('sources') or [] # process README readme = "" readme_path = os.path.join(pkg_dir, "README.md") if not os.path.exists(readme_path): readme_path = os.path.join(pkg_dir, "README.markdown") if not os.path.exists(readme_path): if len(pkg_info['description']) > 140: readme = markdown.markdown(pkg_info['description'], output_format="html5", encoding="UTF-8") pkg_info['description'] = "" else: log.warn("No README.md or description found in the data package.") else: pkg_info['readme_path'] = readme_path contents = codecs.open(readme_path, 'r', 'utf-8').read() try: readme = markdown.markdown(contents, output_format="html5", encoding="UTF-8") except UnicodeDecodeError: raise ParseException( "README.md has invalid encoding, maybe the datapackage is not UTF-8?" ) pkg_info['readme'] = readme # process resource/datafiles list for r in metadata['resources']: if not r.get('schema'): log.warn("Schema missing in resource, adding blank") r['schema'] = {'fields': []} if not r.get('path'): r['path'] = 'data/%s' % r['url'].split('/')[-1] r['basename'] = os.path.basename(r['path']) if not r.get('title'): if r.get('name'): title = os.path.basename(r['name']) else: # no resource name, use capitalised filename title = os.path.basename(r['path']).split('.')[0] title = title[:1].upper() + title[1:] r['title'] = title pkg_info['datafiles'] = metadata['resources'] return pkg_info
def generate(offline, fetch_only): '''Main function that takes care of the whole process.''' # set up the output directory if not os.path.exists(output_dir): os.mkdir(output_dir) # set up the dir for storing repositories if not os.path.exists(repo_dir): log.info("Directory %s doesn't exist, creating it." % repo_dir) os.mkdir(repo_dir) # create dir for dataset pages if not os.path.exists(os.path.join(output_dir, datasets_dir)): os.mkdir(os.path.join(output_dir, datasets_dir)) # create download dir for zip and csv/json/* dataset files if not os.path.exists(os.path.join(output_dir, files_dir)): os.mkdir(os.path.join(output_dir, files_dir)) # create static dirs # TODO: only update changed files -- right now we regenerate the whole static dir css_dir = os.path.join(output_dir, "css") js_dir = os.path.join(output_dir, "js") img_dir = os.path.join(output_dir, "img") fonts_dir = os.path.join(output_dir, "fonts") if os.path.exists(css_dir): shutil.rmtree(css_dir) shutil.copytree("static/css", css_dir) if os.path.exists(js_dir): shutil.rmtree(js_dir) shutil.copytree("static/js", js_dir) if os.path.exists(img_dir): shutil.rmtree(img_dir) shutil.copytree("static/img", img_dir) if os.path.exists(fonts_dir): shutil.rmtree(fonts_dir) shutil.copytree("static/fonts", fonts_dir) # read the config file to get the datasets we want to publish parser = SafeConfigParser() parser.read(config_file) packages = [] if not parser.items('repositories'): log.critical('No repository data in settings.conf (does it even exist?). Cannot proceed :(') sys.exit() # go through each specified dataset for r in parser.items('repositories'): name, url = r dir_name = os.path.join(repo_dir, name) # do we have a local copy? if os.path.isdir(dir_name): if not offline: log.info("Checking for changes in repo '%s'..." % name) repo = git.Repo(dir_name) origin = repo.remotes.origin try: origin.fetch() except AssertionError: # usually this fails on the first run, try again origin.fetch() except git.exc.GitCommandError: log.critical("Fetch error connecting to repository, this dataset will be ignored and not listed in the index!") continue # connection errors can also happen if fetch succeeds but pull fails try: result = origin.pull()[0] except git.exc.GitCommandError: log.critical("Pull error connecting to repository, this dataset will be ignored and not listed in the index!") continue # we get specific flags for the results Git gave us # and we set the "updated" var in order to signal whether to # copy over the new files to the download dir or not if result.flags & result.HEAD_UPTODATE: log.info("No new changes in repo '%s'." % name) updated = False elif result.flags & result.ERROR: log.error("Error pulling from repo '%s'!" % name) updated = False else: # TODO: figure out other git-python flags and return more # informative log output log.info("Repo changed, updating. (returned flags: %d)" % result.flags) updated = True else: log.info("Offline mode, using cached version of package %s..." % name) # we set updated to True in order to re-generate everything # FIXME: See checksum of CSV files to make sure they're new before # marking updated as true updated = True repo = git.Repo(dir_name) if fetch_only: # if the --fetch-only flag was set, skip to the next dataset continue else: if offline: log.warn("Package %s specified in settings but no local cache, skipping..." % name) continue else: log.info("We don't have repo '%s', cloning..." % name) repo = git.Repo.clone_from(url, dir_name) updated = True # get datapackage metadata pkg_info = process_datapackage(name) # set last updated time based on last commit, comes in Unix timestamp format so we convert import datetime d = repo.head.commit.committed_date last_updated = datetime.datetime.fromtimestamp(int("1284101485")).strftime('%Y-%m-%d %H:%M:%S') log.debug(last_updated) pkg_info['last_updated'] = last_updated # add it to the packages list for index page generation after the loop ends packages.append(pkg_info) # re-generate the dataset HTML pages create_dataset_page(pkg_info) # if repo was updated, copy over CSV/JSON/* and ZIP files to the download dir # (we always generate them if offline) if updated or offline: create_dataset_page(pkg_info) datafiles = pkg_info['datafiles'] zipf = zipfile.ZipFile(os.path.join(output_dir, files_dir, name + '.zip'), 'w') for d in datafiles: # copy CSV file target = os.path.join(output_dir, files_dir, os.path.basename(d['path'])) shutil.copyfile(os.path.join(dir_name, d['path']), target) # generate JSON version csv2json(target, target.replace(".csv", ".json")) # make zip file zipf.write(os.path.join(dir_name, d['path']), d['basename'], compress_type=zipfile.ZIP_DEFLATED) try: zipf.write(pkg_info['readme_path'], 'README.md') except OSError: pass zipf.close() # generate the HTML index with the list of available packages create_index_page(packages) # generate the static JSON API of the data packages create_api(packages)
def stop(self): if self.is_playing: log.debug("Killing ffplay PID: {}".format(self.process.pid)) os.kill(self.process.pid, SIGTERM) else: log.warn("Player: radio is not playing")
def generate(offline=False, fetch_only=False, output_dir=OUTPUT_DIR, theme_dir=os.path.join(THEMES_DIR, 'centraldedados'), repo_dir=REPO_DIR, config_file=CONFIG_FILE): '''Main function that takes care of the whole process.''' global env, packages # Read the config file parser = SafeConfigParser() parser.read(config_file) # Load the theme and set up Jinja theme_name = parser.get('ui', 'theme') theme_dir = os.path.join(THEMES_DIR, theme_name) template_dir = os.path.join(theme_dir, "templates") env = jinja2.Environment(loader=jinja2.FileSystemLoader([template_dir])) # Set up the output directory if not os.path.exists(output_dir): os.mkdir(output_dir) # Set up the dir for storing repositories if not os.path.exists(repo_dir): log.debug("Directory %s doesn't exist, creating it." % repo_dir) os.mkdir(repo_dir) # Copy htaccess file shutil.copyfile(os.path.join(theme_dir, 'static/htaccess'), os.path.join(output_dir, ".htaccess")) # Static CSS files css_dir = os.path.join(output_dir, "css") if os.path.exists(css_dir): shutil.rmtree(css_dir) shutil.copytree(os.path.join(theme_dir, "static/css"), css_dir) # Static JavaScript files js_dir = os.path.join(output_dir, "js") if os.path.exists(js_dir): shutil.rmtree(js_dir) shutil.copytree(os.path.join(theme_dir, "static/js"), js_dir) # Theme images img_dir = os.path.join(output_dir, "img") if os.path.exists(img_dir): shutil.rmtree(img_dir) shutil.copytree(os.path.join(theme_dir, "static/img"), img_dir) # Fonts fonts_dir = os.path.join(output_dir, "fonts") if os.path.exists(fonts_dir): shutil.rmtree(fonts_dir) shutil.copytree(os.path.join(theme_dir, "static/fonts"), fonts_dir) if not parser.items('repositories'): log.critical('No repository data in settings.conf (does it even exist?). Cannot proceed :(') sys.exit() # go through each specified dataset for r in parser.items('repositories'): name, url = r dir_name = os.path.join(repo_dir, name) repo = None # do we have a local copy? if os.path.isdir(dir_name): if not os.path.isdir(os.path.join(dir_name, '.git')): if url.endswith(".json"): log.info("%s: Data package, refreshing" % name) updated = fetch_data_package(url, dir_name) else: log.info('%s: Unsupported repo, skipping update' % name) continue elif not offline: repo = git.Repo(dir_name) origin = repo.remotes.origin try: origin.fetch() except AssertionError: # usually this fails on the first run, try again origin.fetch() except git.exc.GitCommandError: log.critical("%s: Fetch error, this dataset will be left out." % name) continue # see if we have updates if not local_and_remote_are_at_same_commit(repo, origin): log.debug("%s: Repo has new commits, updating local copy." % name) updated = True # connection errors can also happen if fetch succeeds but pull fails try: result = origin.pull()[0] except git.exc.GitCommandError: log.critical("%s: Pull error, this dataset will be left out." % name) continue if result.flags & result.ERROR: log.error("%s: Pull error, but going ahead." % name) updated = False else: log.info("%s: No changes." % name) updated = False else: log.debug("%s: Offline mode, using cached version." % name) # we set updated to True in order to re-generate everything updated = True repo = git.Repo(dir_name) if fetch_only: # if the --fetch-only flag was set, skip to the next dataset continue else: if offline: log.warn("%s: No local cache, skipping." % name) continue else: if url.endswith(".git"): # Handle GIT Repository URL log.info("%s: New repo, cloning." % name) try: repo = git.Repo.clone_from(url, dir_name) # For faster checkouts, one file at a time: # repo = git.Repo.clone_from(url, dir_name, n=True, depth=1) # repo.git.checkout("HEAD", "datapackage.json") except git.exc.GitCommandError as inst: log.warn("%s: skipping %s" % (inst, name)) continue updated = True elif url.endswith(".json"): # Handle Data Package URL log.info("%s: New data package, fetching." % name) updated = fetch_data_package(url, dir_name) else: log.warn("Unsupported repository: %s" % url) # get datapackage metadata try: pkg_info = process_datapackage(name, repo_dir, url) except ParseException as inst: log.warn("%s: skipping %s" % (inst, name)) continue # set last updated time based on last commit, comes in Unix timestamp format so we convert import datetime if repo is not None: d = repo.head.commit.committed_date else: d = int(time.mktime(time.localtime())) last_updated = datetime.datetime.fromtimestamp(int(d)).strftime('%Y-%m-%d %H:%M:%S') pkg_info['last_updated'] = last_updated # add it to the packages list for index page generation after the loop ends packages.append(pkg_info) # re-generate the dataset HTML pages create_dataset_page(pkg_info, output_dir) # if repo was updated, copy over CSV/JSON/* and ZIP files to the download dir # (we always generate them if offline) if updated or offline: create_dataset_page(pkg_info, output_dir) datafiles = pkg_info['datafiles'] zipf = zipfile.ZipFile(os.path.join(output_dir, name + '.zip'), 'w') for d in datafiles: log.info("Copying %s" % d['path']) # copy file target = os.path.join(output_dir, os.path.basename(d['path'])) shutil.copyfile(os.path.join(dir_name, d['path']), target) # generate JSON version of CSV if target.endswith('.csv'): csv2json(target, target.replace(".csv", ".json")) # make zip file zipf.write(os.path.join(dir_name, d['path']), d['basename'], compress_type=zipfile.ZIP_DEFLATED) if 'readme_path' in pkg_info: try: zipf.write(pkg_info['readme_path'], 'README.md') except OSError: pass zipf.close() # HTML index with the list of available packages create_index_page(packages, output_dir) # Static JSON API of the data packages create_api(packages, output_dir, repo_dir) # Static pages create_static_pages(output_dir) # Contact page create_contact_page(output_dir, parser.get('credentials', 'contact_email')) log.info("All static content is ready inside '%s'." % OUTPUT_DIR)
def process_datapackage(pkg_name, repo_dir, repo_url): '''Reads a data package and returns a dict with its metadata. The items in the dict are: - name - title - license - repository - version - description - sources - readme: in HTML, processed with python-markdown from README.md, empty if README.md does not exist) - datafiles: a dict that contains the contents of the "resources" attribute. Each resource also contains the "basename" property, which is the resource base filename (without preceding directory) ''' pkg_dir = os.path.join(repo_dir, pkg_name) pkg_info = {} try: metadata = json.loads(open(os.path.join(pkg_dir, "datapackage.json")).read()) except IOError: raise ParseException("datapackage.json not found") # get main attributes pkg_info['name'] = pkg_name pkg_info['homepage'] = repo_url pkg_info['original_name'] = metadata['name'] pkg_info['title'] = metadata['title'] pkg_info['license'] = metadata.get('license') pkg_info['version'] = metadata.get('version') pkg_info['repository'] = metadata.get('repository') pkg_info['homepage'] = metadata.get('homepage') if pkg_info['license'] and 'title' in pkg_info['license']: pkg_info['license'] = pkg_info['license']['title'] if 'description' not in metadata: pkg_info['description'] = "" else: pkg_info['description'] = metadata['description'] pkg_info['sources'] = metadata.get('sources') or [] # process README readme = "" readme_path = os.path.join(pkg_dir, "README.md") if not os.path.exists(readme_path): readme_path = os.path.join(pkg_dir, "README.markdown") if not os.path.exists(readme_path): if len(pkg_info['description']) > 140: readme = markdown.markdown(pkg_info['description'], output_format="html5", encoding="UTF-8") pkg_info['description'] = "" else: log.warn("No README.md or description found in the data package.") else: pkg_info['readme_path'] = readme_path contents = codecs.open(readme_path, 'r', 'utf-8').read() try: readme = markdown.markdown(contents, output_format="html5", encoding="UTF-8") except UnicodeDecodeError: raise ParseException("README.md has invalid encoding, maybe the datapackage is not UTF-8?") pkg_info['readme'] = readme # process resource/datafiles list for r in metadata['resources']: if not r.get('schema'): log.warn("Schema missing in resource, adding blank") r['schema'] = {'fields': []} if not r.get('path'): r['path'] = 'data/%s' % r['url'].split('/')[-1] r['basename'] = os.path.basename(r['path']) if not r.get('title'): if r.get('name'): title = os.path.basename(r['name']) else: # no resource name, use capitalised filename title = os.path.basename(r['path']).split('.')[0] title = title[:1].upper() + title[1:] r['title'] = title pkg_info['datafiles'] = metadata['resources'] return pkg_info
def seeds_to_metrics(seeds: Dict[str, Dict[str, Dict[str, list]]], gt_data: Dict[str, tuple], image_ids: list, load_from: str = 'seeds_to_metrics.json'): # image_ids == [['feedback_interactive', 'UserName1', '2019-04-10T14:24:44', '2448185', 'data_set', 'DataID1'], ...] load_ = Path(__file__).with_name(load_from) if load_.exists(): with load_.open(mode='r') as fp: log.info(f'Reading file {load_}') metrics_results = json.load(fp) else: metrics_results = {} for prot, v in seeds.items(): if prot not in metrics_results: metrics_results[prot] = {} for usr, v_ in v.items(): if usr not in metrics_results[prot]: metrics_results[prot][usr] = {} for expe, seeds_list in v_.items(): if expe not in metrics_results[prot][usr]: image_id = [ e[5] for e in image_ids if e[0] == prot and e[1] == usr and e[2] == expe ] if len(image_id) is 0: print( f'[Warning] no ID found for {(prot, usr, expe)} in {image_ids}' ) continue image_id = image_id[0] try: img, gt = gt_data[image_id] except KeyError: print( f'[Warning] data ID not found, skipping "{image_id}"' ) continue metrics_ = [] current_seeds = np.zeros_like(img, dtype=np.int8) current_seeds[(0, -1), :] = -1 current_seeds[:, (0, -1)] = -1 assert 'uint' in str( img.dtype), f'{image_id}, {img.dtype}, {img.shape}' img = img.astype(np.uint16, copy=False) met = Metrics() met.set_multiple_inputs( (gt, np.zeros_like(current_seeds, dtype=np.int8) - 1)) metrics_result = met.get_outcome() metrics_.append(metrics_result) for seeds_ in seeds_list: if len(seeds_['bg']) > 0: prev_w, prev_h = seeds_['bg'][0] for i, (w_, h_) in enumerate(seeds_['bg']): try: rr, cc = line(prev_h, prev_w, h_, w_) current_seeds[rr, cc] = -1 prev_w, prev_h = w_, h_ except IndexError as ex: log.warn(ex) if len(seeds_['bg']) > (i + 1): prev_w, prev_h = seeds_['bg'][i + 1] if len(seeds_['fg']) > 0: prev_w, prev_h = seeds_['fg'][0] for i, (w_, h_) in enumerate(seeds_['fg']): try: rr, cc = line(prev_h, prev_w, h_, w_) current_seeds[rr, cc] = 1 prev_w, prev_h = w_, h_ except IndexError as ex: log.warn(ex) if len(seeds_['bg']) > (i + 1): prev_w, prev_h = seeds_['bg'][i + 1] segmentation_mask, *_ = grow_cut(img, current_seeds, max_iter=2**11, window_size=3) met = Metrics() met.set_multiple_inputs((gt, segmentation_mask)) metrics_result = met.get_outcome() metrics_.append(metrics_result) metrics_results[prot][usr][expe] = {image_id: metrics_} return metrics_results
def parse_event_info(event, info): # event is a dict, info is a BeautifulSoup object if event['type'] == u"Baixa comissão para discussão": com_name = info.find('span', id=RE_COMNAME) if com_name: event['comission_name'] = com_name.text.strip() elif event['type'] == u"Publicação": url_nodes = info.findAll('a') urls = [{'url': node['href'], 'title': node.text.strip('[]')} for node in url_nodes] event['references'] = urls elif event['type'] in (u"Votação na generalidade", u"Votação Deliberação", u"Votação final global"): vote_info = info.find('span', id=RE_VOTEINFO) # funky parse loop for understanding how each party voted # I really have to refactor this, please pester me if you need it -- rlafuente results = {'for': [], 'against': [], 'abstain': []} current_vote = None for c in vote_info.contents: if type(c) == bs4.element.Tag: if c.name == "br": continue elif c.name == "i": results[current_vote].append(c.text) else: log.error("Unrecognized vote tag: %s" % c) elif type(c) == bs4.element.NavigableString: c = c.strip() if c == ",": continue if c.startswith(u'Contra:'): current_vote = "against" if not c == u'Contra:': # cases with voters in one line (individual MPs) # ex. "Abstenção: Isabel Oneto (PS)" c = c.replace(u'Contra: ', '').split(', ') for mp in c: if mp: results[current_vote].append(mp.strip(',')) elif c.startswith(u"A Favor:"): current_vote = "for" if not c == u'A Favor:': c = c.replace(u'A Favor: ', '').split(', ') for mp in c: if mp: results[current_vote].append(mp.strip(',')) elif c.startswith(u"Abstenção:"): current_vote = "abstain" if not c == u'Abstenção:': c = c.replace(u'Abstenção: ', '').split(', ') for mp in c: if mp: results[current_vote].append(mp.strip(',')) else: log.warn("Orphan vote string: %s -- saving as voter" % c) c = c.split(', ') for mp in c: if mp: results[current_vote].append(mp) event['vote_info'] = results pass else: if info.text.strip(): event['raw_info'] = info.text.strip() return event
def predict_log_data( input_file_path='./data/cache/correlation_raw_data.json', out_dir: Union[Path, str] = './data/results', remove_questionnaire_features: bool = True, keep_sus_as_feature: bool = False, # May be used for AttrakDiff-only prediction add_features_from_pca: Union[ int, float] = 0.1, # Add N (int or frac) additional PCA features to X_{train|test} use_feature_selection: bool = True, use_num_features: int = 20, # Note: only utilized if 'use_feature_selection' relative_test_size: float = 0.1, random_state: int = 42): out_dir = Path(out_dir) out_dir.mkdir(parents=True, exist_ok=True) log.info(f'Load data from "./best_features.json"') with Path('best_features.json').open('r') as fp: # With 128 trees, best 1% feature_selection_columns = json.load(fp) feature_selection_columns = { k: [e.replace('\u03a3', u'Σ') for e in v] for k, v in feature_selection_columns.items() } if use_feature_selection: assert use_num_features <= len(feature_selection_columns['PQ']) y_labels = ('PQ', 'ATT', 'HQ-I', 'HQ-S', 'HQ', 'SUS') questionnaire_labels = [l.lower() for l in y_labels] if keep_sus_as_feature: questionnaire_labels = [ l for l in questionnaire_labels if l.upper() != 'SUS' ] y_labels = [l for l in y_labels if l.upper() != 'SUS'] attributes, data = get_data(input_file_path) pd.set_option('display.max_rows', 500) pd.set_option('display.max_columns', 500) pd.set_option('display.width', 1000) for y_label in y_labels: log.info(f'# {y_label.upper()}') y_attribute = [y_label.lower() == a.lower() for a in attributes] if remove_questionnaire_features: x_attributes = [(a.lower() not in questionnaire_labels) for a in attributes] else: x_attributes = np.invert(y_attribute).tolist() X = np.array(data)[:, x_attributes] y = np.array(data)[:, y_attribute].ravel() if 0 < add_features_from_pca: if 1 > add_features_from_pca: n_components_pca = int( np.ceil(add_features_from_pca * X.shape[1])) else: n_components_pca = add_features_from_pca if n_components_pca > min(*X.shape): log.warn( f'Could not compute PCA with {n_components_pca} components, since input X {X.shape} has ' + f'not enough data. Using {min(*X.shape)} components instead.' ) n_components_pca = min(*X.shape) X, _ = impute_and_scale(X) pca = PCA(n_components=n_components_pca, svd_solver='full') pca.fit(X) X_pca = pca.transform(X) log.debug(f'X_pca has shape {X_pca.shape}') X = np.concatenate((X, X_pca), axis=1) pca_attributes = [f'PCA_VAL_{i}' for i in range(X_pca.shape[1])] attributes_including_pca = [*attributes, *pca_attributes] log.info(f'Additional PCA features: {X_pca.shape[1]}') x_attributes.extend([True] * X_pca.shape[1]) del X_pca df = pd.DataFrame( data=X, columns=np.array(attributes_including_pca)[x_attributes]) # log.debug(df.head()) # log.debug(df.describe(include='all')) if use_feature_selection: # Check if all features (PCA) are present feature_selection_columns = { k: [v_ for v_ in v if v_ in attributes_including_pca] for k, v in feature_selection_columns.items() } feature_names = feature_selection_columns[ y_label.upper()][:use_num_features] df = df.loc[:, feature_names] # log.info(df.head()) # log.info(df.describe(include='all')) log.info( f'Features before selection: {np.count_nonzero(x_attributes)}') x_attributes = [ (x_attr and (atrr in feature_names)) for x_attr, atrr in zip(x_attributes, attributes_including_pca) ] log.info( f'Features after selection: {np.count_nonzero(x_attributes)}') log.info( str([ atrr for x_attr, atrr in zip(x_attributes, attributes_including_pca) if (x_attr and (atrr in feature_names)) ])) X = df # X = X.as_matrix() X, scaler = impute_and_scale(X) log.debug(f'Number of train/test splits is {X.shape[0]}') kf = KFold(n_splits=X.shape[0], shuffle=False, random_state=random_state) log.info(f'Load data from "./best_parameters.json"') with Path('best_parameters.json').open( 'r') as fp: # With 128 trees, best 1% parameters = json.load(fp)[y_label] parameters.pop('random_state', '<dummy/>') if X.shape[0] < 10 and parameters['min_samples_leaf'] > 1: log.warn( f'GBRF parameter "min_samples_leaf" is originally set to {parameters["min_samples_leaf"]}, ' + f'however only {X.shape[0]} samples are in the overall input data. ' + 'Therefore, "min_samples_leaf" are set to 1 during training.') parameters['min_samples_leaf'] = 1 log.info(f'Best parameters: {parameters}') for iteration_num, (train_indices, test_indices) in tqdm(enumerate(kf.split(X=X, y=y))): assert test_indices.size == 1 X_train, X_test, y_train, y_test = X[train_indices], X[ test_indices], y[train_indices], y[test_indices] # Speed improvement X_train = np.asfortranarray(X_train, dtype=np.float64) y_train = np.ascontiguousarray(y_train, dtype=np.float32) X_test = np.asfortranarray(X_test, dtype=np.float64) y_test = np.ascontiguousarray(y_test, dtype=np.float32) x_feature_labels = feature_names if use_feature_selection else \ [a for a, l in zip(attributes_including_pca, x_attributes) if l] assert len(x_feature_labels) == X_train.shape[1] log.info(f'y size: {y.shape} -> y_train size: {y_train.shape}') log.info( f'y label: {", ".join([a for a, l in zip(attributes_including_pca, y_attribute) if l])}' ) log.info(f'features: {len(x_feature_labels)}') # Set seed for reproducibility np.random.seed(random_state + iteration_num) parameters.update({'random_state': random_state + iteration_num}) additional_hash = '' if use_feature_selection: additional_hash = f'-{hash_(feature_selection_columns)}' save_file = f'gbrf-label_{y_label.upper()}-PCA_{add_features_from_pca}-' + \ f'SUS_{keep_sus_as_feature}-FEATSEL_{use_feature_selection}-' + \ f'{test_indices[0]}-{iteration_num}-{hash_(parameters)}-{additional_hash}.pkl' save_file = Path(out_dir).joinpath(save_file) log.info(f'Current model\'s save file: "{save_file}"') if save_file.is_file(): log.info('Load model from save file') try: sf = joblib.load(filename=save_file) except AttributeError as ex: log.error( 'You probably used another version of sklearn or Python to pickle this.' ) raise ex model = sf['model'] else: log.info('Fit new model') model = GradientBoostingRegressor(**parameters) model.fit(X_train, y_train) value = { 'model': model, 'parameters': parameters, 'X_train': X_train, 'y_train': y_train, 'X_test': X_test, 'y_test': y_test, 'X_feature_labels': x_feature_labels, 'y_label': y_label, 'additional_parameters': { 'relative_test_size': relative_test_size, 'remove_quest_features': remove_questionnaire_features, 'keep_sus_as_feature': keep_sus_as_feature, 'add_features_from_pca': add_features_from_pca, 'use_feature_selection': use_feature_selection, 'use_num_features': use_num_features, 'random_state': random_state + iteration_num, 'scaler': scaler, } } joblib.dump(value=value, filename=save_file, compress=9, protocol=pickle.HIGHEST_PROTOCOL) y_pred = model.predict(X_test) for score_func in (explained_variance_score, mean_absolute_error, mean_squared_error, median_absolute_error, r2_score): log.info(f'{score_func.__name__} {score_func(y_test, y_pred)}')