def deploy(target, url, force=False): if url is None: raise click.BadParameter("Please set instance asset url in INSTANCE_URL enviroment variable") if instance_path().exists() and not force: logger.info("Skipping deployment; Instance folder exists") return logger.debug("Retrieving instance folder from %s", url) local_filename, headers = urllib.request.urlretrieve(url) print(local_filename, headers) with ZipFile(local_filename) as instance_zip: instance_zip.extractall(target)
def _instance_path(prefix=number_topics): path = instance_path() / "lda" / str(prefix) path.mkdir(exist_ok=True, parents=True) return path
def build(target, method: list, dataset_name, limit: int, number_of_topics): """ Build page. :param target: Target file :param method: List of methods to use. :param limit: Limit processing into N candidates. """ click.echo("Loading dataset ... ", nl=False) dataset = importlib.import_module(f".{dataset_name}", "agora_analytica.data") df = dataset.load_dataset() if limit < 2: raise click.BadParameter("Build should include more than 2 candidates.", param_hint="--limit") df = df.sample(min(limit, df.shape[0])) click.echo("[DONE]") click.echo("Calculating distances ... ", nl=False) distances = measure_distances(df, methods=method) click.echo("[DONE]") click.echo("Analyzing text ... ", nl=False) if number_of_topics == -1: # Using squareroot seems to provide pretty good default number_of_topics = settings.getint("build", "number_of_topics", fallback=np.sqrt(limit)) number_of_topics = int(number_of_topics) settings.set("build", "number_of_topics", str(number_of_topics)) click.echo(f"Topics: {number_of_topics} ", nl=False) texts_df = df.text_answers().sort_index() visualization = settings.getboolean('build', 'generate_visualization', fallback=debug) topics = TextTopics(texts_df, number_topics=number_of_topics, generate_visualization=visualization) words = {} n = texts_df.shape[0] for a in range(n): for b in range(a + 1, n): a_idx = texts_df.index[a] b_idx = texts_df.index[b] r = topics.compare_rows(texts_df, a_idx, b_idx) if r: words[(a_idx, b_idx)] = r[0][1] words[(b_idx, a_idx)] = r[1][1] click.echo("[DONE]") click.echo("Generating structures ... ", nl=False) data_nodes = [{ "id": int(idx), "name": row.get("name"), "party": row.get("party"), "image": row.get("image", None), "constituency": row.get("vaalipiiri"), "number": int(row.get("number", -1)) } for idx, row in df.replace(np.NaN, None).iterrows()] data_links = [{ "source": int(i), "source_term": words.get((i, l), None), "distance": float(d), "target_term": words.get((l, i), None), "target": int(l) } for i, d, l in distances.values] click.echo("[DONE]") # Build static pages _build_pages(target / "pages") click.echo("Writing data ... ", nl=False) _write("nodes", data_nodes, target) _write("links", data_links, target) cfg = instance_path() / "app.cfg" with cfg.open('w') as f: settings.write(f, space_around_delimiters=True) click.echo("[DONE]")
def _write(file, data, target=instance_path()): """ Helper to write data into json file """ with open(os.path.join(target, f"{file}.json"), 'w') as f: f.write(jsonify(data, indent=(4 if debug else 0)))
debug = False settings = config() def _write(file, data, target=instance_path()): """ Helper to write data into json file """ with open(os.path.join(target, f"{file}.json"), 'w') as f: f.write(jsonify(data, indent=(4 if debug else 0))) @click.group() @click.option("--debug/--no-debug", default=debug, help="Show debug output") @click.option("--config", default=instance_path() / "app.cfg", help="Config file") def cli(debug, config): globals()['debug'] = debug logging.basicConfig(level=(logging.DEBUG if debug else logging.INFO)) settings.read(config) @cli.command() @click.option("--target", type=click.Path(file_okay=False), default=Path.cwd(), show_default=True) @click.option("--url", default=os.environ.get("INSTANCE_URL", None), show_default=True) def deploy(target, url, force=False): if url is None: raise click.BadParameter("Please set instance asset url in INSTANCE_URL enviroment variable")
import click from agora_analytica import instance_path from agora_analytica.data.utils import generate_names from agora_analytica.data.interpolation.wikidata import finnish_politicians import pandas as pd # Extra attributes to append into image url. By default wikipedia uses 300px wide # images, so it's good enought for us. IMAGE_URL_APPEND = "?width=300px" @click.command() @click.argument('file', type=click.Path(file_okay=True, dir_okay=False, exists=True), default=instance_path() / "nodes.json") def cli_obfuscate(file): """ Obfuscate contents of node FILE """ with open(file, mode="r+") as fp: df = pd.read_json(fp, orient="records") # Check for image using name images = politician_pictures() for idx, row in df.iterrows(): name = row['name'].lower().strip() img = images.get(name, None) df.loc[idx, "image"] = img + IMAGE_URL_APPEND if img else None # Generate fake names df = fake_names(df)
def test_instancepath(): path = instance_path() assert isinstance(path, Path) assert path.is_dir()
debug = False settings = config() def _write(file, data, target=instance_path()): """ Helper to write data into json file """ with open(os.path.join(target, f"{file}.json"), 'w') as f: f.write(jsonify(data, indent=(4 if debug else 0))) @click.group() @click.option("--debug/--no-debug", default=debug, help="Show debug output") @click.option("--config", default=instance_path() / "app.cfg", help="Config file") def cli(debug, config): globals()['debug'] = debug logging.basicConfig(level=(logging.DEBUG if debug else logging.INFO)) settings.read(config) @cli.command() @click.option("--target", type=click.Path(file_okay=False), default=(instance_path() / "..").resolve(), show_default=True) @click.option("--url", default=os.environ.get("INSTANCE_URL", None), show_default=True)
def build(target, method: list, dataset_name, limit: int, number_of_topics): """ Build page. :param target: Target file :param method: List of methods to use. :param limit: Limit processing into N candidates. """ click.echo("Loading dataset ... ", nl=False) dataset = importlib.import_module(f".{dataset_name}", "agora_analytica.data") df = dataset.load_dataset() if limit < 2: raise click.BadParameter( "Build should include more than 2 candidates.", param_hint="--limit") preferred_list_file = settings.get("build", "preferred_candidates", fallback=None) if preferred_list_file: with open(preferred_list_file) as fp: # Fetch all preferred candidates by row, skipping ones beginning with `#` preferred_candidates = filter(lambda x: x != "" and x[0] != "#", map(str.strip, fp.readlines())) # Slice preferred candidates preferred_filter = df["name"].isin(preferred_candidates) preferred = df[preferred_filter] # Fill to a required ammount with sampled data df = preferred.append(df[~preferred_filter].sample( clamp(df.shape[0] - preferred.shape[0], limit - preferred.shape[0], 0))) del preferred, preferred_filter # sample to a correct size df = df.sample(min(limit, df.shape[0])) click.echo("[DONE]") click.echo("Calculating distances ... ", nl=False) distances = measure_distances(df, methods=method) click.echo("[DONE]") click.echo("Analyzing text ... ", nl=False) if number_of_topics == -1: # Using squareroot seems to provide pretty good default number_of_topics = settings.getint("build", "number_of_topics", fallback=np.sqrt(limit)) number_of_topics = int(number_of_topics) settings.set("build", "number_of_topics", str(number_of_topics)) click.echo(f"Topics: {number_of_topics} ", nl=False) texts_df = df.text_answers().sort_index() visualization = settings.getboolean('build', 'generate_visualization', fallback=debug) topics = TextTopics(texts_df, number_topics=number_of_topics, generate_visualization=visualization) words = {} n = texts_df.shape[0] talkinpoints = {} for a in range(n): a_idx = texts_df.index[a] for b in range(a + 1, n): b_idx = texts_df.index[b] r = topics.compare_rows(texts_df, a_idx, b_idx) if r: words[(a_idx, b_idx)] = r[0][1] words[(b_idx, a_idx)] = r[1][1] talkinpoints[a_idx] = topics.find_talkingpoint(texts_df.loc[a_idx]) click.echo("[DONE]") click.echo("Generating structures ... ", nl=False) data_nodes = [{ "id": int(idx), "name": row.get("name"), "party": row.get("party"), "image": row.get("image", None), "constituency": row.get("constituency"), "number": int(row.get("number", -1)), "talkinpoint": talkinpoints.get(int(idx), None) } for idx, row in df.replace(np.NaN, None).iterrows()] data_links = [{ "source": int(i), "source_term": words.get((i, l), None), "distance": float(d), "target_term": words.get((l, i), None), "target": int(l) } for i, d, l in distances.values] click.echo("[DONE]") # Build static pages _build_pages(target / "pages") click.echo("Writing data ... ", nl=False) _write("nodes", data_nodes, target) _write("links", data_links, target) cfg = instance_path() / "app.cfg" with cfg.open('w') as f: settings.write(f, space_around_delimiters=True) click.echo("[DONE]")