def convert(xml_data_file, json_data_file=JSON_DATA_FILE): """ Converts the XML file to a JSON, s.t. I can work with it. """ if not Path(xml_data_file).is_file(): print(f"File {xml_data_file} does not exist") sys.exit(1) with open(xml_data_file) as infile: data = infile.read() dict_data = xmltodict.parse(data) with open(json_data_file, "w") as outfile: json.dump(dict_data, outfile, indent=4) # write provenance information prov = Provenance(json_data_file) prov.add( agents=["schlagwetter"], activity="xml_to_json_conversion", description="Convert provided XML-file to JSON.", ) prov.add_primary_source(PRIMARY_SOURCE_URL) prov.add_sources([xml_data_file]) prov.save()
def export_datasets(): """ Exports standardized datasets with fields spezified in the config.yml """ config = load_config() for dataset_name, fields in config["export"].items(): export_dataset = {} print("exporting dataset <{}> ...".format(dataset_name)) dataset = get_dataset(".", dataset_name) for entry in dataset: row = { field: entry[field] for field in fields } row["id"] = entry["id"] export_dataset[entry["id"]] = row out_file = os.path.join(config["project"]["export_dir"], "{}.json".format(dataset_name)) json.dump(export_dataset, open(out_file, "w"), indent=4) prov = Provenance(out_file, overwrite=True) prov.add( agents=[ PROV_AGENT ], activity="export_std_dataset", description="export standardized fields <{}> from dataset <{}>".format(", ".join(fields), dataset_name) ) prov.add_sources(dataset.source_file()) prov.save()
def fetch(self): """ Fetches all mobygames datasets and write the file />data_dir>/mobygames.zip """ print("fetch mobygames dataset ...") with zipfile.ZipFile(self.filepath, "w", zipfile.ZIP_DEFLATED) as zf: offset = 0 while True: result = self._api_call(SEARCH.format(api_key=self.api_key, title="", offset=offset)) games = result["games"] for game in games: game_str = json.dumps(game, indent=4) zf.writestr("{}.json".format(game["game_id"]), game_str) offset += OFFSET_STEP result_number = len(games) print("\t current offset: {}".format(offset)) if result_number == 0: break prov = Provenance(self.filepath) prov.add(agents=[ PROV_AGENT ], activity="fetch_mobygames", description="Full game datasets from mobygames api") prov.add_primary_source("mobygames") prov.save()
def save_provenance(self, outfilename): """ Save the provenance. """ prov = Provenance(outfilename, overwrite=True) prov.add( agents=[PROVIT_AGENT], activity=self.PROVIT_ACTIVITY, description=self.PROVIT_DESCRIPTION, ) prov.add_sources(self.sources) prov.add_primary_source(self.primary_source) prov.save() return prov
def __init__(self, client_id, video_id, export_dir=EXPORT_DIR): comments = [] cursor = "" current_len = 0 resp = requests.get( META_URL.format(video_id=video_id, client_id=client_id)) meta = resp.json() title = meta["title"][:20] title = re.sub('\W+', '', title) channel = meta["channel"]["name"] print("load twitch chat ... ") print("0", end="") while True: resp = requests.get( COMMENTS_URL.format(video_id=video_id, client_id=client_id, curor=cursor)) resp = resp.json() comments += resp["comments"] print("\r", end="") current_len = len(comments) print(current_len, end="") if "_next" in resp: cursor = resp["_next"] else: break print("") if not os.path.exists(export_dir): os.makedirs(export_dir) video_chat = {"meta": meta, "comments": comments} out_file = os.path.join( export_dir, "{}_{}_{}.json".format(video_id, channel, title)) json.dump(video_chat, open(out_file, "w"), indent=4) prov = Provenance(out_file) prov.add( agents=["twitch_chat_downloader"], activity="download_twitch_chat", description="video chat download for video '{}...' of channel '{}'" .format(title, channel)) prov.add_primary_source("twitchtv") prov.save()
def build_vis(self, outfilename, template): visualization = template.render( dataset=repr(json.dumps(self.dataset)), years=repr(json.dumps(list(self.years))), title=self.title, ) with open(outfilename, "w") as outfile: outfile.write(visualization) prov = Provenance(outfilename, overwrite=True) prov.add( agents=[PROVIT_AGENT], activity=self.PROVIT_ACTIVITY, description=self.PROVIT_DESCRIPTION, ) prov.add_sources([self.games_dataset_path, self.releases_dataset_path]) prov.add_primary_source("mobygames") prov.save() return outfilename
def _write_prov(self, outfilename): prov = Provenance(outfilename) prov.add( agents=[PROV_AGENT], activity=SAMPLE_PROV_ACTIVITY, description=SAMPLE_PROV_DESC, ) prov.save()
def build_wikidata_mapping(): """ Fetches all wikidata items with a mobygames company ID. Result is saved as JSON to DATASETS_DIR / WIKIDATA_MAPPING_FILENAME. """ sparql = SPARQLWrapper( SPARQL_ENDPOINT, agent=SPARQL_AGENT, ) sparql.setQuery(SPARQL_QUERY) sparql.setReturnFormat(JSON) try: results = sparql.query().convert() except URLError: raise RuntimeError( "Error while fetching data from wikidata... No file written!") dataset = [] for binding in results["results"]["bindings"]: country = None if "countryLabel" in binding: country = binding["countryLabel"]["value"] dataset.append({ "mobygames_slug": binding["companyId"]["value"], "country": country, "wkp": binding["item"]["value"].split("/")[-1], }) mapping_filename = Path(DATASETS_DIR) / WIKIDATA_MAPPING_FILENAME with open(mapping_filename, "w") as f: json.dump(dataset, f, indent=4) prov = Provenance(mapping_filename, overwrite=True) prov.add( agents=[PROV_AGENT], activity=WIKIDATA_PROV_ACTIVITY, description=WIKIDATA_PROV_DESC, ) prov.add_primary_source("wikidata") prov.save() return len(dataset), mapping_filename
def path_with_directories(tmp_path_factory): test_path = tmp_path_factory.mktemp("directories_with_file") existing_dir = test_path / EXISTING_DIRNAME non_existing_dir = test_path / NON_EXISTING_DIRNAME existing_dir.mkdir() # Add noprov file without provenance noprov_file = test_path / NOPROV_FILE noprov_file.touch() # Add data file with provenance data_file = test_path / DATA_FILE data_file.touch() data_file_prov = Provenance(data_file) data_file_prov.add(agents=[TEST_AGENT], activity="testactivity", description=TEST_ACTIVITY) data_file_prov.save() d_content = { str(existing_dir.resolve()): { "comment": "test123" }, str(non_existing_dir.resolve()): { "comment": "bla" }, } d_expect = [ { "directory": str(existing_dir.resolve()), "comment": "test123", "exists": True, }, { "directory": str(non_existing_dir.resolve()), "comment": "bla", "exists": False, }, ] home.cfg = get_config(test_path) with open(home.cfg.directories_file, "w") as dfile: yaml.dump(d_content, dfile) return test_path, d_expect
def test_directory_update(client, path_with_directories_and_file, test_filenames): test_path, _ = path_with_directories_and_file # Create file with provenance at original location test_file = test_path.joinpath(test_filenames["EXISTING_DIRNAME"], "test.txt") test_file.touch() p = Provenance(test_file) p.add(agents=["testagents"], activity="test_activity", description="test_description") p.save() # Move to new location new_path = Path(test_path.parent) / "new" test_path.rename(Path(new_path)) # Update directory data = {"directory": str(new_path / test_filenames["EXISTING_DIRNAME"])} result = do_json_post(client, "/directory/update", data) prov = result["files"][0]["prov"] assert prov["last_activity"].startswith("file moved to new location ->")
def get_prov(filepath, channel_name): prov = Provenance(filepath) prov_data = prov.tree() return {"channel": channel_name, "retrieved_at": prov_data["ended_at"]}
def build(self): g = nx.Graph() all_games = set() games = {} self.dataset = get_combined_dataset() if not self.gamelist_file: self.dataset.set_filter([self.platform], self.countries) else: gamelist = load_gamelist(self.gamelist_file) self.dataset.set_gamelist_filter(gamelist) company_list = [] for company_id, production_roles in self.dataset.filtered_dataset.items(): company_list += self.company_ids(company_id, production_roles) for c in company_list: g.add_node(c) for c1, c2 in tqdm(combinations(company_list, 2)): if self.roles: c1_id = c1.split("__")[0] c1_role = c1.split("__")[1] c2_id = c2.split("__")[0] c2_role = c2.split("__")[1] else: c1_id = c1 c1_role = None c2_id = c2 c2_role = None if c1 not in games: games[c1] = self._filter_games( self.dataset.filtered_dataset[c1_id], self.countries, self.platform, c1_role, ) if c2 not in games: games[c2] = self._filter_games( self.dataset.filtered_dataset[c2_id], self.countries, self.platform, c2_role, ) overlap = games[c1].intersection(games[c2]) all_games = all_games.union(overlap) if len(overlap) > 0: g.add_edge(c1, c2, weight=len(overlap)) # add node information for node in g.nodes(): id_ = node.split("__")[0] if self.roles: role = node.split("__")[1] g.nodes[node]["country"] = self._get_wiki_country(id_) if self.roles: g.nodes[node]["company_name"] = self.dataset.filtered_dataset[id_][0][ "company_name" ] g.nodes[node]["role"] = role g.nodes[node]["label"] = ( self.dataset.filtered_dataset[id_][0]["company_name"] + "(" + role + ")" ) else: g.nodes[node]["label"] = self.dataset.filtered_dataset[id_][0][ "company_name" ] g.nodes[node]["no_of_games"] = len(games[node]) out_path = Path(COMPANY_NETWORKS_DIR) out_filename = "company_network_" if self.gamelist_file: project_name = self.gamelist_file.split("/")[-1].replace(".yml", "") out_filename += project_name else: out_filename += self.countries_str(self.countries) out_filename += "_" + self.platform_str(self.platform) if self.roles: out_filename += "_roles" if self.publisher: out_filename += "_pub" out_filename += ".graphml" out_file = out_path / out_filename nx.write_graphml(g, out_file) prov = Provenance(out_file) prov.add( agents=[PROV_AGENT], activity=NETWORK_PROV_ACTIVITY, description=NETWORK_PROV_DESC.format( platforms=self.platform_str(self.platform), countries=self.countries_str(self.countries), ), ) prov.save() self._write_log(out_file, len(g.nodes), len(g.edges), len(all_games)) return out_file, len(g.nodes), len(g.edges), len(all_games)
def build_mobygames_companies(unified_api_url=DIGGR_API): """ Builds a reduced local company dataset from the unified api mobygames dataset. The dataset is dictionary with the (mobygames) company id as its key, and a list of the production roles for all the games the company was involved in. <company_id>: [ <game_1_prod_info_1>, <game_1_prod_info_2>, <game_3_prod_info_1> , ... ] The production information contain the data points: Company-specific information: * company_name * production role Game-specific information: * game_id * game_slug * game_title * game_years * release_countries * platform """ api = DiggrApi(unified_api_url) pm = dt.PlatformMapper("mobygames") dataset = defaultdict(list) for id_ in tqdm(api.mobygames_ids()): data = api.entry("mobygames", id_) slug = data["raw"]["moby_url"].split("/")[-1] for platform in data["raw"]["platforms"]: for release in platform["releases"]: for company in release["companies"]: dataset[company["company_id"]].append({ "company_name": company["company_name"], "game_id": id_, "game_slug": slug, "game_title": data["title"], "game_years": data["years"], "production_role": company["role"], "release_countries": release["countries"], "platform": pm[platform["platform_name"]], }) mg_companies_filename = Path(DATASETS_DIR) / MOBYGAMES_COMPANIES_FILENAME with open(mg_companies_filename, "w") as f: json.dump(dict(dataset), f, indent=4) prov = Provenance(mg_companies_filename, overwrite=True) prov.add( agents=[PROV_AGENT], activity=COMPANIES_PROV_ACTIVITY, description=COMPANIES_PROV_DESC, ) prov.add_primary_source("mobygames") prov.add_primary_source("diggr_platform_mapping") prov.save() return mg_companies_filename
def __init__(self, channel, archive_filepath): super().__init__() archive = ZipArchive(archive_filepath) self.channel_metadata = archive.get("channel_meta.json") channel_id = self.channel_metadata["items"][0]["id"] self.channel_title = self.channel_metadata["items"][0]["snippet"][ "title"] self._current = YoutubeArchiveReader(archive_filepath) diff_filepath = os.path.join( os.path.dirname(archive_filepath), "{}_{}.zip".format(self.channel_title, datetime.now().isoformat()[:19])) self._archive = ZipArchive(diff_filepath) updated = [] print("Update youtube channel <{}>".format(self.channel_title)) uploads = self.channel_metadata["items"][0]["contentDetails"][ "relatedPlaylists"]["uploads"] # get all video ids in uploads playlist video_ids = [] next_page = None while True: cmd = self.youtube.playlistItems().list(playlistId=uploads, part="snippet,status", maxResults=50, pageToken=next_page) playlist_page = youtube_api_call(cmd) video_ids += [ x["snippet"]["resourceId"]["videoId"] for x in playlist_page["items"] ] if "nextPageToken" in playlist_page: next_page = playlist_page["nextPageToken"] else: break for i, video_id in enumerate(video_ids): print("{}/{}".format(i, len(video_ids))) if video_id in self._current.video_ids: video = self._current[video_id] comment_count = video.comment_count new_meta = self._load_video_metadata(video_id) meta = new_meta["items"][0] new_comment_count = int( meta["statistics"]["commentCount"] ) if "commentCount" in meta["statistics"] else None if comment_count == new_comment_count: continue updated.append(video_id) self._fetch_video_metadata(video_id) self._fetch_video_comments(video_id) self._fetch_video_captions(video_id) if len(updated) == 0: os.remove(diff_filepath) else: print(self._current.last_update_file()) self._archive.add("video_ids.json", updated) prov = Provenance(self._archive.filepath) prov.add( agents=[PROV_AGENT], activity="update_channel", description="Youtube video/comment update data for channel <{}>" .format(self.channel_title)) prov.add_sources([self._current.last_update_file()]) prov.save()