Beispiel #1
0
def convert(xml_data_file, json_data_file=JSON_DATA_FILE):
    """
    Converts the XML file to a JSON, s.t. I can work with it.
    """
    if not Path(xml_data_file).is_file():
        print(f"File {xml_data_file} does not exist")
        sys.exit(1)

    with open(xml_data_file) as infile:
        data = infile.read()

    dict_data = xmltodict.parse(data)

    with open(json_data_file, "w") as outfile:
        json.dump(dict_data, outfile, indent=4)

    # write provenance information
    prov = Provenance(json_data_file)
    prov.add(
        agents=["schlagwetter"],
        activity="xml_to_json_conversion",
        description="Convert provided XML-file to JSON.",
    )
    prov.add_primary_source(PRIMARY_SOURCE_URL)
    prov.add_sources([xml_data_file])
    prov.save()
Beispiel #2
0
def export_datasets():
    """
    Exports standardized datasets with fields spezified in the config.yml
    """
    config = load_config()

    for dataset_name, fields in config["export"].items():
        export_dataset = {}
        
        print("exporting dataset <{}> ...".format(dataset_name))
        dataset = get_dataset(".", dataset_name)

        for entry in dataset:
            row = { field: entry[field] for field in fields }
            row["id"] = entry["id"]
            export_dataset[entry["id"]] = row

        out_file = os.path.join(config["project"]["export_dir"], "{}.json".format(dataset_name))
        json.dump(export_dataset, open(out_file, "w"), indent=4)

        prov = Provenance(out_file, overwrite=True)
        prov.add(
            agents=[ PROV_AGENT ], 
            activity="export_std_dataset",
            description="export standardized fields <{}> from dataset <{}>".format(", ".join(fields), dataset_name)
        )
        prov.add_sources(dataset.source_file())
        prov.save()
Beispiel #3
0
    def fetch(self):
        """
        Fetches all mobygames datasets and write the file />data_dir>/mobygames.zip
        """          
        print("fetch mobygames dataset ...")
        with zipfile.ZipFile(self.filepath, "w", zipfile.ZIP_DEFLATED) as zf:

            offset = 0          
            while True:
            
                result = self._api_call(SEARCH.format(api_key=self.api_key, title="", offset=offset))
                games = result["games"]
                for game in games:
                    game_str = json.dumps(game, indent=4)
                    zf.writestr("{}.json".format(game["game_id"]), game_str)

                offset += OFFSET_STEP
                result_number = len(games)
                print("\t current offset: {}".format(offset))

                if result_number == 0:
                    break
                
        prov = Provenance(self.filepath)
        prov.add(agents=[ PROV_AGENT ], activity="fetch_mobygames", description="Full game datasets from mobygames api")
        prov.add_primary_source("mobygames")
        prov.save()
Beispiel #4
0
 def save_provenance(self, outfilename):
     """
     Save the provenance.
     """
     prov = Provenance(outfilename, overwrite=True)
     prov.add(
         agents=[PROVIT_AGENT],
         activity=self.PROVIT_ACTIVITY,
         description=self.PROVIT_DESCRIPTION,
     )
     prov.add_sources(self.sources)
     prov.add_primary_source(self.primary_source)
     prov.save()
     return prov
Beispiel #5
0
    def __init__(self, client_id, video_id, export_dir=EXPORT_DIR):

        comments = []
        cursor = ""
        current_len = 0

        resp = requests.get(
            META_URL.format(video_id=video_id, client_id=client_id))
        meta = resp.json()

        title = meta["title"][:20]
        title = re.sub('\W+', '', title)

        channel = meta["channel"]["name"]

        print("load twitch chat ... ")
        print("0", end="")
        while True:
            resp = requests.get(
                COMMENTS_URL.format(video_id=video_id,
                                    client_id=client_id,
                                    curor=cursor))
            resp = resp.json()

            comments += resp["comments"]
            print("\r", end="")
            current_len = len(comments)
            print(current_len, end="")

            if "_next" in resp:
                cursor = resp["_next"]
            else:
                break

        print("")

        if not os.path.exists(export_dir):
            os.makedirs(export_dir)

        video_chat = {"meta": meta, "comments": comments}

        out_file = os.path.join(
            export_dir, "{}_{}_{}.json".format(video_id, channel, title))
        json.dump(video_chat, open(out_file, "w"), indent=4)

        prov = Provenance(out_file)
        prov.add(
            agents=["twitch_chat_downloader"],
            activity="download_twitch_chat",
            description="video chat download for video '{}...' of channel '{}'"
            .format(title, channel))
        prov.add_primary_source("twitchtv")
        prov.save()
Beispiel #6
0
    def build_vis(self, outfilename, template):

        visualization = template.render(
            dataset=repr(json.dumps(self.dataset)),
            years=repr(json.dumps(list(self.years))),
            title=self.title,
        )

        with open(outfilename, "w") as outfile:
            outfile.write(visualization)

        prov = Provenance(outfilename, overwrite=True)
        prov.add(
            agents=[PROVIT_AGENT],
            activity=self.PROVIT_ACTIVITY,
            description=self.PROVIT_DESCRIPTION,
        )
        prov.add_sources([self.games_dataset_path, self.releases_dataset_path])
        prov.add_primary_source("mobygames")
        prov.save()

        return outfilename
 def _write_prov(self, outfilename):
     prov = Provenance(outfilename)
     prov.add(
         agents=[PROV_AGENT],
         activity=SAMPLE_PROV_ACTIVITY,
         description=SAMPLE_PROV_DESC,
     )
     prov.save()
Beispiel #8
0
def build_wikidata_mapping():
    """
    Fetches all wikidata items with a mobygames company ID.
    Result is saved as JSON to DATASETS_DIR / WIKIDATA_MAPPING_FILENAME.
    """
    sparql = SPARQLWrapper(
        SPARQL_ENDPOINT,
        agent=SPARQL_AGENT,
    )

    sparql.setQuery(SPARQL_QUERY)
    sparql.setReturnFormat(JSON)
    try:
        results = sparql.query().convert()
    except URLError:
        raise RuntimeError(
            "Error while fetching data from wikidata... No file written!")

    dataset = []
    for binding in results["results"]["bindings"]:
        country = None
        if "countryLabel" in binding:
            country = binding["countryLabel"]["value"]

        dataset.append({
            "mobygames_slug": binding["companyId"]["value"],
            "country": country,
            "wkp": binding["item"]["value"].split("/")[-1],
        })

    mapping_filename = Path(DATASETS_DIR) / WIKIDATA_MAPPING_FILENAME
    with open(mapping_filename, "w") as f:
        json.dump(dataset, f, indent=4)

    prov = Provenance(mapping_filename, overwrite=True)
    prov.add(
        agents=[PROV_AGENT],
        activity=WIKIDATA_PROV_ACTIVITY,
        description=WIKIDATA_PROV_DESC,
    )
    prov.add_primary_source("wikidata")
    prov.save()

    return len(dataset), mapping_filename
Beispiel #9
0
def path_with_directories(tmp_path_factory):
    test_path = tmp_path_factory.mktemp("directories_with_file")
    existing_dir = test_path / EXISTING_DIRNAME
    non_existing_dir = test_path / NON_EXISTING_DIRNAME
    existing_dir.mkdir()

    # Add noprov file without provenance
    noprov_file = test_path / NOPROV_FILE
    noprov_file.touch()

    # Add data file with provenance
    data_file = test_path / DATA_FILE
    data_file.touch()
    data_file_prov = Provenance(data_file)
    data_file_prov.add(agents=[TEST_AGENT],
                       activity="testactivity",
                       description=TEST_ACTIVITY)
    data_file_prov.save()

    d_content = {
        str(existing_dir.resolve()): {
            "comment": "test123"
        },
        str(non_existing_dir.resolve()): {
            "comment": "bla"
        },
    }
    d_expect = [
        {
            "directory": str(existing_dir.resolve()),
            "comment": "test123",
            "exists": True,
        },
        {
            "directory": str(non_existing_dir.resolve()),
            "comment": "bla",
            "exists": False,
        },
    ]
    home.cfg = get_config(test_path)
    with open(home.cfg.directories_file, "w") as dfile:
        yaml.dump(d_content, dfile)
    return test_path, d_expect
Beispiel #10
0
def test_directory_update(client, path_with_directories_and_file,
                          test_filenames):
    test_path, _ = path_with_directories_and_file

    # Create file with provenance at original location
    test_file = test_path.joinpath(test_filenames["EXISTING_DIRNAME"],
                                   "test.txt")
    test_file.touch()
    p = Provenance(test_file)
    p.add(agents=["testagents"],
          activity="test_activity",
          description="test_description")
    p.save()

    # Move to new location
    new_path = Path(test_path.parent) / "new"
    test_path.rename(Path(new_path))

    # Update directory
    data = {"directory": str(new_path / test_filenames["EXISTING_DIRNAME"])}
    result = do_json_post(client, "/directory/update", data)
    prov = result["files"][0]["prov"]
    assert prov["last_activity"].startswith("file moved to new location ->")
def get_prov(filepath, channel_name):

    prov = Provenance(filepath)
    prov_data = prov.tree()
    return {"channel": channel_name, "retrieved_at": prov_data["ended_at"]}
Beispiel #12
0
    def build(self):

        g = nx.Graph()
        all_games = set()

        games = {}

        self.dataset = get_combined_dataset()
        if not self.gamelist_file:
            self.dataset.set_filter([self.platform], self.countries)
        else:
            gamelist = load_gamelist(self.gamelist_file)
            self.dataset.set_gamelist_filter(gamelist)

        company_list = []
        for company_id, production_roles in self.dataset.filtered_dataset.items():
            company_list += self.company_ids(company_id, production_roles)

        for c in company_list:
            g.add_node(c)

        for c1, c2 in tqdm(combinations(company_list, 2)):
            if self.roles:
                c1_id = c1.split("__")[0]
                c1_role = c1.split("__")[1]
                c2_id = c2.split("__")[0]
                c2_role = c2.split("__")[1]
            else:
                c1_id = c1
                c1_role = None
                c2_id = c2
                c2_role = None

            if c1 not in games:
                games[c1] = self._filter_games(
                    self.dataset.filtered_dataset[c1_id],
                    self.countries,
                    self.platform,
                    c1_role,
                )

            if c2 not in games:
                games[c2] = self._filter_games(
                    self.dataset.filtered_dataset[c2_id],
                    self.countries,
                    self.platform,
                    c2_role,
                )

            overlap = games[c1].intersection(games[c2])
            all_games = all_games.union(overlap)

            if len(overlap) > 0:
                g.add_edge(c1, c2, weight=len(overlap))

        # add node information
        for node in g.nodes():
            id_ = node.split("__")[0]
            if self.roles:
                role = node.split("__")[1]

            g.nodes[node]["country"] = self._get_wiki_country(id_)
            if self.roles:
                g.nodes[node]["company_name"] = self.dataset.filtered_dataset[id_][0][
                    "company_name"
                ]
                g.nodes[node]["role"] = role
                g.nodes[node]["label"] = (
                    self.dataset.filtered_dataset[id_][0]["company_name"]
                    + "("
                    + role
                    + ")"
                )
            else:
                g.nodes[node]["label"] = self.dataset.filtered_dataset[id_][0][
                    "company_name"
                ]
            g.nodes[node]["no_of_games"] = len(games[node])

        out_path = Path(COMPANY_NETWORKS_DIR)
        out_filename = "company_network_"

        if self.gamelist_file:
            project_name = self.gamelist_file.split("/")[-1].replace(".yml", "")
            out_filename += project_name
        else:
            out_filename += self.countries_str(self.countries)
            out_filename += "_" + self.platform_str(self.platform)
        if self.roles:
            out_filename += "_roles"
        if self.publisher:
            out_filename += "_pub"

        out_filename += ".graphml"
        out_file = out_path / out_filename
        nx.write_graphml(g, out_file)

        prov = Provenance(out_file)
        prov.add(
            agents=[PROV_AGENT],
            activity=NETWORK_PROV_ACTIVITY,
            description=NETWORK_PROV_DESC.format(
                platforms=self.platform_str(self.platform),
                countries=self.countries_str(self.countries),
            ),
        )
        prov.save()

        self._write_log(out_file, len(g.nodes), len(g.edges), len(all_games))

        return out_file, len(g.nodes), len(g.edges), len(all_games)
Beispiel #13
0
def build_mobygames_companies(unified_api_url=DIGGR_API):
    """
    Builds a reduced local company dataset from the unified api mobygames dataset.

    The dataset is dictionary with the (mobygames) company id as its key, and a list of
    the production roles for all the games the company was involved in.

    <company_id>: [ <game_1_prod_info_1>, <game_1_prod_info_2>, <game_3_prod_info_1> , ... ]


    The production information contain the data points:

    Company-specific information:
    * company_name
    * production role

    Game-specific information:
    * game_id
    * game_slug
    * game_title
    * game_years
    * release_countries
    * platform
    """
    api = DiggrApi(unified_api_url)
    pm = dt.PlatformMapper("mobygames")

    dataset = defaultdict(list)

    for id_ in tqdm(api.mobygames_ids()):
        data = api.entry("mobygames", id_)
        slug = data["raw"]["moby_url"].split("/")[-1]

        for platform in data["raw"]["platforms"]:
            for release in platform["releases"]:
                for company in release["companies"]:
                    dataset[company["company_id"]].append({
                        "company_name":
                        company["company_name"],
                        "game_id":
                        id_,
                        "game_slug":
                        slug,
                        "game_title":
                        data["title"],
                        "game_years":
                        data["years"],
                        "production_role":
                        company["role"],
                        "release_countries":
                        release["countries"],
                        "platform":
                        pm[platform["platform_name"]],
                    })

    mg_companies_filename = Path(DATASETS_DIR) / MOBYGAMES_COMPANIES_FILENAME
    with open(mg_companies_filename, "w") as f:
        json.dump(dict(dataset), f, indent=4)

    prov = Provenance(mg_companies_filename, overwrite=True)
    prov.add(
        agents=[PROV_AGENT],
        activity=COMPANIES_PROV_ACTIVITY,
        description=COMPANIES_PROV_DESC,
    )
    prov.add_primary_source("mobygames")
    prov.add_primary_source("diggr_platform_mapping")
    prov.save()

    return mg_companies_filename
Beispiel #14
0
    def __init__(self, channel, archive_filepath):
        super().__init__()

        archive = ZipArchive(archive_filepath)
        self.channel_metadata = archive.get("channel_meta.json")
        channel_id = self.channel_metadata["items"][0]["id"]
        self.channel_title = self.channel_metadata["items"][0]["snippet"][
            "title"]

        self._current = YoutubeArchiveReader(archive_filepath)

        diff_filepath = os.path.join(
            os.path.dirname(archive_filepath),
            "{}_{}.zip".format(self.channel_title,
                               datetime.now().isoformat()[:19]))
        self._archive = ZipArchive(diff_filepath)

        updated = []
        print("Update youtube channel <{}>".format(self.channel_title))

        uploads = self.channel_metadata["items"][0]["contentDetails"][
            "relatedPlaylists"]["uploads"]
        # get all video ids in uploads playlist
        video_ids = []
        next_page = None
        while True:

            cmd = self.youtube.playlistItems().list(playlistId=uploads,
                                                    part="snippet,status",
                                                    maxResults=50,
                                                    pageToken=next_page)
            playlist_page = youtube_api_call(cmd)

            video_ids += [
                x["snippet"]["resourceId"]["videoId"]
                for x in playlist_page["items"]
            ]

            if "nextPageToken" in playlist_page:
                next_page = playlist_page["nextPageToken"]
            else:
                break

        for i, video_id in enumerate(video_ids):
            print("{}/{}".format(i, len(video_ids)))
            if video_id in self._current.video_ids:
                video = self._current[video_id]
                comment_count = video.comment_count

                new_meta = self._load_video_metadata(video_id)
                meta = new_meta["items"][0]
                new_comment_count = int(
                    meta["statistics"]["commentCount"]
                ) if "commentCount" in meta["statistics"] else None

                if comment_count == new_comment_count:
                    continue

            updated.append(video_id)
            self._fetch_video_metadata(video_id)
            self._fetch_video_comments(video_id)
            self._fetch_video_captions(video_id)

        if len(updated) == 0:
            os.remove(diff_filepath)
        else:
            print(self._current.last_update_file())
            self._archive.add("video_ids.json", updated)
            prov = Provenance(self._archive.filepath)
            prov.add(
                agents=[PROV_AGENT],
                activity="update_channel",
                description="Youtube video/comment update data for channel <{}>"
                .format(self.channel_title))
            prov.add_sources([self._current.last_update_file()])
            prov.save()