def generate_rows(self,
                      dataset_schema=None,
                      dataset_partitioning=None,
                      partition_id=None,
                      records_limit=-1):
        (beg_date, end_date) = dkuwikipedia.get_daterange(self.config)

        for line in self.config.get("pages", "").split("\n"):
            line = line.strip()
            line = line.split(" ")
            project = line[0]
            page = " ".join(line[1:])

            project = project.strip()

            print "Query for %s : %s" % (project, page)
            resp = dkuwikipedia.query_page(project, page, beg_date, end_date)
            dic = resp.json()
            for item in dic.get("items", []):
                yield {
                    "project":
                    project,
                    "date":
                    dkuwikipedia.parse_and_format_yyyymmddhh(
                        item["timestamp"]),
                    "page":
                    item["article"],
                    "views":
                    item["views"],
                }
def get_rows():
    for item in pages_list_dataset.iter_rows():
        project = item["project"]
        page = item["page"]

        print "Query for %s : %s" % (project, page)
        resp = dkuwikipedia.query_page(project, page, beg_date, end_date)
        dic = resp.json()
        for item in dic.get("items", []):
            yield {
                "project" : project,
                "date" : dkuwikipedia.parse_and_format_yyyymmddhh(item["timestamp"]),
                "page" : item["article"],
                "views" : item["views"],
            }
Beispiel #3
0
def get_rows():
    for item in pages_list_dataset.iter_rows():
        project = item["project"]
        page = item["page"]

        print "Query for %s : %s" % (project, page)
        resp = dkuwikipedia.query_page(project, page, beg_date, end_date)
        dic = resp.json()
        for item in dic.get("items", []):
            yield {
                "project": project,
                "date":
                dkuwikipedia.parse_and_format_yyyymmddhh(item["timestamp"]),
                "page": item["article"],
                "views": item["views"],
            }
    def generate_rows(self, dataset_schema=None, dataset_partitioning=None,
                            partition_id=None, records_limit = -1):
        (beg_date, end_date) = dkuwikipedia.get_daterange(self.config)

        for line in self.config.get("pages", "").split("\n"):
            line = line.strip()
            line = line.split(" ")
            project = line[0]
            page = " ".join(line[1:])

            project = project.strip()

            print "Query for %s : %s" % (project, page)
            resp = dkuwikipedia.query_page(project, page, beg_date, end_date)
            dic = resp.json()
            for item in dic.get("items", []):
                yield {
                    "project" : project,
                    "date" : dkuwikipedia.parse_and_format_yyyymmddhh(item["timestamp"]),
                    "page" : item["article"],
                    "views" : item["views"],
                }